LLVM 20.0.0git
MachinePipeliner.cpp
Go to the documentation of this file.
1//===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
10//
11// This SMS implementation is a target-independent back-end pass. When enabled,
12// the pass runs just prior to the register allocation pass, while the machine
13// IR is in SSA form. If software pipelining is successful, then the original
14// loop is replaced by the optimized loop. The optimized loop contains one or
15// more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
16// the instructions cannot be scheduled in a given MII, we increase the MII by
17// one and try again.
18//
19// The SMS implementation is an extension of the ScheduleDAGInstrs class. We
20// represent loop carried dependences in the DAG as order edges to the Phi
21// nodes. We also perform several passes over the DAG to eliminate unnecessary
22// edges that inhibit the ability to pipeline. The implementation uses the
23// DFAPacketizer class to compute the minimum initiation interval and the check
24// where an instruction may be inserted in the pipelined schedule.
25//
26// In order for the SMS pass to work, several target specific hooks need to be
27// implemented to get information about the loop structure and to rewrite
28// instructions.
29//
30//===----------------------------------------------------------------------===//
31
33#include "llvm/ADT/ArrayRef.h"
34#include "llvm/ADT/BitVector.h"
35#include "llvm/ADT/DenseMap.h"
36#include "llvm/ADT/MapVector.h"
38#include "llvm/ADT/STLExtras.h"
40#include "llvm/ADT/SetVector.h"
42#include "llvm/ADT/SmallSet.h"
44#include "llvm/ADT/Statistic.h"
73#include "llvm/Config/llvm-config.h"
74#include "llvm/IR/Attributes.h"
75#include "llvm/IR/Function.h"
76#include "llvm/MC/LaneBitmask.h"
77#include "llvm/MC/MCInstrDesc.h"
79#include "llvm/Pass.h"
82#include "llvm/Support/Debug.h"
84#include <algorithm>
85#include <cassert>
86#include <climits>
87#include <cstdint>
88#include <deque>
89#include <functional>
90#include <iomanip>
91#include <iterator>
92#include <map>
93#include <memory>
94#include <sstream>
95#include <tuple>
96#include <utility>
97#include <vector>
98
99using namespace llvm;
100
101#define DEBUG_TYPE "pipeliner"
102
103STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
104STATISTIC(NumPipelined, "Number of loops software pipelined");
105STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
106STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
107STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
108STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
109STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
110STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
111STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
112STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
113STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
114
115/// A command line option to turn software pipelining on or off.
116static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
117 cl::desc("Enable Software Pipelining"));
118
119/// A command line option to enable SWP at -Os.
120static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
121 cl::desc("Enable SWP at Os."), cl::Hidden,
122 cl::init(false));
123
124/// A command line argument to limit minimum initial interval for pipelining.
125static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
126 cl::desc("Size limit for the MII."),
127 cl::Hidden, cl::init(27));
128
129/// A command line argument to force pipeliner to use specified initial
130/// interval.
131static cl::opt<int> SwpForceII("pipeliner-force-ii",
132 cl::desc("Force pipeliner to use specified II."),
133 cl::Hidden, cl::init(-1));
134
135/// A command line argument to limit the number of stages in the pipeline.
136static cl::opt<int>
137 SwpMaxStages("pipeliner-max-stages",
138 cl::desc("Maximum stages allowed in the generated scheduled."),
139 cl::Hidden, cl::init(3));
140
141/// A command line option to disable the pruning of chain dependences due to
142/// an unrelated Phi.
143static cl::opt<bool>
144 SwpPruneDeps("pipeliner-prune-deps",
145 cl::desc("Prune dependences between unrelated Phi nodes."),
146 cl::Hidden, cl::init(true));
147
148/// A command line option to disable the pruning of loop carried order
149/// dependences.
150static cl::opt<bool>
151 SwpPruneLoopCarried("pipeliner-prune-loop-carried",
152 cl::desc("Prune loop carried order dependences."),
153 cl::Hidden, cl::init(true));
154
155#ifndef NDEBUG
156static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
157#endif
158
159static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
161 cl::desc("Ignore RecMII"));
162
163static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
164 cl::init(false));
165static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,
166 cl::init(false));
167
169 "pipeliner-annotate-for-testing", cl::Hidden, cl::init(false),
170 cl::desc("Instead of emitting the pipelined code, annotate instructions "
171 "with the generated schedule for feeding into the "
172 "-modulo-schedule-test pass"));
173
175 "pipeliner-experimental-cg", cl::Hidden, cl::init(false),
176 cl::desc(
177 "Use the experimental peeling code generator for software pipelining"));
178
179static cl::opt<int> SwpIISearchRange("pipeliner-ii-search-range",
180 cl::desc("Range to search for II"),
181 cl::Hidden, cl::init(10));
182
183static cl::opt<bool>
184 LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false),
185 cl::desc("Limit register pressure of scheduled loop"));
186
187static cl::opt<int>
188 RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden,
189 cl::init(5),
190 cl::desc("Margin representing the unused percentage of "
191 "the register pressure limit"));
192
193static cl::opt<bool>
194 MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false),
195 cl::desc("Use the MVE code generator for software pipelining"));
196
197namespace llvm {
198
199// A command line option to enable the CopyToPhi DAG mutation.
201 cl::init(true),
202 cl::desc("Enable CopyToPhi DAG Mutation"));
203
204/// A command line argument to force pipeliner to use specified issue
205/// width.
207 "pipeliner-force-issue-width",
208 cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
209 cl::init(-1));
210
211/// A command line argument to set the window scheduling option.
214 cl::desc("Set how to use window scheduling algorithm."),
216 "Turn off window algorithm."),
218 "Use window algorithm after SMS algorithm fails."),
220 "Use window algorithm instead of SMS algorithm.")));
221
222} // end namespace llvm
223
224unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
225char MachinePipeliner::ID = 0;
226#ifndef NDEBUG
228#endif
230
232 "Modulo Software Pipelining", false, false)
238 "Modulo Software Pipelining", false, false)
239
240/// The "main" function for implementing Swing Modulo Scheduling.
241bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
242 if (skipFunction(mf.getFunction()))
243 return false;
244
245 if (!EnableSWP)
246 return false;
247
248 if (mf.getFunction().getAttributes().hasFnAttr(Attribute::OptimizeForSize) &&
250 return false;
251
252 if (!mf.getSubtarget().enableMachinePipeliner())
253 return false;
254
255 // Cannot pipeline loops without instruction itineraries if we are using
256 // DFA for the pipeliner.
257 if (mf.getSubtarget().useDFAforSMS() &&
258 (!mf.getSubtarget().getInstrItineraryData() ||
259 mf.getSubtarget().getInstrItineraryData()->isEmpty()))
260 return false;
261
262 MF = &mf;
263 MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
264 MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
265 ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
266 TII = MF->getSubtarget().getInstrInfo();
267 RegClassInfo.runOnMachineFunction(*MF);
268
269 for (const auto &L : *MLI)
270 scheduleLoop(*L);
271
272 return false;
273}
274
275/// Attempt to perform the SMS algorithm on the specified loop. This function is
276/// the main entry point for the algorithm. The function identifies candidate
277/// loops, calculates the minimum initiation interval, and attempts to schedule
278/// the loop.
279bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
280 bool Changed = false;
281 for (const auto &InnerLoop : L)
282 Changed |= scheduleLoop(*InnerLoop);
283
284#ifndef NDEBUG
285 // Stop trying after reaching the limit (if any).
286 int Limit = SwpLoopLimit;
287 if (Limit >= 0) {
288 if (NumTries >= SwpLoopLimit)
289 return Changed;
290 NumTries++;
291 }
292#endif
293
294 setPragmaPipelineOptions(L);
295 if (!canPipelineLoop(L)) {
296 LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
297 ORE->emit([&]() {
298 return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",
299 L.getStartLoc(), L.getHeader())
300 << "Failed to pipeline loop";
301 });
302
303 LI.LoopPipelinerInfo.reset();
304 return Changed;
305 }
306
307 ++NumTrytoPipeline;
308 if (useSwingModuloScheduler())
309 Changed = swingModuloScheduler(L);
310
311 if (useWindowScheduler(Changed))
312 Changed = runWindowScheduler(L);
313
314 LI.LoopPipelinerInfo.reset();
315 return Changed;
316}
317
318void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
319 // Reset the pragma for the next loop in iteration.
320 disabledByPragma = false;
321 II_setByPragma = 0;
322
323 MachineBasicBlock *LBLK = L.getTopBlock();
324
325 if (LBLK == nullptr)
326 return;
327
328 const BasicBlock *BBLK = LBLK->getBasicBlock();
329 if (BBLK == nullptr)
330 return;
331
332 const Instruction *TI = BBLK->getTerminator();
333 if (TI == nullptr)
334 return;
335
336 MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop);
337 if (LoopID == nullptr)
338 return;
339
340 assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");
341 assert(LoopID->getOperand(0) == LoopID && "invalid loop");
342
343 for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
344 MDNode *MD = dyn_cast<MDNode>(MDO);
345
346 if (MD == nullptr)
347 continue;
348
349 MDString *S = dyn_cast<MDString>(MD->getOperand(0));
350
351 if (S == nullptr)
352 continue;
353
354 if (S->getString() == "llvm.loop.pipeline.initiationinterval") {
355 assert(MD->getNumOperands() == 2 &&
356 "Pipeline initiation interval hint metadata should have two operands.");
358 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
359 assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");
360 } else if (S->getString() == "llvm.loop.pipeline.disable") {
361 disabledByPragma = true;
362 }
363 }
364}
365
366/// Return true if the loop can be software pipelined. The algorithm is
367/// restricted to loops with a single basic block. Make sure that the
368/// branch in the loop can be analyzed.
369bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
370 if (L.getNumBlocks() != 1) {
371 ORE->emit([&]() {
372 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
373 L.getStartLoc(), L.getHeader())
374 << "Not a single basic block: "
375 << ore::NV("NumBlocks", L.getNumBlocks());
376 });
377 return false;
378 }
379
380 if (disabledByPragma) {
381 ORE->emit([&]() {
382 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
383 L.getStartLoc(), L.getHeader())
384 << "Disabled by Pragma.";
385 });
386 return false;
387 }
388
389 // Check if the branch can't be understood because we can't do pipelining
390 // if that's the case.
391 LI.TBB = nullptr;
392 LI.FBB = nullptr;
393 LI.BrCond.clear();
394 if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
395 LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
396 NumFailBranch++;
397 ORE->emit([&]() {
398 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
399 L.getStartLoc(), L.getHeader())
400 << "The branch can't be understood";
401 });
402 return false;
403 }
404
405 LI.LoopInductionVar = nullptr;
406 LI.LoopCompare = nullptr;
408 if (!LI.LoopPipelinerInfo) {
409 LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
410 NumFailLoop++;
411 ORE->emit([&]() {
412 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
413 L.getStartLoc(), L.getHeader())
414 << "The loop structure is not supported";
415 });
416 return false;
417 }
418
419 if (!L.getLoopPreheader()) {
420 LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
421 NumFailPreheader++;
422 ORE->emit([&]() {
423 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
424 L.getStartLoc(), L.getHeader())
425 << "No loop preheader found";
426 });
427 return false;
428 }
429
430 // Remove any subregisters from inputs to phi nodes.
431 preprocessPhiNodes(*L.getHeader());
432 return true;
433}
434
435void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
437 SlotIndexes &Slots =
438 *getAnalysis<LiveIntervalsWrapperPass>().getLIS().getSlotIndexes();
439
440 for (MachineInstr &PI : B.phis()) {
441 MachineOperand &DefOp = PI.getOperand(0);
442 assert(DefOp.getSubReg() == 0);
443 auto *RC = MRI.getRegClass(DefOp.getReg());
444
445 for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
446 MachineOperand &RegOp = PI.getOperand(i);
447 if (RegOp.getSubReg() == 0)
448 continue;
449
450 // If the operand uses a subregister, replace it with a new register
451 // without subregisters, and generate a copy to the new register.
452 Register NewReg = MRI.createVirtualRegister(RC);
453 MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
455 const DebugLoc &DL = PredB.findDebugLoc(At);
456 auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
457 .addReg(RegOp.getReg(), getRegState(RegOp),
458 RegOp.getSubReg());
459 Slots.insertMachineInstrInMaps(*Copy);
460 RegOp.setReg(NewReg);
461 RegOp.setSubReg(0);
462 }
463 }
464}
465
466/// The SMS algorithm consists of the following main steps:
467/// 1. Computation and analysis of the dependence graph.
468/// 2. Ordering of the nodes (instructions).
469/// 3. Attempt to Schedule the loop.
470bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
471 assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
472
474 *this, L, getAnalysis<LiveIntervalsWrapperPass>().getLIS(), RegClassInfo,
476
477 MachineBasicBlock *MBB = L.getHeader();
478 // The kernel should not include any terminator instructions. These
479 // will be added back later.
480 SMS.startBlock(MBB);
481
482 // Compute the number of 'real' instructions in the basic block by
483 // ignoring terminators.
484 unsigned size = MBB->size();
486 E = MBB->instr_end();
487 I != E; ++I, --size)
488 ;
489
490 SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
491 SMS.schedule();
492 SMS.exitRegion();
493
494 SMS.finishBlock();
495 return SMS.hasNewSchedule();
496}
497
507}
508
509bool MachinePipeliner::runWindowScheduler(MachineLoop &L) {
510 MachineSchedContext Context;
511 Context.MF = MF;
512 Context.MLI = MLI;
513 Context.MDT = MDT;
514 Context.PassConfig = &getAnalysis<TargetPassConfig>();
515 Context.AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
516 Context.LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
518 WindowScheduler WS(&Context, L);
519 return WS.run();
520}
521
522bool MachinePipeliner::useSwingModuloScheduler() {
523 // SwingModuloScheduler does not work when WindowScheduler is forced.
525}
526
527bool MachinePipeliner::useWindowScheduler(bool Changed) {
528 // WindowScheduler does not work for following cases:
529 // 1. when it is off.
530 // 2. when SwingModuloScheduler is successfully scheduled.
531 // 3. when pragma II is enabled.
532 if (II_setByPragma) {
533 LLVM_DEBUG(dbgs() << "Window scheduling is disabled when "
534 "llvm.loop.pipeline.initiationinterval is set.\n");
535 return false;
536 }
537
540}
541
542void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
543 if (SwpForceII > 0)
544 MII = SwpForceII;
545 else if (II_setByPragma > 0)
546 MII = II_setByPragma;
547 else
548 MII = std::max(ResMII, RecMII);
549}
550
551void SwingSchedulerDAG::setMAX_II() {
552 if (SwpForceII > 0)
553 MAX_II = SwpForceII;
554 else if (II_setByPragma > 0)
555 MAX_II = II_setByPragma;
556 else
557 MAX_II = MII + SwpIISearchRange;
558}
559
560/// We override the schedule function in ScheduleDAGInstrs to implement the
561/// scheduling part of the Swing Modulo Scheduling algorithm.
563 AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
564 buildSchedGraph(AA);
565 addLoopCarriedDependences(AA);
566 updatePhiDependences();
568 changeDependences();
569 postProcessDAG();
570 DDG = std::make_unique<SwingSchedulerDDG>(SUnits, &EntrySU, &ExitSU);
571 LLVM_DEBUG(dump());
572
573 NodeSetType NodeSets;
574 findCircuits(NodeSets);
575 NodeSetType Circuits = NodeSets;
576
577 // Calculate the MII.
578 unsigned ResMII = calculateResMII();
579 unsigned RecMII = calculateRecMII(NodeSets);
580
581 fuseRecs(NodeSets);
582
583 // This flag is used for testing and can cause correctness problems.
584 if (SwpIgnoreRecMII)
585 RecMII = 0;
586
587 setMII(ResMII, RecMII);
588 setMAX_II();
589
590 LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II
591 << " (rec=" << RecMII << ", res=" << ResMII << ")\n");
592
593 // Can't schedule a loop without a valid MII.
594 if (MII == 0) {
595 LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
596 NumFailZeroMII++;
597 Pass.ORE->emit([&]() {
599 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
600 << "Invalid Minimal Initiation Interval: 0";
601 });
602 return;
603 }
604
605 // Don't pipeline large loops.
606 if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {
607 LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
608 << ", we don't pipeline large loops\n");
609 NumFailLargeMaxMII++;
610 Pass.ORE->emit([&]() {
612 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
613 << "Minimal Initiation Interval too large: "
614 << ore::NV("MII", (int)MII) << " > "
615 << ore::NV("SwpMaxMii", SwpMaxMii) << "."
616 << "Refer to -pipeliner-max-mii.";
617 });
618 return;
619 }
620
621 computeNodeFunctions(NodeSets);
622
623 registerPressureFilter(NodeSets);
624
625 colocateNodeSets(NodeSets);
626
627 checkNodeSets(NodeSets);
628
629 LLVM_DEBUG({
630 for (auto &I : NodeSets) {
631 dbgs() << " Rec NodeSet ";
632 I.dump();
633 }
634 });
635
636 llvm::stable_sort(NodeSets, std::greater<NodeSet>());
637
638 groupRemainingNodes(NodeSets);
639
640 removeDuplicateNodes(NodeSets);
641
642 LLVM_DEBUG({
643 for (auto &I : NodeSets) {
644 dbgs() << " NodeSet ";
645 I.dump();
646 }
647 });
648
649 computeNodeOrder(NodeSets);
650
651 // check for node order issues
652 checkValidNodeOrder(Circuits);
653
654 SMSchedule Schedule(Pass.MF, this);
655 Scheduled = schedulePipeline(Schedule);
656
657 if (!Scheduled){
658 LLVM_DEBUG(dbgs() << "No schedule found, return\n");
659 NumFailNoSchedule++;
660 Pass.ORE->emit([&]() {
662 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
663 << "Unable to find schedule";
664 });
665 return;
666 }
667
668 unsigned numStages = Schedule.getMaxStageCount();
669 // No need to generate pipeline if there are no overlapped iterations.
670 if (numStages == 0) {
671 LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
672 NumFailZeroStage++;
673 Pass.ORE->emit([&]() {
675 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
676 << "No need to pipeline - no overlapped iterations in schedule.";
677 });
678 return;
679 }
680 // Check that the maximum stage count is less than user-defined limit.
681 if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) {
682 LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
683 << " : too many stages, abort\n");
684 NumFailLargeMaxStage++;
685 Pass.ORE->emit([&]() {
687 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
688 << "Too many stages in schedule: "
689 << ore::NV("numStages", (int)numStages) << " > "
690 << ore::NV("SwpMaxStages", SwpMaxStages)
691 << ". Refer to -pipeliner-max-stages.";
692 });
693 return;
694 }
695
696 Pass.ORE->emit([&]() {
698 Loop.getHeader())
699 << "Pipelined succesfully!";
700 });
701
702 // Generate the schedule as a ModuloSchedule.
703 DenseMap<MachineInstr *, int> Cycles, Stages;
704 std::vector<MachineInstr *> OrderedInsts;
705 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
706 ++Cycle) {
707 for (SUnit *SU : Schedule.getInstructions(Cycle)) {
708 OrderedInsts.push_back(SU->getInstr());
709 Cycles[SU->getInstr()] = Cycle;
710 Stages[SU->getInstr()] = Schedule.stageScheduled(SU);
711 }
712 }
714 for (auto &KV : NewMIs) {
715 Cycles[KV.first] = Cycles[KV.second];
716 Stages[KV.first] = Stages[KV.second];
717 NewInstrChanges[KV.first] = InstrChanges[getSUnit(KV.first)];
718 }
719
720 ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
721 std::move(Stages));
723 assert(NewInstrChanges.empty() &&
724 "Cannot serialize a schedule with InstrChanges!");
726 MSTI.annotate();
727 return;
728 }
729 // The experimental code generator can't work if there are InstChanges.
730 if (ExperimentalCodeGen && NewInstrChanges.empty()) {
731 PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
732 MSE.expand();
733 } else if (MVECodeGen && NewInstrChanges.empty() &&
734 LoopPipelinerInfo->isMVEExpanderSupported() &&
736 ModuloScheduleExpanderMVE MSE(MF, MS, LIS);
737 MSE.expand();
738 } else {
739 ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
740 MSE.expand();
741 MSE.cleanup();
742 }
743 ++NumPipelined;
744}
745
746/// Clean up after the software pipeliner runs.
748 for (auto &KV : NewMIs)
749 MF.deleteMachineInstr(KV.second);
750 NewMIs.clear();
751
752 // Call the superclass.
754}
755
756/// Return the register values for the operands of a Phi instruction.
757/// This function assume the instruction is a Phi.
759 unsigned &InitVal, unsigned &LoopVal) {
760 assert(Phi.isPHI() && "Expecting a Phi.");
761
762 InitVal = 0;
763 LoopVal = 0;
764 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
765 if (Phi.getOperand(i + 1).getMBB() != Loop)
766 InitVal = Phi.getOperand(i).getReg();
767 else
768 LoopVal = Phi.getOperand(i).getReg();
769
770 assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
771}
772
773/// Return the Phi register value that comes the loop block.
774static unsigned getLoopPhiReg(const MachineInstr &Phi,
775 const MachineBasicBlock *LoopBB) {
776 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
777 if (Phi.getOperand(i + 1).getMBB() == LoopBB)
778 return Phi.getOperand(i).getReg();
779 return 0;
780}
781
782/// Return true if SUb can be reached from SUa following the chain edges.
783static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
786 Worklist.push_back(SUa);
787 while (!Worklist.empty()) {
788 const SUnit *SU = Worklist.pop_back_val();
789 for (const auto &SI : SU->Succs) {
790 SUnit *SuccSU = SI.getSUnit();
791 if (SI.getKind() == SDep::Order) {
792 if (Visited.count(SuccSU))
793 continue;
794 if (SuccSU == SUb)
795 return true;
796 Worklist.push_back(SuccSU);
797 Visited.insert(SuccSU);
798 }
799 }
800 }
801 return false;
802}
803
804/// Return true if the instruction causes a chain between memory
805/// references before and after it.
807 return MI.isCall() || MI.mayRaiseFPException() ||
808 MI.hasUnmodeledSideEffects() ||
809 (MI.hasOrderedMemoryRef() &&
810 (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad()));
811}
812
813/// Return the underlying objects for the memory references of an instruction.
814/// This function calls the code in ValueTracking, but first checks that the
815/// instruction has a memory operand.
818 if (!MI->hasOneMemOperand())
819 return;
820 MachineMemOperand *MM = *MI->memoperands_begin();
821 if (!MM->getValue())
822 return;
823 getUnderlyingObjects(MM->getValue(), Objs);
824 for (const Value *V : Objs) {
825 if (!isIdentifiedObject(V)) {
826 Objs.clear();
827 return;
828 }
829 }
830}
831
832/// Add a chain edge between a load and store if the store can be an
833/// alias of the load on a subsequent iteration, i.e., a loop carried
834/// dependence. This code is very similar to the code in ScheduleDAGInstrs
835/// but that code doesn't create loop carried dependences.
836void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
840 for (auto &SU : SUnits) {
841 MachineInstr &MI = *SU.getInstr();
843 PendingLoads.clear();
844 else if (MI.mayLoad()) {
847 if (Objs.empty())
849 for (const auto *V : Objs) {
850 SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
851 SUs.push_back(&SU);
852 }
853 } else if (MI.mayStore()) {
856 if (Objs.empty())
858 for (const auto *V : Objs) {
860 PendingLoads.find(V);
861 if (I == PendingLoads.end())
862 continue;
863 for (auto *Load : I->second) {
864 if (isSuccOrder(Load, &SU))
865 continue;
866 MachineInstr &LdMI = *Load->getInstr();
867 // First, perform the cheaper check that compares the base register.
868 // If they are the same and the load offset is less than the store
869 // offset, then mark the dependence as loop carried potentially.
870 const MachineOperand *BaseOp1, *BaseOp2;
871 int64_t Offset1, Offset2;
872 bool Offset1IsScalable, Offset2IsScalable;
873 if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1,
874 Offset1IsScalable, TRI) &&
875 TII->getMemOperandWithOffset(MI, BaseOp2, Offset2,
876 Offset2IsScalable, TRI)) {
877 if (BaseOp1->isIdenticalTo(*BaseOp2) &&
878 Offset1IsScalable == Offset2IsScalable &&
879 (int)Offset1 < (int)Offset2) {
881 "What happened to the chain edge?");
882 SDep Dep(Load, SDep::Barrier);
883 Dep.setLatency(1);
884 SU.addPred(Dep);
885 continue;
886 }
887 }
888 // Second, the more expensive check that uses alias analysis on the
889 // base registers. If they alias, and the load offset is less than
890 // the store offset, the mark the dependence as loop carried.
891 if (!AA) {
892 SDep Dep(Load, SDep::Barrier);
893 Dep.setLatency(1);
894 SU.addPred(Dep);
895 continue;
896 }
897 MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
898 MachineMemOperand *MMO2 = *MI.memoperands_begin();
899 if (!MMO1->getValue() || !MMO2->getValue()) {
900 SDep Dep(Load, SDep::Barrier);
901 Dep.setLatency(1);
902 SU.addPred(Dep);
903 continue;
904 }
905 if (MMO1->getValue() == MMO2->getValue() &&
906 MMO1->getOffset() <= MMO2->getOffset()) {
907 SDep Dep(Load, SDep::Barrier);
908 Dep.setLatency(1);
909 SU.addPred(Dep);
910 continue;
911 }
912 if (!AA->isNoAlias(
915 MMO2->getAAInfo()))) {
916 SDep Dep(Load, SDep::Barrier);
917 Dep.setLatency(1);
918 SU.addPred(Dep);
919 }
920 }
921 }
922 }
923 }
924}
925
926/// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
927/// processes dependences for PHIs. This function adds true dependences
928/// from a PHI to a use, and a loop carried dependence from the use to the
929/// PHI. The loop carried dependence is represented as an anti dependence
930/// edge. This function also removes chain dependences between unrelated
931/// PHIs.
932void SwingSchedulerDAG::updatePhiDependences() {
933 SmallVector<SDep, 4> RemoveDeps;
935
936 // Iterate over each DAG node.
937 for (SUnit &I : SUnits) {
938 RemoveDeps.clear();
939 // Set to true if the instruction has an operand defined by a Phi.
940 unsigned HasPhiUse = 0;
941 unsigned HasPhiDef = 0;
942 MachineInstr *MI = I.getInstr();
943 // Iterate over each operand, and we process the definitions.
944 for (const MachineOperand &MO : MI->operands()) {
945 if (!MO.isReg())
946 continue;
947 Register Reg = MO.getReg();
948 if (MO.isDef()) {
949 // If the register is used by a Phi, then create an anti dependence.
951 UI = MRI.use_instr_begin(Reg),
952 UE = MRI.use_instr_end();
953 UI != UE; ++UI) {
954 MachineInstr *UseMI = &*UI;
955 SUnit *SU = getSUnit(UseMI);
956 if (SU != nullptr && UseMI->isPHI()) {
957 if (!MI->isPHI()) {
958 SDep Dep(SU, SDep::Anti, Reg);
959 Dep.setLatency(1);
960 I.addPred(Dep);
961 } else {
962 HasPhiDef = Reg;
963 // Add a chain edge to a dependent Phi that isn't an existing
964 // predecessor.
965 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
966 I.addPred(SDep(SU, SDep::Barrier));
967 }
968 }
969 }
970 } else if (MO.isUse()) {
971 // If the register is defined by a Phi, then create a true dependence.
973 if (DefMI == nullptr)
974 continue;
975 SUnit *SU = getSUnit(DefMI);
976 if (SU != nullptr && DefMI->isPHI()) {
977 if (!MI->isPHI()) {
978 SDep Dep(SU, SDep::Data, Reg);
979 Dep.setLatency(0);
980 ST.adjustSchedDependency(SU, 0, &I, MO.getOperandNo(), Dep,
981 &SchedModel);
982 I.addPred(Dep);
983 } else {
984 HasPhiUse = Reg;
985 // Add a chain edge to a dependent Phi that isn't an existing
986 // predecessor.
987 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
988 I.addPred(SDep(SU, SDep::Barrier));
989 }
990 }
991 }
992 }
993 // Remove order dependences from an unrelated Phi.
994 if (!SwpPruneDeps)
995 continue;
996 for (auto &PI : I.Preds) {
997 MachineInstr *PMI = PI.getSUnit()->getInstr();
998 if (PMI->isPHI() && PI.getKind() == SDep::Order) {
999 if (I.getInstr()->isPHI()) {
1000 if (PMI->getOperand(0).getReg() == HasPhiUse)
1001 continue;
1002 if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
1003 continue;
1004 }
1005 RemoveDeps.push_back(PI);
1006 }
1007 }
1008 for (const SDep &D : RemoveDeps)
1009 I.removePred(D);
1010 }
1011}
1012
1013/// Iterate over each DAG node and see if we can change any dependences
1014/// in order to reduce the recurrence MII.
1015void SwingSchedulerDAG::changeDependences() {
1016 // See if an instruction can use a value from the previous iteration.
1017 // If so, we update the base and offset of the instruction and change
1018 // the dependences.
1019 for (SUnit &I : SUnits) {
1020 unsigned BasePos = 0, OffsetPos = 0, NewBase = 0;
1021 int64_t NewOffset = 0;
1022 if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
1023 NewOffset))
1024 continue;
1025
1026 // Get the MI and SUnit for the instruction that defines the original base.
1027 Register OrigBase = I.getInstr()->getOperand(BasePos).getReg();
1029 if (!DefMI)
1030 continue;
1031 SUnit *DefSU = getSUnit(DefMI);
1032 if (!DefSU)
1033 continue;
1034 // Get the MI and SUnit for the instruction that defins the new base.
1035 MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
1036 if (!LastMI)
1037 continue;
1038 SUnit *LastSU = getSUnit(LastMI);
1039 if (!LastSU)
1040 continue;
1041
1042 if (Topo.IsReachable(&I, LastSU))
1043 continue;
1044
1045 // Remove the dependence. The value now depends on a prior iteration.
1047 for (const SDep &P : I.Preds)
1048 if (P.getSUnit() == DefSU)
1049 Deps.push_back(P);
1050 for (const SDep &D : Deps) {
1051 Topo.RemovePred(&I, D.getSUnit());
1052 I.removePred(D);
1053 }
1054 // Remove the chain dependence between the instructions.
1055 Deps.clear();
1056 for (auto &P : LastSU->Preds)
1057 if (P.getSUnit() == &I && P.getKind() == SDep::Order)
1058 Deps.push_back(P);
1059 for (const SDep &D : Deps) {
1060 Topo.RemovePred(LastSU, D.getSUnit());
1061 LastSU->removePred(D);
1062 }
1063
1064 // Add a dependence between the new instruction and the instruction
1065 // that defines the new base.
1066 SDep Dep(&I, SDep::Anti, NewBase);
1067 Topo.AddPred(LastSU, &I);
1068 LastSU->addPred(Dep);
1069
1070 // Remember the base and offset information so that we can update the
1071 // instruction during code generation.
1072 InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
1073 }
1074}
1075
1076/// Create an instruction stream that represents a single iteration and stage of
1077/// each instruction. This function differs from SMSchedule::finalizeSchedule in
1078/// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this
1079/// function is an approximation of SMSchedule::finalizeSchedule with all
1080/// non-const operations removed.
1082 SMSchedule &Schedule,
1083 std::vector<MachineInstr *> &OrderedInsts,
1086
1087 // Move all instructions to the first stage from the later stages.
1088 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1089 ++Cycle) {
1090 for (int Stage = 0, LastStage = Schedule.getMaxStageCount();
1091 Stage <= LastStage; ++Stage) {
1092 for (SUnit *SU : llvm::reverse(Schedule.getInstructions(
1093 Cycle + Stage * Schedule.getInitiationInterval()))) {
1094 Instrs[Cycle].push_front(SU);
1095 }
1096 }
1097 }
1098
1099 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1100 ++Cycle) {
1101 std::deque<SUnit *> &CycleInstrs = Instrs[Cycle];
1102 CycleInstrs = Schedule.reorderInstructions(SSD, CycleInstrs);
1103 for (SUnit *SU : CycleInstrs) {
1104 MachineInstr *MI = SU->getInstr();
1105 OrderedInsts.push_back(MI);
1106 Stages[MI] = Schedule.stageScheduled(SU);
1107 }
1108 }
1109}
1110
1111namespace {
1112
1113// FuncUnitSorter - Comparison operator used to sort instructions by
1114// the number of functional unit choices.
1115struct FuncUnitSorter {
1116 const InstrItineraryData *InstrItins;
1117 const MCSubtargetInfo *STI;
1119
1120 FuncUnitSorter(const TargetSubtargetInfo &TSI)
1121 : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
1122
1123 // Compute the number of functional unit alternatives needed
1124 // at each stage, and take the minimum value. We prioritize the
1125 // instructions by the least number of choices first.
1126 unsigned minFuncUnits(const MachineInstr *Inst,
1127 InstrStage::FuncUnits &F) const {
1128 unsigned SchedClass = Inst->getDesc().getSchedClass();
1129 unsigned min = UINT_MAX;
1130 if (InstrItins && !InstrItins->isEmpty()) {
1131 for (const InstrStage &IS :
1132 make_range(InstrItins->beginStage(SchedClass),
1133 InstrItins->endStage(SchedClass))) {
1134 InstrStage::FuncUnits funcUnits = IS.getUnits();
1135 unsigned numAlternatives = llvm::popcount(funcUnits);
1136 if (numAlternatives < min) {
1137 min = numAlternatives;
1138 F = funcUnits;
1139 }
1140 }
1141 return min;
1142 }
1143 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1144 const MCSchedClassDesc *SCDesc =
1145 STI->getSchedModel().getSchedClassDesc(SchedClass);
1146 if (!SCDesc->isValid())
1147 // No valid Schedule Class Desc for schedClass, should be
1148 // Pseudo/PostRAPseudo
1149 return min;
1150
1151 for (const MCWriteProcResEntry &PRE :
1152 make_range(STI->getWriteProcResBegin(SCDesc),
1153 STI->getWriteProcResEnd(SCDesc))) {
1154 if (!PRE.ReleaseAtCycle)
1155 continue;
1156 const MCProcResourceDesc *ProcResource =
1157 STI->getSchedModel().getProcResource(PRE.ProcResourceIdx);
1158 unsigned NumUnits = ProcResource->NumUnits;
1159 if (NumUnits < min) {
1160 min = NumUnits;
1161 F = PRE.ProcResourceIdx;
1162 }
1163 }
1164 return min;
1165 }
1166 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1167 }
1168
1169 // Compute the critical resources needed by the instruction. This
1170 // function records the functional units needed by instructions that
1171 // must use only one functional unit. We use this as a tie breaker
1172 // for computing the resource MII. The instrutions that require
1173 // the same, highly used, functional unit have high priority.
1174 void calcCriticalResources(MachineInstr &MI) {
1175 unsigned SchedClass = MI.getDesc().getSchedClass();
1176 if (InstrItins && !InstrItins->isEmpty()) {
1177 for (const InstrStage &IS :
1178 make_range(InstrItins->beginStage(SchedClass),
1179 InstrItins->endStage(SchedClass))) {
1180 InstrStage::FuncUnits FuncUnits = IS.getUnits();
1181 if (llvm::popcount(FuncUnits) == 1)
1182 Resources[FuncUnits]++;
1183 }
1184 return;
1185 }
1186 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1187 const MCSchedClassDesc *SCDesc =
1188 STI->getSchedModel().getSchedClassDesc(SchedClass);
1189 if (!SCDesc->isValid())
1190 // No valid Schedule Class Desc for schedClass, should be
1191 // Pseudo/PostRAPseudo
1192 return;
1193
1194 for (const MCWriteProcResEntry &PRE :
1195 make_range(STI->getWriteProcResBegin(SCDesc),
1196 STI->getWriteProcResEnd(SCDesc))) {
1197 if (!PRE.ReleaseAtCycle)
1198 continue;
1199 Resources[PRE.ProcResourceIdx]++;
1200 }
1201 return;
1202 }
1203 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1204 }
1205
1206 /// Return true if IS1 has less priority than IS2.
1207 bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
1208 InstrStage::FuncUnits F1 = 0, F2 = 0;
1209 unsigned MFUs1 = minFuncUnits(IS1, F1);
1210 unsigned MFUs2 = minFuncUnits(IS2, F2);
1211 if (MFUs1 == MFUs2)
1212 return Resources.lookup(F1) < Resources.lookup(F2);
1213 return MFUs1 > MFUs2;
1214 }
1215};
1216
1217/// Calculate the maximum register pressure of the scheduled instructions stream
1218class HighRegisterPressureDetector {
1219 MachineBasicBlock *OrigMBB;
1220 const MachineRegisterInfo &MRI;
1221 const TargetRegisterInfo *TRI;
1222
1223 const unsigned PSetNum;
1224
1225 // Indexed by PSet ID
1226 // InitSetPressure takes into account the register pressure of live-in
1227 // registers. It's not depend on how the loop is scheduled, so it's enough to
1228 // calculate them once at the beginning.
1229 std::vector<unsigned> InitSetPressure;
1230
1231 // Indexed by PSet ID
1232 // Upper limit for each register pressure set
1233 std::vector<unsigned> PressureSetLimit;
1234
1236
1238
1239public:
1240 using OrderedInstsTy = std::vector<MachineInstr *>;
1241 using Instr2StageTy = DenseMap<MachineInstr *, unsigned>;
1242
1243private:
1244 static void dumpRegisterPressures(const std::vector<unsigned> &Pressures) {
1245 if (Pressures.size() == 0) {
1246 dbgs() << "[]";
1247 } else {
1248 char Prefix = '[';
1249 for (unsigned P : Pressures) {
1250 dbgs() << Prefix << P;
1251 Prefix = ' ';
1252 }
1253 dbgs() << ']';
1254 }
1255 }
1256
1257 void dumpPSet(Register Reg) const {
1258 dbgs() << "Reg=" << printReg(Reg, TRI, 0, &MRI) << " PSet=";
1259 for (auto PSetIter = MRI.getPressureSets(Reg); PSetIter.isValid();
1260 ++PSetIter) {
1261 dbgs() << *PSetIter << ' ';
1262 }
1263 dbgs() << '\n';
1264 }
1265
1266 void increaseRegisterPressure(std::vector<unsigned> &Pressure,
1267 Register Reg) const {
1268 auto PSetIter = MRI.getPressureSets(Reg);
1269 unsigned Weight = PSetIter.getWeight();
1270 for (; PSetIter.isValid(); ++PSetIter)
1271 Pressure[*PSetIter] += Weight;
1272 }
1273
1274 void decreaseRegisterPressure(std::vector<unsigned> &Pressure,
1275 Register Reg) const {
1276 auto PSetIter = MRI.getPressureSets(Reg);
1277 unsigned Weight = PSetIter.getWeight();
1278 for (; PSetIter.isValid(); ++PSetIter) {
1279 auto &P = Pressure[*PSetIter];
1280 assert(P >= Weight &&
1281 "register pressure must be greater than or equal weight");
1282 P -= Weight;
1283 }
1284 }
1285
1286 // Return true if Reg is reserved one, for example, stack pointer
1287 bool isReservedRegister(Register Reg) const {
1288 return Reg.isPhysical() && MRI.isReserved(Reg.asMCReg());
1289 }
1290
1291 bool isDefinedInThisLoop(Register Reg) const {
1292 return Reg.isVirtual() && MRI.getVRegDef(Reg)->getParent() == OrigMBB;
1293 }
1294
1295 // Search for live-in variables. They are factored into the register pressure
1296 // from the begining. Live-in variables used by every iteration should be
1297 // considered as alive throughout the loop. For example, the variable `c` in
1298 // following code. \code
1299 // int c = ...;
1300 // for (int i = 0; i < n; i++)
1301 // a[i] += b[i] + c;
1302 // \endcode
1303 void computeLiveIn() {
1305 for (auto &MI : *OrigMBB) {
1306 if (MI.isDebugInstr())
1307 continue;
1308 for (auto &Use : ROMap[&MI].Uses) {
1309 auto Reg = Use.RegUnit;
1310 // Ignore the variable that appears only on one side of phi instruction
1311 // because it's used only at the first iteration.
1312 if (MI.isPHI() && Reg != getLoopPhiReg(MI, OrigMBB))
1313 continue;
1314 if (isReservedRegister(Reg))
1315 continue;
1316 if (isDefinedInThisLoop(Reg))
1317 continue;
1318 Used.insert(Reg);
1319 }
1320 }
1321
1322 for (auto LiveIn : Used)
1323 increaseRegisterPressure(InitSetPressure, LiveIn);
1324 }
1325
1326 // Calculate the upper limit of each pressure set
1327 void computePressureSetLimit(const RegisterClassInfo &RCI) {
1328 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1329 PressureSetLimit[PSet] = RCI.getRegPressureSetLimit(PSet);
1330 }
1331
1332 // There are two patterns of last-use.
1333 // - by an instruction of the current iteration
1334 // - by a phi instruction of the next iteration (loop carried value)
1335 //
1336 // Furthermore, following two groups of instructions are executed
1337 // simultaneously
1338 // - next iteration's phi instructions in i-th stage
1339 // - current iteration's instructions in i+1-th stage
1340 //
1341 // This function calculates the last-use of each register while taking into
1342 // account the above two patterns.
1343 Instr2LastUsesTy computeLastUses(const OrderedInstsTy &OrderedInsts,
1344 Instr2StageTy &Stages) const {
1345 // We treat virtual registers that are defined and used in this loop.
1346 // Following virtual register will be ignored
1347 // - live-in one
1348 // - defined but not used in the loop (potentially live-out)
1349 DenseSet<Register> TargetRegs;
1350 const auto UpdateTargetRegs = [this, &TargetRegs](Register Reg) {
1351 if (isDefinedInThisLoop(Reg))
1352 TargetRegs.insert(Reg);
1353 };
1354 for (MachineInstr *MI : OrderedInsts) {
1355 if (MI->isPHI()) {
1356 Register Reg = getLoopPhiReg(*MI, OrigMBB);
1357 UpdateTargetRegs(Reg);
1358 } else {
1359 for (auto &Use : ROMap.find(MI)->getSecond().Uses)
1360 UpdateTargetRegs(Use.RegUnit);
1361 }
1362 }
1363
1364 const auto InstrScore = [&Stages](MachineInstr *MI) {
1365 return Stages[MI] + MI->isPHI();
1366 };
1367
1369 for (MachineInstr *MI : llvm::reverse(OrderedInsts)) {
1370 for (auto &Use : ROMap.find(MI)->getSecond().Uses) {
1371 auto Reg = Use.RegUnit;
1372 if (!TargetRegs.contains(Reg))
1373 continue;
1374 auto [Ite, Inserted] = LastUseMI.try_emplace(Reg, MI);
1375 if (!Inserted) {
1376 MachineInstr *Orig = Ite->second;
1377 MachineInstr *New = MI;
1378 if (InstrScore(Orig) < InstrScore(New))
1379 Ite->second = New;
1380 }
1381 }
1382 }
1383
1384 Instr2LastUsesTy LastUses;
1385 for (auto &Entry : LastUseMI)
1386 LastUses[Entry.second].insert(Entry.first);
1387 return LastUses;
1388 }
1389
1390 // Compute the maximum register pressure of the kernel. We'll simulate #Stage
1391 // iterations and check the register pressure at the point where all stages
1392 // overlapping.
1393 //
1394 // An example of unrolled loop where #Stage is 4..
1395 // Iter i+0 i+1 i+2 i+3
1396 // ------------------------
1397 // Stage 0
1398 // Stage 1 0
1399 // Stage 2 1 0
1400 // Stage 3 2 1 0 <- All stages overlap
1401 //
1402 std::vector<unsigned>
1403 computeMaxSetPressure(const OrderedInstsTy &OrderedInsts,
1404 Instr2StageTy &Stages,
1405 const unsigned StageCount) const {
1406 using RegSetTy = SmallDenseSet<Register, 16>;
1407
1408 // Indexed by #Iter. To treat "local" variables of each stage separately, we
1409 // manage the liveness of the registers independently by iterations.
1410 SmallVector<RegSetTy> LiveRegSets(StageCount);
1411
1412 auto CurSetPressure = InitSetPressure;
1413 auto MaxSetPressure = InitSetPressure;
1414 auto LastUses = computeLastUses(OrderedInsts, Stages);
1415
1416 LLVM_DEBUG({
1417 dbgs() << "Ordered instructions:\n";
1418 for (MachineInstr *MI : OrderedInsts) {
1419 dbgs() << "Stage " << Stages[MI] << ": ";
1420 MI->dump();
1421 }
1422 });
1423
1424 const auto InsertReg = [this, &CurSetPressure](RegSetTy &RegSet,
1425 Register Reg) {
1426 if (!Reg.isValid() || isReservedRegister(Reg))
1427 return;
1428
1429 bool Inserted = RegSet.insert(Reg).second;
1430 if (!Inserted)
1431 return;
1432
1433 LLVM_DEBUG(dbgs() << "insert " << printReg(Reg, TRI, 0, &MRI) << "\n");
1434 increaseRegisterPressure(CurSetPressure, Reg);
1435 LLVM_DEBUG(dumpPSet(Reg));
1436 };
1437
1438 const auto EraseReg = [this, &CurSetPressure](RegSetTy &RegSet,
1439 Register Reg) {
1440 if (!Reg.isValid() || isReservedRegister(Reg))
1441 return;
1442
1443 // live-in register
1444 if (!RegSet.contains(Reg))
1445 return;
1446
1447 LLVM_DEBUG(dbgs() << "erase " << printReg(Reg, TRI, 0, &MRI) << "\n");
1448 RegSet.erase(Reg);
1449 decreaseRegisterPressure(CurSetPressure, Reg);
1450 LLVM_DEBUG(dumpPSet(Reg));
1451 };
1452
1453 for (unsigned I = 0; I < StageCount; I++) {
1454 for (MachineInstr *MI : OrderedInsts) {
1455 const auto Stage = Stages[MI];
1456 if (I < Stage)
1457 continue;
1458
1459 const unsigned Iter = I - Stage;
1460
1461 for (auto &Def : ROMap.find(MI)->getSecond().Defs)
1462 InsertReg(LiveRegSets[Iter], Def.RegUnit);
1463
1464 for (auto LastUse : LastUses[MI]) {
1465 if (MI->isPHI()) {
1466 if (Iter != 0)
1467 EraseReg(LiveRegSets[Iter - 1], LastUse);
1468 } else {
1469 EraseReg(LiveRegSets[Iter], LastUse);
1470 }
1471 }
1472
1473 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1474 MaxSetPressure[PSet] =
1475 std::max(MaxSetPressure[PSet], CurSetPressure[PSet]);
1476
1477 LLVM_DEBUG({
1478 dbgs() << "CurSetPressure=";
1479 dumpRegisterPressures(CurSetPressure);
1480 dbgs() << " iter=" << Iter << " stage=" << Stage << ":";
1481 MI->dump();
1482 });
1483 }
1484 }
1485
1486 return MaxSetPressure;
1487 }
1488
1489public:
1490 HighRegisterPressureDetector(MachineBasicBlock *OrigMBB,
1491 const MachineFunction &MF)
1492 : OrigMBB(OrigMBB), MRI(MF.getRegInfo()),
1493 TRI(MF.getSubtarget().getRegisterInfo()),
1494 PSetNum(TRI->getNumRegPressureSets()), InitSetPressure(PSetNum, 0),
1495 PressureSetLimit(PSetNum, 0) {}
1496
1497 // Used to calculate register pressure, which is independent of loop
1498 // scheduling.
1499 void init(const RegisterClassInfo &RCI) {
1500 for (MachineInstr &MI : *OrigMBB) {
1501 if (MI.isDebugInstr())
1502 continue;
1503 ROMap[&MI].collect(MI, *TRI, MRI, false, true);
1504 }
1505
1506 computeLiveIn();
1507 computePressureSetLimit(RCI);
1508 }
1509
1510 // Calculate the maximum register pressures of the loop and check if they
1511 // exceed the limit
1512 bool detect(const SwingSchedulerDAG *SSD, SMSchedule &Schedule,
1513 const unsigned MaxStage) const {
1515 "the percentage of the margin must be between 0 to 100");
1516
1517 OrderedInstsTy OrderedInsts;
1518 Instr2StageTy Stages;
1519 computeScheduledInsts(SSD, Schedule, OrderedInsts, Stages);
1520 const auto MaxSetPressure =
1521 computeMaxSetPressure(OrderedInsts, Stages, MaxStage + 1);
1522
1523 LLVM_DEBUG({
1524 dbgs() << "Dump MaxSetPressure:\n";
1525 for (unsigned I = 0; I < MaxSetPressure.size(); I++) {
1526 dbgs() << format("MaxSetPressure[%d]=%d\n", I, MaxSetPressure[I]);
1527 }
1528 dbgs() << '\n';
1529 });
1530
1531 for (unsigned PSet = 0; PSet < PSetNum; PSet++) {
1532 unsigned Limit = PressureSetLimit[PSet];
1533 unsigned Margin = Limit * RegPressureMargin / 100;
1534 LLVM_DEBUG(dbgs() << "PSet=" << PSet << " Limit=" << Limit
1535 << " Margin=" << Margin << "\n");
1536 if (Limit < MaxSetPressure[PSet] + Margin) {
1537 LLVM_DEBUG(
1538 dbgs()
1539 << "Rejected the schedule because of too high register pressure\n");
1540 return true;
1541 }
1542 }
1543 return false;
1544 }
1545};
1546
1547} // end anonymous namespace
1548
1549/// Calculate the resource constrained minimum initiation interval for the
1550/// specified loop. We use the DFA to model the resources needed for
1551/// each instruction, and we ignore dependences. A different DFA is created
1552/// for each cycle that is required. When adding a new instruction, we attempt
1553/// to add it to each existing DFA, until a legal space is found. If the
1554/// instruction cannot be reserved in an existing DFA, we create a new one.
1555unsigned SwingSchedulerDAG::calculateResMII() {
1556 LLVM_DEBUG(dbgs() << "calculateResMII:\n");
1558 return RM.calculateResMII();
1559}
1560
1561/// Calculate the recurrence-constrainted minimum initiation interval.
1562/// Iterate over each circuit. Compute the delay(c) and distance(c)
1563/// for each circuit. The II needs to satisfy the inequality
1564/// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1565/// II that satisfies the inequality, and the RecMII is the maximum
1566/// of those values.
1567unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1568 unsigned RecMII = 0;
1569
1570 for (NodeSet &Nodes : NodeSets) {
1571 if (Nodes.empty())
1572 continue;
1573
1574 unsigned Delay = Nodes.getLatency();
1575 unsigned Distance = 1;
1576
1577 // ii = ceil(delay / distance)
1578 unsigned CurMII = (Delay + Distance - 1) / Distance;
1579 Nodes.setRecMII(CurMII);
1580 if (CurMII > RecMII)
1581 RecMII = CurMII;
1582 }
1583
1584 return RecMII;
1585}
1586
1587/// Create the adjacency structure of the nodes in the graph.
1588void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1589 SwingSchedulerDAG *DAG) {
1590 BitVector Added(SUnits.size());
1591 DenseMap<int, int> OutputDeps;
1592 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1593 Added.reset();
1594 // Add any successor to the adjacency matrix and exclude duplicates.
1595 for (auto &OE : DAG->DDG->getOutEdges(&SUnits[i])) {
1596 // Only create a back-edge on the first and last nodes of a dependence
1597 // chain. This records any chains and adds them later.
1598 if (OE.isOutputDep()) {
1599 int N = OE.getDst()->NodeNum;
1600 int BackEdge = i;
1601 auto Dep = OutputDeps.find(BackEdge);
1602 if (Dep != OutputDeps.end()) {
1603 BackEdge = Dep->second;
1604 OutputDeps.erase(Dep);
1605 }
1606 OutputDeps[N] = BackEdge;
1607 }
1608 // Do not process a boundary node, an artificial node.
1609 if (OE.getDst()->isBoundaryNode() || OE.isArtificial())
1610 continue;
1611
1612 // This code is retained o preserve previous behavior and prevent
1613 // regression. This condition means that anti-dependnecies within an
1614 // iteration are ignored when searching circuits. Therefore it's natural
1615 // to consider this dependence as well.
1616 // FIXME: Remove this code if it doesn't have significant impact on
1617 // performance.
1618 if (OE.isAntiDep())
1619 continue;
1620
1621 int N = OE.getDst()->NodeNum;
1622 if (!Added.test(N)) {
1623 AdjK[i].push_back(N);
1624 Added.set(N);
1625 }
1626 }
1627 // A chain edge between a store and a load is treated as a back-edge in the
1628 // adjacency matrix.
1629 for (auto &IE : DAG->DDG->getInEdges(&SUnits[i])) {
1630 SUnit *Src = IE.getSrc();
1631 SUnit *Dst = IE.getDst();
1632 if (!Dst->getInstr()->mayStore() || !DAG->isLoopCarriedDep(IE))
1633 continue;
1634 if (IE.isOrderDep() && Src->getInstr()->mayLoad()) {
1635 int N = Src->NodeNum;
1636 if (!Added.test(N)) {
1637 AdjK[i].push_back(N);
1638 Added.set(N);
1639 }
1640 }
1641 }
1642 }
1643 // Add back-edges in the adjacency matrix for the output dependences.
1644 for (auto &OD : OutputDeps)
1645 if (!Added.test(OD.second)) {
1646 AdjK[OD.first].push_back(OD.second);
1647 Added.set(OD.second);
1648 }
1649}
1650
1651/// Identify an elementary circuit in the dependence graph starting at the
1652/// specified node.
1653bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1654 const SwingSchedulerDAG *DAG,
1655 bool HasBackedge) {
1656 SUnit *SV = &SUnits[V];
1657 bool F = false;
1658 Stack.insert(SV);
1659 Blocked.set(V);
1660
1661 for (auto W : AdjK[V]) {
1662 if (NumPaths > MaxPaths)
1663 break;
1664 if (W < S)
1665 continue;
1666 if (W == S) {
1667 if (!HasBackedge)
1668 NodeSets.push_back(NodeSet(Stack.begin(), Stack.end(), DAG));
1669 F = true;
1670 ++NumPaths;
1671 break;
1672 }
1673 if (!Blocked.test(W)) {
1674 if (circuit(W, S, NodeSets, DAG,
1675 Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
1676 F = true;
1677 }
1678 }
1679
1680 if (F)
1681 unblock(V);
1682 else {
1683 for (auto W : AdjK[V]) {
1684 if (W < S)
1685 continue;
1686 B[W].insert(SV);
1687 }
1688 }
1689 Stack.pop_back();
1690 return F;
1691}
1692
1693/// Unblock a node in the circuit finding algorithm.
1694void SwingSchedulerDAG::Circuits::unblock(int U) {
1695 Blocked.reset(U);
1697 while (!BU.empty()) {
1699 assert(SI != BU.end() && "Invalid B set.");
1700 SUnit *W = *SI;
1701 BU.erase(W);
1702 if (Blocked.test(W->NodeNum))
1703 unblock(W->NodeNum);
1704 }
1705}
1706
1707/// Identify all the elementary circuits in the dependence graph using
1708/// Johnson's circuit algorithm.
1709void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
1710 Circuits Cir(SUnits, Topo);
1711 // Create the adjacency structure.
1712 Cir.createAdjacencyStructure(this);
1713 for (int I = 0, E = SUnits.size(); I != E; ++I) {
1714 Cir.reset();
1715 Cir.circuit(I, I, NodeSets, this);
1716 }
1717}
1718
1719// Create artificial dependencies between the source of COPY/REG_SEQUENCE that
1720// is loop-carried to the USE in next iteration. This will help pipeliner avoid
1721// additional copies that are needed across iterations. An artificial dependence
1722// edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
1723
1724// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
1725// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
1726// PHI-------True-Dep------> USEOfPhi
1727
1728// The mutation creates
1729// USEOfPHI -------Artificial-Dep---> SRCOfCopy
1730
1731// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
1732// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
1733// late to avoid additional copies across iterations. The possible scheduling
1734// order would be
1735// USEOfPHI --- SRCOfCopy--- COPY/REG_SEQUENCE.
1736
1737void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
1738 for (SUnit &SU : DAG->SUnits) {
1739 // Find the COPY/REG_SEQUENCE instruction.
1740 if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
1741 continue;
1742
1743 // Record the loop carried PHIs.
1745 // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
1747
1748 for (auto &Dep : SU.Preds) {
1749 SUnit *TmpSU = Dep.getSUnit();
1750 MachineInstr *TmpMI = TmpSU->getInstr();
1751 SDep::Kind DepKind = Dep.getKind();
1752 // Save the loop carried PHI.
1753 if (DepKind == SDep::Anti && TmpMI->isPHI())
1754 PHISUs.push_back(TmpSU);
1755 // Save the source of COPY/REG_SEQUENCE.
1756 // If the source has no pre-decessors, we will end up creating cycles.
1757 else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
1758 SrcSUs.push_back(TmpSU);
1759 }
1760
1761 if (PHISUs.size() == 0 || SrcSUs.size() == 0)
1762 continue;
1763
1764 // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
1765 // SUnit to the container.
1767 // Do not use iterator based loop here as we are updating the container.
1768 for (size_t Index = 0; Index < PHISUs.size(); ++Index) {
1769 for (auto &Dep : PHISUs[Index]->Succs) {
1770 if (Dep.getKind() != SDep::Data)
1771 continue;
1772
1773 SUnit *TmpSU = Dep.getSUnit();
1774 MachineInstr *TmpMI = TmpSU->getInstr();
1775 if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
1776 PHISUs.push_back(TmpSU);
1777 continue;
1778 }
1779 UseSUs.push_back(TmpSU);
1780 }
1781 }
1782
1783 if (UseSUs.size() == 0)
1784 continue;
1785
1786 SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
1787 // Add the artificial dependencies if it does not form a cycle.
1788 for (auto *I : UseSUs) {
1789 for (auto *Src : SrcSUs) {
1790 if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
1791 Src->addPred(SDep(I, SDep::Artificial));
1792 SDAG->Topo.AddPred(Src, I);
1793 }
1794 }
1795 }
1796 }
1797}
1798
1799/// Compute several functions need to order the nodes for scheduling.
1800/// ASAP - Earliest time to schedule a node.
1801/// ALAP - Latest time to schedule a node.
1802/// MOV - Mobility function, difference between ALAP and ASAP.
1803/// D - Depth of each node.
1804/// H - Height of each node.
1805void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
1806 ScheduleInfo.resize(SUnits.size());
1807
1808 LLVM_DEBUG({
1809 for (int I : Topo) {
1810 const SUnit &SU = SUnits[I];
1811 dumpNode(SU);
1812 }
1813 });
1814
1815 int maxASAP = 0;
1816 // Compute ASAP and ZeroLatencyDepth.
1817 for (int I : Topo) {
1818 int asap = 0;
1819 int zeroLatencyDepth = 0;
1820 SUnit *SU = &SUnits[I];
1821 for (const auto &IE : DDG->getInEdges(SU)) {
1822 SUnit *Pred = IE.getSrc();
1823 if (IE.getLatency() == 0)
1824 zeroLatencyDepth =
1825 std::max(zeroLatencyDepth, getZeroLatencyDepth(Pred) + 1);
1826 if (IE.ignoreDependence(true))
1827 continue;
1828 asap = std::max(asap, (int)(getASAP(Pred) + IE.getLatency() -
1829 IE.getDistance() * MII));
1830 }
1831 maxASAP = std::max(maxASAP, asap);
1832 ScheduleInfo[I].ASAP = asap;
1833 ScheduleInfo[I].ZeroLatencyDepth = zeroLatencyDepth;
1834 }
1835
1836 // Compute ALAP, ZeroLatencyHeight, and MOV.
1837 for (int I : llvm::reverse(Topo)) {
1838 int alap = maxASAP;
1839 int zeroLatencyHeight = 0;
1840 SUnit *SU = &SUnits[I];
1841 for (const auto &OE : DDG->getOutEdges(SU)) {
1842 SUnit *Succ = OE.getDst();
1843 if (Succ->isBoundaryNode())
1844 continue;
1845 if (OE.getLatency() == 0)
1846 zeroLatencyHeight =
1847 std::max(zeroLatencyHeight, getZeroLatencyHeight(Succ) + 1);
1848 if (OE.ignoreDependence(true))
1849 continue;
1850 alap = std::min(alap, (int)(getALAP(Succ) - OE.getLatency() +
1851 OE.getDistance() * MII));
1852 }
1853
1854 ScheduleInfo[I].ALAP = alap;
1855 ScheduleInfo[I].ZeroLatencyHeight = zeroLatencyHeight;
1856 }
1857
1858 // After computing the node functions, compute the summary for each node set.
1859 for (NodeSet &I : NodeSets)
1860 I.computeNodeSetInfo(this);
1861
1862 LLVM_DEBUG({
1863 for (unsigned i = 0; i < SUnits.size(); i++) {
1864 dbgs() << "\tNode " << i << ":\n";
1865 dbgs() << "\t ASAP = " << getASAP(&SUnits[i]) << "\n";
1866 dbgs() << "\t ALAP = " << getALAP(&SUnits[i]) << "\n";
1867 dbgs() << "\t MOV = " << getMOV(&SUnits[i]) << "\n";
1868 dbgs() << "\t D = " << getDepth(&SUnits[i]) << "\n";
1869 dbgs() << "\t H = " << getHeight(&SUnits[i]) << "\n";
1870 dbgs() << "\t ZLD = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
1871 dbgs() << "\t ZLH = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
1872 }
1873 });
1874}
1875
1876/// Compute the Pred_L(O) set, as defined in the paper. The set is defined
1877/// as the predecessors of the elements of NodeOrder that are not also in
1878/// NodeOrder.
1881 const NodeSet *S = nullptr) {
1882 Preds.clear();
1883
1884 for (SUnit *SU : NodeOrder) {
1885 for (const auto &IE : DDG->getInEdges(SU)) {
1886 SUnit *PredSU = IE.getSrc();
1887 if (S && S->count(PredSU) == 0)
1888 continue;
1889 if (IE.ignoreDependence(true))
1890 continue;
1891 if (NodeOrder.count(PredSU) == 0)
1892 Preds.insert(PredSU);
1893 }
1894
1895 // FIXME: The following loop-carried dependencies may also need to be
1896 // considered.
1897 // - Physical register dependencies (true-dependence and WAW).
1898 // - Memory dependencies.
1899 for (const auto &OE : DDG->getOutEdges(SU)) {
1900 SUnit *SuccSU = OE.getDst();
1901 if (!OE.isAntiDep())
1902 continue;
1903 if (S && S->count(SuccSU) == 0)
1904 continue;
1905 if (NodeOrder.count(SuccSU) == 0)
1906 Preds.insert(SuccSU);
1907 }
1908 }
1909 return !Preds.empty();
1910}
1911
1912/// Compute the Succ_L(O) set, as defined in the paper. The set is defined
1913/// as the successors of the elements of NodeOrder that are not also in
1914/// NodeOrder.
1917 const NodeSet *S = nullptr) {
1918 Succs.clear();
1919
1920 for (SUnit *SU : NodeOrder) {
1921 for (const auto &OE : DDG->getOutEdges(SU)) {
1922 SUnit *SuccSU = OE.getDst();
1923 if (S && S->count(SuccSU) == 0)
1924 continue;
1925 if (OE.ignoreDependence(false))
1926 continue;
1927 if (NodeOrder.count(SuccSU) == 0)
1928 Succs.insert(SuccSU);
1929 }
1930
1931 // FIXME: The following loop-carried dependencies may also need to be
1932 // considered.
1933 // - Physical register dependnecies (true-dependnece and WAW).
1934 // - Memory dependencies.
1935 for (const auto &IE : DDG->getInEdges(SU)) {
1936 SUnit *PredSU = IE.getSrc();
1937 if (!IE.isAntiDep())
1938 continue;
1939 if (S && S->count(PredSU) == 0)
1940 continue;
1941 if (NodeOrder.count(PredSU) == 0)
1942 Succs.insert(PredSU);
1943 }
1944 }
1945 return !Succs.empty();
1946}
1947
1948/// Return true if there is a path from the specified node to any of the nodes
1949/// in DestNodes. Keep track and return the nodes in any path.
1950static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
1951 SetVector<SUnit *> &DestNodes,
1952 SetVector<SUnit *> &Exclude,
1953 SmallPtrSet<SUnit *, 8> &Visited,
1954 SwingSchedulerDDG *DDG) {
1955 if (Cur->isBoundaryNode())
1956 return false;
1957 if (Exclude.contains(Cur))
1958 return false;
1959 if (DestNodes.contains(Cur))
1960 return true;
1961 if (!Visited.insert(Cur).second)
1962 return Path.contains(Cur);
1963 bool FoundPath = false;
1964 for (const auto &OE : DDG->getOutEdges(Cur))
1965 if (!OE.ignoreDependence(false))
1966 FoundPath |=
1967 computePath(OE.getDst(), Path, DestNodes, Exclude, Visited, DDG);
1968 for (const auto &IE : DDG->getInEdges(Cur))
1969 if (IE.isAntiDep() && IE.getDistance() == 0)
1970 FoundPath |=
1971 computePath(IE.getSrc(), Path, DestNodes, Exclude, Visited, DDG);
1972 if (FoundPath)
1973 Path.insert(Cur);
1974 return FoundPath;
1975}
1976
1977/// Compute the live-out registers for the instructions in a node-set.
1978/// The live-out registers are those that are defined in the node-set,
1979/// but not used. Except for use operands of Phis.
1981 NodeSet &NS) {
1986 for (SUnit *SU : NS) {
1987 const MachineInstr *MI = SU->getInstr();
1988 if (MI->isPHI())
1989 continue;
1990 for (const MachineOperand &MO : MI->all_uses()) {
1991 Register Reg = MO.getReg();
1992 if (Reg.isVirtual())
1993 Uses.insert(Reg);
1994 else if (MRI.isAllocatable(Reg))
1995 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
1996 Uses.insert(Unit);
1997 }
1998 }
1999 for (SUnit *SU : NS)
2000 for (const MachineOperand &MO : SU->getInstr()->all_defs())
2001 if (!MO.isDead()) {
2002 Register Reg = MO.getReg();
2003 if (Reg.isVirtual()) {
2004 if (!Uses.count(Reg))
2005 LiveOutRegs.emplace_back(Reg, LaneBitmask::getNone());
2006 } else if (MRI.isAllocatable(Reg)) {
2007 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
2008 if (!Uses.count(Unit))
2009 LiveOutRegs.emplace_back(Unit, LaneBitmask::getNone());
2010 }
2011 }
2012 RPTracker.addLiveRegs(LiveOutRegs);
2013}
2014
2015/// A heuristic to filter nodes in recurrent node-sets if the register
2016/// pressure of a set is too high.
2017void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
2018 for (auto &NS : NodeSets) {
2019 // Skip small node-sets since they won't cause register pressure problems.
2020 if (NS.size() <= 2)
2021 continue;
2022 IntervalPressure RecRegPressure;
2023 RegPressureTracker RecRPTracker(RecRegPressure);
2024 RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
2025 computeLiveOuts(MF, RecRPTracker, NS);
2026 RecRPTracker.closeBottom();
2027
2028 std::vector<SUnit *> SUnits(NS.begin(), NS.end());
2029 llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {
2030 return A->NodeNum > B->NodeNum;
2031 });
2032
2033 for (auto &SU : SUnits) {
2034 // Since we're computing the register pressure for a subset of the
2035 // instructions in a block, we need to set the tracker for each
2036 // instruction in the node-set. The tracker is set to the instruction
2037 // just after the one we're interested in.
2039 RecRPTracker.setPos(std::next(CurInstI));
2040
2041 RegPressureDelta RPDelta;
2042 ArrayRef<PressureChange> CriticalPSets;
2043 RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
2044 CriticalPSets,
2045 RecRegPressure.MaxSetPressure);
2046 if (RPDelta.Excess.isValid()) {
2047 LLVM_DEBUG(
2048 dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
2050 << ":" << RPDelta.Excess.getUnitInc() << "\n");
2051 NS.setExceedPressure(SU);
2052 break;
2053 }
2054 RecRPTracker.recede();
2055 }
2056 }
2057}
2058
2059/// A heuristic to colocate node sets that have the same set of
2060/// successors.
2061void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
2062 unsigned Colocate = 0;
2063 for (int i = 0, e = NodeSets.size(); i < e; ++i) {
2064 NodeSet &N1 = NodeSets[i];
2066 if (N1.empty() || !succ_L(N1, S1, DDG.get()))
2067 continue;
2068 for (int j = i + 1; j < e; ++j) {
2069 NodeSet &N2 = NodeSets[j];
2070 if (N1.compareRecMII(N2) != 0)
2071 continue;
2073 if (N2.empty() || !succ_L(N2, S2, DDG.get()))
2074 continue;
2075 if (llvm::set_is_subset(S1, S2) && S1.size() == S2.size()) {
2076 N1.setColocate(++Colocate);
2077 N2.setColocate(Colocate);
2078 break;
2079 }
2080 }
2081 }
2082}
2083
2084/// Check if the existing node-sets are profitable. If not, then ignore the
2085/// recurrent node-sets, and attempt to schedule all nodes together. This is
2086/// a heuristic. If the MII is large and all the recurrent node-sets are small,
2087/// then it's best to try to schedule all instructions together instead of
2088/// starting with the recurrent node-sets.
2089void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
2090 // Look for loops with a large MII.
2091 if (MII < 17)
2092 return;
2093 // Check if the node-set contains only a simple add recurrence.
2094 for (auto &NS : NodeSets) {
2095 if (NS.getRecMII() > 2)
2096 return;
2097 if (NS.getMaxDepth() > MII)
2098 return;
2099 }
2100 NodeSets.clear();
2101 LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
2102}
2103
2104/// Add the nodes that do not belong to a recurrence set into groups
2105/// based upon connected components.
2106void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
2107 SetVector<SUnit *> NodesAdded;
2109 // Add the nodes that are on a path between the previous node sets and
2110 // the current node set.
2111 for (NodeSet &I : NodeSets) {
2113 // Add the nodes from the current node set to the previous node set.
2114 if (succ_L(I, N, DDG.get())) {
2116 for (SUnit *NI : N) {
2117 Visited.clear();
2118 computePath(NI, Path, NodesAdded, I, Visited, DDG.get());
2119 }
2120 if (!Path.empty())
2121 I.insert(Path.begin(), Path.end());
2122 }
2123 // Add the nodes from the previous node set to the current node set.
2124 N.clear();
2125 if (succ_L(NodesAdded, N, DDG.get())) {
2127 for (SUnit *NI : N) {
2128 Visited.clear();
2129 computePath(NI, Path, I, NodesAdded, Visited, DDG.get());
2130 }
2131 if (!Path.empty())
2132 I.insert(Path.begin(), Path.end());
2133 }
2134 NodesAdded.insert(I.begin(), I.end());
2135 }
2136
2137 // Create a new node set with the connected nodes of any successor of a node
2138 // in a recurrent set.
2139 NodeSet NewSet;
2141 if (succ_L(NodesAdded, N, DDG.get()))
2142 for (SUnit *I : N)
2143 addConnectedNodes(I, NewSet, NodesAdded);
2144 if (!NewSet.empty())
2145 NodeSets.push_back(NewSet);
2146
2147 // Create a new node set with the connected nodes of any predecessor of a node
2148 // in a recurrent set.
2149 NewSet.clear();
2150 if (pred_L(NodesAdded, N, DDG.get()))
2151 for (SUnit *I : N)
2152 addConnectedNodes(I, NewSet, NodesAdded);
2153 if (!NewSet.empty())
2154 NodeSets.push_back(NewSet);
2155
2156 // Create new nodes sets with the connected nodes any remaining node that
2157 // has no predecessor.
2158 for (SUnit &SU : SUnits) {
2159 if (NodesAdded.count(&SU) == 0) {
2160 NewSet.clear();
2161 addConnectedNodes(&SU, NewSet, NodesAdded);
2162 if (!NewSet.empty())
2163 NodeSets.push_back(NewSet);
2164 }
2165 }
2166}
2167
2168/// Add the node to the set, and add all of its connected nodes to the set.
2169void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
2170 SetVector<SUnit *> &NodesAdded) {
2171 NewSet.insert(SU);
2172 NodesAdded.insert(SU);
2173 for (auto &OE : DDG->getOutEdges(SU)) {
2174 SUnit *Successor = OE.getDst();
2175 if (!OE.isArtificial() && !Successor->isBoundaryNode() &&
2176 NodesAdded.count(Successor) == 0)
2177 addConnectedNodes(Successor, NewSet, NodesAdded);
2178 }
2179 for (auto &IE : DDG->getInEdges(SU)) {
2180 SUnit *Predecessor = IE.getSrc();
2181 if (!IE.isArtificial() && NodesAdded.count(Predecessor) == 0)
2182 addConnectedNodes(Predecessor, NewSet, NodesAdded);
2183 }
2184}
2185
2186/// Return true if Set1 contains elements in Set2. The elements in common
2187/// are returned in a different container.
2188static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
2190 Result.clear();
2191 for (SUnit *SU : Set1) {
2192 if (Set2.count(SU) != 0)
2193 Result.insert(SU);
2194 }
2195 return !Result.empty();
2196}
2197
2198/// Merge the recurrence node sets that have the same initial node.
2199void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
2200 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2201 ++I) {
2202 NodeSet &NI = *I;
2203 for (NodeSetType::iterator J = I + 1; J != E;) {
2204 NodeSet &NJ = *J;
2205 if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
2206 if (NJ.compareRecMII(NI) > 0)
2207 NI.setRecMII(NJ.getRecMII());
2208 for (SUnit *SU : *J)
2209 I->insert(SU);
2210 NodeSets.erase(J);
2211 E = NodeSets.end();
2212 } else {
2213 ++J;
2214 }
2215 }
2216 }
2217}
2218
2219/// Remove nodes that have been scheduled in previous NodeSets.
2220void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
2221 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2222 ++I)
2223 for (NodeSetType::iterator J = I + 1; J != E;) {
2224 J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
2225
2226 if (J->empty()) {
2227 NodeSets.erase(J);
2228 E = NodeSets.end();
2229 } else {
2230 ++J;
2231 }
2232 }
2233}
2234
2235/// Compute an ordered list of the dependence graph nodes, which
2236/// indicates the order that the nodes will be scheduled. This is a
2237/// two-level algorithm. First, a partial order is created, which
2238/// consists of a list of sets ordered from highest to lowest priority.
2239void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
2241 NodeOrder.clear();
2242
2243 for (auto &Nodes : NodeSets) {
2244 LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
2245 OrderKind Order;
2247 if (pred_L(NodeOrder, N, DDG.get()) && llvm::set_is_subset(N, Nodes)) {
2248 R.insert(N.begin(), N.end());
2249 Order = BottomUp;
2250 LLVM_DEBUG(dbgs() << " Bottom up (preds) ");
2251 } else if (succ_L(NodeOrder, N, DDG.get()) &&
2252 llvm::set_is_subset(N, Nodes)) {
2253 R.insert(N.begin(), N.end());
2254 Order = TopDown;
2255 LLVM_DEBUG(dbgs() << " Top down (succs) ");
2256 } else if (isIntersect(N, Nodes, R)) {
2257 // If some of the successors are in the existing node-set, then use the
2258 // top-down ordering.
2259 Order = TopDown;
2260 LLVM_DEBUG(dbgs() << " Top down (intersect) ");
2261 } else if (NodeSets.size() == 1) {
2262 for (const auto &N : Nodes)
2263 if (N->Succs.size() == 0)
2264 R.insert(N);
2265 Order = BottomUp;
2266 LLVM_DEBUG(dbgs() << " Bottom up (all) ");
2267 } else {
2268 // Find the node with the highest ASAP.
2269 SUnit *maxASAP = nullptr;
2270 for (SUnit *SU : Nodes) {
2271 if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
2272 (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
2273 maxASAP = SU;
2274 }
2275 R.insert(maxASAP);
2276 Order = BottomUp;
2277 LLVM_DEBUG(dbgs() << " Bottom up (default) ");
2278 }
2279
2280 while (!R.empty()) {
2281 if (Order == TopDown) {
2282 // Choose the node with the maximum height. If more than one, choose
2283 // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
2284 // choose the node with the lowest MOV.
2285 while (!R.empty()) {
2286 SUnit *maxHeight = nullptr;
2287 for (SUnit *I : R) {
2288 if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
2289 maxHeight = I;
2290 else if (getHeight(I) == getHeight(maxHeight) &&
2292 maxHeight = I;
2293 else if (getHeight(I) == getHeight(maxHeight) &&
2295 getZeroLatencyHeight(maxHeight) &&
2296 getMOV(I) < getMOV(maxHeight))
2297 maxHeight = I;
2298 }
2299 NodeOrder.insert(maxHeight);
2300 LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
2301 R.remove(maxHeight);
2302 for (const auto &OE : DDG->getOutEdges(maxHeight)) {
2303 SUnit *SU = OE.getDst();
2304 if (Nodes.count(SU) == 0)
2305 continue;
2306 if (NodeOrder.contains(SU))
2307 continue;
2308 if (OE.ignoreDependence(false))
2309 continue;
2310 R.insert(SU);
2311 }
2312
2313 // FIXME: The following loop-carried dependencies may also need to be
2314 // considered.
2315 // - Physical register dependnecies (true-dependnece and WAW).
2316 // - Memory dependencies.
2317 for (const auto &IE : DDG->getInEdges(maxHeight)) {
2318 SUnit *SU = IE.getSrc();
2319 if (!IE.isAntiDep())
2320 continue;
2321 if (Nodes.count(SU) == 0)
2322 continue;
2323 if (NodeOrder.contains(SU))
2324 continue;
2325 R.insert(SU);
2326 }
2327 }
2328 Order = BottomUp;
2329 LLVM_DEBUG(dbgs() << "\n Switching order to bottom up ");
2331 if (pred_L(NodeOrder, N, DDG.get(), &Nodes))
2332 R.insert(N.begin(), N.end());
2333 } else {
2334 // Choose the node with the maximum depth. If more than one, choose
2335 // the node with the maximum ZeroLatencyDepth. If still more than one,
2336 // choose the node with the lowest MOV.
2337 while (!R.empty()) {
2338 SUnit *maxDepth = nullptr;
2339 for (SUnit *I : R) {
2340 if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
2341 maxDepth = I;
2342 else if (getDepth(I) == getDepth(maxDepth) &&
2344 maxDepth = I;
2345 else if (getDepth(I) == getDepth(maxDepth) &&
2347 getMOV(I) < getMOV(maxDepth))
2348 maxDepth = I;
2349 }
2350 NodeOrder.insert(maxDepth);
2351 LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
2352 R.remove(maxDepth);
2353 if (Nodes.isExceedSU(maxDepth)) {
2354 Order = TopDown;
2355 R.clear();
2356 R.insert(Nodes.getNode(0));
2357 break;
2358 }
2359 for (const auto &IE : DDG->getInEdges(maxDepth)) {
2360 SUnit *SU = IE.getSrc();
2361 if (Nodes.count(SU) == 0)
2362 continue;
2363 if (NodeOrder.contains(SU))
2364 continue;
2365 R.insert(SU);
2366 }
2367
2368 // FIXME: The following loop-carried dependencies may also need to be
2369 // considered.
2370 // - Physical register dependnecies (true-dependnece and WAW).
2371 // - Memory dependencies.
2372 for (const auto &OE : DDG->getOutEdges(maxDepth)) {
2373 SUnit *SU = OE.getDst();
2374 if (!OE.isAntiDep())
2375 continue;
2376 if (Nodes.count(SU) == 0)
2377 continue;
2378 if (NodeOrder.contains(SU))
2379 continue;
2380 R.insert(SU);
2381 }
2382 }
2383 Order = TopDown;
2384 LLVM_DEBUG(dbgs() << "\n Switching order to top down ");
2386 if (succ_L(NodeOrder, N, DDG.get(), &Nodes))
2387 R.insert(N.begin(), N.end());
2388 }
2389 }
2390 LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
2391 }
2392
2393 LLVM_DEBUG({
2394 dbgs() << "Node order: ";
2395 for (SUnit *I : NodeOrder)
2396 dbgs() << " " << I->NodeNum << " ";
2397 dbgs() << "\n";
2398 });
2399}
2400
2401/// Process the nodes in the computed order and create the pipelined schedule
2402/// of the instructions, if possible. Return true if a schedule is found.
2403bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
2404
2405 if (NodeOrder.empty()){
2406 LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );
2407 return false;
2408 }
2409
2410 bool scheduleFound = false;
2411 std::unique_ptr<HighRegisterPressureDetector> HRPDetector;
2412 if (LimitRegPressure) {
2413 HRPDetector =
2414 std::make_unique<HighRegisterPressureDetector>(Loop.getHeader(), MF);
2415 HRPDetector->init(RegClassInfo);
2416 }
2417 // Keep increasing II until a valid schedule is found.
2418 for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) {
2419 Schedule.reset();
2420 Schedule.setInitiationInterval(II);
2421 LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
2422
2423 SetVector<SUnit *>::iterator NI = NodeOrder.begin();
2424 SetVector<SUnit *>::iterator NE = NodeOrder.end();
2425 do {
2426 SUnit *SU = *NI;
2427
2428 // Compute the schedule time for the instruction, which is based
2429 // upon the scheduled time for any predecessors/successors.
2430 int EarlyStart = INT_MIN;
2431 int LateStart = INT_MAX;
2432 Schedule.computeStart(SU, &EarlyStart, &LateStart, II, this);
2433 LLVM_DEBUG({
2434 dbgs() << "\n";
2435 dbgs() << "Inst (" << SU->NodeNum << ") ";
2436 SU->getInstr()->dump();
2437 dbgs() << "\n";
2438 });
2439 LLVM_DEBUG(
2440 dbgs() << format("\tes: %8x ls: %8x\n", EarlyStart, LateStart));
2441
2442 if (EarlyStart > LateStart)
2443 scheduleFound = false;
2444 else if (EarlyStart != INT_MIN && LateStart == INT_MAX)
2445 scheduleFound =
2446 Schedule.insert(SU, EarlyStart, EarlyStart + (int)II - 1, II);
2447 else if (EarlyStart == INT_MIN && LateStart != INT_MAX)
2448 scheduleFound =
2449 Schedule.insert(SU, LateStart, LateStart - (int)II + 1, II);
2450 else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2451 LateStart = std::min(LateStart, EarlyStart + (int)II - 1);
2452 // When scheduling a Phi it is better to start at the late cycle and
2453 // go backwards. The default order may insert the Phi too far away
2454 // from its first dependence.
2455 // Also, do backward search when all scheduled predecessors are
2456 // loop-carried output/order dependencies. Empirically, there are also
2457 // cases where scheduling becomes possible with backward search.
2458 if (SU->getInstr()->isPHI() ||
2459 Schedule.onlyHasLoopCarriedOutputOrOrderPreds(SU, this->getDDG()))
2460 scheduleFound = Schedule.insert(SU, LateStart, EarlyStart, II);
2461 else
2462 scheduleFound = Schedule.insert(SU, EarlyStart, LateStart, II);
2463 } else {
2464 int FirstCycle = Schedule.getFirstCycle();
2465 scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
2466 FirstCycle + getASAP(SU) + II - 1, II);
2467 }
2468
2469 // Even if we find a schedule, make sure the schedule doesn't exceed the
2470 // allowable number of stages. We keep trying if this happens.
2471 if (scheduleFound)
2472 if (SwpMaxStages > -1 &&
2473 Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2474 scheduleFound = false;
2475
2476 LLVM_DEBUG({
2477 if (!scheduleFound)
2478 dbgs() << "\tCan't schedule\n";
2479 });
2480 } while (++NI != NE && scheduleFound);
2481
2482 // If a schedule is found, ensure non-pipelined instructions are in stage 0
2483 if (scheduleFound)
2484 scheduleFound =
2485 Schedule.normalizeNonPipelinedInstructions(this, LoopPipelinerInfo);
2486
2487 // If a schedule is found, check if it is a valid schedule too.
2488 if (scheduleFound)
2489 scheduleFound = Schedule.isValidSchedule(this);
2490
2491 // If a schedule was found and the option is enabled, check if the schedule
2492 // might generate additional register spills/fills.
2493 if (scheduleFound && LimitRegPressure)
2494 scheduleFound =
2495 !HRPDetector->detect(this, Schedule, Schedule.getMaxStageCount());
2496 }
2497
2498 LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
2499 << " (II=" << Schedule.getInitiationInterval()
2500 << ")\n");
2501
2502 if (scheduleFound) {
2503 scheduleFound = LoopPipelinerInfo->shouldUseSchedule(*this, Schedule);
2504 if (!scheduleFound)
2505 LLVM_DEBUG(dbgs() << "Target rejected schedule\n");
2506 }
2507
2508 if (scheduleFound) {
2509 Schedule.finalizeSchedule(this);
2510 Pass.ORE->emit([&]() {
2512 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
2513 << "Schedule found with Initiation Interval: "
2514 << ore::NV("II", Schedule.getInitiationInterval())
2515 << ", MaxStageCount: "
2516 << ore::NV("MaxStageCount", Schedule.getMaxStageCount());
2517 });
2518 } else
2519 Schedule.reset();
2520
2521 return scheduleFound && Schedule.getMaxStageCount() > 0;
2522}
2523
2524/// Return true if we can compute the amount the instruction changes
2525/// during each iteration. Set Delta to the amount of the change.
2526bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) const {
2528 const MachineOperand *BaseOp;
2529 int64_t Offset;
2530 bool OffsetIsScalable;
2531 if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
2532 return false;
2533
2534 // FIXME: This algorithm assumes instructions have fixed-size offsets.
2535 if (OffsetIsScalable)
2536 return false;
2537
2538 if (!BaseOp->isReg())
2539 return false;
2540
2541 Register BaseReg = BaseOp->getReg();
2542
2544 // Check if there is a Phi. If so, get the definition in the loop.
2545 MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
2546 if (BaseDef && BaseDef->isPHI()) {
2547 BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
2548 BaseDef = MRI.getVRegDef(BaseReg);
2549 }
2550 if (!BaseDef)
2551 return false;
2552
2553 int D = 0;
2554 if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
2555 return false;
2556
2557 Delta = D;
2558 return true;
2559}
2560
2561/// Check if we can change the instruction to use an offset value from the
2562/// previous iteration. If so, return true and set the base and offset values
2563/// so that we can rewrite the load, if necessary.
2564/// v1 = Phi(v0, v3)
2565/// v2 = load v1, 0
2566/// v3 = post_store v1, 4, x
2567/// This function enables the load to be rewritten as v2 = load v3, 4.
2568bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
2569 unsigned &BasePos,
2570 unsigned &OffsetPos,
2571 unsigned &NewBase,
2572 int64_t &Offset) {
2573 // Get the load instruction.
2574 if (TII->isPostIncrement(*MI))
2575 return false;
2576 unsigned BasePosLd, OffsetPosLd;
2577 if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
2578 return false;
2579 Register BaseReg = MI->getOperand(BasePosLd).getReg();
2580
2581 // Look for the Phi instruction.
2582 MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
2583 MachineInstr *Phi = MRI.getVRegDef(BaseReg);
2584 if (!Phi || !Phi->isPHI())
2585 return false;
2586 // Get the register defined in the loop block.
2587 unsigned PrevReg = getLoopPhiReg(*Phi, MI->getParent());
2588 if (!PrevReg)
2589 return false;
2590
2591 // Check for the post-increment load/store instruction.
2592 MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
2593 if (!PrevDef || PrevDef == MI)
2594 return false;
2595
2596 if (!TII->isPostIncrement(*PrevDef))
2597 return false;
2598
2599 unsigned BasePos1 = 0, OffsetPos1 = 0;
2600 if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
2601 return false;
2602
2603 // Make sure that the instructions do not access the same memory location in
2604 // the next iteration.
2605 int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
2606 int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
2608 NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
2609 bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
2610 MF.deleteMachineInstr(NewMI);
2611 if (!Disjoint)
2612 return false;
2613
2614 // Set the return value once we determine that we return true.
2615 BasePos = BasePosLd;
2616 OffsetPos = OffsetPosLd;
2617 NewBase = PrevReg;
2618 Offset = StoreOffset;
2619 return true;
2620}
2621
2622/// Apply changes to the instruction if needed. The changes are need
2623/// to improve the scheduling and depend up on the final schedule.
2625 SMSchedule &Schedule) {
2626 SUnit *SU = getSUnit(MI);
2628 InstrChanges.find(SU);
2629 if (It != InstrChanges.end()) {
2630 std::pair<unsigned, int64_t> RegAndOffset = It->second;
2631 unsigned BasePos, OffsetPos;
2632 if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2633 return;
2634 Register BaseReg = MI->getOperand(BasePos).getReg();
2635 MachineInstr *LoopDef = findDefInLoop(BaseReg);
2636 int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
2637 int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
2638 int BaseStageNum = Schedule.stageScheduled(SU);
2639 int BaseCycleNum = Schedule.cycleScheduled(SU);
2640 if (BaseStageNum < DefStageNum) {
2642 int OffsetDiff = DefStageNum - BaseStageNum;
2643 if (DefCycleNum < BaseCycleNum) {
2644 NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
2645 if (OffsetDiff > 0)
2646 --OffsetDiff;
2647 }
2648 int64_t NewOffset =
2649 MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
2650 NewMI->getOperand(OffsetPos).setImm(NewOffset);
2651 SU->setInstr(NewMI);
2652 MISUnitMap[NewMI] = SU;
2653 NewMIs[MI] = NewMI;
2654 }
2655 }
2656}
2657
2658/// Return the instruction in the loop that defines the register.
2659/// If the definition is a Phi, then follow the Phi operand to
2660/// the instruction in the loop.
2661MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
2663 MachineInstr *Def = MRI.getVRegDef(Reg);
2664 while (Def->isPHI()) {
2665 if (!Visited.insert(Def).second)
2666 break;
2667 for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
2668 if (Def->getOperand(i + 1).getMBB() == BB) {
2669 Def = MRI.getVRegDef(Def->getOperand(i).getReg());
2670 break;
2671 }
2672 }
2673 return Def;
2674}
2675
2676/// Return true for an order or output dependence that is loop carried
2677/// potentially. A dependence is loop carried if the destination defines a value
2678/// that may be used or defined by the source in a subsequent iteration.
2680 const SwingSchedulerDDGEdge &Edge) const {
2681 if ((!Edge.isOrderDep() && !Edge.isOutputDep()) || Edge.isArtificial() ||
2682 Edge.getDst()->isBoundaryNode())
2683 return false;
2684
2686 return true;
2687
2688 if (Edge.isOutputDep())
2689 return true;
2690
2691 MachineInstr *SI = Edge.getSrc()->getInstr();
2692 MachineInstr *DI = Edge.getDst()->getInstr();
2693 assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
2694
2695 // Assume ordered loads and stores may have a loop carried dependence.
2696 if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
2697 SI->mayRaiseFPException() || DI->mayRaiseFPException() ||
2698 SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
2699 return true;
2700
2701 if (!DI->mayLoadOrStore() || !SI->mayLoadOrStore())
2702 return false;
2703
2704 // The conservative assumption is that a dependence between memory operations
2705 // may be loop carried. The following code checks when it can be proved that
2706 // there is no loop carried dependence.
2707 unsigned DeltaS, DeltaD;
2708 if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD))
2709 return true;
2710
2711 const MachineOperand *BaseOpS, *BaseOpD;
2712 int64_t OffsetS, OffsetD;
2713 bool OffsetSIsScalable, OffsetDIsScalable;
2715 if (!TII->getMemOperandWithOffset(*SI, BaseOpS, OffsetS, OffsetSIsScalable,
2716 TRI) ||
2717 !TII->getMemOperandWithOffset(*DI, BaseOpD, OffsetD, OffsetDIsScalable,
2718 TRI))
2719 return true;
2720
2721 assert(!OffsetSIsScalable && !OffsetDIsScalable &&
2722 "Expected offsets to be byte offsets");
2723
2724 MachineInstr *DefS = MRI.getVRegDef(BaseOpS->getReg());
2725 MachineInstr *DefD = MRI.getVRegDef(BaseOpD->getReg());
2726 if (!DefS || !DefD || !DefS->isPHI() || !DefD->isPHI())
2727 return true;
2728
2729 unsigned InitValS = 0;
2730 unsigned LoopValS = 0;
2731 unsigned InitValD = 0;
2732 unsigned LoopValD = 0;
2733 getPhiRegs(*DefS, BB, InitValS, LoopValS);
2734 getPhiRegs(*DefD, BB, InitValD, LoopValD);
2735 MachineInstr *InitDefS = MRI.getVRegDef(InitValS);
2736 MachineInstr *InitDefD = MRI.getVRegDef(InitValD);
2737
2738 if (!InitDefS->isIdenticalTo(*InitDefD))
2739 return true;
2740
2741 // Check that the base register is incremented by a constant value for each
2742 // iteration.
2743 MachineInstr *LoopDefS = MRI.getVRegDef(LoopValS);
2744 int D = 0;
2745 if (!LoopDefS || !TII->getIncrementValue(*LoopDefS, D))
2746 return true;
2747
2748 LocationSize AccessSizeS = (*SI->memoperands_begin())->getSize();
2749 LocationSize AccessSizeD = (*DI->memoperands_begin())->getSize();
2750
2751 // This is the main test, which checks the offset values and the loop
2752 // increment value to determine if the accesses may be loop carried.
2753 if (!AccessSizeS.hasValue() || !AccessSizeD.hasValue())
2754 return true;
2755
2756 if (DeltaS != DeltaD || DeltaS < AccessSizeS.getValue() ||
2757 DeltaD < AccessSizeD.getValue())
2758 return true;
2759
2760 return (OffsetS + (int64_t)AccessSizeS.getValue() <
2761 OffsetD + (int64_t)AccessSizeD.getValue());
2762}
2763
2764void SwingSchedulerDAG::postProcessDAG() {
2765 for (auto &M : Mutations)
2766 M->apply(this);
2767}
2768
2769/// Try to schedule the node at the specified StartCycle and continue
2770/// until the node is schedule or the EndCycle is reached. This function
2771/// returns true if the node is scheduled. This routine may search either
2772/// forward or backward for a place to insert the instruction based upon
2773/// the relative values of StartCycle and EndCycle.
2774bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
2775 bool forward = true;
2776 LLVM_DEBUG({
2777 dbgs() << "Trying to insert node between " << StartCycle << " and "
2778 << EndCycle << " II: " << II << "\n";
2779 });
2780 if (StartCycle > EndCycle)
2781 forward = false;
2782
2783 // The terminating condition depends on the direction.
2784 int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
2785 for (int curCycle = StartCycle; curCycle != termCycle;
2786 forward ? ++curCycle : --curCycle) {
2787
2788 if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
2789 ProcItinResources.canReserveResources(*SU, curCycle)) {
2790 LLVM_DEBUG({
2791 dbgs() << "\tinsert at cycle " << curCycle << " ";
2792 SU->getInstr()->dump();
2793 });
2794
2795 if (!ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
2796 ProcItinResources.reserveResources(*SU, curCycle);
2797 ScheduledInstrs[curCycle].push_back(SU);
2798 InstrToCycle.insert(std::make_pair(SU, curCycle));
2799 if (curCycle > LastCycle)
2800 LastCycle = curCycle;
2801 if (curCycle < FirstCycle)
2802 FirstCycle = curCycle;
2803 return true;
2804 }
2805 LLVM_DEBUG({
2806 dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
2807 SU->getInstr()->dump();
2808 });
2809 }
2810 return false;
2811}
2812
2813// Return the cycle of the earliest scheduled instruction in the chain.
2815 const SwingSchedulerDDG *DDG) {
2818 Worklist.push_back(Dep);
2819 int EarlyCycle = INT_MAX;
2820 while (!Worklist.empty()) {
2821 const SwingSchedulerDDGEdge &Cur = Worklist.pop_back_val();
2822 SUnit *PrevSU = Cur.getSrc();
2823 if (Visited.count(PrevSU))
2824 continue;
2825 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
2826 if (it == InstrToCycle.end())
2827 continue;
2828 EarlyCycle = std::min(EarlyCycle, it->second);
2829 for (const auto &IE : DDG->getInEdges(PrevSU))
2830 if (IE.isOrderDep() || IE.isOutputDep())
2831 Worklist.push_back(IE);
2832 Visited.insert(PrevSU);
2833 }
2834 return EarlyCycle;
2835}
2836
2837// Return the cycle of the latest scheduled instruction in the chain.
2839 const SwingSchedulerDDG *DDG) {
2842 Worklist.push_back(Dep);
2843 int LateCycle = INT_MIN;
2844 while (!Worklist.empty()) {
2845 const SwingSchedulerDDGEdge &Cur = Worklist.pop_back_val();
2846 SUnit *SuccSU = Cur.getDst();
2847 if (Visited.count(SuccSU) || SuccSU->isBoundaryNode())
2848 continue;
2849 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
2850 if (it == InstrToCycle.end())
2851 continue;
2852 LateCycle = std::max(LateCycle, it->second);
2853 for (const auto &OE : DDG->getOutEdges(SuccSU))
2854 if (OE.isOrderDep() || OE.isOutputDep())
2855 Worklist.push_back(OE);
2856 Visited.insert(SuccSU);
2857 }
2858 return LateCycle;
2859}
2860
2861/// If an instruction has a use that spans multiple iterations, then
2862/// return true. These instructions are characterized by having a back-ege
2863/// to a Phi, which contains a reference to another Phi.
2865 for (auto &P : SU->Preds)
2866 if (P.getKind() == SDep::Anti && P.getSUnit()->getInstr()->isPHI())
2867 for (auto &S : P.getSUnit()->Succs)
2868 if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
2869 return P.getSUnit();
2870 return nullptr;
2871}
2872
2873/// Compute the scheduling start slot for the instruction. The start slot
2874/// depends on any predecessor or successor nodes scheduled already.
2875void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
2876 int II, SwingSchedulerDAG *DAG) {
2877 const SwingSchedulerDDG *DDG = DAG->getDDG();
2878
2879 // Iterate over each instruction that has been scheduled already. The start
2880 // slot computation depends on whether the previously scheduled instruction
2881 // is a predecessor or successor of the specified instruction.
2882 for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
2883 for (SUnit *I : getInstructions(cycle)) {
2884 for (const auto &IE : DDG->getInEdges(SU)) {
2885 if (IE.getSrc() == I) {
2886 // FIXME: Add reverse edge to `DDG` instead of calling
2887 // `isLoopCarriedDep`
2888 if (DAG->isLoopCarriedDep(IE)) {
2889 int End = earliestCycleInChain(IE, DDG) + (II - 1);
2890 *MinLateStart = std::min(*MinLateStart, End);
2891 }
2892 int EarlyStart = cycle + IE.getLatency() - IE.getDistance() * II;
2893 *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
2894 }
2895 }
2896
2897 for (const auto &OE : DDG->getOutEdges(SU)) {
2898 if (OE.getDst() == I) {
2899 // FIXME: Add reverse edge to `DDG` instead of calling
2900 // `isLoopCarriedDep`
2901 if (DAG->isLoopCarriedDep(OE)) {
2902 int Start = latestCycleInChain(OE, DDG) + 1 - II;
2903 *MaxEarlyStart = std::max(*MaxEarlyStart, Start);
2904 }
2905 int LateStart = cycle - OE.getLatency() + OE.getDistance() * II;
2906 *MinLateStart = std::min(*MinLateStart, LateStart);
2907 }
2908 }
2909
2910 SUnit *BE = multipleIterations(I, DAG);
2911 for (const auto &Dep : SU->Preds) {
2912 // For instruction that requires multiple iterations, make sure that
2913 // the dependent instruction is not scheduled past the definition.
2914 if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
2915 !SU->isPred(I))
2916 *MinLateStart = std::min(*MinLateStart, cycle);
2917 }
2918 }
2919 }
2920}
2921
2922/// Order the instructions within a cycle so that the definitions occur
2923/// before the uses. Returns true if the instruction is added to the start
2924/// of the list, or false if added to the end.
2926 std::deque<SUnit *> &Insts) const {
2927 MachineInstr *MI = SU->getInstr();
2928 bool OrderBeforeUse = false;
2929 bool OrderAfterDef = false;
2930 bool OrderBeforeDef = false;
2931 unsigned MoveDef = 0;
2932 unsigned MoveUse = 0;
2933 int StageInst1 = stageScheduled(SU);
2934 const SwingSchedulerDDG *DDG = SSD->getDDG();
2935
2936 unsigned Pos = 0;
2937 for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
2938 ++I, ++Pos) {
2939 for (MachineOperand &MO : MI->operands()) {
2940 if (!MO.isReg() || !MO.getReg().isVirtual())
2941 continue;
2942
2943 Register Reg = MO.getReg();
2944 unsigned BasePos, OffsetPos;
2945 if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2946 if (MI->getOperand(BasePos).getReg() == Reg)
2947 if (unsigned NewReg = SSD->getInstrBaseReg(SU))
2948 Reg = NewReg;
2949 bool Reads, Writes;
2950 std::tie(Reads, Writes) =
2951 (*I)->getInstr()->readsWritesVirtualRegister(Reg);
2952 if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
2953 OrderBeforeUse = true;
2954 if (MoveUse == 0)
2955 MoveUse = Pos;
2956 } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
2957 // Add the instruction after the scheduled instruction.
2958 OrderAfterDef = true;
2959 MoveDef = Pos;
2960 } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
2961 if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
2962 OrderBeforeUse = true;
2963 if (MoveUse == 0)
2964 MoveUse = Pos;
2965 } else {
2966 OrderAfterDef = true;
2967 MoveDef = Pos;
2968 }
2969 } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
2970 OrderBeforeUse = true;
2971 if (MoveUse == 0)
2972 MoveUse = Pos;
2973 if (MoveUse != 0) {
2974 OrderAfterDef = true;
2975 MoveDef = Pos - 1;
2976 }
2977 } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
2978 // Add the instruction before the scheduled instruction.
2979 OrderBeforeUse = true;
2980 if (MoveUse == 0)
2981 MoveUse = Pos;
2982 } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
2983 isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
2984 if (MoveUse == 0) {
2985 OrderBeforeDef = true;
2986 MoveUse = Pos;
2987 }
2988 }
2989 }
2990 // Check for order dependences between instructions. Make sure the source
2991 // is ordered before the destination.
2992 for (auto &OE : DDG->getOutEdges(SU)) {
2993 if (OE.getDst() != *I)
2994 continue;
2995 if (OE.isOrderDep() && stageScheduled(*I) == StageInst1) {
2996 OrderBeforeUse = true;
2997 if (Pos < MoveUse)
2998 MoveUse = Pos;
2999 }
3000 // We did not handle HW dependences in previous for loop,
3001 // and we normally set Latency = 0 for Anti/Output deps,
3002 // so may have nodes in same cycle with Anti/Output dependent on HW regs.
3003 else if ((OE.isAntiDep() || OE.isOutputDep()) &&
3004 stageScheduled(*I) == StageInst1) {
3005 OrderBeforeUse = true;
3006 if ((MoveUse == 0) || (Pos < MoveUse))
3007 MoveUse = Pos;
3008 }
3009 }
3010 for (auto &IE : DDG->getInEdges(SU)) {
3011 if (IE.getSrc() != *I)
3012 continue;
3013 if ((IE.isAntiDep() || IE.isOutputDep() || IE.isOrderDep()) &&
3014 stageScheduled(*I) == StageInst1) {
3015 OrderAfterDef = true;
3016 MoveDef = Pos;
3017 }
3018 }
3019 }
3020
3021 // A circular dependence.
3022 if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
3023 OrderBeforeUse = false;
3024
3025 // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
3026 // to a loop-carried dependence.
3027 if (OrderBeforeDef)
3028 OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
3029
3030 // The uncommon case when the instruction order needs to be updated because
3031 // there is both a use and def.
3032 if (OrderBeforeUse && OrderAfterDef) {
3033 SUnit *UseSU = Insts.at(MoveUse);
3034 SUnit *DefSU = Insts.at(MoveDef);
3035 if (MoveUse > MoveDef) {
3036 Insts.erase(Insts.begin() + MoveUse);
3037 Insts.erase(Insts.begin() + MoveDef);
3038 } else {
3039 Insts.erase(Insts.begin() + MoveDef);
3040 Insts.erase(Insts.begin() + MoveUse);
3041 }
3042 orderDependence(SSD, UseSU, Insts);
3043 orderDependence(SSD, SU, Insts);
3044 orderDependence(SSD, DefSU, Insts);
3045 return;
3046 }
3047 // Put the new instruction first if there is a use in the list. Otherwise,
3048 // put it at the end of the list.
3049 if (OrderBeforeUse)
3050 Insts.push_front(SU);
3051 else
3052 Insts.push_back(SU);
3053}
3054
3055/// Return true if the scheduled Phi has a loop carried operand.
3057 MachineInstr &Phi) const {
3058 if (!Phi.isPHI())
3059 return false;
3060 assert(Phi.isPHI() && "Expecting a Phi.");
3061 SUnit *DefSU = SSD->getSUnit(&Phi);
3062 unsigned DefCycle = cycleScheduled(DefSU);
3063 int DefStage = stageScheduled(DefSU);
3064
3065 unsigned InitVal = 0;
3066 unsigned LoopVal = 0;
3067 getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
3068 SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
3069 if (!UseSU)
3070 return true;
3071 if (UseSU->getInstr()->isPHI())
3072 return true;
3073 unsigned LoopCycle = cycleScheduled(UseSU);
3074 int LoopStage = stageScheduled(UseSU);
3075 return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
3076}
3077
3078/// Return true if the instruction is a definition that is loop carried
3079/// and defines the use on the next iteration.
3080/// v1 = phi(v2, v3)
3081/// (Def) v3 = op v1
3082/// (MO) = v1
3083/// If MO appears before Def, then v1 and v3 may get assigned to the same
3084/// register.
3086 MachineInstr *Def,
3087 MachineOperand &MO) const {
3088 if (!MO.isReg())
3089 return false;
3090 if (Def->isPHI())
3091 return false;
3092 MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
3093 if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
3094 return false;
3095 if (!isLoopCarried(SSD, *Phi))
3096 return false;
3097 unsigned LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
3098 for (MachineOperand &DMO : Def->all_defs()) {
3099 if (DMO.getReg() == LoopReg)
3100 return true;
3101 }
3102 return false;
3103}
3104
3105/// Return true if all scheduled predecessors are loop-carried output/order
3106/// dependencies.
3108 SUnit *SU, const SwingSchedulerDDG *DDG) const {
3109 for (const auto &IE : DDG->getInEdges(SU))
3110 if (InstrToCycle.count(IE.getSrc()))
3111 return false;
3112 return true;
3113}
3114
3115/// Determine transitive dependences of unpipelineable instructions
3118 SmallSet<SUnit *, 8> DoNotPipeline;
3119 SmallVector<SUnit *, 8> Worklist;
3120
3121 for (auto &SU : SSD->SUnits)
3122 if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr()))
3123 Worklist.push_back(&SU);
3124
3125 const SwingSchedulerDDG *DDG = SSD->getDDG();
3126 while (!Worklist.empty()) {
3127 auto SU = Worklist.pop_back_val();
3128 if (DoNotPipeline.count(SU))
3129 continue;
3130 LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n");
3131 DoNotPipeline.insert(SU);
3132 for (const auto &IE : DDG->getInEdges(SU))
3133 Worklist.push_back(IE.getSrc());
3134
3135 // To preserve previous behavior and prevent regression
3136 // FIXME: Remove if this doesn't have significant impact on
3137 for (const auto &OE : DDG->getOutEdges(SU))
3138 if (OE.getDistance() == 1)
3139 Worklist.push_back(OE.getDst());
3140 }
3141 return DoNotPipeline;
3142}
3143
3144// Determine all instructions upon which any unpipelineable instruction depends
3145// and ensure that they are in stage 0. If unable to do so, return false.
3149
3150 int NewLastCycle = INT_MIN;
3151 for (SUnit &SU : SSD->SUnits) {
3152 if (!SU.isInstr())
3153 continue;
3154 if (!DNP.contains(&SU) || stageScheduled(&SU) == 0) {
3155 NewLastCycle = std::max(NewLastCycle, InstrToCycle[&SU]);
3156 continue;
3157 }
3158
3159 // Put the non-pipelined instruction as early as possible in the schedule
3160 int NewCycle = getFirstCycle();
3161 for (const auto &IE : SSD->getDDG()->getInEdges(&SU))
3162 if (IE.getDistance() == 0)
3163 NewCycle = std::max(InstrToCycle[IE.getSrc()], NewCycle);
3164
3165 // To preserve previous behavior and prevent regression
3166 // FIXME: Remove if this doesn't have significant impact on performance
3167 for (auto &OE : SSD->getDDG()->getOutEdges(&SU))
3168 if (OE.getDistance() == 1)
3169 NewCycle = std::max(InstrToCycle[OE.getDst()], NewCycle);
3170
3171 int OldCycle = InstrToCycle[&SU];
3172 if (OldCycle != NewCycle) {
3173 InstrToCycle[&SU] = NewCycle;
3174 auto &OldS = getInstructions(OldCycle);
3175 llvm::erase(OldS, &SU);
3176 getInstructions(NewCycle).emplace_back(&SU);
3177 LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum
3178 << ") is not pipelined; moving from cycle " << OldCycle
3179 << " to " << NewCycle << " Instr:" << *SU.getInstr());
3180 }
3181 NewLastCycle = std::max(NewLastCycle, NewCycle);
3182 }
3183 LastCycle = NewLastCycle;
3184 return true;
3185}
3186
3187// Check if the generated schedule is valid. This function checks if
3188// an instruction that uses a physical register is scheduled in a
3189// different stage than the definition. The pipeliner does not handle
3190// physical register values that may cross a basic block boundary.
3191// Furthermore, if a physical def/use pair is assigned to the same
3192// cycle, orderDependence does not guarantee def/use ordering, so that
3193// case should be considered invalid. (The test checks for both
3194// earlier and same-cycle use to be more robust.)
3196 for (SUnit &SU : SSD->SUnits) {
3197 if (!SU.hasPhysRegDefs)
3198 continue;
3199 int StageDef = stageScheduled(&SU);
3200 int CycleDef = InstrToCycle[&SU];
3201 assert(StageDef != -1 && "Instruction should have been scheduled.");
3202 for (auto &OE : SSD->getDDG()->getOutEdges(&SU)) {
3203 SUnit *Dst = OE.getDst();
3204 if (OE.isAssignedRegDep() && !Dst->isBoundaryNode())
3205 if (OE.getReg().isPhysical()) {
3206 if (stageScheduled(Dst) != StageDef)
3207 return false;
3208 if (InstrToCycle[Dst] <= CycleDef)
3209 return false;
3210 }
3211 }
3212 }
3213 return true;
3214}
3215
3216/// A property of the node order in swing-modulo-scheduling is
3217/// that for nodes outside circuits the following holds:
3218/// none of them is scheduled after both a successor and a
3219/// predecessor.
3220/// The method below checks whether the property is met.
3221/// If not, debug information is printed and statistics information updated.
3222/// Note that we do not use an assert statement.
3223/// The reason is that although an invalid node order may prevent
3224/// the pipeliner from finding a pipelined schedule for arbitrary II,
3225/// it does not lead to the generation of incorrect code.
3226void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
3227
3228 // a sorted vector that maps each SUnit to its index in the NodeOrder
3229 typedef std::pair<SUnit *, unsigned> UnitIndex;
3230 std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
3231
3232 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
3233 Indices.push_back(std::make_pair(NodeOrder[i], i));
3234
3235 auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
3236 return std::get<0>(i1) < std::get<0>(i2);
3237 };
3238
3239 // sort, so that we can perform a binary search
3240 llvm::sort(Indices, CompareKey);
3241
3242 bool Valid = true;
3243 (void)Valid;
3244 // for each SUnit in the NodeOrder, check whether
3245 // it appears after both a successor and a predecessor
3246 // of the SUnit. If this is the case, and the SUnit
3247 // is not part of circuit, then the NodeOrder is not
3248 // valid.
3249 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
3250 SUnit *SU = NodeOrder[i];
3251 unsigned Index = i;
3252
3253 bool PredBefore = false;
3254 bool SuccBefore = false;
3255
3256 SUnit *Succ;
3257 SUnit *Pred;
3258 (void)Succ;
3259 (void)Pred;
3260
3261 for (const auto &IE : DDG->getInEdges(SU)) {
3262 SUnit *PredSU = IE.getSrc();
3263 unsigned PredIndex = std::get<1>(
3264 *llvm::lower_bound(Indices, std::make_pair(PredSU, 0), CompareKey));
3265 if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
3266 PredBefore = true;
3267 Pred = PredSU;
3268 break;
3269 }
3270 }
3271
3272 for (const auto &OE : DDG->getOutEdges(SU)) {
3273 SUnit *SuccSU = OE.getDst();
3274 // Do not process a boundary node, it was not included in NodeOrder,
3275 // hence not in Indices either, call to std::lower_bound() below will
3276 // return Indices.end().
3277 if (SuccSU->isBoundaryNode())
3278 continue;
3279 unsigned SuccIndex = std::get<1>(
3280 *llvm::lower_bound(Indices, std::make_pair(SuccSU, 0), CompareKey));
3281 if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
3282 SuccBefore = true;
3283 Succ = SuccSU;
3284 break;
3285 }
3286 }
3287
3288 if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
3289 // instructions in circuits are allowed to be scheduled
3290 // after both a successor and predecessor.
3291 bool InCircuit = llvm::any_of(
3292 Circuits, [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
3293 if (InCircuit)
3294 LLVM_DEBUG(dbgs() << "In a circuit, predecessor ");
3295 else {
3296 Valid = false;
3297 NumNodeOrderIssues++;
3298 LLVM_DEBUG(dbgs() << "Predecessor ");
3299 }
3300 LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
3301 << " are scheduled before node " << SU->NodeNum
3302 << "\n");
3303 }
3304 }
3305
3306 LLVM_DEBUG({
3307 if (!Valid)
3308 dbgs() << "Invalid node order found!\n";
3309 });
3310}
3311
3312/// Attempt to fix the degenerate cases when the instruction serialization
3313/// causes the register lifetimes to overlap. For example,
3314/// p' = store_pi(p, b)
3315/// = load p, offset
3316/// In this case p and p' overlap, which means that two registers are needed.
3317/// Instead, this function changes the load to use p' and updates the offset.
3318void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
3319 unsigned OverlapReg = 0;
3320 unsigned NewBaseReg = 0;
3321 for (SUnit *SU : Instrs) {
3322 MachineInstr *MI = SU->getInstr();
3323 for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3324 const MachineOperand &MO = MI->getOperand(i);
3325 // Look for an instruction that uses p. The instruction occurs in the
3326 // same cycle but occurs later in the serialized order.
3327 if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
3328 // Check that the instruction appears in the InstrChanges structure,
3329 // which contains instructions that can have the offset updated.
3331 InstrChanges.find(SU);
3332 if (It != InstrChanges.end()) {
3333 unsigned BasePos, OffsetPos;
3334 // Update the base register and adjust the offset.
3335 if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
3337 NewMI->getOperand(BasePos).setReg(NewBaseReg);
3338 int64_t NewOffset =
3339 MI->getOperand(OffsetPos).getImm() - It->second.second;
3340 NewMI->getOperand(OffsetPos).setImm(NewOffset);
3341 SU->setInstr(NewMI);
3342 MISUnitMap[NewMI] = SU;
3343 NewMIs[MI] = NewMI;
3344 }
3345 }
3346 OverlapReg = 0;
3347 NewBaseReg = 0;
3348 break;
3349 }
3350 // Look for an instruction of the form p' = op(p), which uses and defines
3351 // two virtual registers that get allocated to the same physical register.
3352 unsigned TiedUseIdx = 0;
3353 if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
3354 // OverlapReg is p in the example above.
3355 OverlapReg = MI->getOperand(TiedUseIdx).getReg();
3356 // NewBaseReg is p' in the example above.
3357 NewBaseReg = MI->getOperand(i).getReg();
3358 break;
3359 }
3360 }
3361 }
3362}
3363
3364std::deque<SUnit *>
3366 const std::deque<SUnit *> &Instrs) const {
3367 std::deque<SUnit *> NewOrderPhi;
3368 for (SUnit *SU : Instrs) {
3369 if (SU->getInstr()->isPHI())
3370 NewOrderPhi.push_back(SU);
3371 }
3372 std::deque<SUnit *> NewOrderI;
3373 for (SUnit *SU : Instrs) {
3374 if (!SU->getInstr()->isPHI())
3375 orderDependence(SSD, SU, NewOrderI);
3376 }
3377 llvm::append_range(NewOrderPhi, NewOrderI);
3378 return NewOrderPhi;
3379}
3380
3381/// After the schedule has been formed, call this function to combine
3382/// the instructions from the different stages/cycles. That is, this
3383/// function creates a schedule that represents a single iteration.
3385 // Move all instructions to the first stage from later stages.
3386 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3387 for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
3388 ++stage) {
3389 std::deque<SUnit *> &cycleInstrs =
3390 ScheduledInstrs[cycle + (stage * InitiationInterval)];
3391 for (SUnit *SU : llvm::reverse(cycleInstrs))
3392 ScheduledInstrs[cycle].push_front(SU);
3393 }
3394 }
3395
3396 // Erase all the elements in the later stages. Only one iteration should
3397 // remain in the scheduled list, and it contains all the instructions.
3398 for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
3399 ScheduledInstrs.erase(cycle);
3400
3401 // Change the registers in instruction as specified in the InstrChanges
3402 // map. We need to use the new registers to create the correct order.
3403 for (const SUnit &SU : SSD->SUnits)
3404 SSD->applyInstrChange(SU.getInstr(), *this);
3405
3406 // Reorder the instructions in each cycle to fix and improve the
3407 // generated code.
3408 for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
3409 std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
3410 cycleInstrs = reorderInstructions(SSD, cycleInstrs);
3411 SSD->fixupRegisterOverlaps(cycleInstrs);
3412 }
3413
3414 LLVM_DEBUG(dump(););
3415}
3416
3418 os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
3419 << " depth " << MaxDepth << " col " << Colocate << "\n";
3420 for (const auto &I : Nodes)
3421 os << " SU(" << I->NodeNum << ") " << *(I->getInstr());
3422 os << "\n";
3423}
3424
3425#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3426/// Print the schedule information to the given output.
3428 // Iterate over each cycle.
3429 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3430 // Iterate over each instruction in the cycle.
3431 const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
3432 for (SUnit *CI : cycleInstrs->second) {
3433 os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
3434 os << "(" << CI->NodeNum << ") ";
3435 CI->getInstr()->print(os);
3436 os << "\n";
3437 }
3438 }
3439}
3440
3441/// Utility function used for debugging to print the schedule.
3444
3445void ResourceManager::dumpMRT() const {
3446 LLVM_DEBUG({
3447 if (UseDFA)
3448 return;
3449 std::stringstream SS;
3450 SS << "MRT:\n";
3451 SS << std::setw(4) << "Slot";
3452 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3453 SS << std::setw(3) << I;
3454 SS << std::setw(7) << "#Mops"
3455 << "\n";
3456 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3457 SS << std::setw(4) << Slot;
3458 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3459 SS << std::setw(3) << MRT[Slot][I];
3460 SS << std::setw(7) << NumScheduledMops[Slot] << "\n";
3461 }
3462 dbgs() << SS.str();
3463 });
3464}
3465#endif
3466
3468 const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {
3469 unsigned ProcResourceID = 0;
3470
3471 // We currently limit the resource kinds to 64 and below so that we can use
3472 // uint64_t for Masks
3473 assert(SM.getNumProcResourceKinds() < 64 &&
3474 "Too many kinds of resources, unsupported");
3475 // Create a unique bitmask for every processor resource unit.
3476 // Skip resource at index 0, since it always references 'InvalidUnit'.
3477 Masks.resize(SM.getNumProcResourceKinds());
3478 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3480 if (Desc.SubUnitsIdxBegin)
3481 continue;
3482 Masks[I] = 1ULL << ProcResourceID;
3483 ProcResourceID++;
3484 }
3485 // Create a unique bitmask for every processor resource group.
3486 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3488 if (!Desc.SubUnitsIdxBegin)
3489 continue;
3490 Masks[I] = 1ULL << ProcResourceID;
3491 for (unsigned U = 0; U < Desc.NumUnits; ++U)
3492 Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];
3493 ProcResourceID++;
3494 }
3495 LLVM_DEBUG({
3496 if (SwpShowResMask) {
3497 dbgs() << "ProcResourceDesc:\n";
3498 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3499 const MCProcResourceDesc *ProcResource = SM.getProcResource(I);
3500 dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
3501 ProcResource->Name, I, Masks[I],
3502 ProcResource->NumUnits);
3503 }
3504 dbgs() << " -----------------\n";
3505 }
3506 });
3507}
3508
3510 LLVM_DEBUG({
3511 if (SwpDebugResource)
3512 dbgs() << "canReserveResources:\n";
3513 });
3514 if (UseDFA)
3515 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3516 ->canReserveResources(&SU.getInstr()->getDesc());
3517
3518 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3519 if (!SCDesc->isValid()) {
3520 LLVM_DEBUG({
3521 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3522 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3523 });
3524 return true;
3525 }
3526
3527 reserveResources(SCDesc, Cycle);
3528 bool Result = !isOverbooked();
3529 unreserveResources(SCDesc, Cycle);
3530
3531 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return " << Result << "\n\n");
3532 return Result;
3533}
3534
3535void ResourceManager::reserveResources(SUnit &SU, int Cycle) {
3536 LLVM_DEBUG({
3537 if (SwpDebugResource)
3538 dbgs() << "reserveResources:\n";
3539 });
3540 if (UseDFA)
3541 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3542 ->reserveResources(&SU.getInstr()->getDesc());
3543
3544 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3545 if (!SCDesc->isValid()) {
3546 LLVM_DEBUG({
3547 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3548 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3549 });
3550 return;
3551 }
3552
3553 reserveResources(SCDesc, Cycle);
3554
3555 LLVM_DEBUG({
3556 if (SwpDebugResource) {
3557 dumpMRT();
3558 dbgs() << "reserveResources: done!\n\n";
3559 }
3560 });
3561}
3562
3563void ResourceManager::reserveResources(const MCSchedClassDesc *SCDesc,
3564 int Cycle) {
3565 assert(!UseDFA);
3566 for (const MCWriteProcResEntry &PRE : make_range(
3567 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3568 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3569 ++MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3570
3571 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3572 ++NumScheduledMops[positiveModulo(C, InitiationInterval)];
3573}
3574
3575void ResourceManager::unreserveResources(const MCSchedClassDesc *SCDesc,
3576 int Cycle) {
3577 assert(!UseDFA);
3578 for (const MCWriteProcResEntry &PRE : make_range(
3579 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3580 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3581 --MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3582
3583 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3584 --NumScheduledMops[positiveModulo(C, InitiationInterval)];
3585}
3586
3587bool ResourceManager::isOverbooked() const {
3588 assert(!UseDFA);
3589 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3590 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3592 if (MRT[Slot][I] > Desc->NumUnits)
3593 return true;
3594 }
3595 if (NumScheduledMops[Slot] > IssueWidth)
3596 return true;
3597 }
3598 return false;
3599}
3600
3601int ResourceManager::calculateResMIIDFA() const {
3602 assert(UseDFA);
3603
3604 // Sort the instructions by the number of available choices for scheduling,
3605 // least to most. Use the number of critical resources as the tie breaker.
3606 FuncUnitSorter FUS = FuncUnitSorter(*ST);
3607 for (SUnit &SU : DAG->SUnits)
3608 FUS.calcCriticalResources(*SU.getInstr());
3610 FuncUnitOrder(FUS);
3611
3612 for (SUnit &SU : DAG->SUnits)
3613 FuncUnitOrder.push(SU.getInstr());
3614
3616 Resources.push_back(
3617 std::unique_ptr<DFAPacketizer>(TII->CreateTargetScheduleState(*ST)));
3618
3619 while (!FuncUnitOrder.empty()) {
3620 MachineInstr *MI = FuncUnitOrder.top();
3621 FuncUnitOrder.pop();
3622 if (TII->isZeroCost(MI->getOpcode()))
3623 continue;
3624
3625 // Attempt to reserve the instruction in an existing DFA. At least one
3626 // DFA is needed for each cycle.
3627 unsigned NumCycles = DAG->getSUnit(MI)->Latency;
3628 unsigned ReservedCycles = 0;
3629 auto *RI = Resources.begin();
3630 auto *RE = Resources.end();
3631 LLVM_DEBUG({
3632 dbgs() << "Trying to reserve resource for " << NumCycles
3633 << " cycles for \n";
3634 MI->dump();
3635 });
3636 for (unsigned C = 0; C < NumCycles; ++C)
3637 while (RI != RE) {
3638 if ((*RI)->canReserveResources(*MI)) {
3639 (*RI)->reserveResources(*MI);
3640 ++ReservedCycles;
3641 break;
3642 }
3643 RI++;
3644 }
3645 LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles
3646 << ", NumCycles:" << NumCycles << "\n");
3647 // Add new DFAs, if needed, to reserve resources.
3648 for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
3650 << "NewResource created to reserve resources"
3651 << "\n");
3652 auto *NewResource = TII->CreateTargetScheduleState(*ST);
3653 assert(NewResource->canReserveResources(*MI) && "Reserve error.");
3654 NewResource->reserveResources(*MI);
3655 Resources.push_back(std::unique_ptr<DFAPacketizer>(NewResource));
3656 }
3657 }
3658
3659 int Resmii = Resources.size();
3660 LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
3661 return Resmii;
3662}
3663
3665 if (UseDFA)
3666 return calculateResMIIDFA();
3667
3668 // Count each resource consumption and divide it by the number of units.
3669 // ResMII is the max value among them.
3670
3671 int NumMops = 0;
3673 for (SUnit &SU : DAG->SUnits) {
3674 if (TII->isZeroCost(SU.getInstr()->getOpcode()))
3675 continue;
3676
3677 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3678 if (!SCDesc->isValid())
3679 continue;
3680
3681 LLVM_DEBUG({
3682 if (SwpDebugResource) {
3683 DAG->dumpNode(SU);
3684 dbgs() << " #Mops: " << SCDesc->NumMicroOps << "\n"
3685 << " WriteProcRes: ";
3686 }
3687 });
3688 NumMops += SCDesc->NumMicroOps;
3689 for (const MCWriteProcResEntry &PRE :
3690 make_range(STI->getWriteProcResBegin(SCDesc),
3691 STI->getWriteProcResEnd(SCDesc))) {
3692 LLVM_DEBUG({
3693 if (SwpDebugResource) {
3694 const MCProcResourceDesc *Desc =
3695 SM.getProcResource(PRE.ProcResourceIdx);
3696 dbgs() << Desc->Name << ": " << PRE.ReleaseAtCycle << ", ";
3697 }
3698 });
3699 ResourceCount[PRE.ProcResourceIdx] += PRE.ReleaseAtCycle;
3700 }
3701 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "\n");
3702 }
3703
3704 int Result = (NumMops + IssueWidth - 1) / IssueWidth;
3705 LLVM_DEBUG({
3706 if (SwpDebugResource)
3707 dbgs() << "#Mops: " << NumMops << ", "
3708 << "IssueWidth: " << IssueWidth << ", "
3709 << "Cycles: " << Result << "\n";
3710 });
3711
3712 LLVM_DEBUG({
3713 if (SwpDebugResource) {
3714 std::stringstream SS;
3715 SS << std::setw(2) << "ID" << std::setw(16) << "Name" << std::setw(10)
3716 << "Units" << std::setw(10) << "Consumed" << std::setw(10) << "Cycles"
3717 << "\n";
3718 dbgs() << SS.str();
3719 }
3720 });
3721 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3723 int Cycles = (ResourceCount[I] + Desc->NumUnits - 1) / Desc->NumUnits;
3724 LLVM_DEBUG({
3725 if (SwpDebugResource) {
3726 std::stringstream SS;
3727 SS << std::setw(2) << I << std::setw(16) << Desc->Name << std::setw(10)
3728 << Desc->NumUnits << std::setw(10) << ResourceCount[I]
3729 << std::setw(10) << Cycles << "\n";
3730 dbgs() << SS.str();
3731 }
3732 });
3733 if (Cycles > Result)
3734 Result = Cycles;
3735 }
3736 return Result;
3737}
3738
3740 InitiationInterval = II;
3741 DFAResources.clear();
3742 DFAResources.resize(II);
3743 for (auto &I : DFAResources)
3744 I.reset(ST->getInstrInfo()->CreateTargetScheduleState(*ST));
3745 MRT.clear();
3747 NumScheduledMops.clear();
3748 NumScheduledMops.resize(II);
3749}
3750
3751bool SwingSchedulerDDGEdge::ignoreDependence(bool IgnoreAnti) const {
3752 if (Pred.isArtificial() || Dst->isBoundaryNode())
3753 return true;
3754 // Currently, dependence that is an anti-dependences but not a loop-carried is
3755 // also ignored. This behavior is preserved to prevent regression.
3756 // FIXME: Remove if this doesn't have significant impact on performance
3757 return IgnoreAnti && (Pred.getKind() == SDep::Kind::Anti || Distance != 0);
3758}
3759
3760SwingSchedulerDDG::SwingSchedulerDDGEdges &
3761SwingSchedulerDDG::getEdges(const SUnit *SU) {
3762 if (SU == EntrySU)
3763 return EntrySUEdges;
3764 if (SU == ExitSU)
3765 return ExitSUEdges;
3766 return EdgesVec[SU->NodeNum];
3767}
3768
3769const SwingSchedulerDDG::SwingSchedulerDDGEdges &
3770SwingSchedulerDDG::getEdges(const SUnit *SU) const {
3771 if (SU == EntrySU)
3772 return EntrySUEdges;
3773 if (SU == ExitSU)
3774 return ExitSUEdges;
3775 return EdgesVec[SU->NodeNum];
3776}
3777
3778void SwingSchedulerDDG::addEdge(const SUnit *SU,
3779 const SwingSchedulerDDGEdge &Edge) {
3780 auto &Edges = getEdges(SU);
3781 if (Edge.getSrc() == SU)
3782 Edges.Succs.push_back(Edge);
3783 else
3784 Edges.Preds.push_back(Edge);
3785}
3786
3787void SwingSchedulerDDG::initEdges(SUnit *SU) {
3788 for (const auto &PI : SU->Preds) {
3789 SwingSchedulerDDGEdge Edge(SU, PI, false);
3790 addEdge(SU, Edge);
3791 }
3792
3793 for (const auto &SI : SU->Succs) {
3794 SwingSchedulerDDGEdge Edge(SU, SI, true);
3795 addEdge(SU, Edge);
3796 }
3797}
3798
3799SwingSchedulerDDG::SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU,
3800 SUnit *ExitSU)
3801 : EntrySU(EntrySU), ExitSU(ExitSU) {
3802 EdgesVec.resize(SUnits.size());
3803
3804 initEdges(EntrySU);
3805 initEdges(ExitSU);
3806 for (auto &SU : SUnits)
3807 initEdges(&SU);
3808}
3809
3812 return getEdges(SU).Preds;
3813}
3814
3817 return getEdges(SU).Succs;
3818}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static const LLT S1
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:686
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:622
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
bool End
Definition: ELF_riscv.cpp:480
SmallVector< uint32_t, 0 > Writes
Definition: ELF_riscv.cpp:497
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
static cl::opt< int > SwpForceII("pipeliner-force-ii", cl::desc("Force pipeliner to use specified II."), cl::Hidden, cl::init(-1))
A command line argument to force pipeliner to use specified initial interval.
static cl::opt< bool > ExperimentalCodeGen("pipeliner-experimental-cg", cl::Hidden, cl::init(false), cl::desc("Use the experimental peeling code generator for software pipelining"))
static cl::opt< bool > MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false), cl::desc("Use the MVE code generator for software pipelining"))
static cl::opt< int > RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden, cl::init(5), cl::desc("Margin representing the unused percentage of " "the register pressure limit"))
static cl::opt< bool > SwpDebugResource("pipeliner-dbg-res", cl::Hidden, cl::init(false))
static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker, NodeSet &NS)
Compute the live-out registers for the instructions in a node-set.
static void computeScheduledInsts(const SwingSchedulerDAG *SSD, SMSchedule &Schedule, std::vector< MachineInstr * > &OrderedInsts, DenseMap< MachineInstr *, unsigned > &Stages)
Create an instruction stream that represents a single iteration and stage of each instruction.
static cl::opt< bool > EmitTestAnnotations("pipeliner-annotate-for-testing", cl::Hidden, cl::init(false), cl::desc("Instead of emitting the pipelined code, annotate instructions " "with the generated schedule for feeding into the " "-modulo-schedule-test pass"))
static bool isIntersect(SmallSetVector< SUnit *, 8 > &Set1, const NodeSet &Set2, SmallSetVector< SUnit *, 8 > &Result)
Return true if Set1 contains elements in Set2.
static cl::opt< bool > SwpIgnoreRecMII("pipeliner-ignore-recmii", cl::ReallyHidden, cl::desc("Ignore RecMII"))
static cl::opt< int > SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1))
Modulo Software Pipelining
static cl::opt< bool > SwpPruneLoopCarried("pipeliner-prune-loop-carried", cl::desc("Prune loop carried order dependences."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of loop carried order dependences.
static bool isDependenceBarrier(MachineInstr &MI)
Return true if the instruction causes a chain between memory references before and after it.
static cl::opt< int > SwpMaxMii("pipeliner-max-mii", cl::desc("Size limit for the MII."), cl::Hidden, cl::init(27))
A command line argument to limit minimum initial interval for pipelining.
static unsigned getLoopPhiReg(const MachineInstr &Phi, const MachineBasicBlock *LoopBB)
Return the Phi register value that comes the loop block.
static bool isSuccOrder(SUnit *SUa, SUnit *SUb)
Return true if SUb can be reached from SUa following the chain edges.
static cl::opt< int > SwpMaxStages("pipeliner-max-stages", cl::desc("Maximum stages allowed in the generated scheduled."), cl::Hidden, cl::init(3))
A command line argument to limit the number of stages in the pipeline.
static cl::opt< bool > EnableSWPOptSize("enable-pipeliner-opt-size", cl::desc("Enable SWP at Os."), cl::Hidden, cl::init(false))
A command line option to enable SWP at -Os.
static bool pred_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Preds, SwingSchedulerDDG *DDG, const NodeSet *S=nullptr)
Compute the Pred_L(O) set, as defined in the paper.
static cl::opt< bool > SwpShowResMask("pipeliner-show-mask", cl::Hidden, cl::init(false))
static cl::opt< int > SwpIISearchRange("pipeliner-ii-search-range", cl::desc("Range to search for II"), cl::Hidden, cl::init(10))
static bool computePath(SUnit *Cur, SetVector< SUnit * > &Path, SetVector< SUnit * > &DestNodes, SetVector< SUnit * > &Exclude, SmallPtrSet< SUnit *, 8 > &Visited, SwingSchedulerDDG *DDG)
Return true if there is a path from the specified node to any of the nodes in DestNodes.
static bool succ_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Succs, SwingSchedulerDDG *DDG, const NodeSet *S=nullptr)
Compute the Succ_L(O) set, as defined in the paper.
static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop, unsigned &InitVal, unsigned &LoopVal)
Return the register values for the operands of a Phi instruction.
static cl::opt< bool > LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false), cl::desc("Limit register pressure of scheduled loop"))
#define DEBUG_TYPE
static cl::opt< bool > EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true), cl::desc("Enable Software Pipelining"))
A command line option to turn software pipelining on or off.
static cl::opt< bool > SwpPruneDeps("pipeliner-prune-deps", cl::desc("Prune dependences between unrelated Phi nodes."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of chain dependences due to an unrelated Phi.
static SUnit * multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG)
If an instruction has a use that spans multiple iterations, then return true.
unsigned const TargetRegisterInfo * TRI
unsigned Reg
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file defines the PriorityQueue class.
Remove Loads Into Fake Uses
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
Target-Independent Code Generator Pass Configuration Options pass.
static unsigned getSize(unsigned Kind)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
bool isNoAlias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A trivial helper function to check to see if the specified pointers are no-alias.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:226
bool erase(const KeyT &Val)
Definition: DenseMap.h:321
bool empty() const
Definition: DenseMap.h:98
iterator end()
Definition: DenseMap.h:84
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
A possibly irreducible generalization of a Loop.
Itinerary data supplied by a subtarget to be used by a target.
const InstrStage * beginStage(unsigned ItinClassIndx) const
Return the first stage of the itinerary.
const InstrStage * endStage(unsigned ItinClassIndx) const
Return the last+1 stage of the itinerary.
bool isEmpty() const
Returns true if there are no itineraries.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:390
bool hasValue() const
TypeSize getValue() const
BlockT * getHeader() const
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:632
unsigned getSchedClass() const
Return the scheduling class for this instruction.
Definition: MCInstrDesc.h:600
const MCInstrDesc & get(unsigned Opcode) const
Return the machine instruction descriptor that corresponds to the specified instruction opcode.
Definition: MCInstrInfo.h:63
Generic base class for all target subtargets.
const MCWriteProcResEntry * getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCWriteProcResEntry * getWriteProcResBegin(const MCSchedClassDesc *SC) const
Return an iterator at the first process resource consumed by the given scheduling class.
const MCSchedModel & getSchedModel() const
Get the machine model for this subtarget's CPU.
Metadata node.
Definition: Metadata.h:1073
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1434
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1432
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1440
Tracking metadata reference owned by Metadata.
Definition: Metadata.h:895
A single uniqued string.
Definition: Metadata.h:724
StringRef getString() const
Definition: Metadata.cpp:616
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
instr_iterator instr_end()
Analysis pass which computes a MachineDominatorTree.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void deleteMachineInstr(MachineInstr *MI)
DeleteMachineInstr - Delete the given MachineInstr.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:71
bool mayRaiseFPException() const
Return true if this instruction could possibly raise a floating-point exception.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:577
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:349
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition: MachineInstr.h:574
bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
bool isRegSequence() const
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:808
bool isIdenticalTo(const MachineInstr &Other, MICheckType Check=CheckDefs) const
Return true if this instruction is identical to Other.
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool isPseudo(QueryType Type=IgnoreBundle) const
Return true if this is a pseudo instruction that doesn't correspond to a real machine instruction.
Definition: MachineInstr.h:938
bool isPHI() const
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:587
iterator_range< filtered_mop_iterator > all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
Definition: MachineInstr.h:764
A description of a memory reference used in the backend.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
const Value * getValue() const
Return the base address of the memory access.
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
Register getReg() const
getReg - Returns the register number.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
Diagnostic information for optimization analysis remarks.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
The main class in the implementation of the target independent software pipeliner pass.
const TargetInstrInfo * TII
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
MachineFunction * MF
const MachineDominatorTree * MDT
const MachineLoopInfo * MLI
MachineOptimizationRemarkEmitter * ORE
RegisterClassInfo RegClassInfo
defusechain_iterator - This class provides iterator support for machine operands in the function that...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
use_instr_iterator use_instr_begin(Register RegNo) const
static use_instr_iterator use_instr_end()
MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator end()
Definition: MapVector.h:71
iterator find(const KeyT &Key)
Definition: MapVector.h:167
void clear()
Definition: MapVector.h:88
static MemoryLocation getAfter(const Value *Ptr, const AAMDNodes &AATags=AAMDNodes())
Return a location that may access any location after Ptr, while remaining within the underlying objec...
Expand the kernel using modulo variable expansion algorithm (MVE).
static bool canApply(MachineLoop &L)
Check if ModuloScheduleExpanderMVE can be applied to L.
The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place, rewriting the old loop and...
void cleanup()
Performs final cleanup after expansion.
void expand()
Performs the actual expansion.
Expander that simply annotates each scheduled instruction with a post-instr symbol that can be consum...
void annotate()
Performs the annotation.
Represents a schedule for a single-block loop.
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
SUnit * getNode(unsigned i) const
void print(raw_ostream &os) const
void setRecMII(unsigned mii)
unsigned count(SUnit *SU) const
void setColocate(unsigned c)
int compareRecMII(NodeSet &RHS)
unsigned size() const
bool insert(SUnit *SU)
LLVM_DUMP_METHOD void dump() const
bool empty() const
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
void dump() const
Definition: Pass.cpp:136
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
A reimplementation of ModuloScheduleExpander.
unsigned getPSet() const
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
Track the current register pressure at some position in the instruction stream, and remember the high...
void addLiveRegs(ArrayRef< VRegMaskOrUnit > Regs)
Force liveness of virtual registers or physical register units.
unsigned getRegPressureSetLimit(unsigned Idx) const
Get the register unit limit for the given pressure set index.
void runOnMachineFunction(const MachineFunction &MF)
runOnFunction - Prepare to answer questions about MF.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
void initProcResourceVectors(const MCSchedModel &SM, SmallVectorImpl< uint64_t > &Masks)
void init(int II)
Initialize resources with the initiation interval II.
bool canReserveResources(SUnit &SU, int Cycle)
Check if the resources occupied by a machine instruction are available in the current state.
Scheduling dependency.
Definition: ScheduleDAG.h:49
Kind
These are the different kinds of scheduling dependencies.
Definition: ScheduleDAG.h:52
@ Order
Any other ordering dependency.
Definition: ScheduleDAG.h:56
@ Anti
A register anti-dependence (aka WAR).
Definition: ScheduleDAG.h:54
@ Data
Regular data dependence (aka true-dependence).
Definition: ScheduleDAG.h:53
@ Barrier
An unknown scheduling barrier.
Definition: ScheduleDAG.h:69
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
This class represents the scheduled code.
std::deque< SUnit * > reorderInstructions(const SwingSchedulerDAG *SSD, const std::deque< SUnit * > &Instrs) const
void setInitiationInterval(int ii)
Set the initiation interval for this schedule.
SmallSet< SUnit *, 8 > computeUnpipelineableNodes(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
Determine transitive dependences of unpipelineable instructions.
void dump() const
Utility function used for debugging to print the schedule.
bool insert(SUnit *SU, int StartCycle, int EndCycle, int II)
Try to schedule the node at the specified StartCycle and continue until the node is schedule or the E...
int earliestCycleInChain(const SwingSchedulerDDGEdge &Dep, const SwingSchedulerDDG *DDG)
Return the cycle of the earliest scheduled instruction in the dependence chain.
unsigned getMaxStageCount()
Return the maximum stage count needed for this schedule.
void print(raw_ostream &os) const
Print the schedule information to the given output.
bool onlyHasLoopCarriedOutputOrOrderPreds(SUnit *SU, const SwingSchedulerDDG *DDG) const
Return true if all scheduled predecessors are loop-carried output/order dependencies.
int stageScheduled(SUnit *SU) const
Return the stage for a scheduled instruction.
void orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU, std::deque< SUnit * > &Insts) const
Order the instructions within a cycle so that the definitions occur before the uses.
bool isValidSchedule(SwingSchedulerDAG *SSD)
int getInitiationInterval() const
Return the initiation interval for this schedule.
std::deque< SUnit * > & getInstructions(int cycle)
Return the instructions that are scheduled at the specified cycle.
int getFirstCycle() const
Return the first cycle in the completed schedule.
bool isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD, MachineInstr *Def, MachineOperand &MO) const
Return true if the instruction is a definition that is loop carried and defines the use on the next i...
unsigned cycleScheduled(SUnit *SU) const
Return the cycle for a scheduled instruction.
void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart, int II, SwingSchedulerDAG *DAG)
Compute the scheduling start slot for the instruction.
bool normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
bool isLoopCarried(const SwingSchedulerDAG *SSD, MachineInstr &Phi) const
Return true if the scheduled Phi has a loop carried operand.
int latestCycleInChain(const SwingSchedulerDDGEdge &Dep, const SwingSchedulerDDG *DDG)
Return the cycle of the latest scheduled instruction in the dependence chain.
int getFinalCycle() const
Return the last cycle in the finalized schedule.
void finalizeSchedule(SwingSchedulerDAG *SSD)
After the schedule has been formed, call this function to combine the instructions from the different...
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
unsigned NumPreds
Definition: ScheduleDAG.h:272
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
Definition: ScheduleDAG.h:378
unsigned NodeNum
Entry # of node in the node vector.
Definition: ScheduleDAG.h:270
void setInstr(MachineInstr *MI)
Assigns the instruction for the SUnit.
Definition: ScheduleDAG.h:382
void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
bool isPred(const SUnit *N) const
Tests if node N is a predecessor of this node.
Definition: ScheduleDAG.h:449
unsigned short Latency
Node latency.
Definition: ScheduleDAG.h:303
bool isBoundaryNode() const
Boundary nodes are placeholders for the boundary of the scheduling region.
Definition: ScheduleDAG.h:358
bool hasPhysRegDefs
Has physreg defs that are being used.
Definition: ScheduleDAG.h:292
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:263
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:262
bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:390
A ScheduleDAG for scheduling lists of MachineInstr.
DenseMap< MachineInstr *, SUnit * > MISUnitMap
After calling BuildSchedGraph, each machine instruction in the current scheduling region is mapped to...
virtual void finishBlock()
Cleans up after scheduling in the given block.
MachineBasicBlock * BB
The block in which to insert instructions.
const MCSchedClassDesc * getSchedClass(SUnit *SU) const
Resolves and cache a resolved scheduling class for an SUnit.
void dumpNode(const SUnit &SU) const override
UndefValue * UnknownValue
For an unanalyzable memory access, this Value is used in maps.
void buildSchedGraph(AAResults *AA, RegPressureTracker *RPTracker=nullptr, PressureDiffs *PDiffs=nullptr, LiveIntervals *LIS=nullptr, bool TrackLaneMasks=false)
Builds SUnits for the current region.
SUnit * getSUnit(MachineInstr *MI) const
Returns an existing SUnit for this MI, or nullptr.
TargetSchedModel SchedModel
TargetSchedModel provides an interface to the machine model.
void dump() const override
void RemovePred(SUnit *M, SUnit *N)
Updates the topological ordering to accommodate an edge to be removed from the specified node N from ...
void InitDAGTopologicalSorting()
Creates the initial topological ordering from the DAG to be scheduled.
void AddPred(SUnit *Y, SUnit *X)
Updates the topological ordering to accommodate an edge to be added from SUnit X to SUnit Y.
bool IsReachable(const SUnit *SU, const SUnit *TargetSU)
Checks if SU is reachable from TargetSU.
MachineRegisterInfo & MRI
Virtual/real register map.
Definition: ScheduleDAG.h:578
const TargetInstrInfo * TII
Target instruction information.
Definition: ScheduleDAG.h:575
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:579
const TargetRegisterInfo * TRI
Target processor register info.
Definition: ScheduleDAG.h:576
SUnit EntrySU
Special node for the region entry.
Definition: ScheduleDAG.h:580
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:577
SUnit ExitSU
Special node for the region exit.
Definition: ScheduleDAG.h:581
A vector that has set insertion semantics.
Definition: SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
typename vector_type::const_iterator iterator
Definition: SetVector.h:69
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:254
SlotIndexes pass.
Definition: SlotIndexes.h:297
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:531
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:298
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:401
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
iterator end() const
Definition: SmallPtrSet.h:477
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
iterator begin() const
Definition: SmallPtrSet.h:472
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:175
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:222
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
This class builds the dependence graph for the instructions in a loop, and attempts to schedule the i...
unsigned getInstrBaseReg(SUnit *SU) const
Return the new base register that was stored away for the changed instruction.
unsigned getDepth(SUnit *Node)
The depth, in the dependence graph, for a node.
int getASAP(SUnit *Node)
Return the earliest time an instruction may be scheduled.
void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule)
Apply changes to the instruction if needed.
const SwingSchedulerDDG * getDDG() const
void finishBlock() override
Clean up after the software pipeliner runs.
void fixupRegisterOverlaps(std::deque< SUnit * > &Instrs)
Attempt to fix the degenerate cases when the instruction serialization causes the register lifetimes ...
int getZeroLatencyDepth(SUnit *Node)
The maximum unweighted length of a path from an arbitrary node to the given node in which each edge h...
bool isLoopCarriedDep(const SwingSchedulerDDGEdge &Edge) const
Return true for an order or output dependence that is loop carried potentially.
void schedule() override
We override the schedule function in ScheduleDAGInstrs to implement the scheduling part of the Swing ...
int getMOV(SUnit *Node)
The mobility function, which the number of slots in which an instruction may be scheduled.
int getZeroLatencyHeight(SUnit *Node)
The maximum unweighted length of a path from the given node to an arbitrary node in which each edge h...
unsigned getHeight(SUnit *Node)
The height, in the dependence graph, for a node.
int getALAP(SUnit *Node)
Return the latest time an instruction my be scheduled.
Represents a dependence between two instruction.
SUnit * getDst() const
Returns the SUnit to which the edge points (destination node).
bool isArtificial() const
Returns true if the edge represents an artificial dependence.
bool ignoreDependence(bool IgnoreAnti) const
Returns true for DDG nodes that we ignore when computing the cost functions.
bool isOrderDep() const
Returns true if the edge represents a dependence that is not data, anti or output dependence.
SUnit * getSrc() const
Returns the SUnit from which the edge comes (source node).
bool isOutputDep() const
Returns true if the edge represents output dependence.
Represents dependencies between instructions.
SwingSchedulerDDG(std::vector< SUnit > &SUnits, SUnit *EntrySU, SUnit *ExitSU)
const EdgesType & getInEdges(const SUnit *SU) const
const EdgesType & getOutEdges(const SUnit *SU) const
Object returned by analyzeLoopForPipelining.
virtual bool isMVEExpanderSupported()
Return true if the target can expand pipelined schedule with modulo variable expansion.
virtual bool shouldIgnoreForPipelining(const MachineInstr *MI) const =0
Return true if the given instruction should not be pipelined and should be ignored.
virtual bool shouldUseSchedule(SwingSchedulerDAG &SSD, SMSchedule &SMS)
Return true if the proposed schedule should used.
virtual std::unique_ptr< PipelinerLoopInfo > analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const
Analyze loop L, which must be a single-basic-block loop, and if the conditions can be understood enou...
bool isZeroCost(unsigned Opcode) const
Return true for pseudo instructions that don't consume any machine resources in their current form.
virtual bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
virtual DFAPacketizer * CreateTargetScheduleState(const TargetSubtargetInfo &) const
Create machine specific model for scheduling.
virtual bool isPostIncrement(const MachineInstr &MI) const
Return true for post-incremented instructions.
virtual bool getBaseAndOffsetPosition(const MachineInstr &MI, unsigned &BasePos, unsigned &OffsetPos) const
Return true if the instruction contains a base register and offset.
virtual bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const
Sometimes, it is possible for the target to tell, even without aliasing information,...
virtual bool getIncrementValue(const MachineInstr &MI, int &Value) const
If the instruction is an increment of a constant value, return the amount.
bool getMemOperandWithOffset(const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const
Get the base operand and byte offset of an instruction that reads/writes memory.
Target-Independent Code Generator Pass Configuration Options.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const char * getRegPressureSetName(unsigned Idx) const =0
Get the name of this register unit pressure set.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
static Type * getVoidTy(LLVMContext &C)
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
The main class in the implementation of the target independent window scheduler.
unsigned getPosition() const
Definition: CommandLine.h:306
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:193
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Entry
Definition: COFF.h:844
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:138
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:711
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double e
Definition: MathExtras.h:48
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Offset
Definition: DWP.cpp:480
void stable_sort(R &&Range)
Definition: STLExtras.h:2037
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2115
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2107
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
@ WS_Force
Use window algorithm after SMS algorithm fails.
@ WS_On
Turn off window algorithm.
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1664
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
unsigned getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1978
char & MachinePipelinerID
This pass performs software pipelining on machine instructions.
CycleInfo::CycleT Cycle
Definition: CycleInfo.h:24
cl::opt< bool > SwpEnableCopyToPhi
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
cl::opt< WindowSchedulingFlag > WindowSchedulingOption("window-sched", cl::Hidden, cl::init(WindowSchedulingFlag::WS_On), cl::desc("Set how to use window scheduling algorithm."), cl::values(clEnumValN(WindowSchedulingFlag::WS_Off, "off", "Turn off window algorithm."), clEnumValN(WindowSchedulingFlag::WS_On, "on", "Use window algorithm after SMS algorithm fails."), clEnumValN(WindowSchedulingFlag::WS_Force, "force", "Use window algorithm instead of SMS algorithm.")))
A command line argument to set the window scheduling option.
bool isIdentifiedObject(const Value *V)
Return true if this pointer refers to a distinct and identifiable object.
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
cl::opt< int > SwpForceIssueWidth
A command line argument to force pipeliner to use specified issue width.
#define N
Description of the encoding of one expression Op.
These values represent a non-pipelined step in the execution of an instruction.
RegisterPressure computed within a region of instructions delimited by TopIdx and BottomIdx.
static constexpr LaneBitmask getNone()
Definition: LaneBitmask.h:81
Define a kind of processor resource that will be modeled by the scheduler.
Definition: MCSchedule.h:34
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition: MCSchedule.h:121
bool isValid() const
Definition: MCSchedule.h:139
Machine model for scheduling, bundling, and heuristics.
Definition: MCSchedule.h:256
const MCSchedClassDesc * getSchedClassDesc(unsigned SchedClassIdx) const
Definition: MCSchedule.h:363
unsigned getNumProcResourceKinds() const
Definition: MCSchedule.h:352
bool hasInstrSchedModel() const
Does this machine model include instruction-level scheduling.
Definition: MCSchedule.h:337
const MCProcResourceDesc * getProcResource(unsigned ProcResourceIdx) const
Definition: MCSchedule.h:356
Identify one of the processor resource kinds consumed by a particular scheduling class for the specif...
Definition: MCSchedule.h:66
SmallVector< MachineOperand, 4 > BrCond
std::unique_ptr< TargetInstrInfo::PipelinerLoopInfo > LoopPipelinerInfo
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
const TargetPassConfig * PassConfig
const MachineDominatorTree * MDT
RegisterClassInfo * RegClassInfo
const MachineLoopInfo * MLI
Store the effects of a change in pressure on things that MI scheduler cares about.
std::vector< unsigned > MaxSetPressure
Map of max reg pressure indexed by pressure set ID, not class ID.