LLVM 20.0.0git
MachinePipeliner.cpp
Go to the documentation of this file.
1//===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
10//
11// This SMS implementation is a target-independent back-end pass. When enabled,
12// the pass runs just prior to the register allocation pass, while the machine
13// IR is in SSA form. If software pipelining is successful, then the original
14// loop is replaced by the optimized loop. The optimized loop contains one or
15// more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
16// the instructions cannot be scheduled in a given MII, we increase the MII by
17// one and try again.
18//
19// The SMS implementation is an extension of the ScheduleDAGInstrs class. We
20// represent loop carried dependences in the DAG as order edges to the Phi
21// nodes. We also perform several passes over the DAG to eliminate unnecessary
22// edges that inhibit the ability to pipeline. The implementation uses the
23// DFAPacketizer class to compute the minimum initiation interval and the check
24// where an instruction may be inserted in the pipelined schedule.
25//
26// In order for the SMS pass to work, several target specific hooks need to be
27// implemented to get information about the loop structure and to rewrite
28// instructions.
29//
30//===----------------------------------------------------------------------===//
31
33#include "llvm/ADT/ArrayRef.h"
34#include "llvm/ADT/BitVector.h"
35#include "llvm/ADT/DenseMap.h"
36#include "llvm/ADT/MapVector.h"
38#include "llvm/ADT/STLExtras.h"
40#include "llvm/ADT/SetVector.h"
42#include "llvm/ADT/SmallSet.h"
44#include "llvm/ADT/Statistic.h"
73#include "llvm/Config/llvm-config.h"
74#include "llvm/IR/Attributes.h"
75#include "llvm/IR/Function.h"
76#include "llvm/MC/LaneBitmask.h"
77#include "llvm/MC/MCInstrDesc.h"
79#include "llvm/Pass.h"
82#include "llvm/Support/Debug.h"
84#include <algorithm>
85#include <cassert>
86#include <climits>
87#include <cstdint>
88#include <deque>
89#include <functional>
90#include <iomanip>
91#include <iterator>
92#include <map>
93#include <memory>
94#include <sstream>
95#include <tuple>
96#include <utility>
97#include <vector>
98
99using namespace llvm;
100
101#define DEBUG_TYPE "pipeliner"
102
103STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
104STATISTIC(NumPipelined, "Number of loops software pipelined");
105STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
106STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
107STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
108STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
109STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
110STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
111STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
112STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
113STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
114
115/// A command line option to turn software pipelining on or off.
116static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
117 cl::desc("Enable Software Pipelining"));
118
119/// A command line option to enable SWP at -Os.
120static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
121 cl::desc("Enable SWP at Os."), cl::Hidden,
122 cl::init(false));
123
124/// A command line argument to limit minimum initial interval for pipelining.
125static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
126 cl::desc("Size limit for the MII."),
127 cl::Hidden, cl::init(27));
128
129/// A command line argument to force pipeliner to use specified initial
130/// interval.
131static cl::opt<int> SwpForceII("pipeliner-force-ii",
132 cl::desc("Force pipeliner to use specified II."),
133 cl::Hidden, cl::init(-1));
134
135/// A command line argument to limit the number of stages in the pipeline.
136static cl::opt<int>
137 SwpMaxStages("pipeliner-max-stages",
138 cl::desc("Maximum stages allowed in the generated scheduled."),
139 cl::Hidden, cl::init(3));
140
141/// A command line option to disable the pruning of chain dependences due to
142/// an unrelated Phi.
143static cl::opt<bool>
144 SwpPruneDeps("pipeliner-prune-deps",
145 cl::desc("Prune dependences between unrelated Phi nodes."),
146 cl::Hidden, cl::init(true));
147
148/// A command line option to disable the pruning of loop carried order
149/// dependences.
150static cl::opt<bool>
151 SwpPruneLoopCarried("pipeliner-prune-loop-carried",
152 cl::desc("Prune loop carried order dependences."),
153 cl::Hidden, cl::init(true));
154
155#ifndef NDEBUG
156static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
157#endif
158
159static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
161 cl::desc("Ignore RecMII"));
162
163static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
164 cl::init(false));
165static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,
166 cl::init(false));
167
169 "pipeliner-annotate-for-testing", cl::Hidden, cl::init(false),
170 cl::desc("Instead of emitting the pipelined code, annotate instructions "
171 "with the generated schedule for feeding into the "
172 "-modulo-schedule-test pass"));
173
175 "pipeliner-experimental-cg", cl::Hidden, cl::init(false),
176 cl::desc(
177 "Use the experimental peeling code generator for software pipelining"));
178
179static cl::opt<int> SwpIISearchRange("pipeliner-ii-search-range",
180 cl::desc("Range to search for II"),
181 cl::Hidden, cl::init(10));
182
183static cl::opt<bool>
184 LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false),
185 cl::desc("Limit register pressure of scheduled loop"));
186
187static cl::opt<int>
188 RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden,
189 cl::init(5),
190 cl::desc("Margin representing the unused percentage of "
191 "the register pressure limit"));
192
193static cl::opt<bool>
194 MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false),
195 cl::desc("Use the MVE code generator for software pipelining"));
196
197namespace llvm {
198
199// A command line option to enable the CopyToPhi DAG mutation.
201 cl::init(true),
202 cl::desc("Enable CopyToPhi DAG Mutation"));
203
204/// A command line argument to force pipeliner to use specified issue
205/// width.
207 "pipeliner-force-issue-width",
208 cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
209 cl::init(-1));
210
211/// A command line argument to set the window scheduling option.
214 cl::desc("Set how to use window scheduling algorithm."),
216 "Turn off window algorithm."),
218 "Use window algorithm after SMS algorithm fails."),
220 "Use window algorithm instead of SMS algorithm.")));
221
222} // end namespace llvm
223
224unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
225char MachinePipeliner::ID = 0;
226#ifndef NDEBUG
228#endif
230
232 "Modulo Software Pipelining", false, false)
238 "Modulo Software Pipelining", false, false)
239
240/// The "main" function for implementing Swing Modulo Scheduling.
241bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
242 if (skipFunction(mf.getFunction()))
243 return false;
244
245 if (!EnableSWP)
246 return false;
247
248 if (mf.getFunction().getAttributes().hasFnAttr(Attribute::OptimizeForSize) &&
249 !EnableSWPOptSize.getPosition())
250 return false;
251
252 if (!mf.getSubtarget().enableMachinePipeliner())
253 return false;
254
255 // Cannot pipeline loops without instruction itineraries if we are using
256 // DFA for the pipeliner.
257 if (mf.getSubtarget().useDFAforSMS() &&
258 (!mf.getSubtarget().getInstrItineraryData() ||
259 mf.getSubtarget().getInstrItineraryData()->isEmpty()))
260 return false;
261
262 MF = &mf;
263 MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
264 MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
265 ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
266 TII = MF->getSubtarget().getInstrInfo();
267 RegClassInfo.runOnMachineFunction(*MF);
268
269 for (const auto &L : *MLI)
270 scheduleLoop(*L);
271
272 return false;
273}
274
275/// Attempt to perform the SMS algorithm on the specified loop. This function is
276/// the main entry point for the algorithm. The function identifies candidate
277/// loops, calculates the minimum initiation interval, and attempts to schedule
278/// the loop.
279bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
280 bool Changed = false;
281 for (const auto &InnerLoop : L)
282 Changed |= scheduleLoop(*InnerLoop);
283
284#ifndef NDEBUG
285 // Stop trying after reaching the limit (if any).
286 int Limit = SwpLoopLimit;
287 if (Limit >= 0) {
288 if (NumTries >= SwpLoopLimit)
289 return Changed;
290 NumTries++;
291 }
292#endif
293
294 setPragmaPipelineOptions(L);
295 if (!canPipelineLoop(L)) {
296 LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
297 ORE->emit([&]() {
298 return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",
299 L.getStartLoc(), L.getHeader())
300 << "Failed to pipeline loop";
301 });
302
303 LI.LoopPipelinerInfo.reset();
304 return Changed;
305 }
306
307 ++NumTrytoPipeline;
308 if (useSwingModuloScheduler())
309 Changed = swingModuloScheduler(L);
310
311 if (useWindowScheduler(Changed))
312 Changed = runWindowScheduler(L);
313
314 LI.LoopPipelinerInfo.reset();
315 return Changed;
316}
317
318void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
319 // Reset the pragma for the next loop in iteration.
320 disabledByPragma = false;
321 II_setByPragma = 0;
322
323 MachineBasicBlock *LBLK = L.getTopBlock();
324
325 if (LBLK == nullptr)
326 return;
327
328 const BasicBlock *BBLK = LBLK->getBasicBlock();
329 if (BBLK == nullptr)
330 return;
331
332 const Instruction *TI = BBLK->getTerminator();
333 if (TI == nullptr)
334 return;
335
336 MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop);
337 if (LoopID == nullptr)
338 return;
339
340 assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");
341 assert(LoopID->getOperand(0) == LoopID && "invalid loop");
342
343 for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
344 MDNode *MD = dyn_cast<MDNode>(MDO);
345
346 if (MD == nullptr)
347 continue;
348
349 MDString *S = dyn_cast<MDString>(MD->getOperand(0));
350
351 if (S == nullptr)
352 continue;
353
354 if (S->getString() == "llvm.loop.pipeline.initiationinterval") {
355 assert(MD->getNumOperands() == 2 &&
356 "Pipeline initiation interval hint metadata should have two operands.");
358 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
359 assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");
360 } else if (S->getString() == "llvm.loop.pipeline.disable") {
361 disabledByPragma = true;
362 }
363 }
364}
365
366/// Return true if the loop can be software pipelined. The algorithm is
367/// restricted to loops with a single basic block. Make sure that the
368/// branch in the loop can be analyzed.
369bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
370 if (L.getNumBlocks() != 1) {
371 ORE->emit([&]() {
372 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
373 L.getStartLoc(), L.getHeader())
374 << "Not a single basic block: "
375 << ore::NV("NumBlocks", L.getNumBlocks());
376 });
377 return false;
378 }
379
380 if (disabledByPragma) {
381 ORE->emit([&]() {
382 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
383 L.getStartLoc(), L.getHeader())
384 << "Disabled by Pragma.";
385 });
386 return false;
387 }
388
389 // Check if the branch can't be understood because we can't do pipelining
390 // if that's the case.
391 LI.TBB = nullptr;
392 LI.FBB = nullptr;
393 LI.BrCond.clear();
394 if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
395 LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
396 NumFailBranch++;
397 ORE->emit([&]() {
398 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
399 L.getStartLoc(), L.getHeader())
400 << "The branch can't be understood";
401 });
402 return false;
403 }
404
405 LI.LoopInductionVar = nullptr;
406 LI.LoopCompare = nullptr;
408 if (!LI.LoopPipelinerInfo) {
409 LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
410 NumFailLoop++;
411 ORE->emit([&]() {
412 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
413 L.getStartLoc(), L.getHeader())
414 << "The loop structure is not supported";
415 });
416 return false;
417 }
418
419 if (!L.getLoopPreheader()) {
420 LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
421 NumFailPreheader++;
422 ORE->emit([&]() {
423 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
424 L.getStartLoc(), L.getHeader())
425 << "No loop preheader found";
426 });
427 return false;
428 }
429
430 // Remove any subregisters from inputs to phi nodes.
431 preprocessPhiNodes(*L.getHeader());
432 return true;
433}
434
435void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
437 SlotIndexes &Slots =
438 *getAnalysis<LiveIntervalsWrapperPass>().getLIS().getSlotIndexes();
439
440 for (MachineInstr &PI : B.phis()) {
441 MachineOperand &DefOp = PI.getOperand(0);
442 assert(DefOp.getSubReg() == 0);
443 auto *RC = MRI.getRegClass(DefOp.getReg());
444
445 for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
446 MachineOperand &RegOp = PI.getOperand(i);
447 if (RegOp.getSubReg() == 0)
448 continue;
449
450 // If the operand uses a subregister, replace it with a new register
451 // without subregisters, and generate a copy to the new register.
452 Register NewReg = MRI.createVirtualRegister(RC);
453 MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
455 const DebugLoc &DL = PredB.findDebugLoc(At);
456 auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
457 .addReg(RegOp.getReg(), getRegState(RegOp),
458 RegOp.getSubReg());
459 Slots.insertMachineInstrInMaps(*Copy);
460 RegOp.setReg(NewReg);
461 RegOp.setSubReg(0);
462 }
463 }
464}
465
466/// The SMS algorithm consists of the following main steps:
467/// 1. Computation and analysis of the dependence graph.
468/// 2. Ordering of the nodes (instructions).
469/// 3. Attempt to Schedule the loop.
470bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
471 assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
472
474 *this, L, getAnalysis<LiveIntervalsWrapperPass>().getLIS(), RegClassInfo,
476
477 MachineBasicBlock *MBB = L.getHeader();
478 // The kernel should not include any terminator instructions. These
479 // will be added back later.
480 SMS.startBlock(MBB);
481
482 // Compute the number of 'real' instructions in the basic block by
483 // ignoring terminators.
484 unsigned size = MBB->size();
486 E = MBB->instr_end();
487 I != E; ++I, --size)
488 ;
489
490 SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
491 SMS.schedule();
492 SMS.exitRegion();
493
494 SMS.finishBlock();
495 return SMS.hasNewSchedule();
496}
497
507}
508
509bool MachinePipeliner::runWindowScheduler(MachineLoop &L) {
510 MachineSchedContext Context;
511 Context.MF = MF;
512 Context.MLI = MLI;
513 Context.MDT = MDT;
514 Context.PassConfig = &getAnalysis<TargetPassConfig>();
515 Context.AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
516 Context.LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
518 WindowScheduler WS(&Context, L);
519 return WS.run();
520}
521
522bool MachinePipeliner::useSwingModuloScheduler() {
523 // SwingModuloScheduler does not work when WindowScheduler is forced.
525}
526
527bool MachinePipeliner::useWindowScheduler(bool Changed) {
528 // WindowScheduler does not work for following cases:
529 // 1. when it is off.
530 // 2. when SwingModuloScheduler is successfully scheduled.
531 // 3. when pragma II is enabled.
532 if (II_setByPragma) {
533 LLVM_DEBUG(dbgs() << "Window scheduling is disabled when "
534 "llvm.loop.pipeline.initiationinterval is set.\n");
535 return false;
536 }
537
540}
541
542void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
543 if (SwpForceII > 0)
544 MII = SwpForceII;
545 else if (II_setByPragma > 0)
546 MII = II_setByPragma;
547 else
548 MII = std::max(ResMII, RecMII);
549}
550
551void SwingSchedulerDAG::setMAX_II() {
552 if (SwpForceII > 0)
553 MAX_II = SwpForceII;
554 else if (II_setByPragma > 0)
555 MAX_II = II_setByPragma;
556 else
557 MAX_II = MII + SwpIISearchRange;
558}
559
560/// We override the schedule function in ScheduleDAGInstrs to implement the
561/// scheduling part of the Swing Modulo Scheduling algorithm.
563 AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
564 buildSchedGraph(AA);
565 addLoopCarriedDependences(AA);
566 updatePhiDependences();
568 changeDependences();
569 postProcessDAG();
570 DDG = std::make_unique<SwingSchedulerDDG>(SUnits, &EntrySU, &ExitSU);
571 LLVM_DEBUG(dump());
572
573 NodeSetType NodeSets;
574 findCircuits(NodeSets);
575 NodeSetType Circuits = NodeSets;
576
577 // Calculate the MII.
578 unsigned ResMII = calculateResMII();
579 unsigned RecMII = calculateRecMII(NodeSets);
580
581 fuseRecs(NodeSets);
582
583 // This flag is used for testing and can cause correctness problems.
584 if (SwpIgnoreRecMII)
585 RecMII = 0;
586
587 setMII(ResMII, RecMII);
588 setMAX_II();
589
590 LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II
591 << " (rec=" << RecMII << ", res=" << ResMII << ")\n");
592
593 // Can't schedule a loop without a valid MII.
594 if (MII == 0) {
595 LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
596 NumFailZeroMII++;
597 Pass.ORE->emit([&]() {
599 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
600 << "Invalid Minimal Initiation Interval: 0";
601 });
602 return;
603 }
604
605 // Don't pipeline large loops.
606 if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {
607 LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
608 << ", we don't pipeline large loops\n");
609 NumFailLargeMaxMII++;
610 Pass.ORE->emit([&]() {
612 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
613 << "Minimal Initiation Interval too large: "
614 << ore::NV("MII", (int)MII) << " > "
615 << ore::NV("SwpMaxMii", SwpMaxMii) << "."
616 << "Refer to -pipeliner-max-mii.";
617 });
618 return;
619 }
620
621 computeNodeFunctions(NodeSets);
622
623 registerPressureFilter(NodeSets);
624
625 colocateNodeSets(NodeSets);
626
627 checkNodeSets(NodeSets);
628
629 LLVM_DEBUG({
630 for (auto &I : NodeSets) {
631 dbgs() << " Rec NodeSet ";
632 I.dump();
633 }
634 });
635
636 llvm::stable_sort(NodeSets, std::greater<NodeSet>());
637
638 groupRemainingNodes(NodeSets);
639
640 removeDuplicateNodes(NodeSets);
641
642 LLVM_DEBUG({
643 for (auto &I : NodeSets) {
644 dbgs() << " NodeSet ";
645 I.dump();
646 }
647 });
648
649 computeNodeOrder(NodeSets);
650
651 // check for node order issues
652 checkValidNodeOrder(Circuits);
653
654 SMSchedule Schedule(Pass.MF, this);
655 Scheduled = schedulePipeline(Schedule);
656
657 if (!Scheduled){
658 LLVM_DEBUG(dbgs() << "No schedule found, return\n");
659 NumFailNoSchedule++;
660 Pass.ORE->emit([&]() {
662 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
663 << "Unable to find schedule";
664 });
665 return;
666 }
667
668 unsigned numStages = Schedule.getMaxStageCount();
669 // No need to generate pipeline if there are no overlapped iterations.
670 if (numStages == 0) {
671 LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
672 NumFailZeroStage++;
673 Pass.ORE->emit([&]() {
675 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
676 << "No need to pipeline - no overlapped iterations in schedule.";
677 });
678 return;
679 }
680 // Check that the maximum stage count is less than user-defined limit.
681 if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) {
682 LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
683 << " : too many stages, abort\n");
684 NumFailLargeMaxStage++;
685 Pass.ORE->emit([&]() {
687 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
688 << "Too many stages in schedule: "
689 << ore::NV("numStages", (int)numStages) << " > "
690 << ore::NV("SwpMaxStages", SwpMaxStages)
691 << ". Refer to -pipeliner-max-stages.";
692 });
693 return;
694 }
695
696 Pass.ORE->emit([&]() {
698 Loop.getHeader())
699 << "Pipelined succesfully!";
700 });
701
702 // Generate the schedule as a ModuloSchedule.
703 DenseMap<MachineInstr *, int> Cycles, Stages;
704 std::vector<MachineInstr *> OrderedInsts;
705 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
706 ++Cycle) {
707 for (SUnit *SU : Schedule.getInstructions(Cycle)) {
708 OrderedInsts.push_back(SU->getInstr());
709 Cycles[SU->getInstr()] = Cycle;
710 Stages[SU->getInstr()] = Schedule.stageScheduled(SU);
711 }
712 }
714 for (auto &KV : NewMIs) {
715 Cycles[KV.first] = Cycles[KV.second];
716 Stages[KV.first] = Stages[KV.second];
717 NewInstrChanges[KV.first] = InstrChanges[getSUnit(KV.first)];
718 }
719
720 ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
721 std::move(Stages));
723 assert(NewInstrChanges.empty() &&
724 "Cannot serialize a schedule with InstrChanges!");
726 MSTI.annotate();
727 return;
728 }
729 // The experimental code generator can't work if there are InstChanges.
730 if (ExperimentalCodeGen && NewInstrChanges.empty()) {
731 PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
732 MSE.expand();
733 } else if (MVECodeGen && NewInstrChanges.empty() &&
734 LoopPipelinerInfo->isMVEExpanderSupported() &&
736 ModuloScheduleExpanderMVE MSE(MF, MS, LIS);
737 MSE.expand();
738 } else {
739 ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
740 MSE.expand();
741 MSE.cleanup();
742 }
743 ++NumPipelined;
744}
745
746/// Clean up after the software pipeliner runs.
748 for (auto &KV : NewMIs)
749 MF.deleteMachineInstr(KV.second);
750 NewMIs.clear();
751
752 // Call the superclass.
754}
755
756/// Return the register values for the operands of a Phi instruction.
757/// This function assume the instruction is a Phi.
759 unsigned &InitVal, unsigned &LoopVal) {
760 assert(Phi.isPHI() && "Expecting a Phi.");
761
762 InitVal = 0;
763 LoopVal = 0;
764 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
765 if (Phi.getOperand(i + 1).getMBB() != Loop)
766 InitVal = Phi.getOperand(i).getReg();
767 else
768 LoopVal = Phi.getOperand(i).getReg();
769
770 assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
771}
772
773/// Return the Phi register value that comes the loop block.
774static unsigned getLoopPhiReg(const MachineInstr &Phi,
775 const MachineBasicBlock *LoopBB) {
776 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
777 if (Phi.getOperand(i + 1).getMBB() == LoopBB)
778 return Phi.getOperand(i).getReg();
779 return 0;
780}
781
782/// Return true if SUb can be reached from SUa following the chain edges.
783static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
786 Worklist.push_back(SUa);
787 while (!Worklist.empty()) {
788 const SUnit *SU = Worklist.pop_back_val();
789 for (const auto &SI : SU->Succs) {
790 SUnit *SuccSU = SI.getSUnit();
791 if (SI.getKind() == SDep::Order) {
792 if (Visited.count(SuccSU))
793 continue;
794 if (SuccSU == SUb)
795 return true;
796 Worklist.push_back(SuccSU);
797 Visited.insert(SuccSU);
798 }
799 }
800 }
801 return false;
802}
803
804/// Return true if the instruction causes a chain between memory
805/// references before and after it.
807 return MI.isCall() || MI.mayRaiseFPException() ||
808 MI.hasUnmodeledSideEffects() ||
809 (MI.hasOrderedMemoryRef() &&
810 (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad()));
811}
812
813/// Return the underlying objects for the memory references of an instruction.
814/// This function calls the code in ValueTracking, but first checks that the
815/// instruction has a memory operand.
818 if (!MI->hasOneMemOperand())
819 return;
820 MachineMemOperand *MM = *MI->memoperands_begin();
821 if (!MM->getValue())
822 return;
823 getUnderlyingObjects(MM->getValue(), Objs);
824 for (const Value *V : Objs) {
825 if (!isIdentifiedObject(V)) {
826 Objs.clear();
827 return;
828 }
829 }
830}
831
832/// Add a chain edge between a load and store if the store can be an
833/// alias of the load on a subsequent iteration, i.e., a loop carried
834/// dependence. This code is very similar to the code in ScheduleDAGInstrs
835/// but that code doesn't create loop carried dependences.
836void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
840 for (auto &SU : SUnits) {
841 MachineInstr &MI = *SU.getInstr();
843 PendingLoads.clear();
844 else if (MI.mayLoad()) {
847 if (Objs.empty())
849 for (const auto *V : Objs) {
850 SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
851 SUs.push_back(&SU);
852 }
853 } else if (MI.mayStore()) {
856 if (Objs.empty())
858 for (const auto *V : Objs) {
860 PendingLoads.find(V);
861 if (I == PendingLoads.end())
862 continue;
863 for (auto *Load : I->second) {
864 if (isSuccOrder(Load, &SU))
865 continue;
866 MachineInstr &LdMI = *Load->getInstr();
867 // First, perform the cheaper check that compares the base register.
868 // If they are the same and the load offset is less than the store
869 // offset, then mark the dependence as loop carried potentially.
870 const MachineOperand *BaseOp1, *BaseOp2;
871 int64_t Offset1, Offset2;
872 bool Offset1IsScalable, Offset2IsScalable;
873 if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1,
874 Offset1IsScalable, TRI) &&
875 TII->getMemOperandWithOffset(MI, BaseOp2, Offset2,
876 Offset2IsScalable, TRI)) {
877 if (BaseOp1->isIdenticalTo(*BaseOp2) &&
878 Offset1IsScalable == Offset2IsScalable &&
879 (int)Offset1 < (int)Offset2) {
881 "What happened to the chain edge?");
882 SDep Dep(Load, SDep::Barrier);
883 Dep.setLatency(1);
884 SU.addPred(Dep);
885 continue;
886 }
887 }
888 // Second, the more expensive check that uses alias analysis on the
889 // base registers. If they alias, and the load offset is less than
890 // the store offset, the mark the dependence as loop carried.
891 if (!AA) {
892 SDep Dep(Load, SDep::Barrier);
893 Dep.setLatency(1);
894 SU.addPred(Dep);
895 continue;
896 }
897 MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
898 MachineMemOperand *MMO2 = *MI.memoperands_begin();
899 if (!MMO1->getValue() || !MMO2->getValue()) {
900 SDep Dep(Load, SDep::Barrier);
901 Dep.setLatency(1);
902 SU.addPred(Dep);
903 continue;
904 }
905 if (MMO1->getValue() == MMO2->getValue() &&
906 MMO1->getOffset() <= MMO2->getOffset()) {
907 SDep Dep(Load, SDep::Barrier);
908 Dep.setLatency(1);
909 SU.addPred(Dep);
910 continue;
911 }
912 if (!AA->isNoAlias(
915 MMO2->getAAInfo()))) {
916 SDep Dep(Load, SDep::Barrier);
917 Dep.setLatency(1);
918 SU.addPred(Dep);
919 }
920 }
921 }
922 }
923 }
924}
925
926/// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
927/// processes dependences for PHIs. This function adds true dependences
928/// from a PHI to a use, and a loop carried dependence from the use to the
929/// PHI. The loop carried dependence is represented as an anti dependence
930/// edge. This function also removes chain dependences between unrelated
931/// PHIs.
932void SwingSchedulerDAG::updatePhiDependences() {
933 SmallVector<SDep, 4> RemoveDeps;
935
936 // Iterate over each DAG node.
937 for (SUnit &I : SUnits) {
938 RemoveDeps.clear();
939 // Set to true if the instruction has an operand defined by a Phi.
940 unsigned HasPhiUse = 0;
941 unsigned HasPhiDef = 0;
942 MachineInstr *MI = I.getInstr();
943 // Iterate over each operand, and we process the definitions.
944 for (const MachineOperand &MO : MI->operands()) {
945 if (!MO.isReg())
946 continue;
947 Register Reg = MO.getReg();
948 if (MO.isDef()) {
949 // If the register is used by a Phi, then create an anti dependence.
951 UI = MRI.use_instr_begin(Reg),
952 UE = MRI.use_instr_end();
953 UI != UE; ++UI) {
954 MachineInstr *UseMI = &*UI;
955 SUnit *SU = getSUnit(UseMI);
956 if (SU != nullptr && UseMI->isPHI()) {
957 if (!MI->isPHI()) {
958 SDep Dep(SU, SDep::Anti, Reg);
959 Dep.setLatency(1);
960 I.addPred(Dep);
961 } else {
962 HasPhiDef = Reg;
963 // Add a chain edge to a dependent Phi that isn't an existing
964 // predecessor.
965 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
966 I.addPred(SDep(SU, SDep::Barrier));
967 }
968 }
969 }
970 } else if (MO.isUse()) {
971 // If the register is defined by a Phi, then create a true dependence.
973 if (DefMI == nullptr)
974 continue;
975 SUnit *SU = getSUnit(DefMI);
976 if (SU != nullptr && DefMI->isPHI()) {
977 if (!MI->isPHI()) {
978 SDep Dep(SU, SDep::Data, Reg);
979 Dep.setLatency(0);
980 ST.adjustSchedDependency(SU, 0, &I, MO.getOperandNo(), Dep,
981 &SchedModel);
982 I.addPred(Dep);
983 } else {
984 HasPhiUse = Reg;
985 // Add a chain edge to a dependent Phi that isn't an existing
986 // predecessor.
987 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
988 I.addPred(SDep(SU, SDep::Barrier));
989 }
990 }
991 }
992 }
993 // Remove order dependences from an unrelated Phi.
994 if (!SwpPruneDeps)
995 continue;
996 for (auto &PI : I.Preds) {
997 MachineInstr *PMI = PI.getSUnit()->getInstr();
998 if (PMI->isPHI() && PI.getKind() == SDep::Order) {
999 if (I.getInstr()->isPHI()) {
1000 if (PMI->getOperand(0).getReg() == HasPhiUse)
1001 continue;
1002 if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
1003 continue;
1004 }
1005 RemoveDeps.push_back(PI);
1006 }
1007 }
1008 for (const SDep &D : RemoveDeps)
1009 I.removePred(D);
1010 }
1011}
1012
1013/// Iterate over each DAG node and see if we can change any dependences
1014/// in order to reduce the recurrence MII.
1015void SwingSchedulerDAG::changeDependences() {
1016 // See if an instruction can use a value from the previous iteration.
1017 // If so, we update the base and offset of the instruction and change
1018 // the dependences.
1019 for (SUnit &I : SUnits) {
1020 unsigned BasePos = 0, OffsetPos = 0, NewBase = 0;
1021 int64_t NewOffset = 0;
1022 if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
1023 NewOffset))
1024 continue;
1025
1026 // Get the MI and SUnit for the instruction that defines the original base.
1027 Register OrigBase = I.getInstr()->getOperand(BasePos).getReg();
1029 if (!DefMI)
1030 continue;
1031 SUnit *DefSU = getSUnit(DefMI);
1032 if (!DefSU)
1033 continue;
1034 // Get the MI and SUnit for the instruction that defins the new base.
1035 MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
1036 if (!LastMI)
1037 continue;
1038 SUnit *LastSU = getSUnit(LastMI);
1039 if (!LastSU)
1040 continue;
1041
1042 if (Topo.IsReachable(&I, LastSU))
1043 continue;
1044
1045 // Remove the dependence. The value now depends on a prior iteration.
1047 for (const SDep &P : I.Preds)
1048 if (P.getSUnit() == DefSU)
1049 Deps.push_back(P);
1050 for (const SDep &D : Deps) {
1051 Topo.RemovePred(&I, D.getSUnit());
1052 I.removePred(D);
1053 }
1054 // Remove the chain dependence between the instructions.
1055 Deps.clear();
1056 for (auto &P : LastSU->Preds)
1057 if (P.getSUnit() == &I && P.getKind() == SDep::Order)
1058 Deps.push_back(P);
1059 for (const SDep &D : Deps) {
1060 Topo.RemovePred(LastSU, D.getSUnit());
1061 LastSU->removePred(D);
1062 }
1063
1064 // Add a dependence between the new instruction and the instruction
1065 // that defines the new base.
1066 SDep Dep(&I, SDep::Anti, NewBase);
1067 Topo.AddPred(LastSU, &I);
1068 LastSU->addPred(Dep);
1069
1070 // Remember the base and offset information so that we can update the
1071 // instruction during code generation.
1072 InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
1073 }
1074}
1075
1076/// Create an instruction stream that represents a single iteration and stage of
1077/// each instruction. This function differs from SMSchedule::finalizeSchedule in
1078/// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this
1079/// function is an approximation of SMSchedule::finalizeSchedule with all
1080/// non-const operations removed.
1082 SMSchedule &Schedule,
1083 std::vector<MachineInstr *> &OrderedInsts,
1086
1087 // Move all instructions to the first stage from the later stages.
1088 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1089 ++Cycle) {
1090 for (int Stage = 0, LastStage = Schedule.getMaxStageCount();
1091 Stage <= LastStage; ++Stage) {
1092 for (SUnit *SU : llvm::reverse(Schedule.getInstructions(
1093 Cycle + Stage * Schedule.getInitiationInterval()))) {
1094 Instrs[Cycle].push_front(SU);
1095 }
1096 }
1097 }
1098
1099 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1100 ++Cycle) {
1101 std::deque<SUnit *> &CycleInstrs = Instrs[Cycle];
1102 CycleInstrs = Schedule.reorderInstructions(SSD, CycleInstrs);
1103 for (SUnit *SU : CycleInstrs) {
1104 MachineInstr *MI = SU->getInstr();
1105 OrderedInsts.push_back(MI);
1106 Stages[MI] = Schedule.stageScheduled(SU);
1107 }
1108 }
1109}
1110
1111namespace {
1112
1113// FuncUnitSorter - Comparison operator used to sort instructions by
1114// the number of functional unit choices.
1115struct FuncUnitSorter {
1116 const InstrItineraryData *InstrItins;
1117 const MCSubtargetInfo *STI;
1119
1120 FuncUnitSorter(const TargetSubtargetInfo &TSI)
1121 : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
1122
1123 // Compute the number of functional unit alternatives needed
1124 // at each stage, and take the minimum value. We prioritize the
1125 // instructions by the least number of choices first.
1126 unsigned minFuncUnits(const MachineInstr *Inst,
1127 InstrStage::FuncUnits &F) const {
1128 unsigned SchedClass = Inst->getDesc().getSchedClass();
1129 unsigned min = UINT_MAX;
1130 if (InstrItins && !InstrItins->isEmpty()) {
1131 for (const InstrStage &IS :
1132 make_range(InstrItins->beginStage(SchedClass),
1133 InstrItins->endStage(SchedClass))) {
1134 InstrStage::FuncUnits funcUnits = IS.getUnits();
1135 unsigned numAlternatives = llvm::popcount(funcUnits);
1136 if (numAlternatives < min) {
1137 min = numAlternatives;
1138 F = funcUnits;
1139 }
1140 }
1141 return min;
1142 }
1143 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1144 const MCSchedClassDesc *SCDesc =
1145 STI->getSchedModel().getSchedClassDesc(SchedClass);
1146 if (!SCDesc->isValid())
1147 // No valid Schedule Class Desc for schedClass, should be
1148 // Pseudo/PostRAPseudo
1149 return min;
1150
1151 for (const MCWriteProcResEntry &PRE :
1152 make_range(STI->getWriteProcResBegin(SCDesc),
1153 STI->getWriteProcResEnd(SCDesc))) {
1154 if (!PRE.ReleaseAtCycle)
1155 continue;
1156 const MCProcResourceDesc *ProcResource =
1157 STI->getSchedModel().getProcResource(PRE.ProcResourceIdx);
1158 unsigned NumUnits = ProcResource->NumUnits;
1159 if (NumUnits < min) {
1160 min = NumUnits;
1161 F = PRE.ProcResourceIdx;
1162 }
1163 }
1164 return min;
1165 }
1166 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1167 }
1168
1169 // Compute the critical resources needed by the instruction. This
1170 // function records the functional units needed by instructions that
1171 // must use only one functional unit. We use this as a tie breaker
1172 // for computing the resource MII. The instrutions that require
1173 // the same, highly used, functional unit have high priority.
1174 void calcCriticalResources(MachineInstr &MI) {
1175 unsigned SchedClass = MI.getDesc().getSchedClass();
1176 if (InstrItins && !InstrItins->isEmpty()) {
1177 for (const InstrStage &IS :
1178 make_range(InstrItins->beginStage(SchedClass),
1179 InstrItins->endStage(SchedClass))) {
1180 InstrStage::FuncUnits FuncUnits = IS.getUnits();
1181 if (llvm::popcount(FuncUnits) == 1)
1182 Resources[FuncUnits]++;
1183 }
1184 return;
1185 }
1186 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1187 const MCSchedClassDesc *SCDesc =
1188 STI->getSchedModel().getSchedClassDesc(SchedClass);
1189 if (!SCDesc->isValid())
1190 // No valid Schedule Class Desc for schedClass, should be
1191 // Pseudo/PostRAPseudo
1192 return;
1193
1194 for (const MCWriteProcResEntry &PRE :
1195 make_range(STI->getWriteProcResBegin(SCDesc),
1196 STI->getWriteProcResEnd(SCDesc))) {
1197 if (!PRE.ReleaseAtCycle)
1198 continue;
1199 Resources[PRE.ProcResourceIdx]++;
1200 }
1201 return;
1202 }
1203 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1204 }
1205
1206 /// Return true if IS1 has less priority than IS2.
1207 bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
1208 InstrStage::FuncUnits F1 = 0, F2 = 0;
1209 unsigned MFUs1 = minFuncUnits(IS1, F1);
1210 unsigned MFUs2 = minFuncUnits(IS2, F2);
1211 if (MFUs1 == MFUs2)
1212 return Resources.lookup(F1) < Resources.lookup(F2);
1213 return MFUs1 > MFUs2;
1214 }
1215};
1216
1217/// Calculate the maximum register pressure of the scheduled instructions stream
1218class HighRegisterPressureDetector {
1219 MachineBasicBlock *OrigMBB;
1220 const MachineRegisterInfo &MRI;
1221 const TargetRegisterInfo *TRI;
1222
1223 const unsigned PSetNum;
1224
1225 // Indexed by PSet ID
1226 // InitSetPressure takes into account the register pressure of live-in
1227 // registers. It's not depend on how the loop is scheduled, so it's enough to
1228 // calculate them once at the beginning.
1229 std::vector<unsigned> InitSetPressure;
1230
1231 // Indexed by PSet ID
1232 // Upper limit for each register pressure set
1233 std::vector<unsigned> PressureSetLimit;
1234
1236
1238
1239public:
1240 using OrderedInstsTy = std::vector<MachineInstr *>;
1241 using Instr2StageTy = DenseMap<MachineInstr *, unsigned>;
1242
1243private:
1244 static void dumpRegisterPressures(const std::vector<unsigned> &Pressures) {
1245 if (Pressures.size() == 0) {
1246 dbgs() << "[]";
1247 } else {
1248 char Prefix = '[';
1249 for (unsigned P : Pressures) {
1250 dbgs() << Prefix << P;
1251 Prefix = ' ';
1252 }
1253 dbgs() << ']';
1254 }
1255 }
1256
1257 void dumpPSet(Register Reg) const {
1258 dbgs() << "Reg=" << printReg(Reg, TRI, 0, &MRI) << " PSet=";
1259 for (auto PSetIter = MRI.getPressureSets(Reg); PSetIter.isValid();
1260 ++PSetIter) {
1261 dbgs() << *PSetIter << ' ';
1262 }
1263 dbgs() << '\n';
1264 }
1265
1266 void increaseRegisterPressure(std::vector<unsigned> &Pressure,
1267 Register Reg) const {
1268 auto PSetIter = MRI.getPressureSets(Reg);
1269 unsigned Weight = PSetIter.getWeight();
1270 for (; PSetIter.isValid(); ++PSetIter)
1271 Pressure[*PSetIter] += Weight;
1272 }
1273
1274 void decreaseRegisterPressure(std::vector<unsigned> &Pressure,
1275 Register Reg) const {
1276 auto PSetIter = MRI.getPressureSets(Reg);
1277 unsigned Weight = PSetIter.getWeight();
1278 for (; PSetIter.isValid(); ++PSetIter) {
1279 auto &P = Pressure[*PSetIter];
1280 assert(P >= Weight &&
1281 "register pressure must be greater than or equal weight");
1282 P -= Weight;
1283 }
1284 }
1285
1286 // Return true if Reg is reserved one, for example, stack pointer
1287 bool isReservedRegister(Register Reg) const {
1288 return Reg.isPhysical() && MRI.isReserved(Reg.asMCReg());
1289 }
1290
1291 bool isDefinedInThisLoop(Register Reg) const {
1292 return Reg.isVirtual() && MRI.getVRegDef(Reg)->getParent() == OrigMBB;
1293 }
1294
1295 // Search for live-in variables. They are factored into the register pressure
1296 // from the begining. Live-in variables used by every iteration should be
1297 // considered as alive throughout the loop. For example, the variable `c` in
1298 // following code. \code
1299 // int c = ...;
1300 // for (int i = 0; i < n; i++)
1301 // a[i] += b[i] + c;
1302 // \endcode
1303 void computeLiveIn() {
1305 for (auto &MI : *OrigMBB) {
1306 if (MI.isDebugInstr())
1307 continue;
1308 for (auto &Use : ROMap[&MI].Uses) {
1309 auto Reg = Use.RegUnit;
1310 // Ignore the variable that appears only on one side of phi instruction
1311 // because it's used only at the first iteration.
1312 if (MI.isPHI() && Reg != getLoopPhiReg(MI, OrigMBB))
1313 continue;
1314 if (isReservedRegister(Reg))
1315 continue;
1316 if (isDefinedInThisLoop(Reg))
1317 continue;
1318 Used.insert(Reg);
1319 }
1320 }
1321
1322 for (auto LiveIn : Used)
1323 increaseRegisterPressure(InitSetPressure, LiveIn);
1324 }
1325
1326 // Calculate the upper limit of each pressure set
1327 void computePressureSetLimit(const RegisterClassInfo &RCI) {
1328 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1329 PressureSetLimit[PSet] = RCI.getRegPressureSetLimit(PSet);
1330 }
1331
1332 // There are two patterns of last-use.
1333 // - by an instruction of the current iteration
1334 // - by a phi instruction of the next iteration (loop carried value)
1335 //
1336 // Furthermore, following two groups of instructions are executed
1337 // simultaneously
1338 // - next iteration's phi instructions in i-th stage
1339 // - current iteration's instructions in i+1-th stage
1340 //
1341 // This function calculates the last-use of each register while taking into
1342 // account the above two patterns.
1343 Instr2LastUsesTy computeLastUses(const OrderedInstsTy &OrderedInsts,
1344 Instr2StageTy &Stages) const {
1345 // We treat virtual registers that are defined and used in this loop.
1346 // Following virtual register will be ignored
1347 // - live-in one
1348 // - defined but not used in the loop (potentially live-out)
1349 DenseSet<Register> TargetRegs;
1350 const auto UpdateTargetRegs = [this, &TargetRegs](Register Reg) {
1351 if (isDefinedInThisLoop(Reg))
1352 TargetRegs.insert(Reg);
1353 };
1354 for (MachineInstr *MI : OrderedInsts) {
1355 if (MI->isPHI()) {
1356 Register Reg = getLoopPhiReg(*MI, OrigMBB);
1357 UpdateTargetRegs(Reg);
1358 } else {
1359 for (auto &Use : ROMap.find(MI)->getSecond().Uses)
1360 UpdateTargetRegs(Use.RegUnit);
1361 }
1362 }
1363
1364 const auto InstrScore = [&Stages](MachineInstr *MI) {
1365 return Stages[MI] + MI->isPHI();
1366 };
1367
1369 for (MachineInstr *MI : llvm::reverse(OrderedInsts)) {
1370 for (auto &Use : ROMap.find(MI)->getSecond().Uses) {
1371 auto Reg = Use.RegUnit;
1372 if (!TargetRegs.contains(Reg))
1373 continue;
1374 auto [Ite, Inserted] = LastUseMI.try_emplace(Reg, MI);
1375 if (!Inserted) {
1376 MachineInstr *Orig = Ite->second;
1377 MachineInstr *New = MI;
1378 if (InstrScore(Orig) < InstrScore(New))
1379 Ite->second = New;
1380 }
1381 }
1382 }
1383
1384 Instr2LastUsesTy LastUses;
1385 for (auto &Entry : LastUseMI)
1386 LastUses[Entry.second].insert(Entry.first);
1387 return LastUses;
1388 }
1389
1390 // Compute the maximum register pressure of the kernel. We'll simulate #Stage
1391 // iterations and check the register pressure at the point where all stages
1392 // overlapping.
1393 //
1394 // An example of unrolled loop where #Stage is 4..
1395 // Iter i+0 i+1 i+2 i+3
1396 // ------------------------
1397 // Stage 0
1398 // Stage 1 0
1399 // Stage 2 1 0
1400 // Stage 3 2 1 0 <- All stages overlap
1401 //
1402 std::vector<unsigned>
1403 computeMaxSetPressure(const OrderedInstsTy &OrderedInsts,
1404 Instr2StageTy &Stages,
1405 const unsigned StageCount) const {
1406 using RegSetTy = SmallDenseSet<Register, 16>;
1407
1408 // Indexed by #Iter. To treat "local" variables of each stage separately, we
1409 // manage the liveness of the registers independently by iterations.
1410 SmallVector<RegSetTy> LiveRegSets(StageCount);
1411
1412 auto CurSetPressure = InitSetPressure;
1413 auto MaxSetPressure = InitSetPressure;
1414 auto LastUses = computeLastUses(OrderedInsts, Stages);
1415
1416 LLVM_DEBUG({
1417 dbgs() << "Ordered instructions:\n";
1418 for (MachineInstr *MI : OrderedInsts) {
1419 dbgs() << "Stage " << Stages[MI] << ": ";
1420 MI->dump();
1421 }
1422 });
1423
1424 const auto InsertReg = [this, &CurSetPressure](RegSetTy &RegSet,
1425 Register Reg) {
1426 if (!Reg.isValid() || isReservedRegister(Reg))
1427 return;
1428
1429 bool Inserted = RegSet.insert(Reg).second;
1430 if (!Inserted)
1431 return;
1432
1433 LLVM_DEBUG(dbgs() << "insert " << printReg(Reg, TRI, 0, &MRI) << "\n");
1434 increaseRegisterPressure(CurSetPressure, Reg);
1435 LLVM_DEBUG(dumpPSet(Reg));
1436 };
1437
1438 const auto EraseReg = [this, &CurSetPressure](RegSetTy &RegSet,
1439 Register Reg) {
1440 if (!Reg.isValid() || isReservedRegister(Reg))
1441 return;
1442
1443 // live-in register
1444 if (!RegSet.contains(Reg))
1445 return;
1446
1447 LLVM_DEBUG(dbgs() << "erase " << printReg(Reg, TRI, 0, &MRI) << "\n");
1448 RegSet.erase(Reg);
1449 decreaseRegisterPressure(CurSetPressure, Reg);
1450 LLVM_DEBUG(dumpPSet(Reg));
1451 };
1452
1453 for (unsigned I = 0; I < StageCount; I++) {
1454 for (MachineInstr *MI : OrderedInsts) {
1455 const auto Stage = Stages[MI];
1456 if (I < Stage)
1457 continue;
1458
1459 const unsigned Iter = I - Stage;
1460
1461 for (auto &Def : ROMap.find(MI)->getSecond().Defs)
1462 InsertReg(LiveRegSets[Iter], Def.RegUnit);
1463
1464 for (auto LastUse : LastUses[MI]) {
1465 if (MI->isPHI()) {
1466 if (Iter != 0)
1467 EraseReg(LiveRegSets[Iter - 1], LastUse);
1468 } else {
1469 EraseReg(LiveRegSets[Iter], LastUse);
1470 }
1471 }
1472
1473 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1474 MaxSetPressure[PSet] =
1475 std::max(MaxSetPressure[PSet], CurSetPressure[PSet]);
1476
1477 LLVM_DEBUG({
1478 dbgs() << "CurSetPressure=";
1479 dumpRegisterPressures(CurSetPressure);
1480 dbgs() << " iter=" << Iter << " stage=" << Stage << ":";
1481 MI->dump();
1482 });
1483 }
1484 }
1485
1486 return MaxSetPressure;
1487 }
1488
1489public:
1490 HighRegisterPressureDetector(MachineBasicBlock *OrigMBB,
1491 const MachineFunction &MF)
1492 : OrigMBB(OrigMBB), MRI(MF.getRegInfo()),
1493 TRI(MF.getSubtarget().getRegisterInfo()),
1494 PSetNum(TRI->getNumRegPressureSets()), InitSetPressure(PSetNum, 0),
1495 PressureSetLimit(PSetNum, 0) {}
1496
1497 // Used to calculate register pressure, which is independent of loop
1498 // scheduling.
1499 void init(const RegisterClassInfo &RCI) {
1500 for (MachineInstr &MI : *OrigMBB) {
1501 if (MI.isDebugInstr())
1502 continue;
1503 ROMap[&MI].collect(MI, *TRI, MRI, false, true);
1504 }
1505
1506 computeLiveIn();
1507 computePressureSetLimit(RCI);
1508 }
1509
1510 // Calculate the maximum register pressures of the loop and check if they
1511 // exceed the limit
1512 bool detect(const SwingSchedulerDAG *SSD, SMSchedule &Schedule,
1513 const unsigned MaxStage) const {
1515 "the percentage of the margin must be between 0 to 100");
1516
1517 OrderedInstsTy OrderedInsts;
1518 Instr2StageTy Stages;
1519 computeScheduledInsts(SSD, Schedule, OrderedInsts, Stages);
1520 const auto MaxSetPressure =
1521 computeMaxSetPressure(OrderedInsts, Stages, MaxStage + 1);
1522
1523 LLVM_DEBUG({
1524 dbgs() << "Dump MaxSetPressure:\n";
1525 for (unsigned I = 0; I < MaxSetPressure.size(); I++) {
1526 dbgs() << format("MaxSetPressure[%d]=%d\n", I, MaxSetPressure[I]);
1527 }
1528 dbgs() << '\n';
1529 });
1530
1531 for (unsigned PSet = 0; PSet < PSetNum; PSet++) {
1532 unsigned Limit = PressureSetLimit[PSet];
1533 unsigned Margin = Limit * RegPressureMargin / 100;
1534 LLVM_DEBUG(dbgs() << "PSet=" << PSet << " Limit=" << Limit
1535 << " Margin=" << Margin << "\n");
1536 if (Limit < MaxSetPressure[PSet] + Margin) {
1537 LLVM_DEBUG(
1538 dbgs()
1539 << "Rejected the schedule because of too high register pressure\n");
1540 return true;
1541 }
1542 }
1543 return false;
1544 }
1545};
1546
1547} // end anonymous namespace
1548
1549/// Calculate the resource constrained minimum initiation interval for the
1550/// specified loop. We use the DFA to model the resources needed for
1551/// each instruction, and we ignore dependences. A different DFA is created
1552/// for each cycle that is required. When adding a new instruction, we attempt
1553/// to add it to each existing DFA, until a legal space is found. If the
1554/// instruction cannot be reserved in an existing DFA, we create a new one.
1555unsigned SwingSchedulerDAG::calculateResMII() {
1556 LLVM_DEBUG(dbgs() << "calculateResMII:\n");
1558 return RM.calculateResMII();
1559}
1560
1561/// Calculate the recurrence-constrainted minimum initiation interval.
1562/// Iterate over each circuit. Compute the delay(c) and distance(c)
1563/// for each circuit. The II needs to satisfy the inequality
1564/// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1565/// II that satisfies the inequality, and the RecMII is the maximum
1566/// of those values.
1567unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1568 unsigned RecMII = 0;
1569
1570 for (NodeSet &Nodes : NodeSets) {
1571 if (Nodes.empty())
1572 continue;
1573
1574 unsigned Delay = Nodes.getLatency();
1575 unsigned Distance = 1;
1576
1577 // ii = ceil(delay / distance)
1578 unsigned CurMII = (Delay + Distance - 1) / Distance;
1579 Nodes.setRecMII(CurMII);
1580 if (CurMII > RecMII)
1581 RecMII = CurMII;
1582 }
1583
1584 return RecMII;
1585}
1586
1587/// Create the adjacency structure of the nodes in the graph.
1588void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1589 SwingSchedulerDAG *DAG) {
1590 BitVector Added(SUnits.size());
1591 DenseMap<int, int> OutputDeps;
1592 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1593 Added.reset();
1594 // Add any successor to the adjacency matrix and exclude duplicates.
1595 for (auto &OE : DAG->DDG->getOutEdges(&SUnits[i])) {
1596 // Only create a back-edge on the first and last nodes of a dependence
1597 // chain. This records any chains and adds them later.
1598 if (OE.isOutputDep()) {
1599 int N = OE.getDst()->NodeNum;
1600 int BackEdge = i;
1601 auto Dep = OutputDeps.find(BackEdge);
1602 if (Dep != OutputDeps.end()) {
1603 BackEdge = Dep->second;
1604 OutputDeps.erase(Dep);
1605 }
1606 OutputDeps[N] = BackEdge;
1607 }
1608 // Do not process a boundary node, an artificial node.
1609 if (OE.getDst()->isBoundaryNode() || OE.isArtificial())
1610 continue;
1611
1612 // This code is retained o preserve previous behavior and prevent
1613 // regression. This condition means that anti-dependnecies within an
1614 // iteration are ignored when searching circuits. Therefore it's natural
1615 // to consider this dependence as well.
1616 // FIXME: Remove this code if it doesn't have significant impact on
1617 // performance.
1618 if (OE.isAntiDep())
1619 continue;
1620
1621 int N = OE.getDst()->NodeNum;
1622 if (!Added.test(N)) {
1623 AdjK[i].push_back(N);
1624 Added.set(N);
1625 }
1626 }
1627 // A chain edge between a store and a load is treated as a back-edge in the
1628 // adjacency matrix.
1629 for (auto &IE : DAG->DDG->getInEdges(&SUnits[i])) {
1630 SUnit *Src = IE.getSrc();
1631 SUnit *Dst = IE.getDst();
1632 if (!Dst->getInstr()->mayStore() || !DAG->isLoopCarriedDep(IE))
1633 continue;
1634 if (IE.isOrderDep() && Src->getInstr()->mayLoad()) {
1635 int N = Src->NodeNum;
1636 if (!Added.test(N)) {
1637 AdjK[i].push_back(N);
1638 Added.set(N);
1639 }
1640 }
1641 }
1642 }
1643 // Add back-edges in the adjacency matrix for the output dependences.
1644 for (auto &OD : OutputDeps)
1645 if (!Added.test(OD.second)) {
1646 AdjK[OD.first].push_back(OD.second);
1647 Added.set(OD.second);
1648 }
1649}
1650
1651/// Identify an elementary circuit in the dependence graph starting at the
1652/// specified node.
1653bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1654 const SwingSchedulerDAG *DAG,
1655 bool HasBackedge) {
1656 SUnit *SV = &SUnits[V];
1657 bool F = false;
1658 Stack.insert(SV);
1659 Blocked.set(V);
1660
1661 for (auto W : AdjK[V]) {
1662 if (NumPaths > MaxPaths)
1663 break;
1664 if (W < S)
1665 continue;
1666 if (W == S) {
1667 if (!HasBackedge)
1668 NodeSets.push_back(NodeSet(Stack.begin(), Stack.end(), DAG));
1669 F = true;
1670 ++NumPaths;
1671 break;
1672 }
1673 if (!Blocked.test(W)) {
1674 if (circuit(W, S, NodeSets, DAG,
1675 Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
1676 F = true;
1677 }
1678 }
1679
1680 if (F)
1681 unblock(V);
1682 else {
1683 for (auto W : AdjK[V]) {
1684 if (W < S)
1685 continue;
1686 B[W].insert(SV);
1687 }
1688 }
1689 Stack.pop_back();
1690 return F;
1691}
1692
1693/// Unblock a node in the circuit finding algorithm.
1694void SwingSchedulerDAG::Circuits::unblock(int U) {
1695 Blocked.reset(U);
1697 while (!BU.empty()) {
1699 assert(SI != BU.end() && "Invalid B set.");
1700 SUnit *W = *SI;
1701 BU.erase(W);
1702 if (Blocked.test(W->NodeNum))
1703 unblock(W->NodeNum);
1704 }
1705}
1706
1707/// Identify all the elementary circuits in the dependence graph using
1708/// Johnson's circuit algorithm.
1709void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
1710 Circuits Cir(SUnits, Topo);
1711 // Create the adjacency structure.
1712 Cir.createAdjacencyStructure(this);
1713 for (int I = 0, E = SUnits.size(); I != E; ++I) {
1714 Cir.reset();
1715 Cir.circuit(I, I, NodeSets, this);
1716 }
1717}
1718
1719// Create artificial dependencies between the source of COPY/REG_SEQUENCE that
1720// is loop-carried to the USE in next iteration. This will help pipeliner avoid
1721// additional copies that are needed across iterations. An artificial dependence
1722// edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
1723
1724// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
1725// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
1726// PHI-------True-Dep------> USEOfPhi
1727
1728// The mutation creates
1729// USEOfPHI -------Artificial-Dep---> SRCOfCopy
1730
1731// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
1732// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
1733// late to avoid additional copies across iterations. The possible scheduling
1734// order would be
1735// USEOfPHI --- SRCOfCopy--- COPY/REG_SEQUENCE.
1736
1737void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
1738 for (SUnit &SU : DAG->SUnits) {
1739 // Find the COPY/REG_SEQUENCE instruction.
1740 if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
1741 continue;
1742
1743 // Record the loop carried PHIs.
1745 // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
1747
1748 for (auto &Dep : SU.Preds) {
1749 SUnit *TmpSU = Dep.getSUnit();
1750 MachineInstr *TmpMI = TmpSU->getInstr();
1751 SDep::Kind DepKind = Dep.getKind();
1752 // Save the loop carried PHI.
1753 if (DepKind == SDep::Anti && TmpMI->isPHI())
1754 PHISUs.push_back(TmpSU);
1755 // Save the source of COPY/REG_SEQUENCE.
1756 // If the source has no pre-decessors, we will end up creating cycles.
1757 else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
1758 SrcSUs.push_back(TmpSU);
1759 }
1760
1761 if (PHISUs.size() == 0 || SrcSUs.size() == 0)
1762 continue;
1763
1764 // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
1765 // SUnit to the container.
1767 // Do not use iterator based loop here as we are updating the container.
1768 for (size_t Index = 0; Index < PHISUs.size(); ++Index) {
1769 for (auto &Dep : PHISUs[Index]->Succs) {
1770 if (Dep.getKind() != SDep::Data)
1771 continue;
1772
1773 SUnit *TmpSU = Dep.getSUnit();
1774 MachineInstr *TmpMI = TmpSU->getInstr();
1775 if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
1776 PHISUs.push_back(TmpSU);
1777 continue;
1778 }
1779 UseSUs.push_back(TmpSU);
1780 }
1781 }
1782
1783 if (UseSUs.size() == 0)
1784 continue;
1785
1786 SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
1787 // Add the artificial dependencies if it does not form a cycle.
1788 for (auto *I : UseSUs) {
1789 for (auto *Src : SrcSUs) {
1790 if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
1791 Src->addPred(SDep(I, SDep::Artificial));
1792 SDAG->Topo.AddPred(Src, I);
1793 }
1794 }
1795 }
1796 }
1797}
1798
1799/// Compute several functions need to order the nodes for scheduling.
1800/// ASAP - Earliest time to schedule a node.
1801/// ALAP - Latest time to schedule a node.
1802/// MOV - Mobility function, difference between ALAP and ASAP.
1803/// D - Depth of each node.
1804/// H - Height of each node.
1805void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
1806 ScheduleInfo.resize(SUnits.size());
1807
1808 LLVM_DEBUG({
1809 for (int I : Topo) {
1810 const SUnit &SU = SUnits[I];
1811 dumpNode(SU);
1812 }
1813 });
1814
1815 int maxASAP = 0;
1816 // Compute ASAP and ZeroLatencyDepth.
1817 for (int I : Topo) {
1818 int asap = 0;
1819 int zeroLatencyDepth = 0;
1820 SUnit *SU = &SUnits[I];
1821 for (const auto &IE : DDG->getInEdges(SU)) {
1822 SUnit *Pred = IE.getSrc();
1823 if (IE.getLatency() == 0)
1824 zeroLatencyDepth =
1825 std::max(zeroLatencyDepth, getZeroLatencyDepth(Pred) + 1);
1826 if (IE.ignoreDependence(true))
1827 continue;
1828 asap = std::max(asap, (int)(getASAP(Pred) + IE.getLatency() -
1829 IE.getDistance() * MII));
1830 }
1831 maxASAP = std::max(maxASAP, asap);
1832 ScheduleInfo[I].ASAP = asap;
1833 ScheduleInfo[I].ZeroLatencyDepth = zeroLatencyDepth;
1834 }
1835
1836 // Compute ALAP, ZeroLatencyHeight, and MOV.
1837 for (int I : llvm::reverse(Topo)) {
1838 int alap = maxASAP;
1839 int zeroLatencyHeight = 0;
1840 SUnit *SU = &SUnits[I];
1841 for (const auto &OE : DDG->getOutEdges(SU)) {
1842 SUnit *Succ = OE.getDst();
1843 if (Succ->isBoundaryNode())
1844 continue;
1845 if (OE.getLatency() == 0)
1846 zeroLatencyHeight =
1847 std::max(zeroLatencyHeight, getZeroLatencyHeight(Succ) + 1);
1848 if (OE.ignoreDependence(true))
1849 continue;
1850 alap = std::min(alap, (int)(getALAP(Succ) - OE.getLatency() +
1851 OE.getDistance() * MII));
1852 }
1853
1854 ScheduleInfo[I].ALAP = alap;
1855 ScheduleInfo[I].ZeroLatencyHeight = zeroLatencyHeight;
1856 }
1857
1858 // After computing the node functions, compute the summary for each node set.
1859 for (NodeSet &I : NodeSets)
1860 I.computeNodeSetInfo(this);
1861
1862 LLVM_DEBUG({
1863 for (unsigned i = 0; i < SUnits.size(); i++) {
1864 dbgs() << "\tNode " << i << ":\n";
1865 dbgs() << "\t ASAP = " << getASAP(&SUnits[i]) << "\n";
1866 dbgs() << "\t ALAP = " << getALAP(&SUnits[i]) << "\n";
1867 dbgs() << "\t MOV = " << getMOV(&SUnits[i]) << "\n";
1868 dbgs() << "\t D = " << getDepth(&SUnits[i]) << "\n";
1869 dbgs() << "\t H = " << getHeight(&SUnits[i]) << "\n";
1870 dbgs() << "\t ZLD = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
1871 dbgs() << "\t ZLH = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
1872 }
1873 });
1874}
1875
1876/// Compute the Pred_L(O) set, as defined in the paper. The set is defined
1877/// as the predecessors of the elements of NodeOrder that are not also in
1878/// NodeOrder.
1881 const NodeSet *S = nullptr) {
1882 Preds.clear();
1883
1884 for (SUnit *SU : NodeOrder) {
1885 for (const auto &IE : DDG->getInEdges(SU)) {
1886 SUnit *PredSU = IE.getSrc();
1887 if (S && S->count(PredSU) == 0)
1888 continue;
1889 if (IE.ignoreDependence(true))
1890 continue;
1891 if (NodeOrder.count(PredSU) == 0)
1892 Preds.insert(PredSU);
1893 }
1894
1895 // FIXME: The following loop-carried dependencies may also need to be
1896 // considered.
1897 // - Physical register dependencies (true-dependence and WAW).
1898 // - Memory dependencies.
1899 for (const auto &OE : DDG->getOutEdges(SU)) {
1900 SUnit *SuccSU = OE.getDst();
1901 if (!OE.isAntiDep())
1902 continue;
1903 if (S && S->count(SuccSU) == 0)
1904 continue;
1905 if (NodeOrder.count(SuccSU) == 0)
1906 Preds.insert(SuccSU);
1907 }
1908 }
1909 return !Preds.empty();
1910}
1911
1912/// Compute the Succ_L(O) set, as defined in the paper. The set is defined
1913/// as the successors of the elements of NodeOrder that are not also in
1914/// NodeOrder.
1917 const NodeSet *S = nullptr) {
1918 Succs.clear();
1919
1920 for (SUnit *SU : NodeOrder) {
1921 for (const auto &OE : DDG->getOutEdges(SU)) {
1922 SUnit *SuccSU = OE.getDst();
1923 if (S && S->count(SuccSU) == 0)
1924 continue;
1925 if (OE.ignoreDependence(false))
1926 continue;
1927 if (NodeOrder.count(SuccSU) == 0)
1928 Succs.insert(SuccSU);
1929 }
1930
1931 // FIXME: The following loop-carried dependencies may also need to be
1932 // considered.
1933 // - Physical register dependnecies (true-dependnece and WAW).
1934 // - Memory dependencies.
1935 for (const auto &IE : DDG->getInEdges(SU)) {
1936 SUnit *PredSU = IE.getSrc();
1937 if (!IE.isAntiDep())
1938 continue;
1939 if (S && S->count(PredSU) == 0)
1940 continue;
1941 if (NodeOrder.count(PredSU) == 0)
1942 Succs.insert(PredSU);
1943 }
1944 }
1945 return !Succs.empty();
1946}
1947
1948/// Return true if there is a path from the specified node to any of the nodes
1949/// in DestNodes. Keep track and return the nodes in any path.
1950static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
1951 SetVector<SUnit *> &DestNodes,
1952 SetVector<SUnit *> &Exclude,
1953 SmallPtrSet<SUnit *, 8> &Visited,
1954 SwingSchedulerDDG *DDG) {
1955 if (Cur->isBoundaryNode())
1956 return false;
1957 if (Exclude.contains(Cur))
1958 return false;
1959 if (DestNodes.contains(Cur))
1960 return true;
1961 if (!Visited.insert(Cur).second)
1962 return Path.contains(Cur);
1963 bool FoundPath = false;
1964 for (const auto &OE : DDG->getOutEdges(Cur))
1965 if (!OE.ignoreDependence(false))
1966 FoundPath |=
1967 computePath(OE.getDst(), Path, DestNodes, Exclude, Visited, DDG);
1968 for (const auto &IE : DDG->getInEdges(Cur))
1969 if (IE.isAntiDep() && IE.getDistance() == 0)
1970 FoundPath |=
1971 computePath(IE.getSrc(), Path, DestNodes, Exclude, Visited, DDG);
1972 if (FoundPath)
1973 Path.insert(Cur);
1974 return FoundPath;
1975}
1976
1977/// Compute the live-out registers for the instructions in a node-set.
1978/// The live-out registers are those that are defined in the node-set,
1979/// but not used. Except for use operands of Phis.
1981 NodeSet &NS) {
1986 for (SUnit *SU : NS) {
1987 const MachineInstr *MI = SU->getInstr();
1988 if (MI->isPHI())
1989 continue;
1990 for (const MachineOperand &MO : MI->all_uses()) {
1991 Register Reg = MO.getReg();
1992 if (Reg.isVirtual())
1993 Uses.insert(Reg);
1994 else if (MRI.isAllocatable(Reg))
1995 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
1996 Uses.insert(Unit);
1997 }
1998 }
1999 for (SUnit *SU : NS)
2000 for (const MachineOperand &MO : SU->getInstr()->all_defs())
2001 if (!MO.isDead()) {
2002 Register Reg = MO.getReg();
2003 if (Reg.isVirtual()) {
2004 if (!Uses.count(Reg))
2005 LiveOutRegs.push_back(RegisterMaskPair(Reg,
2007 } else if (MRI.isAllocatable(Reg)) {
2008 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
2009 if (!Uses.count(Unit))
2010 LiveOutRegs.push_back(
2012 }
2013 }
2014 RPTracker.addLiveRegs(LiveOutRegs);
2015}
2016
2017/// A heuristic to filter nodes in recurrent node-sets if the register
2018/// pressure of a set is too high.
2019void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
2020 for (auto &NS : NodeSets) {
2021 // Skip small node-sets since they won't cause register pressure problems.
2022 if (NS.size() <= 2)
2023 continue;
2024 IntervalPressure RecRegPressure;
2025 RegPressureTracker RecRPTracker(RecRegPressure);
2026 RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
2027 computeLiveOuts(MF, RecRPTracker, NS);
2028 RecRPTracker.closeBottom();
2029
2030 std::vector<SUnit *> SUnits(NS.begin(), NS.end());
2031 llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {
2032 return A->NodeNum > B->NodeNum;
2033 });
2034
2035 for (auto &SU : SUnits) {
2036 // Since we're computing the register pressure for a subset of the
2037 // instructions in a block, we need to set the tracker for each
2038 // instruction in the node-set. The tracker is set to the instruction
2039 // just after the one we're interested in.
2041 RecRPTracker.setPos(std::next(CurInstI));
2042
2043 RegPressureDelta RPDelta;
2044 ArrayRef<PressureChange> CriticalPSets;
2045 RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
2046 CriticalPSets,
2047 RecRegPressure.MaxSetPressure);
2048 if (RPDelta.Excess.isValid()) {
2049 LLVM_DEBUG(
2050 dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
2052 << ":" << RPDelta.Excess.getUnitInc() << "\n");
2053 NS.setExceedPressure(SU);
2054 break;
2055 }
2056 RecRPTracker.recede();
2057 }
2058 }
2059}
2060
2061/// A heuristic to colocate node sets that have the same set of
2062/// successors.
2063void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
2064 unsigned Colocate = 0;
2065 for (int i = 0, e = NodeSets.size(); i < e; ++i) {
2066 NodeSet &N1 = NodeSets[i];
2068 if (N1.empty() || !succ_L(N1, S1, DDG.get()))
2069 continue;
2070 for (int j = i + 1; j < e; ++j) {
2071 NodeSet &N2 = NodeSets[j];
2072 if (N1.compareRecMII(N2) != 0)
2073 continue;
2075 if (N2.empty() || !succ_L(N2, S2, DDG.get()))
2076 continue;
2077 if (llvm::set_is_subset(S1, S2) && S1.size() == S2.size()) {
2078 N1.setColocate(++Colocate);
2079 N2.setColocate(Colocate);
2080 break;
2081 }
2082 }
2083 }
2084}
2085
2086/// Check if the existing node-sets are profitable. If not, then ignore the
2087/// recurrent node-sets, and attempt to schedule all nodes together. This is
2088/// a heuristic. If the MII is large and all the recurrent node-sets are small,
2089/// then it's best to try to schedule all instructions together instead of
2090/// starting with the recurrent node-sets.
2091void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
2092 // Look for loops with a large MII.
2093 if (MII < 17)
2094 return;
2095 // Check if the node-set contains only a simple add recurrence.
2096 for (auto &NS : NodeSets) {
2097 if (NS.getRecMII() > 2)
2098 return;
2099 if (NS.getMaxDepth() > MII)
2100 return;
2101 }
2102 NodeSets.clear();
2103 LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
2104}
2105
2106/// Add the nodes that do not belong to a recurrence set into groups
2107/// based upon connected components.
2108void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
2109 SetVector<SUnit *> NodesAdded;
2111 // Add the nodes that are on a path between the previous node sets and
2112 // the current node set.
2113 for (NodeSet &I : NodeSets) {
2115 // Add the nodes from the current node set to the previous node set.
2116 if (succ_L(I, N, DDG.get())) {
2118 for (SUnit *NI : N) {
2119 Visited.clear();
2120 computePath(NI, Path, NodesAdded, I, Visited, DDG.get());
2121 }
2122 if (!Path.empty())
2123 I.insert(Path.begin(), Path.end());
2124 }
2125 // Add the nodes from the previous node set to the current node set.
2126 N.clear();
2127 if (succ_L(NodesAdded, N, DDG.get())) {
2129 for (SUnit *NI : N) {
2130 Visited.clear();
2131 computePath(NI, Path, I, NodesAdded, Visited, DDG.get());
2132 }
2133 if (!Path.empty())
2134 I.insert(Path.begin(), Path.end());
2135 }
2136 NodesAdded.insert(I.begin(), I.end());
2137 }
2138
2139 // Create a new node set with the connected nodes of any successor of a node
2140 // in a recurrent set.
2141 NodeSet NewSet;
2143 if (succ_L(NodesAdded, N, DDG.get()))
2144 for (SUnit *I : N)
2145 addConnectedNodes(I, NewSet, NodesAdded);
2146 if (!NewSet.empty())
2147 NodeSets.push_back(NewSet);
2148
2149 // Create a new node set with the connected nodes of any predecessor of a node
2150 // in a recurrent set.
2151 NewSet.clear();
2152 if (pred_L(NodesAdded, N, DDG.get()))
2153 for (SUnit *I : N)
2154 addConnectedNodes(I, NewSet, NodesAdded);
2155 if (!NewSet.empty())
2156 NodeSets.push_back(NewSet);
2157
2158 // Create new nodes sets with the connected nodes any remaining node that
2159 // has no predecessor.
2160 for (SUnit &SU : SUnits) {
2161 if (NodesAdded.count(&SU) == 0) {
2162 NewSet.clear();
2163 addConnectedNodes(&SU, NewSet, NodesAdded);
2164 if (!NewSet.empty())
2165 NodeSets.push_back(NewSet);
2166 }
2167 }
2168}
2169
2170/// Add the node to the set, and add all of its connected nodes to the set.
2171void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
2172 SetVector<SUnit *> &NodesAdded) {
2173 NewSet.insert(SU);
2174 NodesAdded.insert(SU);
2175 for (auto &OE : DDG->getOutEdges(SU)) {
2176 SUnit *Successor = OE.getDst();
2177 if (!OE.isArtificial() && !Successor->isBoundaryNode() &&
2178 NodesAdded.count(Successor) == 0)
2179 addConnectedNodes(Successor, NewSet, NodesAdded);
2180 }
2181 for (auto &IE : DDG->getInEdges(SU)) {
2182 SUnit *Predecessor = IE.getSrc();
2183 if (!IE.isArtificial() && NodesAdded.count(Predecessor) == 0)
2184 addConnectedNodes(Predecessor, NewSet, NodesAdded);
2185 }
2186}
2187
2188/// Return true if Set1 contains elements in Set2. The elements in common
2189/// are returned in a different container.
2190static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
2192 Result.clear();
2193 for (SUnit *SU : Set1) {
2194 if (Set2.count(SU) != 0)
2195 Result.insert(SU);
2196 }
2197 return !Result.empty();
2198}
2199
2200/// Merge the recurrence node sets that have the same initial node.
2201void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
2202 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2203 ++I) {
2204 NodeSet &NI = *I;
2205 for (NodeSetType::iterator J = I + 1; J != E;) {
2206 NodeSet &NJ = *J;
2207 if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
2208 if (NJ.compareRecMII(NI) > 0)
2209 NI.setRecMII(NJ.getRecMII());
2210 for (SUnit *SU : *J)
2211 I->insert(SU);
2212 NodeSets.erase(J);
2213 E = NodeSets.end();
2214 } else {
2215 ++J;
2216 }
2217 }
2218 }
2219}
2220
2221/// Remove nodes that have been scheduled in previous NodeSets.
2222void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
2223 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2224 ++I)
2225 for (NodeSetType::iterator J = I + 1; J != E;) {
2226 J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
2227
2228 if (J->empty()) {
2229 NodeSets.erase(J);
2230 E = NodeSets.end();
2231 } else {
2232 ++J;
2233 }
2234 }
2235}
2236
2237/// Compute an ordered list of the dependence graph nodes, which
2238/// indicates the order that the nodes will be scheduled. This is a
2239/// two-level algorithm. First, a partial order is created, which
2240/// consists of a list of sets ordered from highest to lowest priority.
2241void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
2243 NodeOrder.clear();
2244
2245 for (auto &Nodes : NodeSets) {
2246 LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
2247 OrderKind Order;
2249 if (pred_L(NodeOrder, N, DDG.get()) && llvm::set_is_subset(N, Nodes)) {
2250 R.insert(N.begin(), N.end());
2251 Order = BottomUp;
2252 LLVM_DEBUG(dbgs() << " Bottom up (preds) ");
2253 } else if (succ_L(NodeOrder, N, DDG.get()) &&
2254 llvm::set_is_subset(N, Nodes)) {
2255 R.insert(N.begin(), N.end());
2256 Order = TopDown;
2257 LLVM_DEBUG(dbgs() << " Top down (succs) ");
2258 } else if (isIntersect(N, Nodes, R)) {
2259 // If some of the successors are in the existing node-set, then use the
2260 // top-down ordering.
2261 Order = TopDown;
2262 LLVM_DEBUG(dbgs() << " Top down (intersect) ");
2263 } else if (NodeSets.size() == 1) {
2264 for (const auto &N : Nodes)
2265 if (N->Succs.size() == 0)
2266 R.insert(N);
2267 Order = BottomUp;
2268 LLVM_DEBUG(dbgs() << " Bottom up (all) ");
2269 } else {
2270 // Find the node with the highest ASAP.
2271 SUnit *maxASAP = nullptr;
2272 for (SUnit *SU : Nodes) {
2273 if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
2274 (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
2275 maxASAP = SU;
2276 }
2277 R.insert(maxASAP);
2278 Order = BottomUp;
2279 LLVM_DEBUG(dbgs() << " Bottom up (default) ");
2280 }
2281
2282 while (!R.empty()) {
2283 if (Order == TopDown) {
2284 // Choose the node with the maximum height. If more than one, choose
2285 // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
2286 // choose the node with the lowest MOV.
2287 while (!R.empty()) {
2288 SUnit *maxHeight = nullptr;
2289 for (SUnit *I : R) {
2290 if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
2291 maxHeight = I;
2292 else if (getHeight(I) == getHeight(maxHeight) &&
2294 maxHeight = I;
2295 else if (getHeight(I) == getHeight(maxHeight) &&
2297 getZeroLatencyHeight(maxHeight) &&
2298 getMOV(I) < getMOV(maxHeight))
2299 maxHeight = I;
2300 }
2301 NodeOrder.insert(maxHeight);
2302 LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
2303 R.remove(maxHeight);
2304 for (const auto &OE : DDG->getOutEdges(maxHeight)) {
2305 SUnit *SU = OE.getDst();
2306 if (Nodes.count(SU) == 0)
2307 continue;
2308 if (NodeOrder.contains(SU))
2309 continue;
2310 if (OE.ignoreDependence(false))
2311 continue;
2312 R.insert(SU);
2313 }
2314
2315 // FIXME: The following loop-carried dependencies may also need to be
2316 // considered.
2317 // - Physical register dependnecies (true-dependnece and WAW).
2318 // - Memory dependencies.
2319 for (const auto &IE : DDG->getInEdges(maxHeight)) {
2320 SUnit *SU = IE.getSrc();
2321 if (!IE.isAntiDep())
2322 continue;
2323 if (Nodes.count(SU) == 0)
2324 continue;
2325 if (NodeOrder.contains(SU))
2326 continue;
2327 R.insert(SU);
2328 }
2329 }
2330 Order = BottomUp;
2331 LLVM_DEBUG(dbgs() << "\n Switching order to bottom up ");
2333 if (pred_L(NodeOrder, N, DDG.get(), &Nodes))
2334 R.insert(N.begin(), N.end());
2335 } else {
2336 // Choose the node with the maximum depth. If more than one, choose
2337 // the node with the maximum ZeroLatencyDepth. If still more than one,
2338 // choose the node with the lowest MOV.
2339 while (!R.empty()) {
2340 SUnit *maxDepth = nullptr;
2341 for (SUnit *I : R) {
2342 if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
2343 maxDepth = I;
2344 else if (getDepth(I) == getDepth(maxDepth) &&
2346 maxDepth = I;
2347 else if (getDepth(I) == getDepth(maxDepth) &&
2349 getMOV(I) < getMOV(maxDepth))
2350 maxDepth = I;
2351 }
2352 NodeOrder.insert(maxDepth);
2353 LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
2354 R.remove(maxDepth);
2355 if (Nodes.isExceedSU(maxDepth)) {
2356 Order = TopDown;
2357 R.clear();
2358 R.insert(Nodes.getNode(0));
2359 break;
2360 }
2361 for (const auto &IE : DDG->getInEdges(maxDepth)) {
2362 SUnit *SU = IE.getSrc();
2363 if (Nodes.count(SU) == 0)
2364 continue;
2365 if (NodeOrder.contains(SU))
2366 continue;
2367 R.insert(SU);
2368 }
2369
2370 // FIXME: The following loop-carried dependencies may also need to be
2371 // considered.
2372 // - Physical register dependnecies (true-dependnece and WAW).
2373 // - Memory dependencies.
2374 for (const auto &OE : DDG->getOutEdges(maxDepth)) {
2375 SUnit *SU = OE.getDst();
2376 if (!OE.isAntiDep())
2377 continue;
2378 if (Nodes.count(SU) == 0)
2379 continue;
2380 if (NodeOrder.contains(SU))
2381 continue;
2382 R.insert(SU);
2383 }
2384 }
2385 Order = TopDown;
2386 LLVM_DEBUG(dbgs() << "\n Switching order to top down ");
2388 if (succ_L(NodeOrder, N, DDG.get(), &Nodes))
2389 R.insert(N.begin(), N.end());
2390 }
2391 }
2392 LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
2393 }
2394
2395 LLVM_DEBUG({
2396 dbgs() << "Node order: ";
2397 for (SUnit *I : NodeOrder)
2398 dbgs() << " " << I->NodeNum << " ";
2399 dbgs() << "\n";
2400 });
2401}
2402
2403/// Process the nodes in the computed order and create the pipelined schedule
2404/// of the instructions, if possible. Return true if a schedule is found.
2405bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
2406
2407 if (NodeOrder.empty()){
2408 LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );
2409 return false;
2410 }
2411
2412 bool scheduleFound = false;
2413 std::unique_ptr<HighRegisterPressureDetector> HRPDetector;
2414 if (LimitRegPressure) {
2415 HRPDetector =
2416 std::make_unique<HighRegisterPressureDetector>(Loop.getHeader(), MF);
2417 HRPDetector->init(RegClassInfo);
2418 }
2419 // Keep increasing II until a valid schedule is found.
2420 for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) {
2421 Schedule.reset();
2422 Schedule.setInitiationInterval(II);
2423 LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
2424
2425 SetVector<SUnit *>::iterator NI = NodeOrder.begin();
2426 SetVector<SUnit *>::iterator NE = NodeOrder.end();
2427 do {
2428 SUnit *SU = *NI;
2429
2430 // Compute the schedule time for the instruction, which is based
2431 // upon the scheduled time for any predecessors/successors.
2432 int EarlyStart = INT_MIN;
2433 int LateStart = INT_MAX;
2434 Schedule.computeStart(SU, &EarlyStart, &LateStart, II, this);
2435 LLVM_DEBUG({
2436 dbgs() << "\n";
2437 dbgs() << "Inst (" << SU->NodeNum << ") ";
2438 SU->getInstr()->dump();
2439 dbgs() << "\n";
2440 });
2441 LLVM_DEBUG(
2442 dbgs() << format("\tes: %8x ls: %8x\n", EarlyStart, LateStart));
2443
2444 if (EarlyStart > LateStart)
2445 scheduleFound = false;
2446 else if (EarlyStart != INT_MIN && LateStart == INT_MAX)
2447 scheduleFound =
2448 Schedule.insert(SU, EarlyStart, EarlyStart + (int)II - 1, II);
2449 else if (EarlyStart == INT_MIN && LateStart != INT_MAX)
2450 scheduleFound =
2451 Schedule.insert(SU, LateStart, LateStart - (int)II + 1, II);
2452 else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2453 LateStart = std::min(LateStart, EarlyStart + (int)II - 1);
2454 // When scheduling a Phi it is better to start at the late cycle and
2455 // go backwards. The default order may insert the Phi too far away
2456 // from its first dependence.
2457 // Also, do backward search when all scheduled predecessors are
2458 // loop-carried output/order dependencies. Empirically, there are also
2459 // cases where scheduling becomes possible with backward search.
2460 if (SU->getInstr()->isPHI() ||
2461 Schedule.onlyHasLoopCarriedOutputOrOrderPreds(SU, this->getDDG()))
2462 scheduleFound = Schedule.insert(SU, LateStart, EarlyStart, II);
2463 else
2464 scheduleFound = Schedule.insert(SU, EarlyStart, LateStart, II);
2465 } else {
2466 int FirstCycle = Schedule.getFirstCycle();
2467 scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
2468 FirstCycle + getASAP(SU) + II - 1, II);
2469 }
2470
2471 // Even if we find a schedule, make sure the schedule doesn't exceed the
2472 // allowable number of stages. We keep trying if this happens.
2473 if (scheduleFound)
2474 if (SwpMaxStages > -1 &&
2475 Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2476 scheduleFound = false;
2477
2478 LLVM_DEBUG({
2479 if (!scheduleFound)
2480 dbgs() << "\tCan't schedule\n";
2481 });
2482 } while (++NI != NE && scheduleFound);
2483
2484 // If a schedule is found, ensure non-pipelined instructions are in stage 0
2485 if (scheduleFound)
2486 scheduleFound =
2487 Schedule.normalizeNonPipelinedInstructions(this, LoopPipelinerInfo);
2488
2489 // If a schedule is found, check if it is a valid schedule too.
2490 if (scheduleFound)
2491 scheduleFound = Schedule.isValidSchedule(this);
2492
2493 // If a schedule was found and the option is enabled, check if the schedule
2494 // might generate additional register spills/fills.
2495 if (scheduleFound && LimitRegPressure)
2496 scheduleFound =
2497 !HRPDetector->detect(this, Schedule, Schedule.getMaxStageCount());
2498 }
2499
2500 LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
2501 << " (II=" << Schedule.getInitiationInterval()
2502 << ")\n");
2503
2504 if (scheduleFound) {
2505 scheduleFound = LoopPipelinerInfo->shouldUseSchedule(*this, Schedule);
2506 if (!scheduleFound)
2507 LLVM_DEBUG(dbgs() << "Target rejected schedule\n");
2508 }
2509
2510 if (scheduleFound) {
2511 Schedule.finalizeSchedule(this);
2512 Pass.ORE->emit([&]() {
2514 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
2515 << "Schedule found with Initiation Interval: "
2516 << ore::NV("II", Schedule.getInitiationInterval())
2517 << ", MaxStageCount: "
2518 << ore::NV("MaxStageCount", Schedule.getMaxStageCount());
2519 });
2520 } else
2521 Schedule.reset();
2522
2523 return scheduleFound && Schedule.getMaxStageCount() > 0;
2524}
2525
2526/// Return true if we can compute the amount the instruction changes
2527/// during each iteration. Set Delta to the amount of the change.
2528bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) const {
2530 const MachineOperand *BaseOp;
2531 int64_t Offset;
2532 bool OffsetIsScalable;
2533 if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
2534 return false;
2535
2536 // FIXME: This algorithm assumes instructions have fixed-size offsets.
2537 if (OffsetIsScalable)
2538 return false;
2539
2540 if (!BaseOp->isReg())
2541 return false;
2542
2543 Register BaseReg = BaseOp->getReg();
2544
2546 // Check if there is a Phi. If so, get the definition in the loop.
2547 MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
2548 if (BaseDef && BaseDef->isPHI()) {
2549 BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
2550 BaseDef = MRI.getVRegDef(BaseReg);
2551 }
2552 if (!BaseDef)
2553 return false;
2554
2555 int D = 0;
2556 if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
2557 return false;
2558
2559 Delta = D;
2560 return true;
2561}
2562
2563/// Check if we can change the instruction to use an offset value from the
2564/// previous iteration. If so, return true and set the base and offset values
2565/// so that we can rewrite the load, if necessary.
2566/// v1 = Phi(v0, v3)
2567/// v2 = load v1, 0
2568/// v3 = post_store v1, 4, x
2569/// This function enables the load to be rewritten as v2 = load v3, 4.
2570bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
2571 unsigned &BasePos,
2572 unsigned &OffsetPos,
2573 unsigned &NewBase,
2574 int64_t &Offset) {
2575 // Get the load instruction.
2576 if (TII->isPostIncrement(*MI))
2577 return false;
2578 unsigned BasePosLd, OffsetPosLd;
2579 if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
2580 return false;
2581 Register BaseReg = MI->getOperand(BasePosLd).getReg();
2582
2583 // Look for the Phi instruction.
2584 MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
2585 MachineInstr *Phi = MRI.getVRegDef(BaseReg);
2586 if (!Phi || !Phi->isPHI())
2587 return false;
2588 // Get the register defined in the loop block.
2589 unsigned PrevReg = getLoopPhiReg(*Phi, MI->getParent());
2590 if (!PrevReg)
2591 return false;
2592
2593 // Check for the post-increment load/store instruction.
2594 MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
2595 if (!PrevDef || PrevDef == MI)
2596 return false;
2597
2598 if (!TII->isPostIncrement(*PrevDef))
2599 return false;
2600
2601 unsigned BasePos1 = 0, OffsetPos1 = 0;
2602 if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
2603 return false;
2604
2605 // Make sure that the instructions do not access the same memory location in
2606 // the next iteration.
2607 int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
2608 int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
2610 NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
2611 bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
2612 MF.deleteMachineInstr(NewMI);
2613 if (!Disjoint)
2614 return false;
2615
2616 // Set the return value once we determine that we return true.
2617 BasePos = BasePosLd;
2618 OffsetPos = OffsetPosLd;
2619 NewBase = PrevReg;
2620 Offset = StoreOffset;
2621 return true;
2622}
2623
2624/// Apply changes to the instruction if needed. The changes are need
2625/// to improve the scheduling and depend up on the final schedule.
2627 SMSchedule &Schedule) {
2628 SUnit *SU = getSUnit(MI);
2630 InstrChanges.find(SU);
2631 if (It != InstrChanges.end()) {
2632 std::pair<unsigned, int64_t> RegAndOffset = It->second;
2633 unsigned BasePos, OffsetPos;
2634 if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2635 return;
2636 Register BaseReg = MI->getOperand(BasePos).getReg();
2637 MachineInstr *LoopDef = findDefInLoop(BaseReg);
2638 int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
2639 int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
2640 int BaseStageNum = Schedule.stageScheduled(SU);
2641 int BaseCycleNum = Schedule.cycleScheduled(SU);
2642 if (BaseStageNum < DefStageNum) {
2644 int OffsetDiff = DefStageNum - BaseStageNum;
2645 if (DefCycleNum < BaseCycleNum) {
2646 NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
2647 if (OffsetDiff > 0)
2648 --OffsetDiff;
2649 }
2650 int64_t NewOffset =
2651 MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
2652 NewMI->getOperand(OffsetPos).setImm(NewOffset);
2653 SU->setInstr(NewMI);
2654 MISUnitMap[NewMI] = SU;
2655 NewMIs[MI] = NewMI;
2656 }
2657 }
2658}
2659
2660/// Return the instruction in the loop that defines the register.
2661/// If the definition is a Phi, then follow the Phi operand to
2662/// the instruction in the loop.
2663MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
2665 MachineInstr *Def = MRI.getVRegDef(Reg);
2666 while (Def->isPHI()) {
2667 if (!Visited.insert(Def).second)
2668 break;
2669 for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
2670 if (Def->getOperand(i + 1).getMBB() == BB) {
2671 Def = MRI.getVRegDef(Def->getOperand(i).getReg());
2672 break;
2673 }
2674 }
2675 return Def;
2676}
2677
2678/// Return true for an order or output dependence that is loop carried
2679/// potentially. A dependence is loop carried if the destination defines a value
2680/// that may be used or defined by the source in a subsequent iteration.
2682 const SwingSchedulerDDGEdge &Edge) const {
2683 if ((!Edge.isOrderDep() && !Edge.isOutputDep()) || Edge.isArtificial() ||
2684 Edge.getDst()->isBoundaryNode())
2685 return false;
2686
2688 return true;
2689
2690 if (Edge.isOutputDep())
2691 return true;
2692
2693 MachineInstr *SI = Edge.getSrc()->getInstr();
2694 MachineInstr *DI = Edge.getDst()->getInstr();
2695 assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
2696
2697 // Assume ordered loads and stores may have a loop carried dependence.
2698 if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
2699 SI->mayRaiseFPException() || DI->mayRaiseFPException() ||
2700 SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
2701 return true;
2702
2703 if (!DI->mayLoadOrStore() || !SI->mayLoadOrStore())
2704 return false;
2705
2706 // The conservative assumption is that a dependence between memory operations
2707 // may be loop carried. The following code checks when it can be proved that
2708 // there is no loop carried dependence.
2709 unsigned DeltaS, DeltaD;
2710 if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD))
2711 return true;
2712
2713 const MachineOperand *BaseOpS, *BaseOpD;
2714 int64_t OffsetS, OffsetD;
2715 bool OffsetSIsScalable, OffsetDIsScalable;
2717 if (!TII->getMemOperandWithOffset(*SI, BaseOpS, OffsetS, OffsetSIsScalable,
2718 TRI) ||
2719 !TII->getMemOperandWithOffset(*DI, BaseOpD, OffsetD, OffsetDIsScalable,
2720 TRI))
2721 return true;
2722
2723 assert(!OffsetSIsScalable && !OffsetDIsScalable &&
2724 "Expected offsets to be byte offsets");
2725
2726 MachineInstr *DefS = MRI.getVRegDef(BaseOpS->getReg());
2727 MachineInstr *DefD = MRI.getVRegDef(BaseOpD->getReg());
2728 if (!DefS || !DefD || !DefS->isPHI() || !DefD->isPHI())
2729 return true;
2730
2731 unsigned InitValS = 0;
2732 unsigned LoopValS = 0;
2733 unsigned InitValD = 0;
2734 unsigned LoopValD = 0;
2735 getPhiRegs(*DefS, BB, InitValS, LoopValS);
2736 getPhiRegs(*DefD, BB, InitValD, LoopValD);
2737 MachineInstr *InitDefS = MRI.getVRegDef(InitValS);
2738 MachineInstr *InitDefD = MRI.getVRegDef(InitValD);
2739
2740 if (!InitDefS->isIdenticalTo(*InitDefD))
2741 return true;
2742
2743 // Check that the base register is incremented by a constant value for each
2744 // iteration.
2745 MachineInstr *LoopDefS = MRI.getVRegDef(LoopValS);
2746 int D = 0;
2747 if (!LoopDefS || !TII->getIncrementValue(*LoopDefS, D))
2748 return true;
2749
2750 LocationSize AccessSizeS = (*SI->memoperands_begin())->getSize();
2751 LocationSize AccessSizeD = (*DI->memoperands_begin())->getSize();
2752
2753 // This is the main test, which checks the offset values and the loop
2754 // increment value to determine if the accesses may be loop carried.
2755 if (!AccessSizeS.hasValue() || !AccessSizeD.hasValue())
2756 return true;
2757
2758 if (DeltaS != DeltaD || DeltaS < AccessSizeS.getValue() ||
2759 DeltaD < AccessSizeD.getValue())
2760 return true;
2761
2762 return (OffsetS + (int64_t)AccessSizeS.getValue() <
2763 OffsetD + (int64_t)AccessSizeD.getValue());
2764}
2765
2766void SwingSchedulerDAG::postProcessDAG() {
2767 for (auto &M : Mutations)
2768 M->apply(this);
2769}
2770
2771/// Try to schedule the node at the specified StartCycle and continue
2772/// until the node is schedule or the EndCycle is reached. This function
2773/// returns true if the node is scheduled. This routine may search either
2774/// forward or backward for a place to insert the instruction based upon
2775/// the relative values of StartCycle and EndCycle.
2776bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
2777 bool forward = true;
2778 LLVM_DEBUG({
2779 dbgs() << "Trying to insert node between " << StartCycle << " and "
2780 << EndCycle << " II: " << II << "\n";
2781 });
2782 if (StartCycle > EndCycle)
2783 forward = false;
2784
2785 // The terminating condition depends on the direction.
2786 int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
2787 for (int curCycle = StartCycle; curCycle != termCycle;
2788 forward ? ++curCycle : --curCycle) {
2789
2790 if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
2791 ProcItinResources.canReserveResources(*SU, curCycle)) {
2792 LLVM_DEBUG({
2793 dbgs() << "\tinsert at cycle " << curCycle << " ";
2794 SU->getInstr()->dump();
2795 });
2796
2797 if (!ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
2798 ProcItinResources.reserveResources(*SU, curCycle);
2799 ScheduledInstrs[curCycle].push_back(SU);
2800 InstrToCycle.insert(std::make_pair(SU, curCycle));
2801 if (curCycle > LastCycle)
2802 LastCycle = curCycle;
2803 if (curCycle < FirstCycle)
2804 FirstCycle = curCycle;
2805 return true;
2806 }
2807 LLVM_DEBUG({
2808 dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
2809 SU->getInstr()->dump();
2810 });
2811 }
2812 return false;
2813}
2814
2815// Return the cycle of the earliest scheduled instruction in the chain.
2817 const SwingSchedulerDDG *DDG) {
2820 Worklist.push_back(Dep);
2821 int EarlyCycle = INT_MAX;
2822 while (!Worklist.empty()) {
2823 const SwingSchedulerDDGEdge &Cur = Worklist.pop_back_val();
2824 SUnit *PrevSU = Cur.getSrc();
2825 if (Visited.count(PrevSU))
2826 continue;
2827 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
2828 if (it == InstrToCycle.end())
2829 continue;
2830 EarlyCycle = std::min(EarlyCycle, it->second);
2831 for (const auto &IE : DDG->getInEdges(PrevSU))
2832 if (IE.isOrderDep() || IE.isOutputDep())
2833 Worklist.push_back(IE);
2834 Visited.insert(PrevSU);
2835 }
2836 return EarlyCycle;
2837}
2838
2839// Return the cycle of the latest scheduled instruction in the chain.
2841 const SwingSchedulerDDG *DDG) {
2844 Worklist.push_back(Dep);
2845 int LateCycle = INT_MIN;
2846 while (!Worklist.empty()) {
2847 const SwingSchedulerDDGEdge &Cur = Worklist.pop_back_val();
2848 SUnit *SuccSU = Cur.getDst();
2849 if (Visited.count(SuccSU) || SuccSU->isBoundaryNode())
2850 continue;
2851 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
2852 if (it == InstrToCycle.end())
2853 continue;
2854 LateCycle = std::max(LateCycle, it->second);
2855 for (const auto &OE : DDG->getOutEdges(SuccSU))
2856 if (OE.isOrderDep() || OE.isOutputDep())
2857 Worklist.push_back(OE);
2858 Visited.insert(SuccSU);
2859 }
2860 return LateCycle;
2861}
2862
2863/// If an instruction has a use that spans multiple iterations, then
2864/// return true. These instructions are characterized by having a back-ege
2865/// to a Phi, which contains a reference to another Phi.
2867 for (auto &P : SU->Preds)
2868 if (P.getKind() == SDep::Anti && P.getSUnit()->getInstr()->isPHI())
2869 for (auto &S : P.getSUnit()->Succs)
2870 if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
2871 return P.getSUnit();
2872 return nullptr;
2873}
2874
2875/// Compute the scheduling start slot for the instruction. The start slot
2876/// depends on any predecessor or successor nodes scheduled already.
2877void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
2878 int II, SwingSchedulerDAG *DAG) {
2879 const SwingSchedulerDDG *DDG = DAG->getDDG();
2880
2881 // Iterate over each instruction that has been scheduled already. The start
2882 // slot computation depends on whether the previously scheduled instruction
2883 // is a predecessor or successor of the specified instruction.
2884 for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
2885 for (SUnit *I : getInstructions(cycle)) {
2886 for (const auto &IE : DDG->getInEdges(SU)) {
2887 if (IE.getSrc() == I) {
2888 // FIXME: Add reverse edge to `DDG` instead of calling
2889 // `isLoopCarriedDep`
2890 if (DAG->isLoopCarriedDep(IE)) {
2891 int End = earliestCycleInChain(IE, DDG) + (II - 1);
2892 *MinLateStart = std::min(*MinLateStart, End);
2893 }
2894 int EarlyStart = cycle + IE.getLatency() - IE.getDistance() * II;
2895 *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
2896 }
2897 }
2898
2899 for (const auto &OE : DDG->getOutEdges(SU)) {
2900 if (OE.getDst() == I) {
2901 // FIXME: Add reverse edge to `DDG` instead of calling
2902 // `isLoopCarriedDep`
2903 if (DAG->isLoopCarriedDep(OE)) {
2904 int Start = latestCycleInChain(OE, DDG) + 1 - II;
2905 *MaxEarlyStart = std::max(*MaxEarlyStart, Start);
2906 }
2907 int LateStart = cycle - OE.getLatency() + OE.getDistance() * II;
2908 *MinLateStart = std::min(*MinLateStart, LateStart);
2909 }
2910 }
2911
2912 SUnit *BE = multipleIterations(I, DAG);
2913 for (const auto &Dep : SU->Preds) {
2914 // For instruction that requires multiple iterations, make sure that
2915 // the dependent instruction is not scheduled past the definition.
2916 if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
2917 !SU->isPred(I))
2918 *MinLateStart = std::min(*MinLateStart, cycle);
2919 }
2920 }
2921 }
2922}
2923
2924/// Order the instructions within a cycle so that the definitions occur
2925/// before the uses. Returns true if the instruction is added to the start
2926/// of the list, or false if added to the end.
2928 std::deque<SUnit *> &Insts) const {
2929 MachineInstr *MI = SU->getInstr();
2930 bool OrderBeforeUse = false;
2931 bool OrderAfterDef = false;
2932 bool OrderBeforeDef = false;
2933 unsigned MoveDef = 0;
2934 unsigned MoveUse = 0;
2935 int StageInst1 = stageScheduled(SU);
2936 const SwingSchedulerDDG *DDG = SSD->getDDG();
2937
2938 unsigned Pos = 0;
2939 for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
2940 ++I, ++Pos) {
2941 for (MachineOperand &MO : MI->operands()) {
2942 if (!MO.isReg() || !MO.getReg().isVirtual())
2943 continue;
2944
2945 Register Reg = MO.getReg();
2946 unsigned BasePos, OffsetPos;
2947 if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2948 if (MI->getOperand(BasePos).getReg() == Reg)
2949 if (unsigned NewReg = SSD->getInstrBaseReg(SU))
2950 Reg = NewReg;
2951 bool Reads, Writes;
2952 std::tie(Reads, Writes) =
2953 (*I)->getInstr()->readsWritesVirtualRegister(Reg);
2954 if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
2955 OrderBeforeUse = true;
2956 if (MoveUse == 0)
2957 MoveUse = Pos;
2958 } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
2959 // Add the instruction after the scheduled instruction.
2960 OrderAfterDef = true;
2961 MoveDef = Pos;
2962 } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
2963 if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
2964 OrderBeforeUse = true;
2965 if (MoveUse == 0)
2966 MoveUse = Pos;
2967 } else {
2968 OrderAfterDef = true;
2969 MoveDef = Pos;
2970 }
2971 } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
2972 OrderBeforeUse = true;
2973 if (MoveUse == 0)
2974 MoveUse = Pos;
2975 if (MoveUse != 0) {
2976 OrderAfterDef = true;
2977 MoveDef = Pos - 1;
2978 }
2979 } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
2980 // Add the instruction before the scheduled instruction.
2981 OrderBeforeUse = true;
2982 if (MoveUse == 0)
2983 MoveUse = Pos;
2984 } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
2985 isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
2986 if (MoveUse == 0) {
2987 OrderBeforeDef = true;
2988 MoveUse = Pos;
2989 }
2990 }
2991 }
2992 // Check for order dependences between instructions. Make sure the source
2993 // is ordered before the destination.
2994 for (auto &OE : DDG->getOutEdges(SU)) {
2995 if (OE.getDst() != *I)
2996 continue;
2997 if (OE.isOrderDep() && stageScheduled(*I) == StageInst1) {
2998 OrderBeforeUse = true;
2999 if (Pos < MoveUse)
3000 MoveUse = Pos;
3001 }
3002 // We did not handle HW dependences in previous for loop,
3003 // and we normally set Latency = 0 for Anti/Output deps,
3004 // so may have nodes in same cycle with Anti/Output dependent on HW regs.
3005 else if ((OE.isAntiDep() || OE.isOutputDep()) &&
3006 stageScheduled(*I) == StageInst1) {
3007 OrderBeforeUse = true;
3008 if ((MoveUse == 0) || (Pos < MoveUse))
3009 MoveUse = Pos;
3010 }
3011 }
3012 for (auto &IE : DDG->getInEdges(SU)) {
3013 if (IE.getSrc() != *I)
3014 continue;
3015 if ((IE.isAntiDep() || IE.isOutputDep() || IE.isOrderDep()) &&
3016 stageScheduled(*I) == StageInst1) {
3017 OrderAfterDef = true;
3018 MoveDef = Pos;
3019 }
3020 }
3021 }
3022
3023 // A circular dependence.
3024 if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
3025 OrderBeforeUse = false;
3026
3027 // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
3028 // to a loop-carried dependence.
3029 if (OrderBeforeDef)
3030 OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
3031
3032 // The uncommon case when the instruction order needs to be updated because
3033 // there is both a use and def.
3034 if (OrderBeforeUse && OrderAfterDef) {
3035 SUnit *UseSU = Insts.at(MoveUse);
3036 SUnit *DefSU = Insts.at(MoveDef);
3037 if (MoveUse > MoveDef) {
3038 Insts.erase(Insts.begin() + MoveUse);
3039 Insts.erase(Insts.begin() + MoveDef);
3040 } else {
3041 Insts.erase(Insts.begin() + MoveDef);
3042 Insts.erase(Insts.begin() + MoveUse);
3043 }
3044 orderDependence(SSD, UseSU, Insts);
3045 orderDependence(SSD, SU, Insts);
3046 orderDependence(SSD, DefSU, Insts);
3047 return;
3048 }
3049 // Put the new instruction first if there is a use in the list. Otherwise,
3050 // put it at the end of the list.
3051 if (OrderBeforeUse)
3052 Insts.push_front(SU);
3053 else
3054 Insts.push_back(SU);
3055}
3056
3057/// Return true if the scheduled Phi has a loop carried operand.
3059 MachineInstr &Phi) const {
3060 if (!Phi.isPHI())
3061 return false;
3062 assert(Phi.isPHI() && "Expecting a Phi.");
3063 SUnit *DefSU = SSD->getSUnit(&Phi);
3064 unsigned DefCycle = cycleScheduled(DefSU);
3065 int DefStage = stageScheduled(DefSU);
3066
3067 unsigned InitVal = 0;
3068 unsigned LoopVal = 0;
3069 getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
3070 SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
3071 if (!UseSU)
3072 return true;
3073 if (UseSU->getInstr()->isPHI())
3074 return true;
3075 unsigned LoopCycle = cycleScheduled(UseSU);
3076 int LoopStage = stageScheduled(UseSU);
3077 return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
3078}
3079
3080/// Return true if the instruction is a definition that is loop carried
3081/// and defines the use on the next iteration.
3082/// v1 = phi(v2, v3)
3083/// (Def) v3 = op v1
3084/// (MO) = v1
3085/// If MO appears before Def, then v1 and v3 may get assigned to the same
3086/// register.
3088 MachineInstr *Def,
3089 MachineOperand &MO) const {
3090 if (!MO.isReg())
3091 return false;
3092 if (Def->isPHI())
3093 return false;
3094 MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
3095 if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
3096 return false;
3097 if (!isLoopCarried(SSD, *Phi))
3098 return false;
3099 unsigned LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
3100 for (MachineOperand &DMO : Def->all_defs()) {
3101 if (DMO.getReg() == LoopReg)
3102 return true;
3103 }
3104 return false;
3105}
3106
3107/// Return true if all scheduled predecessors are loop-carried output/order
3108/// dependencies.
3110 SUnit *SU, const SwingSchedulerDDG *DDG) const {
3111 for (const auto &IE : DDG->getInEdges(SU))
3112 if (InstrToCycle.count(IE.getSrc()))
3113 return false;
3114 return true;
3115}
3116
3117/// Determine transitive dependences of unpipelineable instructions
3120 SmallSet<SUnit *, 8> DoNotPipeline;
3121 SmallVector<SUnit *, 8> Worklist;
3122
3123 for (auto &SU : SSD->SUnits)
3124 if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr()))
3125 Worklist.push_back(&SU);
3126
3127 const SwingSchedulerDDG *DDG = SSD->getDDG();
3128 while (!Worklist.empty()) {
3129 auto SU = Worklist.pop_back_val();
3130 if (DoNotPipeline.count(SU))
3131 continue;
3132 LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n");
3133 DoNotPipeline.insert(SU);
3134 for (const auto &IE : DDG->getInEdges(SU))
3135 Worklist.push_back(IE.getSrc());
3136
3137 // To preserve previous behavior and prevent regression
3138 // FIXME: Remove if this doesn't have significant impact on
3139 for (const auto &OE : DDG->getOutEdges(SU))
3140 if (OE.getDistance() == 1)
3141 Worklist.push_back(OE.getDst());
3142 }
3143 return DoNotPipeline;
3144}
3145
3146// Determine all instructions upon which any unpipelineable instruction depends
3147// and ensure that they are in stage 0. If unable to do so, return false.
3151
3152 int NewLastCycle = INT_MIN;
3153 for (SUnit &SU : SSD->SUnits) {
3154 if (!SU.isInstr())
3155 continue;
3156 if (!DNP.contains(&SU) || stageScheduled(&SU) == 0) {
3157 NewLastCycle = std::max(NewLastCycle, InstrToCycle[&SU]);
3158 continue;
3159 }
3160
3161 // Put the non-pipelined instruction as early as possible in the schedule
3162 int NewCycle = getFirstCycle();
3163 for (const auto &IE : SSD->getDDG()->getInEdges(&SU))
3164 if (IE.getDistance() == 0)
3165 NewCycle = std::max(InstrToCycle[IE.getSrc()], NewCycle);
3166
3167 // To preserve previous behavior and prevent regression
3168 // FIXME: Remove if this doesn't have significant impact on performance
3169 for (auto &OE : SSD->getDDG()->getOutEdges(&SU))
3170 if (OE.getDistance() == 1)
3171 NewCycle = std::max(InstrToCycle[OE.getDst()], NewCycle);
3172
3173 int OldCycle = InstrToCycle[&SU];
3174 if (OldCycle != NewCycle) {
3175 InstrToCycle[&SU] = NewCycle;
3176 auto &OldS = getInstructions(OldCycle);
3177 llvm::erase(OldS, &SU);
3178 getInstructions(NewCycle).emplace_back(&SU);
3179 LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum
3180 << ") is not pipelined; moving from cycle " << OldCycle
3181 << " to " << NewCycle << " Instr:" << *SU.getInstr());
3182 }
3183 NewLastCycle = std::max(NewLastCycle, NewCycle);
3184 }
3185 LastCycle = NewLastCycle;
3186 return true;
3187}
3188
3189// Check if the generated schedule is valid. This function checks if
3190// an instruction that uses a physical register is scheduled in a
3191// different stage than the definition. The pipeliner does not handle
3192// physical register values that may cross a basic block boundary.
3193// Furthermore, if a physical def/use pair is assigned to the same
3194// cycle, orderDependence does not guarantee def/use ordering, so that
3195// case should be considered invalid. (The test checks for both
3196// earlier and same-cycle use to be more robust.)
3198 for (SUnit &SU : SSD->SUnits) {
3199 if (!SU.hasPhysRegDefs)
3200 continue;
3201 int StageDef = stageScheduled(&SU);
3202 int CycleDef = InstrToCycle[&SU];
3203 assert(StageDef != -1 && "Instruction should have been scheduled.");
3204 for (auto &OE : SSD->getDDG()->getOutEdges(&SU)) {
3205 SUnit *Dst = OE.getDst();
3206 if (OE.isAssignedRegDep() && !Dst->isBoundaryNode())
3207 if (Register::isPhysicalRegister(OE.getReg())) {
3208 if (stageScheduled(Dst) != StageDef)
3209 return false;
3210 if (InstrToCycle[Dst] <= CycleDef)
3211 return false;
3212 }
3213 }
3214 }
3215 return true;
3216}
3217
3218/// A property of the node order in swing-modulo-scheduling is
3219/// that for nodes outside circuits the following holds:
3220/// none of them is scheduled after both a successor and a
3221/// predecessor.
3222/// The method below checks whether the property is met.
3223/// If not, debug information is printed and statistics information updated.
3224/// Note that we do not use an assert statement.
3225/// The reason is that although an invalid node order may prevent
3226/// the pipeliner from finding a pipelined schedule for arbitrary II,
3227/// it does not lead to the generation of incorrect code.
3228void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
3229
3230 // a sorted vector that maps each SUnit to its index in the NodeOrder
3231 typedef std::pair<SUnit *, unsigned> UnitIndex;
3232 std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
3233
3234 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
3235 Indices.push_back(std::make_pair(NodeOrder[i], i));
3236
3237 auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
3238 return std::get<0>(i1) < std::get<0>(i2);
3239 };
3240
3241 // sort, so that we can perform a binary search
3242 llvm::sort(Indices, CompareKey);
3243
3244 bool Valid = true;
3245 (void)Valid;
3246 // for each SUnit in the NodeOrder, check whether
3247 // it appears after both a successor and a predecessor
3248 // of the SUnit. If this is the case, and the SUnit
3249 // is not part of circuit, then the NodeOrder is not
3250 // valid.
3251 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
3252 SUnit *SU = NodeOrder[i];
3253 unsigned Index = i;
3254
3255 bool PredBefore = false;
3256 bool SuccBefore = false;
3257
3258 SUnit *Succ;
3259 SUnit *Pred;
3260 (void)Succ;
3261 (void)Pred;
3262
3263 for (const auto &IE : DDG->getInEdges(SU)) {
3264 SUnit *PredSU = IE.getSrc();
3265 unsigned PredIndex = std::get<1>(
3266 *llvm::lower_bound(Indices, std::make_pair(PredSU, 0), CompareKey));
3267 if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
3268 PredBefore = true;
3269 Pred = PredSU;
3270 break;
3271 }
3272 }
3273
3274 for (const auto &OE : DDG->getOutEdges(SU)) {
3275 SUnit *SuccSU = OE.getDst();
3276 // Do not process a boundary node, it was not included in NodeOrder,
3277 // hence not in Indices either, call to std::lower_bound() below will
3278 // return Indices.end().
3279 if (SuccSU->isBoundaryNode())
3280 continue;
3281 unsigned SuccIndex = std::get<1>(
3282 *llvm::lower_bound(Indices, std::make_pair(SuccSU, 0), CompareKey));
3283 if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
3284 SuccBefore = true;
3285 Succ = SuccSU;
3286 break;
3287 }
3288 }
3289
3290 if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
3291 // instructions in circuits are allowed to be scheduled
3292 // after both a successor and predecessor.
3293 bool InCircuit = llvm::any_of(
3294 Circuits, [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
3295 if (InCircuit)
3296 LLVM_DEBUG(dbgs() << "In a circuit, predecessor ");
3297 else {
3298 Valid = false;
3299 NumNodeOrderIssues++;
3300 LLVM_DEBUG(dbgs() << "Predecessor ");
3301 }
3302 LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
3303 << " are scheduled before node " << SU->NodeNum
3304 << "\n");
3305 }
3306 }
3307
3308 LLVM_DEBUG({
3309 if (!Valid)
3310 dbgs() << "Invalid node order found!\n";
3311 });
3312}
3313
3314/// Attempt to fix the degenerate cases when the instruction serialization
3315/// causes the register lifetimes to overlap. For example,
3316/// p' = store_pi(p, b)
3317/// = load p, offset
3318/// In this case p and p' overlap, which means that two registers are needed.
3319/// Instead, this function changes the load to use p' and updates the offset.
3320void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
3321 unsigned OverlapReg = 0;
3322 unsigned NewBaseReg = 0;
3323 for (SUnit *SU : Instrs) {
3324 MachineInstr *MI = SU->getInstr();
3325 for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3326 const MachineOperand &MO = MI->getOperand(i);
3327 // Look for an instruction that uses p. The instruction occurs in the
3328 // same cycle but occurs later in the serialized order.
3329 if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
3330 // Check that the instruction appears in the InstrChanges structure,
3331 // which contains instructions that can have the offset updated.
3333 InstrChanges.find(SU);
3334 if (It != InstrChanges.end()) {
3335 unsigned BasePos, OffsetPos;
3336 // Update the base register and adjust the offset.
3337 if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
3339 NewMI->getOperand(BasePos).setReg(NewBaseReg);
3340 int64_t NewOffset =
3341 MI->getOperand(OffsetPos).getImm() - It->second.second;
3342 NewMI->getOperand(OffsetPos).setImm(NewOffset);
3343 SU->setInstr(NewMI);
3344 MISUnitMap[NewMI] = SU;
3345 NewMIs[MI] = NewMI;
3346 }
3347 }
3348 OverlapReg = 0;
3349 NewBaseReg = 0;
3350 break;
3351 }
3352 // Look for an instruction of the form p' = op(p), which uses and defines
3353 // two virtual registers that get allocated to the same physical register.
3354 unsigned TiedUseIdx = 0;
3355 if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
3356 // OverlapReg is p in the example above.
3357 OverlapReg = MI->getOperand(TiedUseIdx).getReg();
3358 // NewBaseReg is p' in the example above.
3359 NewBaseReg = MI->getOperand(i).getReg();
3360 break;
3361 }
3362 }
3363 }
3364}
3365
3366std::deque<SUnit *>
3368 const std::deque<SUnit *> &Instrs) const {
3369 std::deque<SUnit *> NewOrderPhi;
3370 for (SUnit *SU : Instrs) {
3371 if (SU->getInstr()->isPHI())
3372 NewOrderPhi.push_back(SU);
3373 }
3374 std::deque<SUnit *> NewOrderI;
3375 for (SUnit *SU : Instrs) {
3376 if (!SU->getInstr()->isPHI())
3377 orderDependence(SSD, SU, NewOrderI);
3378 }
3379 llvm::append_range(NewOrderPhi, NewOrderI);
3380 return NewOrderPhi;
3381}
3382
3383/// After the schedule has been formed, call this function to combine
3384/// the instructions from the different stages/cycles. That is, this
3385/// function creates a schedule that represents a single iteration.
3387 // Move all instructions to the first stage from later stages.
3388 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3389 for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
3390 ++stage) {
3391 std::deque<SUnit *> &cycleInstrs =
3392 ScheduledInstrs[cycle + (stage * InitiationInterval)];
3393 for (SUnit *SU : llvm::reverse(cycleInstrs))
3394 ScheduledInstrs[cycle].push_front(SU);
3395 }
3396 }
3397
3398 // Erase all the elements in the later stages. Only one iteration should
3399 // remain in the scheduled list, and it contains all the instructions.
3400 for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
3401 ScheduledInstrs.erase(cycle);
3402
3403 // Change the registers in instruction as specified in the InstrChanges
3404 // map. We need to use the new registers to create the correct order.
3405 for (const SUnit &SU : SSD->SUnits)
3406 SSD->applyInstrChange(SU.getInstr(), *this);
3407
3408 // Reorder the instructions in each cycle to fix and improve the
3409 // generated code.
3410 for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
3411 std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
3412 cycleInstrs = reorderInstructions(SSD, cycleInstrs);
3413 SSD->fixupRegisterOverlaps(cycleInstrs);
3414 }
3415
3416 LLVM_DEBUG(dump(););
3417}
3418
3420 os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
3421 << " depth " << MaxDepth << " col " << Colocate << "\n";
3422 for (const auto &I : Nodes)
3423 os << " SU(" << I->NodeNum << ") " << *(I->getInstr());
3424 os << "\n";
3425}
3426
3427#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3428/// Print the schedule information to the given output.
3430 // Iterate over each cycle.
3431 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3432 // Iterate over each instruction in the cycle.
3433 const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
3434 for (SUnit *CI : cycleInstrs->second) {
3435 os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
3436 os << "(" << CI->NodeNum << ") ";
3437 CI->getInstr()->print(os);
3438 os << "\n";
3439 }
3440 }
3441}
3442
3443/// Utility function used for debugging to print the schedule.
3446
3447void ResourceManager::dumpMRT() const {
3448 LLVM_DEBUG({
3449 if (UseDFA)
3450 return;
3451 std::stringstream SS;
3452 SS << "MRT:\n";
3453 SS << std::setw(4) << "Slot";
3454 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3455 SS << std::setw(3) << I;
3456 SS << std::setw(7) << "#Mops"
3457 << "\n";
3458 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3459 SS << std::setw(4) << Slot;
3460 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3461 SS << std::setw(3) << MRT[Slot][I];
3462 SS << std::setw(7) << NumScheduledMops[Slot] << "\n";
3463 }
3464 dbgs() << SS.str();
3465 });
3466}
3467#endif
3468
3470 const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {
3471 unsigned ProcResourceID = 0;
3472
3473 // We currently limit the resource kinds to 64 and below so that we can use
3474 // uint64_t for Masks
3475 assert(SM.getNumProcResourceKinds() < 64 &&
3476 "Too many kinds of resources, unsupported");
3477 // Create a unique bitmask for every processor resource unit.
3478 // Skip resource at index 0, since it always references 'InvalidUnit'.
3479 Masks.resize(SM.getNumProcResourceKinds());
3480 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3482 if (Desc.SubUnitsIdxBegin)
3483 continue;
3484 Masks[I] = 1ULL << ProcResourceID;
3485 ProcResourceID++;
3486 }
3487 // Create a unique bitmask for every processor resource group.
3488 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3490 if (!Desc.SubUnitsIdxBegin)
3491 continue;
3492 Masks[I] = 1ULL << ProcResourceID;
3493 for (unsigned U = 0; U < Desc.NumUnits; ++U)
3494 Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];
3495 ProcResourceID++;
3496 }
3497 LLVM_DEBUG({
3498 if (SwpShowResMask) {
3499 dbgs() << "ProcResourceDesc:\n";
3500 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3501 const MCProcResourceDesc *ProcResource = SM.getProcResource(I);
3502 dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
3503 ProcResource->Name, I, Masks[I],
3504 ProcResource->NumUnits);
3505 }
3506 dbgs() << " -----------------\n";
3507 }
3508 });
3509}
3510
3512 LLVM_DEBUG({
3513 if (SwpDebugResource)
3514 dbgs() << "canReserveResources:\n";
3515 });
3516 if (UseDFA)
3517 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3518 ->canReserveResources(&SU.getInstr()->getDesc());
3519
3520 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3521 if (!SCDesc->isValid()) {
3522 LLVM_DEBUG({
3523 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3524 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3525 });
3526 return true;
3527 }
3528
3529 reserveResources(SCDesc, Cycle);
3530 bool Result = !isOverbooked();
3531 unreserveResources(SCDesc, Cycle);
3532
3533 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return " << Result << "\n\n");
3534 return Result;
3535}
3536
3537void ResourceManager::reserveResources(SUnit &SU, int Cycle) {
3538 LLVM_DEBUG({
3539 if (SwpDebugResource)
3540 dbgs() << "reserveResources:\n";
3541 });
3542 if (UseDFA)
3543 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3544 ->reserveResources(&SU.getInstr()->getDesc());
3545
3546 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3547 if (!SCDesc->isValid()) {
3548 LLVM_DEBUG({
3549 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3550 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3551 });
3552 return;
3553 }
3554
3555 reserveResources(SCDesc, Cycle);
3556
3557 LLVM_DEBUG({
3558 if (SwpDebugResource) {
3559 dumpMRT();
3560 dbgs() << "reserveResources: done!\n\n";
3561 }
3562 });
3563}
3564
3565void ResourceManager::reserveResources(const MCSchedClassDesc *SCDesc,
3566 int Cycle) {
3567 assert(!UseDFA);
3568 for (const MCWriteProcResEntry &PRE : make_range(
3569 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3570 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3571 ++MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3572
3573 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3574 ++NumScheduledMops[positiveModulo(C, InitiationInterval)];
3575}
3576
3577void ResourceManager::unreserveResources(const MCSchedClassDesc *SCDesc,
3578 int Cycle) {
3579 assert(!UseDFA);
3580 for (const MCWriteProcResEntry &PRE : make_range(
3581 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3582 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3583 --MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3584
3585 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3586 --NumScheduledMops[positiveModulo(C, InitiationInterval)];
3587}
3588
3589bool ResourceManager::isOverbooked() const {
3590 assert(!UseDFA);
3591 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3592 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3594 if (MRT[Slot][I] > Desc->NumUnits)
3595 return true;
3596 }
3597 if (NumScheduledMops[Slot] > IssueWidth)
3598 return true;
3599 }
3600 return false;
3601}
3602
3603int ResourceManager::calculateResMIIDFA() const {
3604 assert(UseDFA);
3605
3606 // Sort the instructions by the number of available choices for scheduling,
3607 // least to most. Use the number of critical resources as the tie breaker.
3608 FuncUnitSorter FUS = FuncUnitSorter(*ST);
3609 for (SUnit &SU : DAG->SUnits)
3610 FUS.calcCriticalResources(*SU.getInstr());
3612 FuncUnitOrder(FUS);
3613
3614 for (SUnit &SU : DAG->SUnits)
3615 FuncUnitOrder.push(SU.getInstr());
3616
3618 Resources.push_back(
3619 std::unique_ptr<DFAPacketizer>(TII->CreateTargetScheduleState(*ST)));
3620
3621 while (!FuncUnitOrder.empty()) {
3622 MachineInstr *MI = FuncUnitOrder.top();
3623 FuncUnitOrder.pop();
3624 if (TII->isZeroCost(MI->getOpcode()))
3625 continue;
3626
3627 // Attempt to reserve the instruction in an existing DFA. At least one
3628 // DFA is needed for each cycle.
3629 unsigned NumCycles = DAG->getSUnit(MI)->Latency;
3630 unsigned ReservedCycles = 0;
3631 auto *RI = Resources.begin();
3632 auto *RE = Resources.end();
3633 LLVM_DEBUG({
3634 dbgs() << "Trying to reserve resource for " << NumCycles
3635 << " cycles for \n";
3636 MI->dump();
3637 });
3638 for (unsigned C = 0; C < NumCycles; ++C)
3639 while (RI != RE) {
3640 if ((*RI)->canReserveResources(*MI)) {
3641 (*RI)->reserveResources(*MI);
3642 ++ReservedCycles;
3643 break;
3644 }
3645 RI++;
3646 }
3647 LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles
3648 << ", NumCycles:" << NumCycles << "\n");
3649 // Add new DFAs, if needed, to reserve resources.
3650 for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
3652 << "NewResource created to reserve resources"
3653 << "\n");
3654 auto *NewResource = TII->CreateTargetScheduleState(*ST);
3655 assert(NewResource->canReserveResources(*MI) && "Reserve error.");
3656 NewResource->reserveResources(*MI);
3657 Resources.push_back(std::unique_ptr<DFAPacketizer>(NewResource));
3658 }
3659 }
3660
3661 int Resmii = Resources.size();
3662 LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
3663 return Resmii;
3664}
3665
3667 if (UseDFA)
3668 return calculateResMIIDFA();
3669
3670 // Count each resource consumption and divide it by the number of units.
3671 // ResMII is the max value among them.
3672
3673 int NumMops = 0;
3675 for (SUnit &SU : DAG->SUnits) {
3676 if (TII->isZeroCost(SU.getInstr()->getOpcode()))
3677 continue;
3678
3679 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3680 if (!SCDesc->isValid())
3681 continue;
3682
3683 LLVM_DEBUG({
3684 if (SwpDebugResource) {
3685 DAG->dumpNode(SU);
3686 dbgs() << " #Mops: " << SCDesc->NumMicroOps << "\n"
3687 << " WriteProcRes: ";
3688 }
3689 });
3690 NumMops += SCDesc->NumMicroOps;
3691 for (const MCWriteProcResEntry &PRE :
3692 make_range(STI->getWriteProcResBegin(SCDesc),
3693 STI->getWriteProcResEnd(SCDesc))) {
3694 LLVM_DEBUG({
3695 if (SwpDebugResource) {
3696 const MCProcResourceDesc *Desc =
3697 SM.getProcResource(PRE.ProcResourceIdx);
3698 dbgs() << Desc->Name << ": " << PRE.ReleaseAtCycle << ", ";
3699 }
3700 });
3701 ResourceCount[PRE.ProcResourceIdx] += PRE.ReleaseAtCycle;
3702 }
3703 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "\n");
3704 }
3705
3706 int Result = (NumMops + IssueWidth - 1) / IssueWidth;
3707 LLVM_DEBUG({
3708 if (SwpDebugResource)
3709 dbgs() << "#Mops: " << NumMops << ", "
3710 << "IssueWidth: " << IssueWidth << ", "
3711 << "Cycles: " << Result << "\n";
3712 });
3713
3714 LLVM_DEBUG({
3715 if (SwpDebugResource) {
3716 std::stringstream SS;
3717 SS << std::setw(2) << "ID" << std::setw(16) << "Name" << std::setw(10)
3718 << "Units" << std::setw(10) << "Consumed" << std::setw(10) << "Cycles"
3719 << "\n";
3720 dbgs() << SS.str();
3721 }
3722 });
3723 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3725 int Cycles = (ResourceCount[I] + Desc->NumUnits - 1) / Desc->NumUnits;
3726 LLVM_DEBUG({
3727 if (SwpDebugResource) {
3728 std::stringstream SS;
3729 SS << std::setw(2) << I << std::setw(16) << Desc->Name << std::setw(10)
3730 << Desc->NumUnits << std::setw(10) << ResourceCount[I]
3731 << std::setw(10) << Cycles << "\n";
3732 dbgs() << SS.str();
3733 }
3734 });
3735 if (Cycles > Result)
3736 Result = Cycles;
3737 }
3738 return Result;
3739}
3740
3742 InitiationInterval = II;
3743 DFAResources.clear();
3744 DFAResources.resize(II);
3745 for (auto &I : DFAResources)
3746 I.reset(ST->getInstrInfo()->CreateTargetScheduleState(*ST));
3747 MRT.clear();
3749 NumScheduledMops.clear();
3750 NumScheduledMops.resize(II);
3751}
3752
3753bool SwingSchedulerDDGEdge::ignoreDependence(bool IgnoreAnti) const {
3754 if (Pred.isArtificial() || Dst->isBoundaryNode())
3755 return true;
3756 // Currently, dependence that is an anti-dependences but not a loop-carried is
3757 // also ignored. This behavior is preserved to prevent regression.
3758 // FIXME: Remove if this doesn't have significant impact on performance
3759 return IgnoreAnti && (Pred.getKind() == SDep::Kind::Anti || Distance != 0);
3760}
3761
3762SwingSchedulerDDG::SwingSchedulerDDGEdges &
3763SwingSchedulerDDG::getEdges(const SUnit *SU) {
3764 if (SU == EntrySU)
3765 return EntrySUEdges;
3766 if (SU == ExitSU)
3767 return ExitSUEdges;
3768 return EdgesVec[SU->NodeNum];
3769}
3770
3771const SwingSchedulerDDG::SwingSchedulerDDGEdges &
3772SwingSchedulerDDG::getEdges(const SUnit *SU) const {
3773 if (SU == EntrySU)
3774 return EntrySUEdges;
3775 if (SU == ExitSU)
3776 return ExitSUEdges;
3777 return EdgesVec[SU->NodeNum];
3778}
3779
3780void SwingSchedulerDDG::addEdge(const SUnit *SU,
3781 const SwingSchedulerDDGEdge &Edge) {
3782 auto &Edges = getEdges(SU);
3783 if (Edge.getSrc() == SU)
3784 Edges.Succs.push_back(Edge);
3785 else
3786 Edges.Preds.push_back(Edge);
3787}
3788
3789void SwingSchedulerDDG::initEdges(SUnit *SU) {
3790 for (const auto &PI : SU->Preds) {
3791 SwingSchedulerDDGEdge Edge(SU, PI, false);
3792 addEdge(SU, Edge);
3793 }
3794
3795 for (const auto &SI : SU->Succs) {
3796 SwingSchedulerDDGEdge Edge(SU, SI, true);
3797 addEdge(SU, Edge);
3798 }
3799}
3800
3801SwingSchedulerDDG::SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU,
3802 SUnit *ExitSU)
3803 : EntrySU(EntrySU), ExitSU(ExitSU) {
3804 EdgesVec.resize(SUnits.size());
3805
3806 initEdges(EntrySU);
3807 initEdges(ExitSU);
3808 for (auto &SU : SUnits)
3809 initEdges(&SU);
3810}
3811
3814 return getEdges(SU).Preds;
3815}
3816
3819 return getEdges(SU).Succs;
3820}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static const LLT S1
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:686
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:622
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
bool End
Definition: ELF_riscv.cpp:480
SmallVector< uint32_t, 0 > Writes
Definition: ELF_riscv.cpp:497
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
static cl::opt< int > SwpForceII("pipeliner-force-ii", cl::desc("Force pipeliner to use specified II."), cl::Hidden, cl::init(-1))
A command line argument to force pipeliner to use specified initial interval.
static cl::opt< bool > ExperimentalCodeGen("pipeliner-experimental-cg", cl::Hidden, cl::init(false), cl::desc("Use the experimental peeling code generator for software pipelining"))
static cl::opt< bool > MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false), cl::desc("Use the MVE code generator for software pipelining"))
static cl::opt< int > RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden, cl::init(5), cl::desc("Margin representing the unused percentage of " "the register pressure limit"))
static cl::opt< bool > SwpDebugResource("pipeliner-dbg-res", cl::Hidden, cl::init(false))
static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker, NodeSet &NS)
Compute the live-out registers for the instructions in a node-set.
static void computeScheduledInsts(const SwingSchedulerDAG *SSD, SMSchedule &Schedule, std::vector< MachineInstr * > &OrderedInsts, DenseMap< MachineInstr *, unsigned > &Stages)
Create an instruction stream that represents a single iteration and stage of each instruction.
static cl::opt< bool > EmitTestAnnotations("pipeliner-annotate-for-testing", cl::Hidden, cl::init(false), cl::desc("Instead of emitting the pipelined code, annotate instructions " "with the generated schedule for feeding into the " "-modulo-schedule-test pass"))
static bool isIntersect(SmallSetVector< SUnit *, 8 > &Set1, const NodeSet &Set2, SmallSetVector< SUnit *, 8 > &Result)
Return true if Set1 contains elements in Set2.
static cl::opt< bool > SwpIgnoreRecMII("pipeliner-ignore-recmii", cl::ReallyHidden, cl::desc("Ignore RecMII"))
static cl::opt< int > SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1))
Modulo Software Pipelining
static cl::opt< bool > SwpPruneLoopCarried("pipeliner-prune-loop-carried", cl::desc("Prune loop carried order dependences."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of loop carried order dependences.
static bool isDependenceBarrier(MachineInstr &MI)
Return true if the instruction causes a chain between memory references before and after it.
static cl::opt< int > SwpMaxMii("pipeliner-max-mii", cl::desc("Size limit for the MII."), cl::Hidden, cl::init(27))
A command line argument to limit minimum initial interval for pipelining.
static unsigned getLoopPhiReg(const MachineInstr &Phi, const MachineBasicBlock *LoopBB)
Return the Phi register value that comes the loop block.
static bool isSuccOrder(SUnit *SUa, SUnit *SUb)
Return true if SUb can be reached from SUa following the chain edges.
static cl::opt< int > SwpMaxStages("pipeliner-max-stages", cl::desc("Maximum stages allowed in the generated scheduled."), cl::Hidden, cl::init(3))
A command line argument to limit the number of stages in the pipeline.
static cl::opt< bool > EnableSWPOptSize("enable-pipeliner-opt-size", cl::desc("Enable SWP at Os."), cl::Hidden, cl::init(false))
A command line option to enable SWP at -Os.
static bool pred_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Preds, SwingSchedulerDDG *DDG, const NodeSet *S=nullptr)
Compute the Pred_L(O) set, as defined in the paper.
static cl::opt< bool > SwpShowResMask("pipeliner-show-mask", cl::Hidden, cl::init(false))
static cl::opt< int > SwpIISearchRange("pipeliner-ii-search-range", cl::desc("Range to search for II"), cl::Hidden, cl::init(10))
static bool computePath(SUnit *Cur, SetVector< SUnit * > &Path, SetVector< SUnit * > &DestNodes, SetVector< SUnit * > &Exclude, SmallPtrSet< SUnit *, 8 > &Visited, SwingSchedulerDDG *DDG)
Return true if there is a path from the specified node to any of the nodes in DestNodes.
static bool succ_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Succs, SwingSchedulerDDG *DDG, const NodeSet *S=nullptr)
Compute the Succ_L(O) set, as defined in the paper.
static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop, unsigned &InitVal, unsigned &LoopVal)
Return the register values for the operands of a Phi instruction.
static cl::opt< bool > LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false), cl::desc("Limit register pressure of scheduled loop"))
#define DEBUG_TYPE
static cl::opt< bool > EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true), cl::desc("Enable Software Pipelining"))
A command line option to turn software pipelining on or off.
static cl::opt< bool > SwpPruneDeps("pipeliner-prune-deps", cl::desc("Prune dependences between unrelated Phi nodes."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of chain dependences due to an unrelated Phi.
static SUnit * multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG)
If an instruction has a use that spans multiple iterations, then return true.
unsigned const TargetRegisterInfo * TRI
unsigned Reg
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file defines the PriorityQueue class.
Remove Loads Into Fake Uses
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
Target-Independent Code Generator Pass Configuration Options pass.
static unsigned getSize(unsigned Kind)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
bool isNoAlias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A trivial helper function to check to see if the specified pointers are no-alias.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:226
bool erase(const KeyT &Val)
Definition: DenseMap.h:321
bool empty() const
Definition: DenseMap.h:98
iterator end()
Definition: DenseMap.h:84
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
A possibly irreducible generalization of a Loop.
Itinerary data supplied by a subtarget to be used by a target.
const InstrStage * beginStage(unsigned ItinClassIndx) const
Return the first stage of the itinerary.
const InstrStage * endStage(unsigned ItinClassIndx) const
Return the last+1 stage of the itinerary.
bool isEmpty() const
Returns true if there are no itineraries.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:390
bool hasValue() const
TypeSize getValue() const
BlockT * getHeader() const
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:632
unsigned getSchedClass() const
Return the scheduling class for this instruction.
Definition: MCInstrDesc.h:600
const MCInstrDesc & get(unsigned Opcode) const
Return the machine instruction descriptor that corresponds to the specified instruction opcode.
Definition: MCInstrInfo.h:63
Generic base class for all target subtargets.
const MCWriteProcResEntry * getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCWriteProcResEntry * getWriteProcResBegin(const MCSchedClassDesc *SC) const
Return an iterator at the first process resource consumed by the given scheduling class.
const MCSchedModel & getSchedModel() const
Get the machine model for this subtarget's CPU.
Metadata node.
Definition: Metadata.h:1069
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1430
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1428
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1436
Tracking metadata reference owned by Metadata.
Definition: Metadata.h:891
A single uniqued string.
Definition: Metadata.h:720
StringRef getString() const
Definition: Metadata.cpp:616
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
instr_iterator instr_end()
Analysis pass which computes a MachineDominatorTree.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void deleteMachineInstr(MachineInstr *MI)
DeleteMachineInstr - Delete the given MachineInstr.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
bool mayRaiseFPException() const
Return true if this instruction could possibly raise a floating-point exception.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:575
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:347
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition: MachineInstr.h:572
bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
bool isRegSequence() const
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:806
bool isIdenticalTo(const MachineInstr &Other, MICheckType Check=CheckDefs) const
Return true if this instruction is identical to Other.
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool isPseudo(QueryType Type=IgnoreBundle) const
Return true if this is a pseudo instruction that doesn't correspond to a real machine instruction.
Definition: MachineInstr.h:936
bool isPHI() const
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:585
iterator_range< filtered_mop_iterator > all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
Definition: MachineInstr.h:762
A description of a memory reference used in the backend.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
const Value * getValue() const
Return the base address of the memory access.
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
Register getReg() const
getReg - Returns the register number.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
Diagnostic information for optimization analysis remarks.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
The main class in the implementation of the target independent software pipeliner pass.
const TargetInstrInfo * TII
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
MachineFunction * MF
const MachineDominatorTree * MDT
const MachineLoopInfo * MLI
MachineOptimizationRemarkEmitter * ORE
RegisterClassInfo RegClassInfo
defusechain_iterator - This class provides iterator support for machine operands in the function that...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
use_instr_iterator use_instr_begin(Register RegNo) const
static use_instr_iterator use_instr_end()
MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator end()
Definition: MapVector.h:71
iterator find(const KeyT &Key)
Definition: MapVector.h:167
void clear()
Definition: MapVector.h:88
static MemoryLocation getAfter(const Value *Ptr, const AAMDNodes &AATags=AAMDNodes())
Return a location that may access any location after Ptr, while remaining within the underlying objec...
Expand the kernel using modulo variable expansion algorithm (MVE).
static bool canApply(MachineLoop &L)
Check if ModuloScheduleExpanderMVE can be applied to L.
The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place, rewriting the old loop and...
void cleanup()
Performs final cleanup after expansion.
void expand()
Performs the actual expansion.
Expander that simply annotates each scheduled instruction with a post-instr symbol that can be consum...
void annotate()
Performs the annotation.
Represents a schedule for a single-block loop.
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
SUnit * getNode(unsigned i) const
void print(raw_ostream &os) const
void setRecMII(unsigned mii)
unsigned count(SUnit *SU) const
void setColocate(unsigned c)
int compareRecMII(NodeSet &RHS)
unsigned size() const
bool insert(SUnit *SU)
LLVM_DUMP_METHOD void dump() const
bool empty() const
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
void dump() const
Definition: Pass.cpp:136
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
A reimplementation of ModuloScheduleExpander.
unsigned getPSet() const
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
Track the current register pressure at some position in the instruction stream, and remember the high...
void addLiveRegs(ArrayRef< RegisterMaskPair > Regs)
Force liveness of virtual registers or physical register units.
unsigned getRegPressureSetLimit(unsigned Idx) const
Get the register unit limit for the given pressure set index.
void runOnMachineFunction(const MachineFunction &MF)
runOnFunction - Prepare to answer questions about MF.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static constexpr bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:65
void initProcResourceVectors(const MCSchedModel &SM, SmallVectorImpl< uint64_t > &Masks)
void init(int II)
Initialize resources with the initiation interval II.
bool canReserveResources(SUnit &SU, int Cycle)
Check if the resources occupied by a machine instruction are available in the current state.
Scheduling dependency.
Definition: ScheduleDAG.h:49
Kind
These are the different kinds of scheduling dependencies.
Definition: ScheduleDAG.h:52
@ Order
Any other ordering dependency.
Definition: ScheduleDAG.h:56
@ Anti
A register anti-dependence (aka WAR).
Definition: ScheduleDAG.h:54
@ Data
Regular data dependence (aka true-dependence).
Definition: ScheduleDAG.h:53
@ Barrier
An unknown scheduling barrier.
Definition: ScheduleDAG.h:69
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
This class represents the scheduled code.
std::deque< SUnit * > reorderInstructions(const SwingSchedulerDAG *SSD, const std::deque< SUnit * > &Instrs) const
void setInitiationInterval(int ii)
Set the initiation interval for this schedule.
SmallSet< SUnit *, 8 > computeUnpipelineableNodes(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
Determine transitive dependences of unpipelineable instructions.
void dump() const
Utility function used for debugging to print the schedule.
bool insert(SUnit *SU, int StartCycle, int EndCycle, int II)
Try to schedule the node at the specified StartCycle and continue until the node is schedule or the E...
int earliestCycleInChain(const SwingSchedulerDDGEdge &Dep, const SwingSchedulerDDG *DDG)
Return the cycle of the earliest scheduled instruction in the dependence chain.
unsigned getMaxStageCount()
Return the maximum stage count needed for this schedule.
void print(raw_ostream &os) const
Print the schedule information to the given output.
bool onlyHasLoopCarriedOutputOrOrderPreds(SUnit *SU, const SwingSchedulerDDG *DDG) const
Return true if all scheduled predecessors are loop-carried output/order dependencies.
int stageScheduled(SUnit *SU) const
Return the stage for a scheduled instruction.
void orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU, std::deque< SUnit * > &Insts) const
Order the instructions within a cycle so that the definitions occur before the uses.
bool isValidSchedule(SwingSchedulerDAG *SSD)
int getInitiationInterval() const
Return the initiation interval for this schedule.
std::deque< SUnit * > & getInstructions(int cycle)
Return the instructions that are scheduled at the specified cycle.
int getFirstCycle() const
Return the first cycle in the completed schedule.
bool isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD, MachineInstr *Def, MachineOperand &MO) const
Return true if the instruction is a definition that is loop carried and defines the use on the next i...
unsigned cycleScheduled(SUnit *SU) const
Return the cycle for a scheduled instruction.
void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart, int II, SwingSchedulerDAG *DAG)
Compute the scheduling start slot for the instruction.
bool normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
bool isLoopCarried(const SwingSchedulerDAG *SSD, MachineInstr &Phi) const
Return true if the scheduled Phi has a loop carried operand.
int latestCycleInChain(const SwingSchedulerDDGEdge &Dep, const SwingSchedulerDDG *DDG)
Return the cycle of the latest scheduled instruction in the dependence chain.
int getFinalCycle() const
Return the last cycle in the finalized schedule.
void finalizeSchedule(SwingSchedulerDAG *SSD)
After the schedule has been formed, call this function to combine the instructions from the different...
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
unsigned NumPreds
Definition: ScheduleDAG.h:272
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
Definition: ScheduleDAG.h:378
unsigned NodeNum
Entry # of node in the node vector.
Definition: ScheduleDAG.h:270
void setInstr(MachineInstr *MI)
Assigns the instruction for the SUnit.
Definition: ScheduleDAG.h:382
void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
bool isPred(const SUnit *N) const
Tests if node N is a predecessor of this node.
Definition: ScheduleDAG.h:449
unsigned short Latency
Node latency.
Definition: ScheduleDAG.h:303
bool isBoundaryNode() const
Boundary nodes are placeholders for the boundary of the scheduling region.
Definition: ScheduleDAG.h:358
bool hasPhysRegDefs
Has physreg defs that are being used.
Definition: ScheduleDAG.h:292
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:263
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:262
bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:390
A ScheduleDAG for scheduling lists of MachineInstr.
DenseMap< MachineInstr *, SUnit * > MISUnitMap
After calling BuildSchedGraph, each machine instruction in the current scheduling region is mapped to...
virtual void finishBlock()
Cleans up after scheduling in the given block.
MachineBasicBlock * BB
The block in which to insert instructions.
const MCSchedClassDesc * getSchedClass(SUnit *SU) const
Resolves and cache a resolved scheduling class for an SUnit.
void dumpNode(const SUnit &SU) const override
UndefValue * UnknownValue
For an unanalyzable memory access, this Value is used in maps.
void buildSchedGraph(AAResults *AA, RegPressureTracker *RPTracker=nullptr, PressureDiffs *PDiffs=nullptr, LiveIntervals *LIS=nullptr, bool TrackLaneMasks=false)
Builds SUnits for the current region.
SUnit * getSUnit(MachineInstr *MI) const
Returns an existing SUnit for this MI, or nullptr.
TargetSchedModel SchedModel
TargetSchedModel provides an interface to the machine model.
void dump() const override
void RemovePred(SUnit *M, SUnit *N)
Updates the topological ordering to accommodate an edge to be removed from the specified node N from ...
void InitDAGTopologicalSorting()
Creates the initial topological ordering from the DAG to be scheduled.
void AddPred(SUnit *Y, SUnit *X)
Updates the topological ordering to accommodate an edge to be added from SUnit X to SUnit Y.
bool IsReachable(const SUnit *SU, const SUnit *TargetSU)
Checks if SU is reachable from TargetSU.
MachineRegisterInfo & MRI
Virtual/real register map.
Definition: ScheduleDAG.h:578
const TargetInstrInfo * TII
Target instruction information.
Definition: ScheduleDAG.h:575
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:579
const TargetRegisterInfo * TRI
Target processor register info.
Definition: ScheduleDAG.h:576
SUnit EntrySU
Special node for the region entry.
Definition: ScheduleDAG.h:580
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:577
SUnit ExitSU
Special node for the region exit.
Definition: ScheduleDAG.h:581
A vector that has set insertion semantics.
Definition: SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
typename vector_type::const_iterator iterator
Definition: SetVector.h:69
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:254
SlotIndexes pass.
Definition: SlotIndexes.h:297
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:531
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:298
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:401
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
iterator end() const
Definition: SmallPtrSet.h:477
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
iterator begin() const
Definition: SmallPtrSet.h:472
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:175
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:222
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
This class builds the dependence graph for the instructions in a loop, and attempts to schedule the i...
unsigned getInstrBaseReg(SUnit *SU) const
Return the new base register that was stored away for the changed instruction.
unsigned getDepth(SUnit *Node)
The depth, in the dependence graph, for a node.
int getASAP(SUnit *Node)
Return the earliest time an instruction may be scheduled.
void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule)
Apply changes to the instruction if needed.
const SwingSchedulerDDG * getDDG() const
void finishBlock() override
Clean up after the software pipeliner runs.
void fixupRegisterOverlaps(std::deque< SUnit * > &Instrs)
Attempt to fix the degenerate cases when the instruction serialization causes the register lifetimes ...
int getZeroLatencyDepth(SUnit *Node)
The maximum unweighted length of a path from an arbitrary node to the given node in which each edge h...
bool isLoopCarriedDep(const SwingSchedulerDDGEdge &Edge) const
Return true for an order or output dependence that is loop carried potentially.
void schedule() override
We override the schedule function in ScheduleDAGInstrs to implement the scheduling part of the Swing ...
int getMOV(SUnit *Node)
The mobility function, which the number of slots in which an instruction may be scheduled.
int getZeroLatencyHeight(SUnit *Node)
The maximum unweighted length of a path from the given node to an arbitrary node in which each edge h...
unsigned getHeight(SUnit *Node)
The height, in the dependence graph, for a node.
int getALAP(SUnit *Node)
Return the latest time an instruction my be scheduled.
Represents a dependence between two instruction.
SUnit * getDst() const
Returns the SUnit to which the edge points (destination node).
bool isArtificial() const
Returns true if the edge represents an artificial dependence.
bool ignoreDependence(bool IgnoreAnti) const
Returns true for DDG nodes that we ignore when computing the cost functions.
bool isOrderDep() const
Returns true if the edge represents a dependence that is not data, anti or output dependence.
SUnit * getSrc() const
Returns the SUnit from which the edge comes (source node).
bool isOutputDep() const
Returns true if the edge represents output dependence.
Represents dependencies between instructions.
SwingSchedulerDDG(std::vector< SUnit > &SUnits, SUnit *EntrySU, SUnit *ExitSU)
const EdgesType & getInEdges(const SUnit *SU) const
const EdgesType & getOutEdges(const SUnit *SU) const
Object returned by analyzeLoopForPipelining.
virtual bool isMVEExpanderSupported()
Return true if the target can expand pipelined schedule with modulo variable expansion.
virtual bool shouldIgnoreForPipelining(const MachineInstr *MI) const =0
Return true if the given instruction should not be pipelined and should be ignored.
virtual bool shouldUseSchedule(SwingSchedulerDAG &SSD, SMSchedule &SMS)
Return true if the proposed schedule should used.
virtual std::unique_ptr< PipelinerLoopInfo > analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const
Analyze loop L, which must be a single-basic-block loop, and if the conditions can be understood enou...
bool isZeroCost(unsigned Opcode) const
Return true for pseudo instructions that don't consume any machine resources in their current form.
virtual bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
virtual DFAPacketizer * CreateTargetScheduleState(const TargetSubtargetInfo &) const
Create machine specific model for scheduling.
virtual bool isPostIncrement(const MachineInstr &MI) const
Return true for post-incremented instructions.
virtual bool getBaseAndOffsetPosition(const MachineInstr &MI, unsigned &BasePos, unsigned &OffsetPos) const
Return true if the instruction contains a base register and offset.
virtual bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const
Sometimes, it is possible for the target to tell, even without aliasing information,...
virtual bool getIncrementValue(const MachineInstr &MI, int &Value) const
If the instruction is an increment of a constant value, return the amount.
bool getMemOperandWithOffset(const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const
Get the base operand and byte offset of an instruction that reads/writes memory.
Target-Independent Code Generator Pass Configuration Options.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const char * getRegPressureSetName(unsigned Idx) const =0
Get the name of this register unit pressure set.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
static Type * getVoidTy(LLVMContext &C)
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
The main class in the implementation of the target independent window scheduler.
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:193
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Entry
Definition: COFF.h:844
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:138
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:711
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double e
Definition: MathExtras.h:47
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Offset
Definition: DWP.cpp:480
void stable_sort(R &&Range)
Definition: STLExtras.h:2037
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2115
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2107
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
@ WS_Force
Use window algorithm after SMS algorithm fails.
@ WS_On
Turn off window algorithm.
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1664
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
unsigned getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1978
char & MachinePipelinerID
This pass performs software pipelining on machine instructions.
CycleInfo::CycleT Cycle
Definition: CycleInfo.h:24
cl::opt< bool > SwpEnableCopyToPhi
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
cl::opt< WindowSchedulingFlag > WindowSchedulingOption("window-sched", cl::Hidden, cl::init(WindowSchedulingFlag::WS_On), cl::desc("Set how to use window scheduling algorithm."), cl::values(clEnumValN(WindowSchedulingFlag::WS_Off, "off", "Turn off window algorithm."), clEnumValN(WindowSchedulingFlag::WS_On, "on", "Use window algorithm after SMS algorithm fails."), clEnumValN(WindowSchedulingFlag::WS_Force, "force", "Use window algorithm instead of SMS algorithm.")))
A command line argument to set the window scheduling option.
bool isIdentifiedObject(const Value *V)
Return true if this pointer refers to a distinct and identifiable object.
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
cl::opt< int > SwpForceIssueWidth
A command line argument to force pipeliner to use specified issue width.
#define N
Description of the encoding of one expression Op.
These values represent a non-pipelined step in the execution of an instruction.
RegisterPressure computed within a region of instructions delimited by TopIdx and BottomIdx.
static constexpr LaneBitmask getNone()
Definition: LaneBitmask.h:81
Define a kind of processor resource that will be modeled by the scheduler.
Definition: MCSchedule.h:34
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition: MCSchedule.h:121
bool isValid() const
Definition: MCSchedule.h:139
Machine model for scheduling, bundling, and heuristics.
Definition: MCSchedule.h:256
const MCSchedClassDesc * getSchedClassDesc(unsigned SchedClassIdx) const
Definition: MCSchedule.h:363
unsigned getNumProcResourceKinds() const
Definition: MCSchedule.h:352
bool hasInstrSchedModel() const
Does this machine model include instruction-level scheduling.
Definition: MCSchedule.h:337
const MCProcResourceDesc * getProcResource(unsigned ProcResourceIdx) const
Definition: MCSchedule.h:356
Identify one of the processor resource kinds consumed by a particular scheduling class for the specif...
Definition: MCSchedule.h:66
SmallVector< MachineOperand, 4 > BrCond
std::unique_ptr< TargetInstrInfo::PipelinerLoopInfo > LoopPipelinerInfo
MachineSchedContext provides enough context from the MachineScheduler pass for the target to insta