LLVM 20.0.0git
MachinePipeliner.cpp
Go to the documentation of this file.
1//===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
10//
11// This SMS implementation is a target-independent back-end pass. When enabled,
12// the pass runs just prior to the register allocation pass, while the machine
13// IR is in SSA form. If software pipelining is successful, then the original
14// loop is replaced by the optimized loop. The optimized loop contains one or
15// more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
16// the instructions cannot be scheduled in a given MII, we increase the MII by
17// one and try again.
18//
19// The SMS implementation is an extension of the ScheduleDAGInstrs class. We
20// represent loop carried dependences in the DAG as order edges to the Phi
21// nodes. We also perform several passes over the DAG to eliminate unnecessary
22// edges that inhibit the ability to pipeline. The implementation uses the
23// DFAPacketizer class to compute the minimum initiation interval and the check
24// where an instruction may be inserted in the pipelined schedule.
25//
26// In order for the SMS pass to work, several target specific hooks need to be
27// implemented to get information about the loop structure and to rewrite
28// instructions.
29//
30//===----------------------------------------------------------------------===//
31
33#include "llvm/ADT/ArrayRef.h"
34#include "llvm/ADT/BitVector.h"
35#include "llvm/ADT/DenseMap.h"
36#include "llvm/ADT/MapVector.h"
38#include "llvm/ADT/STLExtras.h"
40#include "llvm/ADT/SetVector.h"
42#include "llvm/ADT/SmallSet.h"
44#include "llvm/ADT/Statistic.h"
73#include "llvm/Config/llvm-config.h"
74#include "llvm/IR/Attributes.h"
75#include "llvm/IR/Function.h"
76#include "llvm/MC/LaneBitmask.h"
77#include "llvm/MC/MCInstrDesc.h"
79#include "llvm/Pass.h"
82#include "llvm/Support/Debug.h"
84#include <algorithm>
85#include <cassert>
86#include <climits>
87#include <cstdint>
88#include <deque>
89#include <functional>
90#include <iomanip>
91#include <iterator>
92#include <map>
93#include <memory>
94#include <sstream>
95#include <tuple>
96#include <utility>
97#include <vector>
98
99using namespace llvm;
100
101#define DEBUG_TYPE "pipeliner"
102
103STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
104STATISTIC(NumPipelined, "Number of loops software pipelined");
105STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
106STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
107STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
108STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
109STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
110STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
111STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
112STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
113STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
114
115/// A command line option to turn software pipelining on or off.
116static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
117 cl::desc("Enable Software Pipelining"));
118
119/// A command line option to enable SWP at -Os.
120static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
121 cl::desc("Enable SWP at Os."), cl::Hidden,
122 cl::init(false));
123
124/// A command line argument to limit minimum initial interval for pipelining.
125static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
126 cl::desc("Size limit for the MII."),
127 cl::Hidden, cl::init(27));
128
129/// A command line argument to force pipeliner to use specified initial
130/// interval.
131static cl::opt<int> SwpForceII("pipeliner-force-ii",
132 cl::desc("Force pipeliner to use specified II."),
133 cl::Hidden, cl::init(-1));
134
135/// A command line argument to limit the number of stages in the pipeline.
136static cl::opt<int>
137 SwpMaxStages("pipeliner-max-stages",
138 cl::desc("Maximum stages allowed in the generated scheduled."),
139 cl::Hidden, cl::init(3));
140
141/// A command line option to disable the pruning of chain dependences due to
142/// an unrelated Phi.
143static cl::opt<bool>
144 SwpPruneDeps("pipeliner-prune-deps",
145 cl::desc("Prune dependences between unrelated Phi nodes."),
146 cl::Hidden, cl::init(true));
147
148/// A command line option to disable the pruning of loop carried order
149/// dependences.
150static cl::opt<bool>
151 SwpPruneLoopCarried("pipeliner-prune-loop-carried",
152 cl::desc("Prune loop carried order dependences."),
153 cl::Hidden, cl::init(true));
154
155#ifndef NDEBUG
156static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
157#endif
158
159static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
161 cl::desc("Ignore RecMII"));
162
163static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
164 cl::init(false));
165static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,
166 cl::init(false));
167
169 "pipeliner-annotate-for-testing", cl::Hidden, cl::init(false),
170 cl::desc("Instead of emitting the pipelined code, annotate instructions "
171 "with the generated schedule for feeding into the "
172 "-modulo-schedule-test pass"));
173
175 "pipeliner-experimental-cg", cl::Hidden, cl::init(false),
176 cl::desc(
177 "Use the experimental peeling code generator for software pipelining"));
178
179static cl::opt<int> SwpIISearchRange("pipeliner-ii-search-range",
180 cl::desc("Range to search for II"),
181 cl::Hidden, cl::init(10));
182
183static cl::opt<bool>
184 LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false),
185 cl::desc("Limit register pressure of scheduled loop"));
186
187static cl::opt<int>
188 RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden,
189 cl::init(5),
190 cl::desc("Margin representing the unused percentage of "
191 "the register pressure limit"));
192
193static cl::opt<bool>
194 MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false),
195 cl::desc("Use the MVE code generator for software pipelining"));
196
197namespace llvm {
198
199// A command line option to enable the CopyToPhi DAG mutation.
201 cl::init(true),
202 cl::desc("Enable CopyToPhi DAG Mutation"));
203
204/// A command line argument to force pipeliner to use specified issue
205/// width.
207 "pipeliner-force-issue-width",
208 cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
209 cl::init(-1));
210
211/// A command line argument to set the window scheduling option.
214 cl::desc("Set how to use window scheduling algorithm."),
216 "Turn off window algorithm."),
218 "Use window algorithm after SMS algorithm fails."),
220 "Use window algorithm instead of SMS algorithm.")));
221
222} // end namespace llvm
223
224unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
225char MachinePipeliner::ID = 0;
226#ifndef NDEBUG
228#endif
230
232 "Modulo Software Pipelining", false, false)
238 "Modulo Software Pipelining", false, false)
239
240/// The "main" function for implementing Swing Modulo Scheduling.
241bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
242 if (skipFunction(mf.getFunction()))
243 return false;
244
245 if (!EnableSWP)
246 return false;
247
248 if (mf.getFunction().getAttributes().hasFnAttr(Attribute::OptimizeForSize) &&
249 !EnableSWPOptSize.getPosition())
250 return false;
251
252 if (!mf.getSubtarget().enableMachinePipeliner())
253 return false;
254
255 // Cannot pipeline loops without instruction itineraries if we are using
256 // DFA for the pipeliner.
257 if (mf.getSubtarget().useDFAforSMS() &&
258 (!mf.getSubtarget().getInstrItineraryData() ||
259 mf.getSubtarget().getInstrItineraryData()->isEmpty()))
260 return false;
261
262 MF = &mf;
263 MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
264 MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
265 ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
266 TII = MF->getSubtarget().getInstrInfo();
267 RegClassInfo.runOnMachineFunction(*MF);
268
269 for (const auto &L : *MLI)
270 scheduleLoop(*L);
271
272 return false;
273}
274
275/// Attempt to perform the SMS algorithm on the specified loop. This function is
276/// the main entry point for the algorithm. The function identifies candidate
277/// loops, calculates the minimum initiation interval, and attempts to schedule
278/// the loop.
279bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
280 bool Changed = false;
281 for (const auto &InnerLoop : L)
282 Changed |= scheduleLoop(*InnerLoop);
283
284#ifndef NDEBUG
285 // Stop trying after reaching the limit (if any).
286 int Limit = SwpLoopLimit;
287 if (Limit >= 0) {
288 if (NumTries >= SwpLoopLimit)
289 return Changed;
290 NumTries++;
291 }
292#endif
293
294 setPragmaPipelineOptions(L);
295 if (!canPipelineLoop(L)) {
296 LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
297 ORE->emit([&]() {
298 return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",
299 L.getStartLoc(), L.getHeader())
300 << "Failed to pipeline loop";
301 });
302
303 LI.LoopPipelinerInfo.reset();
304 return Changed;
305 }
306
307 ++NumTrytoPipeline;
308 if (useSwingModuloScheduler())
309 Changed = swingModuloScheduler(L);
310
311 if (useWindowScheduler(Changed))
312 Changed = runWindowScheduler(L);
313
314 LI.LoopPipelinerInfo.reset();
315 return Changed;
316}
317
318void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
319 // Reset the pragma for the next loop in iteration.
320 disabledByPragma = false;
321 II_setByPragma = 0;
322
323 MachineBasicBlock *LBLK = L.getTopBlock();
324
325 if (LBLK == nullptr)
326 return;
327
328 const BasicBlock *BBLK = LBLK->getBasicBlock();
329 if (BBLK == nullptr)
330 return;
331
332 const Instruction *TI = BBLK->getTerminator();
333 if (TI == nullptr)
334 return;
335
336 MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop);
337 if (LoopID == nullptr)
338 return;
339
340 assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");
341 assert(LoopID->getOperand(0) == LoopID && "invalid loop");
342
343 for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
344 MDNode *MD = dyn_cast<MDNode>(MDO);
345
346 if (MD == nullptr)
347 continue;
348
349 MDString *S = dyn_cast<MDString>(MD->getOperand(0));
350
351 if (S == nullptr)
352 continue;
353
354 if (S->getString() == "llvm.loop.pipeline.initiationinterval") {
355 assert(MD->getNumOperands() == 2 &&
356 "Pipeline initiation interval hint metadata should have two operands.");
358 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
359 assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");
360 } else if (S->getString() == "llvm.loop.pipeline.disable") {
361 disabledByPragma = true;
362 }
363 }
364}
365
366/// Return true if the loop can be software pipelined. The algorithm is
367/// restricted to loops with a single basic block. Make sure that the
368/// branch in the loop can be analyzed.
369bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
370 if (L.getNumBlocks() != 1) {
371 ORE->emit([&]() {
372 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
373 L.getStartLoc(), L.getHeader())
374 << "Not a single basic block: "
375 << ore::NV("NumBlocks", L.getNumBlocks());
376 });
377 return false;
378 }
379
380 if (disabledByPragma) {
381 ORE->emit([&]() {
382 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
383 L.getStartLoc(), L.getHeader())
384 << "Disabled by Pragma.";
385 });
386 return false;
387 }
388
389 // Check if the branch can't be understood because we can't do pipelining
390 // if that's the case.
391 LI.TBB = nullptr;
392 LI.FBB = nullptr;
393 LI.BrCond.clear();
394 if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
395 LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
396 NumFailBranch++;
397 ORE->emit([&]() {
398 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
399 L.getStartLoc(), L.getHeader())
400 << "The branch can't be understood";
401 });
402 return false;
403 }
404
405 LI.LoopInductionVar = nullptr;
406 LI.LoopCompare = nullptr;
408 if (!LI.LoopPipelinerInfo) {
409 LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
410 NumFailLoop++;
411 ORE->emit([&]() {
412 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
413 L.getStartLoc(), L.getHeader())
414 << "The loop structure is not supported";
415 });
416 return false;
417 }
418
419 if (!L.getLoopPreheader()) {
420 LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
421 NumFailPreheader++;
422 ORE->emit([&]() {
423 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
424 L.getStartLoc(), L.getHeader())
425 << "No loop preheader found";
426 });
427 return false;
428 }
429
430 // Remove any subregisters from inputs to phi nodes.
431 preprocessPhiNodes(*L.getHeader());
432 return true;
433}
434
435void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
437 SlotIndexes &Slots =
438 *getAnalysis<LiveIntervalsWrapperPass>().getLIS().getSlotIndexes();
439
440 for (MachineInstr &PI : B.phis()) {
441 MachineOperand &DefOp = PI.getOperand(0);
442 assert(DefOp.getSubReg() == 0);
443 auto *RC = MRI.getRegClass(DefOp.getReg());
444
445 for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
446 MachineOperand &RegOp = PI.getOperand(i);
447 if (RegOp.getSubReg() == 0)
448 continue;
449
450 // If the operand uses a subregister, replace it with a new register
451 // without subregisters, and generate a copy to the new register.
452 Register NewReg = MRI.createVirtualRegister(RC);
453 MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
455 const DebugLoc &DL = PredB.findDebugLoc(At);
456 auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
457 .addReg(RegOp.getReg(), getRegState(RegOp),
458 RegOp.getSubReg());
459 Slots.insertMachineInstrInMaps(*Copy);
460 RegOp.setReg(NewReg);
461 RegOp.setSubReg(0);
462 }
463 }
464}
465
466/// The SMS algorithm consists of the following main steps:
467/// 1. Computation and analysis of the dependence graph.
468/// 2. Ordering of the nodes (instructions).
469/// 3. Attempt to Schedule the loop.
470bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
471 assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
472
474 *this, L, getAnalysis<LiveIntervalsWrapperPass>().getLIS(), RegClassInfo,
476
477 MachineBasicBlock *MBB = L.getHeader();
478 // The kernel should not include any terminator instructions. These
479 // will be added back later.
480 SMS.startBlock(MBB);
481
482 // Compute the number of 'real' instructions in the basic block by
483 // ignoring terminators.
484 unsigned size = MBB->size();
486 E = MBB->instr_end();
487 I != E; ++I, --size)
488 ;
489
490 SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
491 SMS.schedule();
492 SMS.exitRegion();
493
494 SMS.finishBlock();
495 return SMS.hasNewSchedule();
496}
497
507}
508
509bool MachinePipeliner::runWindowScheduler(MachineLoop &L) {
510 MachineSchedContext Context;
511 Context.MF = MF;
512 Context.MLI = MLI;
513 Context.MDT = MDT;
514 Context.PassConfig = &getAnalysis<TargetPassConfig>();
515 Context.AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
516 Context.LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
518 WindowScheduler WS(&Context, L);
519 return WS.run();
520}
521
522bool MachinePipeliner::useSwingModuloScheduler() {
523 // SwingModuloScheduler does not work when WindowScheduler is forced.
525}
526
527bool MachinePipeliner::useWindowScheduler(bool Changed) {
528 // WindowScheduler does not work for following cases:
529 // 1. when it is off.
530 // 2. when SwingModuloScheduler is successfully scheduled.
531 // 3. when pragma II is enabled.
532 if (II_setByPragma) {
533 LLVM_DEBUG(dbgs() << "Window scheduling is disabled when "
534 "llvm.loop.pipeline.initiationinterval is set.\n");
535 return false;
536 }
537
540}
541
542void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
543 if (SwpForceII > 0)
544 MII = SwpForceII;
545 else if (II_setByPragma > 0)
546 MII = II_setByPragma;
547 else
548 MII = std::max(ResMII, RecMII);
549}
550
551void SwingSchedulerDAG::setMAX_II() {
552 if (SwpForceII > 0)
553 MAX_II = SwpForceII;
554 else if (II_setByPragma > 0)
555 MAX_II = II_setByPragma;
556 else
557 MAX_II = MII + SwpIISearchRange;
558}
559
560/// We override the schedule function in ScheduleDAGInstrs to implement the
561/// scheduling part of the Swing Modulo Scheduling algorithm.
563 AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
564 buildSchedGraph(AA);
565 addLoopCarriedDependences(AA);
566 updatePhiDependences();
568 changeDependences();
569 postProcessDAG();
570 LLVM_DEBUG(dump());
571
572 NodeSetType NodeSets;
573 findCircuits(NodeSets);
574 NodeSetType Circuits = NodeSets;
575
576 // Calculate the MII.
577 unsigned ResMII = calculateResMII();
578 unsigned RecMII = calculateRecMII(NodeSets);
579
580 fuseRecs(NodeSets);
581
582 // This flag is used for testing and can cause correctness problems.
583 if (SwpIgnoreRecMII)
584 RecMII = 0;
585
586 setMII(ResMII, RecMII);
587 setMAX_II();
588
589 LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II
590 << " (rec=" << RecMII << ", res=" << ResMII << ")\n");
591
592 // Can't schedule a loop without a valid MII.
593 if (MII == 0) {
594 LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
595 NumFailZeroMII++;
596 Pass.ORE->emit([&]() {
598 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
599 << "Invalid Minimal Initiation Interval: 0";
600 });
601 return;
602 }
603
604 // Don't pipeline large loops.
605 if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {
606 LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
607 << ", we don't pipeline large loops\n");
608 NumFailLargeMaxMII++;
609 Pass.ORE->emit([&]() {
611 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
612 << "Minimal Initiation Interval too large: "
613 << ore::NV("MII", (int)MII) << " > "
614 << ore::NV("SwpMaxMii", SwpMaxMii) << "."
615 << "Refer to -pipeliner-max-mii.";
616 });
617 return;
618 }
619
620 computeNodeFunctions(NodeSets);
621
622 registerPressureFilter(NodeSets);
623
624 colocateNodeSets(NodeSets);
625
626 checkNodeSets(NodeSets);
627
628 LLVM_DEBUG({
629 for (auto &I : NodeSets) {
630 dbgs() << " Rec NodeSet ";
631 I.dump();
632 }
633 });
634
635 llvm::stable_sort(NodeSets, std::greater<NodeSet>());
636
637 groupRemainingNodes(NodeSets);
638
639 removeDuplicateNodes(NodeSets);
640
641 LLVM_DEBUG({
642 for (auto &I : NodeSets) {
643 dbgs() << " NodeSet ";
644 I.dump();
645 }
646 });
647
648 computeNodeOrder(NodeSets);
649
650 // check for node order issues
651 checkValidNodeOrder(Circuits);
652
653 SMSchedule Schedule(Pass.MF, this);
654 Scheduled = schedulePipeline(Schedule);
655
656 if (!Scheduled){
657 LLVM_DEBUG(dbgs() << "No schedule found, return\n");
658 NumFailNoSchedule++;
659 Pass.ORE->emit([&]() {
661 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
662 << "Unable to find schedule";
663 });
664 return;
665 }
666
667 unsigned numStages = Schedule.getMaxStageCount();
668 // No need to generate pipeline if there are no overlapped iterations.
669 if (numStages == 0) {
670 LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
671 NumFailZeroStage++;
672 Pass.ORE->emit([&]() {
674 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
675 << "No need to pipeline - no overlapped iterations in schedule.";
676 });
677 return;
678 }
679 // Check that the maximum stage count is less than user-defined limit.
680 if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) {
681 LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
682 << " : too many stages, abort\n");
683 NumFailLargeMaxStage++;
684 Pass.ORE->emit([&]() {
686 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
687 << "Too many stages in schedule: "
688 << ore::NV("numStages", (int)numStages) << " > "
689 << ore::NV("SwpMaxStages", SwpMaxStages)
690 << ". Refer to -pipeliner-max-stages.";
691 });
692 return;
693 }
694
695 Pass.ORE->emit([&]() {
697 Loop.getHeader())
698 << "Pipelined succesfully!";
699 });
700
701 // Generate the schedule as a ModuloSchedule.
702 DenseMap<MachineInstr *, int> Cycles, Stages;
703 std::vector<MachineInstr *> OrderedInsts;
704 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
705 ++Cycle) {
706 for (SUnit *SU : Schedule.getInstructions(Cycle)) {
707 OrderedInsts.push_back(SU->getInstr());
708 Cycles[SU->getInstr()] = Cycle;
709 Stages[SU->getInstr()] = Schedule.stageScheduled(SU);
710 }
711 }
713 for (auto &KV : NewMIs) {
714 Cycles[KV.first] = Cycles[KV.second];
715 Stages[KV.first] = Stages[KV.second];
716 NewInstrChanges[KV.first] = InstrChanges[getSUnit(KV.first)];
717 }
718
719 ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
720 std::move(Stages));
722 assert(NewInstrChanges.empty() &&
723 "Cannot serialize a schedule with InstrChanges!");
725 MSTI.annotate();
726 return;
727 }
728 // The experimental code generator can't work if there are InstChanges.
729 if (ExperimentalCodeGen && NewInstrChanges.empty()) {
730 PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
731 MSE.expand();
732 } else if (MVECodeGen && NewInstrChanges.empty() &&
733 LoopPipelinerInfo->isMVEExpanderSupported() &&
735 ModuloScheduleExpanderMVE MSE(MF, MS, LIS);
736 MSE.expand();
737 } else {
738 ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
739 MSE.expand();
740 MSE.cleanup();
741 }
742 ++NumPipelined;
743}
744
745/// Clean up after the software pipeliner runs.
747 for (auto &KV : NewMIs)
748 MF.deleteMachineInstr(KV.second);
749 NewMIs.clear();
750
751 // Call the superclass.
753}
754
755/// Return the register values for the operands of a Phi instruction.
756/// This function assume the instruction is a Phi.
758 unsigned &InitVal, unsigned &LoopVal) {
759 assert(Phi.isPHI() && "Expecting a Phi.");
760
761 InitVal = 0;
762 LoopVal = 0;
763 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
764 if (Phi.getOperand(i + 1).getMBB() != Loop)
765 InitVal = Phi.getOperand(i).getReg();
766 else
767 LoopVal = Phi.getOperand(i).getReg();
768
769 assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
770}
771
772/// Return the Phi register value that comes the loop block.
773static unsigned getLoopPhiReg(const MachineInstr &Phi,
774 const MachineBasicBlock *LoopBB) {
775 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
776 if (Phi.getOperand(i + 1).getMBB() == LoopBB)
777 return Phi.getOperand(i).getReg();
778 return 0;
779}
780
781/// Return true if SUb can be reached from SUa following the chain edges.
782static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
785 Worklist.push_back(SUa);
786 while (!Worklist.empty()) {
787 const SUnit *SU = Worklist.pop_back_val();
788 for (const auto &SI : SU->Succs) {
789 SUnit *SuccSU = SI.getSUnit();
790 if (SI.getKind() == SDep::Order) {
791 if (Visited.count(SuccSU))
792 continue;
793 if (SuccSU == SUb)
794 return true;
795 Worklist.push_back(SuccSU);
796 Visited.insert(SuccSU);
797 }
798 }
799 }
800 return false;
801}
802
803/// Return true if the instruction causes a chain between memory
804/// references before and after it.
806 return MI.isCall() || MI.mayRaiseFPException() ||
807 MI.hasUnmodeledSideEffects() ||
808 (MI.hasOrderedMemoryRef() &&
809 (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad()));
810}
811
812/// Return the underlying objects for the memory references of an instruction.
813/// This function calls the code in ValueTracking, but first checks that the
814/// instruction has a memory operand.
817 if (!MI->hasOneMemOperand())
818 return;
819 MachineMemOperand *MM = *MI->memoperands_begin();
820 if (!MM->getValue())
821 return;
822 getUnderlyingObjects(MM->getValue(), Objs);
823 for (const Value *V : Objs) {
824 if (!isIdentifiedObject(V)) {
825 Objs.clear();
826 return;
827 }
828 }
829}
830
831/// Add a chain edge between a load and store if the store can be an
832/// alias of the load on a subsequent iteration, i.e., a loop carried
833/// dependence. This code is very similar to the code in ScheduleDAGInstrs
834/// but that code doesn't create loop carried dependences.
835void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
839 for (auto &SU : SUnits) {
840 MachineInstr &MI = *SU.getInstr();
842 PendingLoads.clear();
843 else if (MI.mayLoad()) {
846 if (Objs.empty())
848 for (const auto *V : Objs) {
849 SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
850 SUs.push_back(&SU);
851 }
852 } else if (MI.mayStore()) {
855 if (Objs.empty())
857 for (const auto *V : Objs) {
859 PendingLoads.find(V);
860 if (I == PendingLoads.end())
861 continue;
862 for (auto *Load : I->second) {
863 if (isSuccOrder(Load, &SU))
864 continue;
865 MachineInstr &LdMI = *Load->getInstr();
866 // First, perform the cheaper check that compares the base register.
867 // If they are the same and the load offset is less than the store
868 // offset, then mark the dependence as loop carried potentially.
869 const MachineOperand *BaseOp1, *BaseOp2;
870 int64_t Offset1, Offset2;
871 bool Offset1IsScalable, Offset2IsScalable;
872 if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1,
873 Offset1IsScalable, TRI) &&
874 TII->getMemOperandWithOffset(MI, BaseOp2, Offset2,
875 Offset2IsScalable, TRI)) {
876 if (BaseOp1->isIdenticalTo(*BaseOp2) &&
877 Offset1IsScalable == Offset2IsScalable &&
878 (int)Offset1 < (int)Offset2) {
880 "What happened to the chain edge?");
881 SDep Dep(Load, SDep::Barrier);
882 Dep.setLatency(1);
883 SU.addPred(Dep);
884 continue;
885 }
886 }
887 // Second, the more expensive check that uses alias analysis on the
888 // base registers. If they alias, and the load offset is less than
889 // the store offset, the mark the dependence as loop carried.
890 if (!AA) {
891 SDep Dep(Load, SDep::Barrier);
892 Dep.setLatency(1);
893 SU.addPred(Dep);
894 continue;
895 }
896 MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
897 MachineMemOperand *MMO2 = *MI.memoperands_begin();
898 if (!MMO1->getValue() || !MMO2->getValue()) {
899 SDep Dep(Load, SDep::Barrier);
900 Dep.setLatency(1);
901 SU.addPred(Dep);
902 continue;
903 }
904 if (MMO1->getValue() == MMO2->getValue() &&
905 MMO1->getOffset() <= MMO2->getOffset()) {
906 SDep Dep(Load, SDep::Barrier);
907 Dep.setLatency(1);
908 SU.addPred(Dep);
909 continue;
910 }
911 if (!AA->isNoAlias(
914 MMO2->getAAInfo()))) {
915 SDep Dep(Load, SDep::Barrier);
916 Dep.setLatency(1);
917 SU.addPred(Dep);
918 }
919 }
920 }
921 }
922 }
923}
924
925/// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
926/// processes dependences for PHIs. This function adds true dependences
927/// from a PHI to a use, and a loop carried dependence from the use to the
928/// PHI. The loop carried dependence is represented as an anti dependence
929/// edge. This function also removes chain dependences between unrelated
930/// PHIs.
931void SwingSchedulerDAG::updatePhiDependences() {
932 SmallVector<SDep, 4> RemoveDeps;
934
935 // Iterate over each DAG node.
936 for (SUnit &I : SUnits) {
937 RemoveDeps.clear();
938 // Set to true if the instruction has an operand defined by a Phi.
939 unsigned HasPhiUse = 0;
940 unsigned HasPhiDef = 0;
941 MachineInstr *MI = I.getInstr();
942 // Iterate over each operand, and we process the definitions.
943 for (const MachineOperand &MO : MI->operands()) {
944 if (!MO.isReg())
945 continue;
946 Register Reg = MO.getReg();
947 if (MO.isDef()) {
948 // If the register is used by a Phi, then create an anti dependence.
950 UI = MRI.use_instr_begin(Reg),
951 UE = MRI.use_instr_end();
952 UI != UE; ++UI) {
953 MachineInstr *UseMI = &*UI;
954 SUnit *SU = getSUnit(UseMI);
955 if (SU != nullptr && UseMI->isPHI()) {
956 if (!MI->isPHI()) {
957 SDep Dep(SU, SDep::Anti, Reg);
958 Dep.setLatency(1);
959 I.addPred(Dep);
960 } else {
961 HasPhiDef = Reg;
962 // Add a chain edge to a dependent Phi that isn't an existing
963 // predecessor.
964 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
965 I.addPred(SDep(SU, SDep::Barrier));
966 }
967 }
968 }
969 } else if (MO.isUse()) {
970 // If the register is defined by a Phi, then create a true dependence.
972 if (DefMI == nullptr)
973 continue;
974 SUnit *SU = getSUnit(DefMI);
975 if (SU != nullptr && DefMI->isPHI()) {
976 if (!MI->isPHI()) {
977 SDep Dep(SU, SDep::Data, Reg);
978 Dep.setLatency(0);
979 ST.adjustSchedDependency(SU, 0, &I, MO.getOperandNo(), Dep,
980 &SchedModel);
981 I.addPred(Dep);
982 } else {
983 HasPhiUse = Reg;
984 // Add a chain edge to a dependent Phi that isn't an existing
985 // predecessor.
986 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
987 I.addPred(SDep(SU, SDep::Barrier));
988 }
989 }
990 }
991 }
992 // Remove order dependences from an unrelated Phi.
993 if (!SwpPruneDeps)
994 continue;
995 for (auto &PI : I.Preds) {
996 MachineInstr *PMI = PI.getSUnit()->getInstr();
997 if (PMI->isPHI() && PI.getKind() == SDep::Order) {
998 if (I.getInstr()->isPHI()) {
999 if (PMI->getOperand(0).getReg() == HasPhiUse)
1000 continue;
1001 if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
1002 continue;
1003 }
1004 RemoveDeps.push_back(PI);
1005 }
1006 }
1007 for (const SDep &D : RemoveDeps)
1008 I.removePred(D);
1009 }
1010}
1011
1012/// Iterate over each DAG node and see if we can change any dependences
1013/// in order to reduce the recurrence MII.
1014void SwingSchedulerDAG::changeDependences() {
1015 // See if an instruction can use a value from the previous iteration.
1016 // If so, we update the base and offset of the instruction and change
1017 // the dependences.
1018 for (SUnit &I : SUnits) {
1019 unsigned BasePos = 0, OffsetPos = 0, NewBase = 0;
1020 int64_t NewOffset = 0;
1021 if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
1022 NewOffset))
1023 continue;
1024
1025 // Get the MI and SUnit for the instruction that defines the original base.
1026 Register OrigBase = I.getInstr()->getOperand(BasePos).getReg();
1028 if (!DefMI)
1029 continue;
1030 SUnit *DefSU = getSUnit(DefMI);
1031 if (!DefSU)
1032 continue;
1033 // Get the MI and SUnit for the instruction that defins the new base.
1034 MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
1035 if (!LastMI)
1036 continue;
1037 SUnit *LastSU = getSUnit(LastMI);
1038 if (!LastSU)
1039 continue;
1040
1041 if (Topo.IsReachable(&I, LastSU))
1042 continue;
1043
1044 // Remove the dependence. The value now depends on a prior iteration.
1046 for (const SDep &P : I.Preds)
1047 if (P.getSUnit() == DefSU)
1048 Deps.push_back(P);
1049 for (const SDep &D : Deps) {
1050 Topo.RemovePred(&I, D.getSUnit());
1051 I.removePred(D);
1052 }
1053 // Remove the chain dependence between the instructions.
1054 Deps.clear();
1055 for (auto &P : LastSU->Preds)
1056 if (P.getSUnit() == &I && P.getKind() == SDep::Order)
1057 Deps.push_back(P);
1058 for (const SDep &D : Deps) {
1059 Topo.RemovePred(LastSU, D.getSUnit());
1060 LastSU->removePred(D);
1061 }
1062
1063 // Add a dependence between the new instruction and the instruction
1064 // that defines the new base.
1065 SDep Dep(&I, SDep::Anti, NewBase);
1066 Topo.AddPred(LastSU, &I);
1067 LastSU->addPred(Dep);
1068
1069 // Remember the base and offset information so that we can update the
1070 // instruction during code generation.
1071 InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
1072 }
1073}
1074
1075/// Create an instruction stream that represents a single iteration and stage of
1076/// each instruction. This function differs from SMSchedule::finalizeSchedule in
1077/// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this
1078/// function is an approximation of SMSchedule::finalizeSchedule with all
1079/// non-const operations removed.
1081 SMSchedule &Schedule,
1082 std::vector<MachineInstr *> &OrderedInsts,
1085
1086 // Move all instructions to the first stage from the later stages.
1087 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1088 ++Cycle) {
1089 for (int Stage = 0, LastStage = Schedule.getMaxStageCount();
1090 Stage <= LastStage; ++Stage) {
1091 for (SUnit *SU : llvm::reverse(Schedule.getInstructions(
1092 Cycle + Stage * Schedule.getInitiationInterval()))) {
1093 Instrs[Cycle].push_front(SU);
1094 }
1095 }
1096 }
1097
1098 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1099 ++Cycle) {
1100 std::deque<SUnit *> &CycleInstrs = Instrs[Cycle];
1101 CycleInstrs = Schedule.reorderInstructions(SSD, CycleInstrs);
1102 for (SUnit *SU : CycleInstrs) {
1103 MachineInstr *MI = SU->getInstr();
1104 OrderedInsts.push_back(MI);
1105 Stages[MI] = Schedule.stageScheduled(SU);
1106 }
1107 }
1108}
1109
1110namespace {
1111
1112// FuncUnitSorter - Comparison operator used to sort instructions by
1113// the number of functional unit choices.
1114struct FuncUnitSorter {
1115 const InstrItineraryData *InstrItins;
1116 const MCSubtargetInfo *STI;
1118
1119 FuncUnitSorter(const TargetSubtargetInfo &TSI)
1120 : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
1121
1122 // Compute the number of functional unit alternatives needed
1123 // at each stage, and take the minimum value. We prioritize the
1124 // instructions by the least number of choices first.
1125 unsigned minFuncUnits(const MachineInstr *Inst,
1126 InstrStage::FuncUnits &F) const {
1127 unsigned SchedClass = Inst->getDesc().getSchedClass();
1128 unsigned min = UINT_MAX;
1129 if (InstrItins && !InstrItins->isEmpty()) {
1130 for (const InstrStage &IS :
1131 make_range(InstrItins->beginStage(SchedClass),
1132 InstrItins->endStage(SchedClass))) {
1133 InstrStage::FuncUnits funcUnits = IS.getUnits();
1134 unsigned numAlternatives = llvm::popcount(funcUnits);
1135 if (numAlternatives < min) {
1136 min = numAlternatives;
1137 F = funcUnits;
1138 }
1139 }
1140 return min;
1141 }
1142 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1143 const MCSchedClassDesc *SCDesc =
1144 STI->getSchedModel().getSchedClassDesc(SchedClass);
1145 if (!SCDesc->isValid())
1146 // No valid Schedule Class Desc for schedClass, should be
1147 // Pseudo/PostRAPseudo
1148 return min;
1149
1150 for (const MCWriteProcResEntry &PRE :
1151 make_range(STI->getWriteProcResBegin(SCDesc),
1152 STI->getWriteProcResEnd(SCDesc))) {
1153 if (!PRE.ReleaseAtCycle)
1154 continue;
1155 const MCProcResourceDesc *ProcResource =
1156 STI->getSchedModel().getProcResource(PRE.ProcResourceIdx);
1157 unsigned NumUnits = ProcResource->NumUnits;
1158 if (NumUnits < min) {
1159 min = NumUnits;
1160 F = PRE.ProcResourceIdx;
1161 }
1162 }
1163 return min;
1164 }
1165 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1166 }
1167
1168 // Compute the critical resources needed by the instruction. This
1169 // function records the functional units needed by instructions that
1170 // must use only one functional unit. We use this as a tie breaker
1171 // for computing the resource MII. The instrutions that require
1172 // the same, highly used, functional unit have high priority.
1173 void calcCriticalResources(MachineInstr &MI) {
1174 unsigned SchedClass = MI.getDesc().getSchedClass();
1175 if (InstrItins && !InstrItins->isEmpty()) {
1176 for (const InstrStage &IS :
1177 make_range(InstrItins->beginStage(SchedClass),
1178 InstrItins->endStage(SchedClass))) {
1179 InstrStage::FuncUnits FuncUnits = IS.getUnits();
1180 if (llvm::popcount(FuncUnits) == 1)
1181 Resources[FuncUnits]++;
1182 }
1183 return;
1184 }
1185 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1186 const MCSchedClassDesc *SCDesc =
1187 STI->getSchedModel().getSchedClassDesc(SchedClass);
1188 if (!SCDesc->isValid())
1189 // No valid Schedule Class Desc for schedClass, should be
1190 // Pseudo/PostRAPseudo
1191 return;
1192
1193 for (const MCWriteProcResEntry &PRE :
1194 make_range(STI->getWriteProcResBegin(SCDesc),
1195 STI->getWriteProcResEnd(SCDesc))) {
1196 if (!PRE.ReleaseAtCycle)
1197 continue;
1198 Resources[PRE.ProcResourceIdx]++;
1199 }
1200 return;
1201 }
1202 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1203 }
1204
1205 /// Return true if IS1 has less priority than IS2.
1206 bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
1207 InstrStage::FuncUnits F1 = 0, F2 = 0;
1208 unsigned MFUs1 = minFuncUnits(IS1, F1);
1209 unsigned MFUs2 = minFuncUnits(IS2, F2);
1210 if (MFUs1 == MFUs2)
1211 return Resources.lookup(F1) < Resources.lookup(F2);
1212 return MFUs1 > MFUs2;
1213 }
1214};
1215
1216/// Calculate the maximum register pressure of the scheduled instructions stream
1217class HighRegisterPressureDetector {
1218 MachineBasicBlock *OrigMBB;
1219 const MachineRegisterInfo &MRI;
1220 const TargetRegisterInfo *TRI;
1221
1222 const unsigned PSetNum;
1223
1224 // Indexed by PSet ID
1225 // InitSetPressure takes into account the register pressure of live-in
1226 // registers. It's not depend on how the loop is scheduled, so it's enough to
1227 // calculate them once at the beginning.
1228 std::vector<unsigned> InitSetPressure;
1229
1230 // Indexed by PSet ID
1231 // Upper limit for each register pressure set
1232 std::vector<unsigned> PressureSetLimit;
1233
1235
1237
1238public:
1239 using OrderedInstsTy = std::vector<MachineInstr *>;
1240 using Instr2StageTy = DenseMap<MachineInstr *, unsigned>;
1241
1242private:
1243 static void dumpRegisterPressures(const std::vector<unsigned> &Pressures) {
1244 if (Pressures.size() == 0) {
1245 dbgs() << "[]";
1246 } else {
1247 char Prefix = '[';
1248 for (unsigned P : Pressures) {
1249 dbgs() << Prefix << P;
1250 Prefix = ' ';
1251 }
1252 dbgs() << ']';
1253 }
1254 }
1255
1256 void dumpPSet(Register Reg) const {
1257 dbgs() << "Reg=" << printReg(Reg, TRI, 0, &MRI) << " PSet=";
1258 for (auto PSetIter = MRI.getPressureSets(Reg); PSetIter.isValid();
1259 ++PSetIter) {
1260 dbgs() << *PSetIter << ' ';
1261 }
1262 dbgs() << '\n';
1263 }
1264
1265 void increaseRegisterPressure(std::vector<unsigned> &Pressure,
1266 Register Reg) const {
1267 auto PSetIter = MRI.getPressureSets(Reg);
1268 unsigned Weight = PSetIter.getWeight();
1269 for (; PSetIter.isValid(); ++PSetIter)
1270 Pressure[*PSetIter] += Weight;
1271 }
1272
1273 void decreaseRegisterPressure(std::vector<unsigned> &Pressure,
1274 Register Reg) const {
1275 auto PSetIter = MRI.getPressureSets(Reg);
1276 unsigned Weight = PSetIter.getWeight();
1277 for (; PSetIter.isValid(); ++PSetIter) {
1278 auto &P = Pressure[*PSetIter];
1279 assert(P >= Weight &&
1280 "register pressure must be greater than or equal weight");
1281 P -= Weight;
1282 }
1283 }
1284
1285 // Return true if Reg is reserved one, for example, stack pointer
1286 bool isReservedRegister(Register Reg) const {
1287 return Reg.isPhysical() && MRI.isReserved(Reg.asMCReg());
1288 }
1289
1290 bool isDefinedInThisLoop(Register Reg) const {
1291 return Reg.isVirtual() && MRI.getVRegDef(Reg)->getParent() == OrigMBB;
1292 }
1293
1294 // Search for live-in variables. They are factored into the register pressure
1295 // from the begining. Live-in variables used by every iteration should be
1296 // considered as alive throughout the loop. For example, the variable `c` in
1297 // following code. \code
1298 // int c = ...;
1299 // for (int i = 0; i < n; i++)
1300 // a[i] += b[i] + c;
1301 // \endcode
1302 void computeLiveIn() {
1304 for (auto &MI : *OrigMBB) {
1305 if (MI.isDebugInstr())
1306 continue;
1307 for (auto &Use : ROMap[&MI].Uses) {
1308 auto Reg = Use.RegUnit;
1309 // Ignore the variable that appears only on one side of phi instruction
1310 // because it's used only at the first iteration.
1311 if (MI.isPHI() && Reg != getLoopPhiReg(MI, OrigMBB))
1312 continue;
1313 if (isReservedRegister(Reg))
1314 continue;
1315 if (isDefinedInThisLoop(Reg))
1316 continue;
1317 Used.insert(Reg);
1318 }
1319 }
1320
1321 for (auto LiveIn : Used)
1322 increaseRegisterPressure(InitSetPressure, LiveIn);
1323 }
1324
1325 // Calculate the upper limit of each pressure set
1326 void computePressureSetLimit(const RegisterClassInfo &RCI) {
1327 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1328 PressureSetLimit[PSet] = RCI.getRegPressureSetLimit(PSet);
1329 }
1330
1331 // There are two patterns of last-use.
1332 // - by an instruction of the current iteration
1333 // - by a phi instruction of the next iteration (loop carried value)
1334 //
1335 // Furthermore, following two groups of instructions are executed
1336 // simultaneously
1337 // - next iteration's phi instructions in i-th stage
1338 // - current iteration's instructions in i+1-th stage
1339 //
1340 // This function calculates the last-use of each register while taking into
1341 // account the above two patterns.
1342 Instr2LastUsesTy computeLastUses(const OrderedInstsTy &OrderedInsts,
1343 Instr2StageTy &Stages) const {
1344 // We treat virtual registers that are defined and used in this loop.
1345 // Following virtual register will be ignored
1346 // - live-in one
1347 // - defined but not used in the loop (potentially live-out)
1348 DenseSet<Register> TargetRegs;
1349 const auto UpdateTargetRegs = [this, &TargetRegs](Register Reg) {
1350 if (isDefinedInThisLoop(Reg))
1351 TargetRegs.insert(Reg);
1352 };
1353 for (MachineInstr *MI : OrderedInsts) {
1354 if (MI->isPHI()) {
1355 Register Reg = getLoopPhiReg(*MI, OrigMBB);
1356 UpdateTargetRegs(Reg);
1357 } else {
1358 for (auto &Use : ROMap.find(MI)->getSecond().Uses)
1359 UpdateTargetRegs(Use.RegUnit);
1360 }
1361 }
1362
1363 const auto InstrScore = [&Stages](MachineInstr *MI) {
1364 return Stages[MI] + MI->isPHI();
1365 };
1366
1368 for (MachineInstr *MI : llvm::reverse(OrderedInsts)) {
1369 for (auto &Use : ROMap.find(MI)->getSecond().Uses) {
1370 auto Reg = Use.RegUnit;
1371 if (!TargetRegs.contains(Reg))
1372 continue;
1373 auto [Ite, Inserted] = LastUseMI.try_emplace(Reg, MI);
1374 if (!Inserted) {
1375 MachineInstr *Orig = Ite->second;
1376 MachineInstr *New = MI;
1377 if (InstrScore(Orig) < InstrScore(New))
1378 Ite->second = New;
1379 }
1380 }
1381 }
1382
1383 Instr2LastUsesTy LastUses;
1384 for (auto &Entry : LastUseMI)
1385 LastUses[Entry.second].insert(Entry.first);
1386 return LastUses;
1387 }
1388
1389 // Compute the maximum register pressure of the kernel. We'll simulate #Stage
1390 // iterations and check the register pressure at the point where all stages
1391 // overlapping.
1392 //
1393 // An example of unrolled loop where #Stage is 4..
1394 // Iter i+0 i+1 i+2 i+3
1395 // ------------------------
1396 // Stage 0
1397 // Stage 1 0
1398 // Stage 2 1 0
1399 // Stage 3 2 1 0 <- All stages overlap
1400 //
1401 std::vector<unsigned>
1402 computeMaxSetPressure(const OrderedInstsTy &OrderedInsts,
1403 Instr2StageTy &Stages,
1404 const unsigned StageCount) const {
1405 using RegSetTy = SmallDenseSet<Register, 16>;
1406
1407 // Indexed by #Iter. To treat "local" variables of each stage separately, we
1408 // manage the liveness of the registers independently by iterations.
1409 SmallVector<RegSetTy> LiveRegSets(StageCount);
1410
1411 auto CurSetPressure = InitSetPressure;
1412 auto MaxSetPressure = InitSetPressure;
1413 auto LastUses = computeLastUses(OrderedInsts, Stages);
1414
1415 LLVM_DEBUG({
1416 dbgs() << "Ordered instructions:\n";
1417 for (MachineInstr *MI : OrderedInsts) {
1418 dbgs() << "Stage " << Stages[MI] << ": ";
1419 MI->dump();
1420 }
1421 });
1422
1423 const auto InsertReg = [this, &CurSetPressure](RegSetTy &RegSet,
1424 Register Reg) {
1425 if (!Reg.isValid() || isReservedRegister(Reg))
1426 return;
1427
1428 bool Inserted = RegSet.insert(Reg).second;
1429 if (!Inserted)
1430 return;
1431
1432 LLVM_DEBUG(dbgs() << "insert " << printReg(Reg, TRI, 0, &MRI) << "\n");
1433 increaseRegisterPressure(CurSetPressure, Reg);
1434 LLVM_DEBUG(dumpPSet(Reg));
1435 };
1436
1437 const auto EraseReg = [this, &CurSetPressure](RegSetTy &RegSet,
1438 Register Reg) {
1439 if (!Reg.isValid() || isReservedRegister(Reg))
1440 return;
1441
1442 // live-in register
1443 if (!RegSet.contains(Reg))
1444 return;
1445
1446 LLVM_DEBUG(dbgs() << "erase " << printReg(Reg, TRI, 0, &MRI) << "\n");
1447 RegSet.erase(Reg);
1448 decreaseRegisterPressure(CurSetPressure, Reg);
1449 LLVM_DEBUG(dumpPSet(Reg));
1450 };
1451
1452 for (unsigned I = 0; I < StageCount; I++) {
1453 for (MachineInstr *MI : OrderedInsts) {
1454 const auto Stage = Stages[MI];
1455 if (I < Stage)
1456 continue;
1457
1458 const unsigned Iter = I - Stage;
1459
1460 for (auto &Def : ROMap.find(MI)->getSecond().Defs)
1461 InsertReg(LiveRegSets[Iter], Def.RegUnit);
1462
1463 for (auto LastUse : LastUses[MI]) {
1464 if (MI->isPHI()) {
1465 if (Iter != 0)
1466 EraseReg(LiveRegSets[Iter - 1], LastUse);
1467 } else {
1468 EraseReg(LiveRegSets[Iter], LastUse);
1469 }
1470 }
1471
1472 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1473 MaxSetPressure[PSet] =
1474 std::max(MaxSetPressure[PSet], CurSetPressure[PSet]);
1475
1476 LLVM_DEBUG({
1477 dbgs() << "CurSetPressure=";
1478 dumpRegisterPressures(CurSetPressure);
1479 dbgs() << " iter=" << Iter << " stage=" << Stage << ":";
1480 MI->dump();
1481 });
1482 }
1483 }
1484
1485 return MaxSetPressure;
1486 }
1487
1488public:
1489 HighRegisterPressureDetector(MachineBasicBlock *OrigMBB,
1490 const MachineFunction &MF)
1491 : OrigMBB(OrigMBB), MRI(MF.getRegInfo()),
1492 TRI(MF.getSubtarget().getRegisterInfo()),
1493 PSetNum(TRI->getNumRegPressureSets()), InitSetPressure(PSetNum, 0),
1494 PressureSetLimit(PSetNum, 0) {}
1495
1496 // Used to calculate register pressure, which is independent of loop
1497 // scheduling.
1498 void init(const RegisterClassInfo &RCI) {
1499 for (MachineInstr &MI : *OrigMBB) {
1500 if (MI.isDebugInstr())
1501 continue;
1502 ROMap[&MI].collect(MI, *TRI, MRI, false, true);
1503 }
1504
1505 computeLiveIn();
1506 computePressureSetLimit(RCI);
1507 }
1508
1509 // Calculate the maximum register pressures of the loop and check if they
1510 // exceed the limit
1511 bool detect(const SwingSchedulerDAG *SSD, SMSchedule &Schedule,
1512 const unsigned MaxStage) const {
1514 "the percentage of the margin must be between 0 to 100");
1515
1516 OrderedInstsTy OrderedInsts;
1517 Instr2StageTy Stages;
1518 computeScheduledInsts(SSD, Schedule, OrderedInsts, Stages);
1519 const auto MaxSetPressure =
1520 computeMaxSetPressure(OrderedInsts, Stages, MaxStage + 1);
1521
1522 LLVM_DEBUG({
1523 dbgs() << "Dump MaxSetPressure:\n";
1524 for (unsigned I = 0; I < MaxSetPressure.size(); I++) {
1525 dbgs() << format("MaxSetPressure[%d]=%d\n", I, MaxSetPressure[I]);
1526 }
1527 dbgs() << '\n';
1528 });
1529
1530 for (unsigned PSet = 0; PSet < PSetNum; PSet++) {
1531 unsigned Limit = PressureSetLimit[PSet];
1532 unsigned Margin = Limit * RegPressureMargin / 100;
1533 LLVM_DEBUG(dbgs() << "PSet=" << PSet << " Limit=" << Limit
1534 << " Margin=" << Margin << "\n");
1535 if (Limit < MaxSetPressure[PSet] + Margin) {
1536 LLVM_DEBUG(
1537 dbgs()
1538 << "Rejected the schedule because of too high register pressure\n");
1539 return true;
1540 }
1541 }
1542 return false;
1543 }
1544};
1545
1546} // end anonymous namespace
1547
1548/// Calculate the resource constrained minimum initiation interval for the
1549/// specified loop. We use the DFA to model the resources needed for
1550/// each instruction, and we ignore dependences. A different DFA is created
1551/// for each cycle that is required. When adding a new instruction, we attempt
1552/// to add it to each existing DFA, until a legal space is found. If the
1553/// instruction cannot be reserved in an existing DFA, we create a new one.
1554unsigned SwingSchedulerDAG::calculateResMII() {
1555 LLVM_DEBUG(dbgs() << "calculateResMII:\n");
1557 return RM.calculateResMII();
1558}
1559
1560/// Calculate the recurrence-constrainted minimum initiation interval.
1561/// Iterate over each circuit. Compute the delay(c) and distance(c)
1562/// for each circuit. The II needs to satisfy the inequality
1563/// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1564/// II that satisfies the inequality, and the RecMII is the maximum
1565/// of those values.
1566unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1567 unsigned RecMII = 0;
1568
1569 for (NodeSet &Nodes : NodeSets) {
1570 if (Nodes.empty())
1571 continue;
1572
1573 unsigned Delay = Nodes.getLatency();
1574 unsigned Distance = 1;
1575
1576 // ii = ceil(delay / distance)
1577 unsigned CurMII = (Delay + Distance - 1) / Distance;
1578 Nodes.setRecMII(CurMII);
1579 if (CurMII > RecMII)
1580 RecMII = CurMII;
1581 }
1582
1583 return RecMII;
1584}
1585
1586/// Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1587/// but we do this to find the circuits, and then change them back.
1588static void swapAntiDependences(std::vector<SUnit> &SUnits) {
1590 for (SUnit &SU : SUnits) {
1591 for (SDep &Pred : SU.Preds)
1592 if (Pred.getKind() == SDep::Anti)
1593 DepsAdded.push_back(std::make_pair(&SU, Pred));
1594 }
1595 for (std::pair<SUnit *, SDep> &P : DepsAdded) {
1596 // Remove this anti dependency and add one in the reverse direction.
1597 SUnit *SU = P.first;
1598 SDep &D = P.second;
1599 SUnit *TargetSU = D.getSUnit();
1600 unsigned Reg = D.getReg();
1601 unsigned Lat = D.getLatency();
1602 SU->removePred(D);
1603 SDep Dep(SU, SDep::Anti, Reg);
1604 Dep.setLatency(Lat);
1605 TargetSU->addPred(Dep);
1606 }
1607}
1608
1609/// Create the adjacency structure of the nodes in the graph.
1610void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1611 SwingSchedulerDAG *DAG) {
1612 BitVector Added(SUnits.size());
1613 DenseMap<int, int> OutputDeps;
1614 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1615 Added.reset();
1616 // Add any successor to the adjacency matrix and exclude duplicates.
1617 for (auto &SI : SUnits[i].Succs) {
1618 // Only create a back-edge on the first and last nodes of a dependence
1619 // chain. This records any chains and adds them later.
1620 if (SI.getKind() == SDep::Output) {
1621 int N = SI.getSUnit()->NodeNum;
1622 int BackEdge = i;
1623 auto Dep = OutputDeps.find(BackEdge);
1624 if (Dep != OutputDeps.end()) {
1625 BackEdge = Dep->second;
1626 OutputDeps.erase(Dep);
1627 }
1628 OutputDeps[N] = BackEdge;
1629 }
1630 // Do not process a boundary node, an artificial node.
1631 // A back-edge is processed only if it goes to a Phi.
1632 if (SI.getSUnit()->isBoundaryNode() || SI.isArtificial() ||
1633 (SI.getKind() == SDep::Anti && !SI.getSUnit()->getInstr()->isPHI()))
1634 continue;
1635 int N = SI.getSUnit()->NodeNum;
1636 if (!Added.test(N)) {
1637 AdjK[i].push_back(N);
1638 Added.set(N);
1639 }
1640 }
1641 // A chain edge between a store and a load is treated as a back-edge in the
1642 // adjacency matrix.
1643 for (auto &PI : SUnits[i].Preds) {
1644 if (!SUnits[i].getInstr()->mayStore() ||
1645 !DAG->isLoopCarriedDep(&SUnits[i], PI, false))
1646 continue;
1647 if (PI.getKind() == SDep::Order && PI.getSUnit()->getInstr()->mayLoad()) {
1648 int N = PI.getSUnit()->NodeNum;
1649 if (!Added.test(N)) {
1650 AdjK[i].push_back(N);
1651 Added.set(N);
1652 }
1653 }
1654 }
1655 }
1656 // Add back-edges in the adjacency matrix for the output dependences.
1657 for (auto &OD : OutputDeps)
1658 if (!Added.test(OD.second)) {
1659 AdjK[OD.first].push_back(OD.second);
1660 Added.set(OD.second);
1661 }
1662}
1663
1664/// Identify an elementary circuit in the dependence graph starting at the
1665/// specified node.
1666bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1667 const SwingSchedulerDAG *DAG,
1668 bool HasBackedge) {
1669 SUnit *SV = &SUnits[V];
1670 bool F = false;
1671 Stack.insert(SV);
1672 Blocked.set(V);
1673
1674 for (auto W : AdjK[V]) {
1675 if (NumPaths > MaxPaths)
1676 break;
1677 if (W < S)
1678 continue;
1679 if (W == S) {
1680 if (!HasBackedge)
1681 NodeSets.push_back(NodeSet(Stack.begin(), Stack.end(), DAG));
1682 F = true;
1683 ++NumPaths;
1684 break;
1685 }
1686 if (!Blocked.test(W)) {
1687 if (circuit(W, S, NodeSets, DAG,
1688 Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
1689 F = true;
1690 }
1691 }
1692
1693 if (F)
1694 unblock(V);
1695 else {
1696 for (auto W : AdjK[V]) {
1697 if (W < S)
1698 continue;
1699 B[W].insert(SV);
1700 }
1701 }
1702 Stack.pop_back();
1703 return F;
1704}
1705
1706/// Unblock a node in the circuit finding algorithm.
1707void SwingSchedulerDAG::Circuits::unblock(int U) {
1708 Blocked.reset(U);
1710 while (!BU.empty()) {
1712 assert(SI != BU.end() && "Invalid B set.");
1713 SUnit *W = *SI;
1714 BU.erase(W);
1715 if (Blocked.test(W->NodeNum))
1716 unblock(W->NodeNum);
1717 }
1718}
1719
1720/// Identify all the elementary circuits in the dependence graph using
1721/// Johnson's circuit algorithm.
1722void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
1723 // Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1724 // but we do this to find the circuits, and then change them back.
1726
1727 Circuits Cir(SUnits, Topo);
1728 // Create the adjacency structure.
1729 Cir.createAdjacencyStructure(this);
1730 for (int I = 0, E = SUnits.size(); I != E; ++I) {
1731 Cir.reset();
1732 Cir.circuit(I, I, NodeSets, this);
1733 }
1734
1735 // Change the dependences back so that we've created a DAG again.
1737}
1738
1739// Create artificial dependencies between the source of COPY/REG_SEQUENCE that
1740// is loop-carried to the USE in next iteration. This will help pipeliner avoid
1741// additional copies that are needed across iterations. An artificial dependence
1742// edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
1743
1744// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
1745// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
1746// PHI-------True-Dep------> USEOfPhi
1747
1748// The mutation creates
1749// USEOfPHI -------Artificial-Dep---> SRCOfCopy
1750
1751// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
1752// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
1753// late to avoid additional copies across iterations. The possible scheduling
1754// order would be
1755// USEOfPHI --- SRCOfCopy--- COPY/REG_SEQUENCE.
1756
1757void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
1758 for (SUnit &SU : DAG->SUnits) {
1759 // Find the COPY/REG_SEQUENCE instruction.
1760 if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
1761 continue;
1762
1763 // Record the loop carried PHIs.
1765 // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
1767
1768 for (auto &Dep : SU.Preds) {
1769 SUnit *TmpSU = Dep.getSUnit();
1770 MachineInstr *TmpMI = TmpSU->getInstr();
1771 SDep::Kind DepKind = Dep.getKind();
1772 // Save the loop carried PHI.
1773 if (DepKind == SDep::Anti && TmpMI->isPHI())
1774 PHISUs.push_back(TmpSU);
1775 // Save the source of COPY/REG_SEQUENCE.
1776 // If the source has no pre-decessors, we will end up creating cycles.
1777 else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
1778 SrcSUs.push_back(TmpSU);
1779 }
1780
1781 if (PHISUs.size() == 0 || SrcSUs.size() == 0)
1782 continue;
1783
1784 // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
1785 // SUnit to the container.
1787 // Do not use iterator based loop here as we are updating the container.
1788 for (size_t Index = 0; Index < PHISUs.size(); ++Index) {
1789 for (auto &Dep : PHISUs[Index]->Succs) {
1790 if (Dep.getKind() != SDep::Data)
1791 continue;
1792
1793 SUnit *TmpSU = Dep.getSUnit();
1794 MachineInstr *TmpMI = TmpSU->getInstr();
1795 if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
1796 PHISUs.push_back(TmpSU);
1797 continue;
1798 }
1799 UseSUs.push_back(TmpSU);
1800 }
1801 }
1802
1803 if (UseSUs.size() == 0)
1804 continue;
1805
1806 SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
1807 // Add the artificial dependencies if it does not form a cycle.
1808 for (auto *I : UseSUs) {
1809 for (auto *Src : SrcSUs) {
1810 if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
1811 Src->addPred(SDep(I, SDep::Artificial));
1812 SDAG->Topo.AddPred(Src, I);
1813 }
1814 }
1815 }
1816 }
1817}
1818
1819/// Return true for DAG nodes that we ignore when computing the cost functions.
1820/// We ignore the back-edge recurrence in order to avoid unbounded recursion
1821/// in the calculation of the ASAP, ALAP, etc functions.
1822static bool ignoreDependence(const SDep &D, bool isPred) {
1823 if (D.isArtificial() || D.getSUnit()->isBoundaryNode())
1824 return true;
1825 return D.getKind() == SDep::Anti && isPred;
1826}
1827
1828/// Compute several functions need to order the nodes for scheduling.
1829/// ASAP - Earliest time to schedule a node.
1830/// ALAP - Latest time to schedule a node.
1831/// MOV - Mobility function, difference between ALAP and ASAP.
1832/// D - Depth of each node.
1833/// H - Height of each node.
1834void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
1835 ScheduleInfo.resize(SUnits.size());
1836
1837 LLVM_DEBUG({
1838 for (int I : Topo) {
1839 const SUnit &SU = SUnits[I];
1840 dumpNode(SU);
1841 }
1842 });
1843
1844 int maxASAP = 0;
1845 // Compute ASAP and ZeroLatencyDepth.
1846 for (int I : Topo) {
1847 int asap = 0;
1848 int zeroLatencyDepth = 0;
1849 SUnit *SU = &SUnits[I];
1850 for (const SDep &P : SU->Preds) {
1851 SUnit *pred = P.getSUnit();
1852 if (P.getLatency() == 0)
1853 zeroLatencyDepth =
1854 std::max(zeroLatencyDepth, getZeroLatencyDepth(pred) + 1);
1855 if (ignoreDependence(P, true))
1856 continue;
1857 asap = std::max(asap, (int)(getASAP(pred) + P.getLatency() -
1858 getDistance(pred, SU, P) * MII));
1859 }
1860 maxASAP = std::max(maxASAP, asap);
1861 ScheduleInfo[I].ASAP = asap;
1862 ScheduleInfo[I].ZeroLatencyDepth = zeroLatencyDepth;
1863 }
1864
1865 // Compute ALAP, ZeroLatencyHeight, and MOV.
1866 for (int I : llvm::reverse(Topo)) {
1867 int alap = maxASAP;
1868 int zeroLatencyHeight = 0;
1869 SUnit *SU = &SUnits[I];
1870 for (const SDep &S : SU->Succs) {
1871 SUnit *succ = S.getSUnit();
1872 if (succ->isBoundaryNode())
1873 continue;
1874 if (S.getLatency() == 0)
1875 zeroLatencyHeight =
1876 std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
1877 if (ignoreDependence(S, true))
1878 continue;
1879 alap = std::min(alap, (int)(getALAP(succ) - S.getLatency() +
1880 getDistance(SU, succ, S) * MII));
1881 }
1882
1883 ScheduleInfo[I].ALAP = alap;
1884 ScheduleInfo[I].ZeroLatencyHeight = zeroLatencyHeight;
1885 }
1886
1887 // After computing the node functions, compute the summary for each node set.
1888 for (NodeSet &I : NodeSets)
1889 I.computeNodeSetInfo(this);
1890
1891 LLVM_DEBUG({
1892 for (unsigned i = 0; i < SUnits.size(); i++) {
1893 dbgs() << "\tNode " << i << ":\n";
1894 dbgs() << "\t ASAP = " << getASAP(&SUnits[i]) << "\n";
1895 dbgs() << "\t ALAP = " << getALAP(&SUnits[i]) << "\n";
1896 dbgs() << "\t MOV = " << getMOV(&SUnits[i]) << "\n";
1897 dbgs() << "\t D = " << getDepth(&SUnits[i]) << "\n";
1898 dbgs() << "\t H = " << getHeight(&SUnits[i]) << "\n";
1899 dbgs() << "\t ZLD = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
1900 dbgs() << "\t ZLH = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
1901 }
1902 });
1903}
1904
1905/// Compute the Pred_L(O) set, as defined in the paper. The set is defined
1906/// as the predecessors of the elements of NodeOrder that are not also in
1907/// NodeOrder.
1910 const NodeSet *S = nullptr) {
1911 Preds.clear();
1912 for (const SUnit *SU : NodeOrder) {
1913 for (const SDep &Pred : SU->Preds) {
1914 if (S && S->count(Pred.getSUnit()) == 0)
1915 continue;
1916 if (ignoreDependence(Pred, true))
1917 continue;
1918 if (NodeOrder.count(Pred.getSUnit()) == 0)
1919 Preds.insert(Pred.getSUnit());
1920 }
1921 // Back-edges are predecessors with an anti-dependence.
1922 for (const SDep &Succ : SU->Succs) {
1923 if (Succ.getKind() != SDep::Anti)
1924 continue;
1925 if (S && S->count(Succ.getSUnit()) == 0)
1926 continue;
1927 if (NodeOrder.count(Succ.getSUnit()) == 0)
1928 Preds.insert(Succ.getSUnit());
1929 }
1930 }
1931 return !Preds.empty();
1932}
1933
1934/// Compute the Succ_L(O) set, as defined in the paper. The set is defined
1935/// as the successors of the elements of NodeOrder that are not also in
1936/// NodeOrder.
1939 const NodeSet *S = nullptr) {
1940 Succs.clear();
1941 for (const SUnit *SU : NodeOrder) {
1942 for (const SDep &Succ : SU->Succs) {
1943 if (S && S->count(Succ.getSUnit()) == 0)
1944 continue;
1945 if (ignoreDependence(Succ, false))
1946 continue;
1947 if (NodeOrder.count(Succ.getSUnit()) == 0)
1948 Succs.insert(Succ.getSUnit());
1949 }
1950 for (const SDep &Pred : SU->Preds) {
1951 if (Pred.getKind() != SDep::Anti)
1952 continue;
1953 if (S && S->count(Pred.getSUnit()) == 0)
1954 continue;
1955 if (NodeOrder.count(Pred.getSUnit()) == 0)
1956 Succs.insert(Pred.getSUnit());
1957 }
1958 }
1959 return !Succs.empty();
1960}
1961
1962/// Return true if there is a path from the specified node to any of the nodes
1963/// in DestNodes. Keep track and return the nodes in any path.
1964static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
1965 SetVector<SUnit *> &DestNodes,
1966 SetVector<SUnit *> &Exclude,
1967 SmallPtrSet<SUnit *, 8> &Visited) {
1968 if (Cur->isBoundaryNode())
1969 return false;
1970 if (Exclude.contains(Cur))
1971 return false;
1972 if (DestNodes.contains(Cur))
1973 return true;
1974 if (!Visited.insert(Cur).second)
1975 return Path.contains(Cur);
1976 bool FoundPath = false;
1977 for (auto &SI : Cur->Succs)
1978 if (!ignoreDependence(SI, false))
1979 FoundPath |=
1980 computePath(SI.getSUnit(), Path, DestNodes, Exclude, Visited);
1981 for (auto &PI : Cur->Preds)
1982 if (PI.getKind() == SDep::Anti)
1983 FoundPath |=
1984 computePath(PI.getSUnit(), Path, DestNodes, Exclude, Visited);
1985 if (FoundPath)
1986 Path.insert(Cur);
1987 return FoundPath;
1988}
1989
1990/// Compute the live-out registers for the instructions in a node-set.
1991/// The live-out registers are those that are defined in the node-set,
1992/// but not used. Except for use operands of Phis.
1994 NodeSet &NS) {
1999 for (SUnit *SU : NS) {
2000 const MachineInstr *MI = SU->getInstr();
2001 if (MI->isPHI())
2002 continue;
2003 for (const MachineOperand &MO : MI->all_uses()) {
2004 Register Reg = MO.getReg();
2005 if (Reg.isVirtual())
2006 Uses.insert(Reg);
2007 else if (MRI.isAllocatable(Reg))
2008 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
2009 Uses.insert(Unit);
2010 }
2011 }
2012 for (SUnit *SU : NS)
2013 for (const MachineOperand &MO : SU->getInstr()->all_defs())
2014 if (!MO.isDead()) {
2015 Register Reg = MO.getReg();
2016 if (Reg.isVirtual()) {
2017 if (!Uses.count(Reg))
2018 LiveOutRegs.push_back(RegisterMaskPair(Reg,
2020 } else if (MRI.isAllocatable(Reg)) {
2021 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
2022 if (!Uses.count(Unit))
2023 LiveOutRegs.push_back(
2025 }
2026 }
2027 RPTracker.addLiveRegs(LiveOutRegs);
2028}
2029
2030/// A heuristic to filter nodes in recurrent node-sets if the register
2031/// pressure of a set is too high.
2032void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
2033 for (auto &NS : NodeSets) {
2034 // Skip small node-sets since they won't cause register pressure problems.
2035 if (NS.size() <= 2)
2036 continue;
2037 IntervalPressure RecRegPressure;
2038 RegPressureTracker RecRPTracker(RecRegPressure);
2039 RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
2040 computeLiveOuts(MF, RecRPTracker, NS);
2041 RecRPTracker.closeBottom();
2042
2043 std::vector<SUnit *> SUnits(NS.begin(), NS.end());
2044 llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {
2045 return A->NodeNum > B->NodeNum;
2046 });
2047
2048 for (auto &SU : SUnits) {
2049 // Since we're computing the register pressure for a subset of the
2050 // instructions in a block, we need to set the tracker for each
2051 // instruction in the node-set. The tracker is set to the instruction
2052 // just after the one we're interested in.
2054 RecRPTracker.setPos(std::next(CurInstI));
2055
2056 RegPressureDelta RPDelta;
2057 ArrayRef<PressureChange> CriticalPSets;
2058 RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
2059 CriticalPSets,
2060 RecRegPressure.MaxSetPressure);
2061 if (RPDelta.Excess.isValid()) {
2062 LLVM_DEBUG(
2063 dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
2065 << ":" << RPDelta.Excess.getUnitInc() << "\n");
2066 NS.setExceedPressure(SU);
2067 break;
2068 }
2069 RecRPTracker.recede();
2070 }
2071 }
2072}
2073
2074/// A heuristic to colocate node sets that have the same set of
2075/// successors.
2076void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
2077 unsigned Colocate = 0;
2078 for (int i = 0, e = NodeSets.size(); i < e; ++i) {
2079 NodeSet &N1 = NodeSets[i];
2081 if (N1.empty() || !succ_L(N1, S1))
2082 continue;
2083 for (int j = i + 1; j < e; ++j) {
2084 NodeSet &N2 = NodeSets[j];
2085 if (N1.compareRecMII(N2) != 0)
2086 continue;
2088 if (N2.empty() || !succ_L(N2, S2))
2089 continue;
2090 if (llvm::set_is_subset(S1, S2) && S1.size() == S2.size()) {
2091 N1.setColocate(++Colocate);
2092 N2.setColocate(Colocate);
2093 break;
2094 }
2095 }
2096 }
2097}
2098
2099/// Check if the existing node-sets are profitable. If not, then ignore the
2100/// recurrent node-sets, and attempt to schedule all nodes together. This is
2101/// a heuristic. If the MII is large and all the recurrent node-sets are small,
2102/// then it's best to try to schedule all instructions together instead of
2103/// starting with the recurrent node-sets.
2104void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
2105 // Look for loops with a large MII.
2106 if (MII < 17)
2107 return;
2108 // Check if the node-set contains only a simple add recurrence.
2109 for (auto &NS : NodeSets) {
2110 if (NS.getRecMII() > 2)
2111 return;
2112 if (NS.getMaxDepth() > MII)
2113 return;
2114 }
2115 NodeSets.clear();
2116 LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
2117}
2118
2119/// Add the nodes that do not belong to a recurrence set into groups
2120/// based upon connected components.
2121void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
2122 SetVector<SUnit *> NodesAdded;
2124 // Add the nodes that are on a path between the previous node sets and
2125 // the current node set.
2126 for (NodeSet &I : NodeSets) {
2128 // Add the nodes from the current node set to the previous node set.
2129 if (succ_L(I, N)) {
2131 for (SUnit *NI : N) {
2132 Visited.clear();
2133 computePath(NI, Path, NodesAdded, I, Visited);
2134 }
2135 if (!Path.empty())
2136 I.insert(Path.begin(), Path.end());
2137 }
2138 // Add the nodes from the previous node set to the current node set.
2139 N.clear();
2140 if (succ_L(NodesAdded, N)) {
2142 for (SUnit *NI : N) {
2143 Visited.clear();
2144 computePath(NI, Path, I, NodesAdded, Visited);
2145 }
2146 if (!Path.empty())
2147 I.insert(Path.begin(), Path.end());
2148 }
2149 NodesAdded.insert(I.begin(), I.end());
2150 }
2151
2152 // Create a new node set with the connected nodes of any successor of a node
2153 // in a recurrent set.
2154 NodeSet NewSet;
2156 if (succ_L(NodesAdded, N))
2157 for (SUnit *I : N)
2158 addConnectedNodes(I, NewSet, NodesAdded);
2159 if (!NewSet.empty())
2160 NodeSets.push_back(NewSet);
2161
2162 // Create a new node set with the connected nodes of any predecessor of a node
2163 // in a recurrent set.
2164 NewSet.clear();
2165 if (pred_L(NodesAdded, N))
2166 for (SUnit *I : N)
2167 addConnectedNodes(I, NewSet, NodesAdded);
2168 if (!NewSet.empty())
2169 NodeSets.push_back(NewSet);
2170
2171 // Create new nodes sets with the connected nodes any remaining node that
2172 // has no predecessor.
2173 for (SUnit &SU : SUnits) {
2174 if (NodesAdded.count(&SU) == 0) {
2175 NewSet.clear();
2176 addConnectedNodes(&SU, NewSet, NodesAdded);
2177 if (!NewSet.empty())
2178 NodeSets.push_back(NewSet);
2179 }
2180 }
2181}
2182
2183/// Add the node to the set, and add all of its connected nodes to the set.
2184void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
2185 SetVector<SUnit *> &NodesAdded) {
2186 NewSet.insert(SU);
2187 NodesAdded.insert(SU);
2188 for (auto &SI : SU->Succs) {
2189 SUnit *Successor = SI.getSUnit();
2190 if (!SI.isArtificial() && !Successor->isBoundaryNode() &&
2191 NodesAdded.count(Successor) == 0)
2192 addConnectedNodes(Successor, NewSet, NodesAdded);
2193 }
2194 for (auto &PI : SU->Preds) {
2195 SUnit *Predecessor = PI.getSUnit();
2196 if (!PI.isArtificial() && NodesAdded.count(Predecessor) == 0)
2197 addConnectedNodes(Predecessor, NewSet, NodesAdded);
2198 }
2199}
2200
2201/// Return true if Set1 contains elements in Set2. The elements in common
2202/// are returned in a different container.
2203static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
2205 Result.clear();
2206 for (SUnit *SU : Set1) {
2207 if (Set2.count(SU) != 0)
2208 Result.insert(SU);
2209 }
2210 return !Result.empty();
2211}
2212
2213/// Merge the recurrence node sets that have the same initial node.
2214void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
2215 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2216 ++I) {
2217 NodeSet &NI = *I;
2218 for (NodeSetType::iterator J = I + 1; J != E;) {
2219 NodeSet &NJ = *J;
2220 if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
2221 if (NJ.compareRecMII(NI) > 0)
2222 NI.setRecMII(NJ.getRecMII());
2223 for (SUnit *SU : *J)
2224 I->insert(SU);
2225 NodeSets.erase(J);
2226 E = NodeSets.end();
2227 } else {
2228 ++J;
2229 }
2230 }
2231 }
2232}
2233
2234/// Remove nodes that have been scheduled in previous NodeSets.
2235void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
2236 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2237 ++I)
2238 for (NodeSetType::iterator J = I + 1; J != E;) {
2239 J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
2240
2241 if (J->empty()) {
2242 NodeSets.erase(J);
2243 E = NodeSets.end();
2244 } else {
2245 ++J;
2246 }
2247 }
2248}
2249
2250/// Compute an ordered list of the dependence graph nodes, which
2251/// indicates the order that the nodes will be scheduled. This is a
2252/// two-level algorithm. First, a partial order is created, which
2253/// consists of a list of sets ordered from highest to lowest priority.
2254void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
2256 NodeOrder.clear();
2257
2258 for (auto &Nodes : NodeSets) {
2259 LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
2260 OrderKind Order;
2262 if (pred_L(NodeOrder, N) && llvm::set_is_subset(N, Nodes)) {
2263 R.insert(N.begin(), N.end());
2264 Order = BottomUp;
2265 LLVM_DEBUG(dbgs() << " Bottom up (preds) ");
2266 } else if (succ_L(NodeOrder, N) && llvm::set_is_subset(N, Nodes)) {
2267 R.insert(N.begin(), N.end());
2268 Order = TopDown;
2269 LLVM_DEBUG(dbgs() << " Top down (succs) ");
2270 } else if (isIntersect(N, Nodes, R)) {
2271 // If some of the successors are in the existing node-set, then use the
2272 // top-down ordering.
2273 Order = TopDown;
2274 LLVM_DEBUG(dbgs() << " Top down (intersect) ");
2275 } else if (NodeSets.size() == 1) {
2276 for (const auto &N : Nodes)
2277 if (N->Succs.size() == 0)
2278 R.insert(N);
2279 Order = BottomUp;
2280 LLVM_DEBUG(dbgs() << " Bottom up (all) ");
2281 } else {
2282 // Find the node with the highest ASAP.
2283 SUnit *maxASAP = nullptr;
2284 for (SUnit *SU : Nodes) {
2285 if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
2286 (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
2287 maxASAP = SU;
2288 }
2289 R.insert(maxASAP);
2290 Order = BottomUp;
2291 LLVM_DEBUG(dbgs() << " Bottom up (default) ");
2292 }
2293
2294 while (!R.empty()) {
2295 if (Order == TopDown) {
2296 // Choose the node with the maximum height. If more than one, choose
2297 // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
2298 // choose the node with the lowest MOV.
2299 while (!R.empty()) {
2300 SUnit *maxHeight = nullptr;
2301 for (SUnit *I : R) {
2302 if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
2303 maxHeight = I;
2304 else if (getHeight(I) == getHeight(maxHeight) &&
2306 maxHeight = I;
2307 else if (getHeight(I) == getHeight(maxHeight) &&
2309 getZeroLatencyHeight(maxHeight) &&
2310 getMOV(I) < getMOV(maxHeight))
2311 maxHeight = I;
2312 }
2313 NodeOrder.insert(maxHeight);
2314 LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
2315 R.remove(maxHeight);
2316 for (const auto &I : maxHeight->Succs) {
2317 if (Nodes.count(I.getSUnit()) == 0)
2318 continue;
2319 if (NodeOrder.contains(I.getSUnit()))
2320 continue;
2321 if (ignoreDependence(I, false))
2322 continue;
2323 R.insert(I.getSUnit());
2324 }
2325 // Back-edges are predecessors with an anti-dependence.
2326 for (const auto &I : maxHeight->Preds) {
2327 if (I.getKind() != SDep::Anti)
2328 continue;
2329 if (Nodes.count(I.getSUnit()) == 0)
2330 continue;
2331 if (NodeOrder.contains(I.getSUnit()))
2332 continue;
2333 R.insert(I.getSUnit());
2334 }
2335 }
2336 Order = BottomUp;
2337 LLVM_DEBUG(dbgs() << "\n Switching order to bottom up ");
2339 if (pred_L(NodeOrder, N, &Nodes))
2340 R.insert(N.begin(), N.end());
2341 } else {
2342 // Choose the node with the maximum depth. If more than one, choose
2343 // the node with the maximum ZeroLatencyDepth. If still more than one,
2344 // choose the node with the lowest MOV.
2345 while (!R.empty()) {
2346 SUnit *maxDepth = nullptr;
2347 for (SUnit *I : R) {
2348 if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
2349 maxDepth = I;
2350 else if (getDepth(I) == getDepth(maxDepth) &&
2352 maxDepth = I;
2353 else if (getDepth(I) == getDepth(maxDepth) &&
2355 getMOV(I) < getMOV(maxDepth))
2356 maxDepth = I;
2357 }
2358 NodeOrder.insert(maxDepth);
2359 LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
2360 R.remove(maxDepth);
2361 if (Nodes.isExceedSU(maxDepth)) {
2362 Order = TopDown;
2363 R.clear();
2364 R.insert(Nodes.getNode(0));
2365 break;
2366 }
2367 for (const auto &I : maxDepth->Preds) {
2368 if (Nodes.count(I.getSUnit()) == 0)
2369 continue;
2370 if (NodeOrder.contains(I.getSUnit()))
2371 continue;
2372 R.insert(I.getSUnit());
2373 }
2374 // Back-edges are predecessors with an anti-dependence.
2375 for (const auto &I : maxDepth->Succs) {
2376 if (I.getKind() != SDep::Anti)
2377 continue;
2378 if (Nodes.count(I.getSUnit()) == 0)
2379 continue;
2380 if (NodeOrder.contains(I.getSUnit()))
2381 continue;
2382 R.insert(I.getSUnit());
2383 }
2384 }
2385 Order = TopDown;
2386 LLVM_DEBUG(dbgs() << "\n Switching order to top down ");
2388 if (succ_L(NodeOrder, N, &Nodes))
2389 R.insert(N.begin(), N.end());
2390 }
2391 }
2392 LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
2393 }
2394
2395 LLVM_DEBUG({
2396 dbgs() << "Node order: ";
2397 for (SUnit *I : NodeOrder)
2398 dbgs() << " " << I->NodeNum << " ";
2399 dbgs() << "\n";
2400 });
2401}
2402
2403/// Process the nodes in the computed order and create the pipelined schedule
2404/// of the instructions, if possible. Return true if a schedule is found.
2405bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
2406
2407 if (NodeOrder.empty()){
2408 LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );
2409 return false;
2410 }
2411
2412 bool scheduleFound = false;
2413 std::unique_ptr<HighRegisterPressureDetector> HRPDetector;
2414 if (LimitRegPressure) {
2415 HRPDetector =
2416 std::make_unique<HighRegisterPressureDetector>(Loop.getHeader(), MF);
2417 HRPDetector->init(RegClassInfo);
2418 }
2419 // Keep increasing II until a valid schedule is found.
2420 for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) {
2421 Schedule.reset();
2422 Schedule.setInitiationInterval(II);
2423 LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
2424
2425 SetVector<SUnit *>::iterator NI = NodeOrder.begin();
2426 SetVector<SUnit *>::iterator NE = NodeOrder.end();
2427 do {
2428 SUnit *SU = *NI;
2429
2430 // Compute the schedule time for the instruction, which is based
2431 // upon the scheduled time for any predecessors/successors.
2432 int EarlyStart = INT_MIN;
2433 int LateStart = INT_MAX;
2434 Schedule.computeStart(SU, &EarlyStart, &LateStart, II, this);
2435 LLVM_DEBUG({
2436 dbgs() << "\n";
2437 dbgs() << "Inst (" << SU->NodeNum << ") ";
2438 SU->getInstr()->dump();
2439 dbgs() << "\n";
2440 });
2441 LLVM_DEBUG(
2442 dbgs() << format("\tes: %8x ls: %8x\n", EarlyStart, LateStart));
2443
2444 if (EarlyStart > LateStart)
2445 scheduleFound = false;
2446 else if (EarlyStart != INT_MIN && LateStart == INT_MAX)
2447 scheduleFound =
2448 Schedule.insert(SU, EarlyStart, EarlyStart + (int)II - 1, II);
2449 else if (EarlyStart == INT_MIN && LateStart != INT_MAX)
2450 scheduleFound =
2451 Schedule.insert(SU, LateStart, LateStart - (int)II + 1, II);
2452 else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2453 LateStart = std::min(LateStart, EarlyStart + (int)II - 1);
2454 // When scheduling a Phi it is better to start at the late cycle and
2455 // go backwards. The default order may insert the Phi too far away
2456 // from its first dependence.
2457 // Also, do backward search when all scheduled predecessors are
2458 // loop-carried output/order dependencies. Empirically, there are also
2459 // cases where scheduling becomes possible with backward search.
2460 if (SU->getInstr()->isPHI() ||
2461 Schedule.onlyHasLoopCarriedOutputOrOrderPreds(SU, this))
2462 scheduleFound = Schedule.insert(SU, LateStart, EarlyStart, II);
2463 else
2464 scheduleFound = Schedule.insert(SU, EarlyStart, LateStart, II);
2465 } else {
2466 int FirstCycle = Schedule.getFirstCycle();
2467 scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
2468 FirstCycle + getASAP(SU) + II - 1, II);
2469 }
2470
2471 // Even if we find a schedule, make sure the schedule doesn't exceed the
2472 // allowable number of stages. We keep trying if this happens.
2473 if (scheduleFound)
2474 if (SwpMaxStages > -1 &&
2475 Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2476 scheduleFound = false;
2477
2478 LLVM_DEBUG({
2479 if (!scheduleFound)
2480 dbgs() << "\tCan't schedule\n";
2481 });
2482 } while (++NI != NE && scheduleFound);
2483
2484 // If a schedule is found, ensure non-pipelined instructions are in stage 0
2485 if (scheduleFound)
2486 scheduleFound =
2487 Schedule.normalizeNonPipelinedInstructions(this, LoopPipelinerInfo);
2488
2489 // If a schedule is found, check if it is a valid schedule too.
2490 if (scheduleFound)
2491 scheduleFound = Schedule.isValidSchedule(this);
2492
2493 // If a schedule was found and the option is enabled, check if the schedule
2494 // might generate additional register spills/fills.
2495 if (scheduleFound && LimitRegPressure)
2496 scheduleFound =
2497 !HRPDetector->detect(this, Schedule, Schedule.getMaxStageCount());
2498 }
2499
2500 LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
2501 << " (II=" << Schedule.getInitiationInterval()
2502 << ")\n");
2503
2504 if (scheduleFound) {
2505 scheduleFound = LoopPipelinerInfo->shouldUseSchedule(*this, Schedule);
2506 if (!scheduleFound)
2507 LLVM_DEBUG(dbgs() << "Target rejected schedule\n");
2508 }
2509
2510 if (scheduleFound) {
2511 Schedule.finalizeSchedule(this);
2512 Pass.ORE->emit([&]() {
2514 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
2515 << "Schedule found with Initiation Interval: "
2516 << ore::NV("II", Schedule.getInitiationInterval())
2517 << ", MaxStageCount: "
2518 << ore::NV("MaxStageCount", Schedule.getMaxStageCount());
2519 });
2520 } else
2521 Schedule.reset();
2522
2523 return scheduleFound && Schedule.getMaxStageCount() > 0;
2524}
2525
2526/// Return true if we can compute the amount the instruction changes
2527/// during each iteration. Set Delta to the amount of the change.
2528bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) const {
2530 const MachineOperand *BaseOp;
2531 int64_t Offset;
2532 bool OffsetIsScalable;
2533 if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
2534 return false;
2535
2536 // FIXME: This algorithm assumes instructions have fixed-size offsets.
2537 if (OffsetIsScalable)
2538 return false;
2539
2540 if (!BaseOp->isReg())
2541 return false;
2542
2543 Register BaseReg = BaseOp->getReg();
2544
2546 // Check if there is a Phi. If so, get the definition in the loop.
2547 MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
2548 if (BaseDef && BaseDef->isPHI()) {
2549 BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
2550 BaseDef = MRI.getVRegDef(BaseReg);
2551 }
2552 if (!BaseDef)
2553 return false;
2554
2555 int D = 0;
2556 if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
2557 return false;
2558
2559 Delta = D;
2560 return true;
2561}
2562
2563/// Check if we can change the instruction to use an offset value from the
2564/// previous iteration. If so, return true and set the base and offset values
2565/// so that we can rewrite the load, if necessary.
2566/// v1 = Phi(v0, v3)
2567/// v2 = load v1, 0
2568/// v3 = post_store v1, 4, x
2569/// This function enables the load to be rewritten as v2 = load v3, 4.
2570bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
2571 unsigned &BasePos,
2572 unsigned &OffsetPos,
2573 unsigned &NewBase,
2574 int64_t &Offset) {
2575 // Get the load instruction.
2576 if (TII->isPostIncrement(*MI))
2577 return false;
2578 unsigned BasePosLd, OffsetPosLd;
2579 if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
2580 return false;
2581 Register BaseReg = MI->getOperand(BasePosLd).getReg();
2582
2583 // Look for the Phi instruction.
2584 MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
2585 MachineInstr *Phi = MRI.getVRegDef(BaseReg);
2586 if (!Phi || !Phi->isPHI())
2587 return false;
2588 // Get the register defined in the loop block.
2589 unsigned PrevReg = getLoopPhiReg(*Phi, MI->getParent());
2590 if (!PrevReg)
2591 return false;
2592
2593 // Check for the post-increment load/store instruction.
2594 MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
2595 if (!PrevDef || PrevDef == MI)
2596 return false;
2597
2598 if (!TII->isPostIncrement(*PrevDef))
2599 return false;
2600
2601 unsigned BasePos1 = 0, OffsetPos1 = 0;
2602 if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
2603 return false;
2604
2605 // Make sure that the instructions do not access the same memory location in
2606 // the next iteration.
2607 int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
2608 int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
2610 NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
2611 bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
2612 MF.deleteMachineInstr(NewMI);
2613 if (!Disjoint)
2614 return false;
2615
2616 // Set the return value once we determine that we return true.
2617 BasePos = BasePosLd;
2618 OffsetPos = OffsetPosLd;
2619 NewBase = PrevReg;
2620 Offset = StoreOffset;
2621 return true;
2622}
2623
2624/// Apply changes to the instruction if needed. The changes are need
2625/// to improve the scheduling and depend up on the final schedule.
2627 SMSchedule &Schedule) {
2628 SUnit *SU = getSUnit(MI);
2630 InstrChanges.find(SU);
2631 if (It != InstrChanges.end()) {
2632 std::pair<unsigned, int64_t> RegAndOffset = It->second;
2633 unsigned BasePos, OffsetPos;
2634 if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2635 return;
2636 Register BaseReg = MI->getOperand(BasePos).getReg();
2637 MachineInstr *LoopDef = findDefInLoop(BaseReg);
2638 int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
2639 int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
2640 int BaseStageNum = Schedule.stageScheduled(SU);
2641 int BaseCycleNum = Schedule.cycleScheduled(SU);
2642 if (BaseStageNum < DefStageNum) {
2644 int OffsetDiff = DefStageNum - BaseStageNum;
2645 if (DefCycleNum < BaseCycleNum) {
2646 NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
2647 if (OffsetDiff > 0)
2648 --OffsetDiff;
2649 }
2650 int64_t NewOffset =
2651 MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
2652 NewMI->getOperand(OffsetPos).setImm(NewOffset);
2653 SU->setInstr(NewMI);
2654 MISUnitMap[NewMI] = SU;
2655 NewMIs[MI] = NewMI;
2656 }
2657 }
2658}
2659
2660/// Return the instruction in the loop that defines the register.
2661/// If the definition is a Phi, then follow the Phi operand to
2662/// the instruction in the loop.
2663MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
2665 MachineInstr *Def = MRI.getVRegDef(Reg);
2666 while (Def->isPHI()) {
2667 if (!Visited.insert(Def).second)
2668 break;
2669 for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
2670 if (Def->getOperand(i + 1).getMBB() == BB) {
2671 Def = MRI.getVRegDef(Def->getOperand(i).getReg());
2672 break;
2673 }
2674 }
2675 return Def;
2676}
2677
2678/// Return true for an order or output dependence that is loop carried
2679/// potentially. A dependence is loop carried if the destination defines a value
2680/// that may be used or defined by the source in a subsequent iteration.
2682 bool isSucc) const {
2683 if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
2684 Dep.isArtificial() || Dep.getSUnit()->isBoundaryNode())
2685 return false;
2686
2688 return true;
2689
2690 if (Dep.getKind() == SDep::Output)
2691 return true;
2692
2693 MachineInstr *SI = Source->getInstr();
2694 MachineInstr *DI = Dep.getSUnit()->getInstr();
2695 if (!isSucc)
2696 std::swap(SI, DI);
2697 assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
2698
2699 // Assume ordered loads and stores may have a loop carried dependence.
2700 if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
2701 SI->mayRaiseFPException() || DI->mayRaiseFPException() ||
2702 SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
2703 return true;
2704
2705 if (!DI->mayLoadOrStore() || !SI->mayLoadOrStore())
2706 return false;
2707
2708 // The conservative assumption is that a dependence between memory operations
2709 // may be loop carried. The following code checks when it can be proved that
2710 // there is no loop carried dependence.
2711 unsigned DeltaS, DeltaD;
2712 if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD))
2713 return true;
2714
2715 const MachineOperand *BaseOpS, *BaseOpD;
2716 int64_t OffsetS, OffsetD;
2717 bool OffsetSIsScalable, OffsetDIsScalable;
2719 if (!TII->getMemOperandWithOffset(*SI, BaseOpS, OffsetS, OffsetSIsScalable,
2720 TRI) ||
2721 !TII->getMemOperandWithOffset(*DI, BaseOpD, OffsetD, OffsetDIsScalable,
2722 TRI))
2723 return true;
2724
2725 assert(!OffsetSIsScalable && !OffsetDIsScalable &&
2726 "Expected offsets to be byte offsets");
2727
2728 MachineInstr *DefS = MRI.getVRegDef(BaseOpS->getReg());
2729 MachineInstr *DefD = MRI.getVRegDef(BaseOpD->getReg());
2730 if (!DefS || !DefD || !DefS->isPHI() || !DefD->isPHI())
2731 return true;
2732
2733 unsigned InitValS = 0;
2734 unsigned LoopValS = 0;
2735 unsigned InitValD = 0;
2736 unsigned LoopValD = 0;
2737 getPhiRegs(*DefS, BB, InitValS, LoopValS);
2738 getPhiRegs(*DefD, BB, InitValD, LoopValD);
2739 MachineInstr *InitDefS = MRI.getVRegDef(InitValS);
2740 MachineInstr *InitDefD = MRI.getVRegDef(InitValD);
2741
2742 if (!InitDefS->isIdenticalTo(*InitDefD))
2743 return true;
2744
2745 // Check that the base register is incremented by a constant value for each
2746 // iteration.
2747 MachineInstr *LoopDefS = MRI.getVRegDef(LoopValS);
2748 int D = 0;
2749 if (!LoopDefS || !TII->getIncrementValue(*LoopDefS, D))
2750 return true;
2751
2752 LocationSize AccessSizeS = (*SI->memoperands_begin())->getSize();
2753 LocationSize AccessSizeD = (*DI->memoperands_begin())->getSize();
2754
2755 // This is the main test, which checks the offset values and the loop
2756 // increment value to determine if the accesses may be loop carried.
2757 if (!AccessSizeS.hasValue() || !AccessSizeD.hasValue())
2758 return true;
2759
2760 if (DeltaS != DeltaD || DeltaS < AccessSizeS.getValue() ||
2761 DeltaD < AccessSizeD.getValue())
2762 return true;
2763
2764 return (OffsetS + (int64_t)AccessSizeS.getValue() <
2765 OffsetD + (int64_t)AccessSizeD.getValue());
2766}
2767
2768void SwingSchedulerDAG::postProcessDAG() {
2769 for (auto &M : Mutations)
2770 M->apply(this);
2771}
2772
2773/// Try to schedule the node at the specified StartCycle and continue
2774/// until the node is schedule or the EndCycle is reached. This function
2775/// returns true if the node is scheduled. This routine may search either
2776/// forward or backward for a place to insert the instruction based upon
2777/// the relative values of StartCycle and EndCycle.
2778bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
2779 bool forward = true;
2780 LLVM_DEBUG({
2781 dbgs() << "Trying to insert node between " << StartCycle << " and "
2782 << EndCycle << " II: " << II << "\n";
2783 });
2784 if (StartCycle > EndCycle)
2785 forward = false;
2786
2787 // The terminating condition depends on the direction.
2788 int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
2789 for (int curCycle = StartCycle; curCycle != termCycle;
2790 forward ? ++curCycle : --curCycle) {
2791
2792 if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
2793 ProcItinResources.canReserveResources(*SU, curCycle)) {
2794 LLVM_DEBUG({
2795 dbgs() << "\tinsert at cycle " << curCycle << " ";
2796 SU->getInstr()->dump();
2797 });
2798
2799 if (!ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
2800 ProcItinResources.reserveResources(*SU, curCycle);
2801 ScheduledInstrs[curCycle].push_back(SU);
2802 InstrToCycle.insert(std::make_pair(SU, curCycle));
2803 if (curCycle > LastCycle)
2804 LastCycle = curCycle;
2805 if (curCycle < FirstCycle)
2806 FirstCycle = curCycle;
2807 return true;
2808 }
2809 LLVM_DEBUG({
2810 dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
2811 SU->getInstr()->dump();
2812 });
2813 }
2814 return false;
2815}
2816
2817// Return the cycle of the earliest scheduled instruction in the chain.
2820 SmallVector<SDep, 8> Worklist;
2821 Worklist.push_back(Dep);
2822 int EarlyCycle = INT_MAX;
2823 while (!Worklist.empty()) {
2824 const SDep &Cur = Worklist.pop_back_val();
2825 SUnit *PrevSU = Cur.getSUnit();
2826 if (Visited.count(PrevSU))
2827 continue;
2828 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
2829 if (it == InstrToCycle.end())
2830 continue;
2831 EarlyCycle = std::min(EarlyCycle, it->second);
2832 for (const auto &PI : PrevSU->Preds)
2833 if (PI.getKind() == SDep::Order || PI.getKind() == SDep::Output)
2834 Worklist.push_back(PI);
2835 Visited.insert(PrevSU);
2836 }
2837 return EarlyCycle;
2838}
2839
2840// Return the cycle of the latest scheduled instruction in the chain.
2843 SmallVector<SDep, 8> Worklist;
2844 Worklist.push_back(Dep);
2845 int LateCycle = INT_MIN;
2846 while (!Worklist.empty()) {
2847 const SDep &Cur = Worklist.pop_back_val();
2848 SUnit *SuccSU = Cur.getSUnit();
2849 if (Visited.count(SuccSU) || SuccSU->isBoundaryNode())
2850 continue;
2851 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
2852 if (it == InstrToCycle.end())
2853 continue;
2854 LateCycle = std::max(LateCycle, it->second);
2855 for (const auto &SI : SuccSU->Succs)
2856 if (SI.getKind() == SDep::Order || SI.getKind() == SDep::Output)
2857 Worklist.push_back(SI);
2858 Visited.insert(SuccSU);
2859 }
2860 return LateCycle;
2861}
2862
2863/// If an instruction has a use that spans multiple iterations, then
2864/// return true. These instructions are characterized by having a back-ege
2865/// to a Phi, which contains a reference to another Phi.
2867 for (auto &P : SU->Preds)
2868 if (DAG->isBackedge(SU, P) && P.getSUnit()->getInstr()->isPHI())
2869 for (auto &S : P.getSUnit()->Succs)
2870 if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
2871 return P.getSUnit();
2872 return nullptr;
2873}
2874
2875/// Compute the scheduling start slot for the instruction. The start slot
2876/// depends on any predecessor or successor nodes scheduled already.
2877void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
2878 int II, SwingSchedulerDAG *DAG) {
2879 // Iterate over each instruction that has been scheduled already. The start
2880 // slot computation depends on whether the previously scheduled instruction
2881 // is a predecessor or successor of the specified instruction.
2882 for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
2883
2884 // Iterate over each instruction in the current cycle.
2885 for (SUnit *I : getInstructions(cycle)) {
2886 // Because we're processing a DAG for the dependences, we recognize
2887 // the back-edge in recurrences by anti dependences.
2888 for (unsigned i = 0, e = (unsigned)SU->Preds.size(); i != e; ++i) {
2889 const SDep &Dep = SU->Preds[i];
2890 if (Dep.getSUnit() == I) {
2891 if (!DAG->isBackedge(SU, Dep)) {
2892 int EarlyStart = cycle + Dep.getLatency() -
2893 DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
2894 *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
2895 if (DAG->isLoopCarriedDep(SU, Dep, false)) {
2896 int End = earliestCycleInChain(Dep) + (II - 1);
2897 *MinLateStart = std::min(*MinLateStart, End);
2898 }
2899 } else {
2900 int LateStart = cycle - Dep.getLatency() +
2901 DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
2902 *MinLateStart = std::min(*MinLateStart, LateStart);
2903 }
2904 }
2905 // For instruction that requires multiple iterations, make sure that
2906 // the dependent instruction is not scheduled past the definition.
2907 SUnit *BE = multipleIterations(I, DAG);
2908 if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
2909 !SU->isPred(I))
2910 *MinLateStart = std::min(*MinLateStart, cycle);
2911 }
2912 for (unsigned i = 0, e = (unsigned)SU->Succs.size(); i != e; ++i) {
2913 if (SU->Succs[i].getSUnit() == I) {
2914 const SDep &Dep = SU->Succs[i];
2915 if (!DAG->isBackedge(SU, Dep)) {
2916 int LateStart = cycle - Dep.getLatency() +
2917 DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
2918 *MinLateStart = std::min(*MinLateStart, LateStart);
2919 if (DAG->isLoopCarriedDep(SU, Dep)) {
2920 int Start = latestCycleInChain(Dep) + 1 - II;
2921 *MaxEarlyStart = std::max(*MaxEarlyStart, Start);
2922 }
2923 } else {
2924 int EarlyStart = cycle + Dep.getLatency() -
2925 DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
2926 *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
2927 }
2928 }
2929 }
2930 }
2931 }
2932}
2933
2934/// Order the instructions within a cycle so that the definitions occur
2935/// before the uses. Returns true if the instruction is added to the start
2936/// of the list, or false if added to the end.
2938 std::deque<SUnit *> &Insts) const {
2939 MachineInstr *MI = SU->getInstr();
2940 bool OrderBeforeUse = false;
2941 bool OrderAfterDef = false;
2942 bool OrderBeforeDef = false;
2943 unsigned MoveDef = 0;
2944 unsigned MoveUse = 0;
2945 int StageInst1 = stageScheduled(SU);
2946
2947 unsigned Pos = 0;
2948 for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
2949 ++I, ++Pos) {
2950 for (MachineOperand &MO : MI->operands()) {
2951 if (!MO.isReg() || !MO.getReg().isVirtual())
2952 continue;
2953
2954 Register Reg = MO.getReg();
2955 unsigned BasePos, OffsetPos;
2956 if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2957 if (MI->getOperand(BasePos).getReg() == Reg)
2958 if (unsigned NewReg = SSD->getInstrBaseReg(SU))
2959 Reg = NewReg;
2960 bool Reads, Writes;
2961 std::tie(Reads, Writes) =
2962 (*I)->getInstr()->readsWritesVirtualRegister(Reg);
2963 if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
2964 OrderBeforeUse = true;
2965 if (MoveUse == 0)
2966 MoveUse = Pos;
2967 } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
2968 // Add the instruction after the scheduled instruction.
2969 OrderAfterDef = true;
2970 MoveDef = Pos;
2971 } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
2972 if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
2973 OrderBeforeUse = true;
2974 if (MoveUse == 0)
2975 MoveUse = Pos;
2976 } else {
2977 OrderAfterDef = true;
2978 MoveDef = Pos;
2979 }
2980 } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
2981 OrderBeforeUse = true;
2982 if (MoveUse == 0)
2983 MoveUse = Pos;
2984 if (MoveUse != 0) {
2985 OrderAfterDef = true;
2986 MoveDef = Pos - 1;
2987 }
2988 } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
2989 // Add the instruction before the scheduled instruction.
2990 OrderBeforeUse = true;
2991 if (MoveUse == 0)
2992 MoveUse = Pos;
2993 } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
2994 isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
2995 if (MoveUse == 0) {
2996 OrderBeforeDef = true;
2997 MoveUse = Pos;
2998 }
2999 }
3000 }
3001 // Check for order dependences between instructions. Make sure the source
3002 // is ordered before the destination.
3003 for (auto &S : SU->Succs) {
3004 if (S.getSUnit() != *I)
3005 continue;
3006 if (S.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
3007 OrderBeforeUse = true;
3008 if (Pos < MoveUse)
3009 MoveUse = Pos;
3010 }
3011 // We did not handle HW dependences in previous for loop,
3012 // and we normally set Latency = 0 for Anti/Output deps,
3013 // so may have nodes in same cycle with Anti/Output dependent on HW regs.
3014 else if ((S.getKind() == SDep::Anti || S.getKind() == SDep::Output) &&
3015 stageScheduled(*I) == StageInst1) {
3016 OrderBeforeUse = true;
3017 if ((MoveUse == 0) || (Pos < MoveUse))
3018 MoveUse = Pos;
3019 }
3020 }
3021 for (auto &P : SU->Preds) {
3022 if (P.getSUnit() != *I)
3023 continue;
3024 if ((P.getKind() == SDep::Order || P.getKind() == SDep::Anti ||
3025 P.getKind() == SDep::Output) &&
3026 stageScheduled(*I) == StageInst1) {
3027 OrderAfterDef = true;
3028 MoveDef = Pos;
3029 }
3030 }
3031 }
3032
3033 // A circular dependence.
3034 if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
3035 OrderBeforeUse = false;
3036
3037 // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
3038 // to a loop-carried dependence.
3039 if (OrderBeforeDef)
3040 OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
3041
3042 // The uncommon case when the instruction order needs to be updated because
3043 // there is both a use and def.
3044 if (OrderBeforeUse && OrderAfterDef) {
3045 SUnit *UseSU = Insts.at(MoveUse);
3046 SUnit *DefSU = Insts.at(MoveDef);
3047 if (MoveUse > MoveDef) {
3048 Insts.erase(Insts.begin() + MoveUse);
3049 Insts.erase(Insts.begin() + MoveDef);
3050 } else {
3051 Insts.erase(Insts.begin() + MoveDef);
3052 Insts.erase(Insts.begin() + MoveUse);
3053 }
3054 orderDependence(SSD, UseSU, Insts);
3055 orderDependence(SSD, SU, Insts);
3056 orderDependence(SSD, DefSU, Insts);
3057 return;
3058 }
3059 // Put the new instruction first if there is a use in the list. Otherwise,
3060 // put it at the end of the list.
3061 if (OrderBeforeUse)
3062 Insts.push_front(SU);
3063 else
3064 Insts.push_back(SU);
3065}
3066
3067/// Return true if the scheduled Phi has a loop carried operand.
3069 MachineInstr &Phi) const {
3070 if (!Phi.isPHI())
3071 return false;
3072 assert(Phi.isPHI() && "Expecting a Phi.");
3073 SUnit *DefSU = SSD->getSUnit(&Phi);
3074 unsigned DefCycle = cycleScheduled(DefSU);
3075 int DefStage = stageScheduled(DefSU);
3076
3077 unsigned InitVal = 0;
3078 unsigned LoopVal = 0;
3079 getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
3080 SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
3081 if (!UseSU)
3082 return true;
3083 if (UseSU->getInstr()->isPHI())
3084 return true;
3085 unsigned LoopCycle = cycleScheduled(UseSU);
3086 int LoopStage = stageScheduled(UseSU);
3087 return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
3088}
3089
3090/// Return true if the instruction is a definition that is loop carried
3091/// and defines the use on the next iteration.
3092/// v1 = phi(v2, v3)
3093/// (Def) v3 = op v1
3094/// (MO) = v1
3095/// If MO appears before Def, then v1 and v3 may get assigned to the same
3096/// register.
3098 MachineInstr *Def,
3099 MachineOperand &MO) const {
3100 if (!MO.isReg())
3101 return false;
3102 if (Def->isPHI())
3103 return false;
3104 MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
3105 if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
3106 return false;
3107 if (!isLoopCarried(SSD, *Phi))
3108 return false;
3109 unsigned LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
3110 for (MachineOperand &DMO : Def->all_defs()) {
3111 if (DMO.getReg() == LoopReg)
3112 return true;
3113 }
3114 return false;
3115}
3116
3117/// Return true if all scheduled predecessors are loop-carried output/order
3118/// dependencies.
3120 SUnit *SU, SwingSchedulerDAG *DAG) const {
3121 for (const SDep &Pred : SU->Preds)
3122 if (InstrToCycle.count(Pred.getSUnit()) && !DAG->isBackedge(SU, Pred))
3123 return false;
3124 for (const SDep &Succ : SU->Succs)
3125 if (InstrToCycle.count(Succ.getSUnit()) && DAG->isBackedge(SU, Succ))
3126 return false;
3127 return true;
3128}
3129
3130/// Determine transitive dependences of unpipelineable instructions
3133 SmallSet<SUnit *, 8> DoNotPipeline;
3134 SmallVector<SUnit *, 8> Worklist;
3135
3136 for (auto &SU : SSD->SUnits)
3137 if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr()))
3138 Worklist.push_back(&SU);
3139
3140 while (!Worklist.empty()) {
3141 auto SU = Worklist.pop_back_val();
3142 if (DoNotPipeline.count(SU))
3143 continue;
3144 LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n");
3145 DoNotPipeline.insert(SU);
3146 for (auto &Dep : SU->Preds)
3147 Worklist.push_back(Dep.getSUnit());
3148 if (SU->getInstr()->isPHI())
3149 for (auto &Dep : SU->Succs)
3150 if (Dep.getKind() == SDep::Anti)
3151 Worklist.push_back(Dep.getSUnit());
3152 }
3153 return DoNotPipeline;
3154}
3155
3156// Determine all instructions upon which any unpipelineable instruction depends
3157// and ensure that they are in stage 0. If unable to do so, return false.
3161
3162 int NewLastCycle = INT_MIN;
3163 for (SUnit &SU : SSD->SUnits) {
3164 if (!SU.isInstr())
3165 continue;
3166 if (!DNP.contains(&SU) || stageScheduled(&SU) == 0) {
3167 NewLastCycle = std::max(NewLastCycle, InstrToCycle[&SU]);
3168 continue;
3169 }
3170
3171 // Put the non-pipelined instruction as early as possible in the schedule
3172 int NewCycle = getFirstCycle();
3173 for (auto &Dep : SU.Preds)
3174 NewCycle = std::max(InstrToCycle[Dep.getSUnit()], NewCycle);
3175
3176 int OldCycle = InstrToCycle[&SU];
3177 if (OldCycle != NewCycle) {
3178 InstrToCycle[&SU] = NewCycle;
3179 auto &OldS = getInstructions(OldCycle);
3180 llvm::erase(OldS, &SU);
3181 getInstructions(NewCycle).emplace_back(&SU);
3182 LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum
3183 << ") is not pipelined; moving from cycle " << OldCycle
3184 << " to " << NewCycle << " Instr:" << *SU.getInstr());
3185 }
3186 NewLastCycle = std::max(NewLastCycle, NewCycle);
3187 }
3188 LastCycle = NewLastCycle;
3189 return true;
3190}
3191
3192// Check if the generated schedule is valid. This function checks if
3193// an instruction that uses a physical register is scheduled in a
3194// different stage than the definition. The pipeliner does not handle
3195// physical register values that may cross a basic block boundary.
3196// Furthermore, if a physical def/use pair is assigned to the same
3197// cycle, orderDependence does not guarantee def/use ordering, so that
3198// case should be considered invalid. (The test checks for both
3199// earlier and same-cycle use to be more robust.)
3201 for (SUnit &SU : SSD->SUnits) {
3202 if (!SU.hasPhysRegDefs)
3203 continue;
3204 int StageDef = stageScheduled(&SU);
3205 int CycleDef = InstrToCycle[&SU];
3206 assert(StageDef != -1 && "Instruction should have been scheduled.");
3207 for (auto &SI : SU.Succs)
3208 if (SI.isAssignedRegDep() && !SI.getSUnit()->isBoundaryNode())
3209 if (Register::isPhysicalRegister(SI.getReg())) {
3210 if (stageScheduled(SI.getSUnit()) != StageDef)
3211 return false;
3212 if (InstrToCycle[SI.getSUnit()] <= CycleDef)
3213 return false;
3214 }
3215 }
3216 return true;
3217}
3218
3219/// A property of the node order in swing-modulo-scheduling is
3220/// that for nodes outside circuits the following holds:
3221/// none of them is scheduled after both a successor and a
3222/// predecessor.
3223/// The method below checks whether the property is met.
3224/// If not, debug information is printed and statistics information updated.
3225/// Note that we do not use an assert statement.
3226/// The reason is that although an invalid node oder may prevent
3227/// the pipeliner from finding a pipelined schedule for arbitrary II,
3228/// it does not lead to the generation of incorrect code.
3229void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
3230
3231 // a sorted vector that maps each SUnit to its index in the NodeOrder
3232 typedef std::pair<SUnit *, unsigned> UnitIndex;
3233 std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
3234
3235 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
3236 Indices.push_back(std::make_pair(NodeOrder[i], i));
3237
3238 auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
3239 return std::get<0>(i1) < std::get<0>(i2);
3240 };
3241
3242 // sort, so that we can perform a binary search
3243 llvm::sort(Indices, CompareKey);
3244
3245 bool Valid = true;
3246 (void)Valid;
3247 // for each SUnit in the NodeOrder, check whether
3248 // it appears after both a successor and a predecessor
3249 // of the SUnit. If this is the case, and the SUnit
3250 // is not part of circuit, then the NodeOrder is not
3251 // valid.
3252 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
3253 SUnit *SU = NodeOrder[i];
3254 unsigned Index = i;
3255
3256 bool PredBefore = false;
3257 bool SuccBefore = false;
3258
3259 SUnit *Succ;
3260 SUnit *Pred;
3261 (void)Succ;
3262 (void)Pred;
3263
3264 for (SDep &PredEdge : SU->Preds) {
3265 SUnit *PredSU = PredEdge.getSUnit();
3266 unsigned PredIndex = std::get<1>(
3267 *llvm::lower_bound(Indices, std::make_pair(PredSU, 0), CompareKey));
3268 if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
3269 PredBefore = true;
3270 Pred = PredSU;
3271 break;
3272 }
3273 }
3274
3275 for (SDep &SuccEdge : SU->Succs) {
3276 SUnit *SuccSU = SuccEdge.getSUnit();
3277 // Do not process a boundary node, it was not included in NodeOrder,
3278 // hence not in Indices either, call to std::lower_bound() below will
3279 // return Indices.end().
3280 if (SuccSU->isBoundaryNode())
3281 continue;
3282 unsigned SuccIndex = std::get<1>(
3283 *llvm::lower_bound(Indices, std::make_pair(SuccSU, 0), CompareKey));
3284 if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
3285 SuccBefore = true;
3286 Succ = SuccSU;
3287 break;
3288 }
3289 }
3290
3291 if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
3292 // instructions in circuits are allowed to be scheduled
3293 // after both a successor and predecessor.
3294 bool InCircuit = llvm::any_of(
3295 Circuits, [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
3296 if (InCircuit)
3297 LLVM_DEBUG(dbgs() << "In a circuit, predecessor ");
3298 else {
3299 Valid = false;
3300 NumNodeOrderIssues++;
3301 LLVM_DEBUG(dbgs() << "Predecessor ");
3302 }
3303 LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
3304 << " are scheduled before node " << SU->NodeNum
3305 << "\n");
3306 }
3307 }
3308
3309 LLVM_DEBUG({
3310 if (!Valid)
3311 dbgs() << "Invalid node order found!\n";
3312 });
3313}
3314
3315/// Attempt to fix the degenerate cases when the instruction serialization
3316/// causes the register lifetimes to overlap. For example,
3317/// p' = store_pi(p, b)
3318/// = load p, offset
3319/// In this case p and p' overlap, which means that two registers are needed.
3320/// Instead, this function changes the load to use p' and updates the offset.
3321void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
3322 unsigned OverlapReg = 0;
3323 unsigned NewBaseReg = 0;
3324 for (SUnit *SU : Instrs) {
3325 MachineInstr *MI = SU->getInstr();
3326 for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3327 const MachineOperand &MO = MI->getOperand(i);
3328 // Look for an instruction that uses p. The instruction occurs in the
3329 // same cycle but occurs later in the serialized order.
3330 if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
3331 // Check that the instruction appears in the InstrChanges structure,
3332 // which contains instructions that can have the offset updated.
3334 InstrChanges.find(SU);
3335 if (It != InstrChanges.end()) {
3336 unsigned BasePos, OffsetPos;
3337 // Update the base register and adjust the offset.
3338 if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
3340 NewMI->getOperand(BasePos).setReg(NewBaseReg);
3341 int64_t NewOffset =
3342 MI->getOperand(OffsetPos).getImm() - It->second.second;
3343 NewMI->getOperand(OffsetPos).setImm(NewOffset);
3344 SU->setInstr(NewMI);
3345 MISUnitMap[NewMI] = SU;
3346 NewMIs[MI] = NewMI;
3347 }
3348 }
3349 OverlapReg = 0;
3350 NewBaseReg = 0;
3351 break;
3352 }
3353 // Look for an instruction of the form p' = op(p), which uses and defines
3354 // two virtual registers that get allocated to the same physical register.
3355 unsigned TiedUseIdx = 0;
3356 if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
3357 // OverlapReg is p in the example above.
3358 OverlapReg = MI->getOperand(TiedUseIdx).getReg();
3359 // NewBaseReg is p' in the example above.
3360 NewBaseReg = MI->getOperand(i).getReg();
3361 break;
3362 }
3363 }
3364 }
3365}
3366
3367std::deque<SUnit *>
3369 const std::deque<SUnit *> &Instrs) const {
3370 std::deque<SUnit *> NewOrderPhi;
3371 for (SUnit *SU : Instrs) {
3372 if (SU->getInstr()->isPHI())
3373 NewOrderPhi.push_back(SU);
3374 }
3375 std::deque<SUnit *> NewOrderI;
3376 for (SUnit *SU : Instrs) {
3377 if (!SU->getInstr()->isPHI())
3378 orderDependence(SSD, SU, NewOrderI);
3379 }
3380 llvm::append_range(NewOrderPhi, NewOrderI);
3381 return NewOrderPhi;
3382}
3383
3384/// After the schedule has been formed, call this function to combine
3385/// the instructions from the different stages/cycles. That is, this
3386/// function creates a schedule that represents a single iteration.
3388 // Move all instructions to the first stage from later stages.
3389 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3390 for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
3391 ++stage) {
3392 std::deque<SUnit *> &cycleInstrs =
3393 ScheduledInstrs[cycle + (stage * InitiationInterval)];
3394 for (SUnit *SU : llvm::reverse(cycleInstrs))
3395 ScheduledInstrs[cycle].push_front(SU);
3396 }
3397 }
3398
3399 // Erase all the elements in the later stages. Only one iteration should
3400 // remain in the scheduled list, and it contains all the instructions.
3401 for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
3402 ScheduledInstrs.erase(cycle);
3403
3404 // Change the registers in instruction as specified in the InstrChanges
3405 // map. We need to use the new registers to create the correct order.
3406 for (const SUnit &SU : SSD->SUnits)
3407 SSD->applyInstrChange(SU.getInstr(), *this);
3408
3409 // Reorder the instructions in each cycle to fix and improve the
3410 // generated code.
3411 for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
3412 std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
3413 cycleInstrs = reorderInstructions(SSD, cycleInstrs);
3414 SSD->fixupRegisterOverlaps(cycleInstrs);
3415 }
3416
3417 LLVM_DEBUG(dump(););
3418}
3419
3421 os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
3422 << " depth " << MaxDepth << " col " << Colocate << "\n";
3423 for (const auto &I : Nodes)
3424 os << " SU(" << I->NodeNum << ") " << *(I->getInstr());
3425 os << "\n";
3426}
3427
3428#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3429/// Print the schedule information to the given output.
3431 // Iterate over each cycle.
3432 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3433 // Iterate over each instruction in the cycle.
3434 const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
3435 for (SUnit *CI : cycleInstrs->second) {
3436 os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
3437 os << "(" << CI->NodeNum << ") ";
3438 CI->getInstr()->print(os);
3439 os << "\n";
3440 }
3441 }
3442}
3443
3444/// Utility function used for debugging to print the schedule.
3447
3448void ResourceManager::dumpMRT() const {
3449 LLVM_DEBUG({
3450 if (UseDFA)
3451 return;
3452 std::stringstream SS;
3453 SS << "MRT:\n";
3454 SS << std::setw(4) << "Slot";
3455 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3456 SS << std::setw(3) << I;
3457 SS << std::setw(7) << "#Mops"
3458 << "\n";
3459 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3460 SS << std::setw(4) << Slot;
3461 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3462 SS << std::setw(3) << MRT[Slot][I];
3463 SS << std::setw(7) << NumScheduledMops[Slot] << "\n";
3464 }
3465 dbgs() << SS.str();
3466 });
3467}
3468#endif
3469
3471 const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {
3472 unsigned ProcResourceID = 0;
3473
3474 // We currently limit the resource kinds to 64 and below so that we can use
3475 // uint64_t for Masks
3476 assert(SM.getNumProcResourceKinds() < 64 &&
3477 "Too many kinds of resources, unsupported");
3478 // Create a unique bitmask for every processor resource unit.
3479 // Skip resource at index 0, since it always references 'InvalidUnit'.
3480 Masks.resize(SM.getNumProcResourceKinds());
3481 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3483 if (Desc.SubUnitsIdxBegin)
3484 continue;
3485 Masks[I] = 1ULL << ProcResourceID;
3486 ProcResourceID++;
3487 }
3488 // Create a unique bitmask for every processor resource group.
3489 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3491 if (!Desc.SubUnitsIdxBegin)
3492 continue;
3493 Masks[I] = 1ULL << ProcResourceID;
3494 for (unsigned U = 0; U < Desc.NumUnits; ++U)
3495 Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];
3496 ProcResourceID++;
3497 }
3498 LLVM_DEBUG({
3499 if (SwpShowResMask) {
3500 dbgs() << "ProcResourceDesc:\n";
3501 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3502 const MCProcResourceDesc *ProcResource = SM.getProcResource(I);
3503 dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
3504 ProcResource->Name, I, Masks[I],
3505 ProcResource->NumUnits);
3506 }
3507 dbgs() << " -----------------\n";
3508 }
3509 });
3510}
3511
3513 LLVM_DEBUG({
3514 if (SwpDebugResource)
3515 dbgs() << "canReserveResources:\n";
3516 });
3517 if (UseDFA)
3518 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3519 ->canReserveResources(&SU.getInstr()->getDesc());
3520
3521 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3522 if (!SCDesc->isValid()) {
3523 LLVM_DEBUG({
3524 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3525 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3526 });
3527 return true;
3528 }
3529
3530 reserveResources(SCDesc, Cycle);
3531 bool Result = !isOverbooked();
3532 unreserveResources(SCDesc, Cycle);
3533
3534 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return " << Result << "\n\n");
3535 return Result;
3536}
3537
3538void ResourceManager::reserveResources(SUnit &SU, int Cycle) {
3539 LLVM_DEBUG({
3540 if (SwpDebugResource)
3541 dbgs() << "reserveResources:\n";
3542 });
3543 if (UseDFA)
3544 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3545 ->reserveResources(&SU.getInstr()->getDesc());
3546
3547 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3548 if (!SCDesc->isValid()) {
3549 LLVM_DEBUG({
3550 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3551 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3552 });
3553 return;
3554 }
3555
3556 reserveResources(SCDesc, Cycle);
3557
3558 LLVM_DEBUG({
3559 if (SwpDebugResource) {
3560 dumpMRT();
3561 dbgs() << "reserveResources: done!\n\n";
3562 }
3563 });
3564}
3565
3566void ResourceManager::reserveResources(const MCSchedClassDesc *SCDesc,
3567 int Cycle) {
3568 assert(!UseDFA);
3569 for (const MCWriteProcResEntry &PRE : make_range(
3570 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3571 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3572 ++MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3573
3574 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3575 ++NumScheduledMops[positiveModulo(C, InitiationInterval)];
3576}
3577
3578void ResourceManager::unreserveResources(const MCSchedClassDesc *SCDesc,
3579 int Cycle) {
3580 assert(!UseDFA);
3581 for (const MCWriteProcResEntry &PRE : make_range(
3582 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3583 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3584 --MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3585
3586 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3587 --NumScheduledMops[positiveModulo(C, InitiationInterval)];
3588}
3589
3590bool ResourceManager::isOverbooked() const {
3591 assert(!UseDFA);
3592 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3593 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3595 if (MRT[Slot][I] > Desc->NumUnits)
3596 return true;
3597 }
3598 if (NumScheduledMops[Slot] > IssueWidth)
3599 return true;
3600 }
3601 return false;
3602}
3603
3604int ResourceManager::calculateResMIIDFA() const {
3605 assert(UseDFA);
3606
3607 // Sort the instructions by the number of available choices for scheduling,
3608 // least to most. Use the number of critical resources as the tie breaker.
3609 FuncUnitSorter FUS = FuncUnitSorter(*ST);
3610 for (SUnit &SU : DAG->SUnits)
3611 FUS.calcCriticalResources(*SU.getInstr());
3613 FuncUnitOrder(FUS);
3614
3615 for (SUnit &SU : DAG->SUnits)
3616 FuncUnitOrder.push(SU.getInstr());
3617
3619 Resources.push_back(
3620 std::unique_ptr<DFAPacketizer>(TII->CreateTargetScheduleState(*ST)));
3621
3622 while (!FuncUnitOrder.empty()) {
3623 MachineInstr *MI = FuncUnitOrder.top();
3624 FuncUnitOrder.pop();
3625 if (TII->isZeroCost(MI->getOpcode()))
3626 continue;
3627
3628 // Attempt to reserve the instruction in an existing DFA. At least one
3629 // DFA is needed for each cycle.
3630 unsigned NumCycles = DAG->getSUnit(MI)->Latency;
3631 unsigned ReservedCycles = 0;
3632 auto *RI = Resources.begin();
3633 auto *RE = Resources.end();
3634 LLVM_DEBUG({
3635 dbgs() << "Trying to reserve resource for " << NumCycles
3636 << " cycles for \n";
3637 MI->dump();
3638 });
3639 for (unsigned C = 0; C < NumCycles; ++C)
3640 while (RI != RE) {
3641 if ((*RI)->canReserveResources(*MI)) {
3642 (*RI)->reserveResources(*MI);
3643 ++ReservedCycles;
3644 break;
3645 }
3646 RI++;
3647 }
3648 LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles
3649 << ", NumCycles:" << NumCycles << "\n");
3650 // Add new DFAs, if needed, to reserve resources.
3651 for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
3653 << "NewResource created to reserve resources"
3654 << "\n");
3655 auto *NewResource = TII->CreateTargetScheduleState(*ST);
3656 assert(NewResource->canReserveResources(*MI) && "Reserve error.");
3657 NewResource->reserveResources(*MI);
3658 Resources.push_back(std::unique_ptr<DFAPacketizer>(NewResource));
3659 }
3660 }
3661
3662 int Resmii = Resources.size();
3663 LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
3664 return Resmii;
3665}
3666
3668 if (UseDFA)
3669 return calculateResMIIDFA();
3670
3671 // Count each resource consumption and divide it by the number of units.
3672 // ResMII is the max value among them.
3673
3674 int NumMops = 0;
3676 for (SUnit &SU : DAG->SUnits) {
3677 if (TII->isZeroCost(SU.getInstr()->getOpcode()))
3678 continue;
3679
3680 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3681 if (!SCDesc->isValid())
3682 continue;
3683
3684 LLVM_DEBUG({
3685 if (SwpDebugResource) {
3686 DAG->dumpNode(SU);
3687 dbgs() << " #Mops: " << SCDesc->NumMicroOps << "\n"
3688 << " WriteProcRes: ";
3689 }
3690 });
3691 NumMops += SCDesc->NumMicroOps;
3692 for (const MCWriteProcResEntry &PRE :
3693 make_range(STI->getWriteProcResBegin(SCDesc),
3694 STI->getWriteProcResEnd(SCDesc))) {
3695 LLVM_DEBUG({
3696 if (SwpDebugResource) {
3697 const MCProcResourceDesc *Desc =
3698 SM.getProcResource(PRE.ProcResourceIdx);
3699 dbgs() << Desc->Name << ": " << PRE.ReleaseAtCycle << ", ";
3700 }
3701 });
3702 ResourceCount[PRE.ProcResourceIdx] += PRE.ReleaseAtCycle;
3703 }
3704 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "\n");
3705 }
3706
3707 int Result = (NumMops + IssueWidth - 1) / IssueWidth;
3708 LLVM_DEBUG({
3709 if (SwpDebugResource)
3710 dbgs() << "#Mops: " << NumMops << ", "
3711 << "IssueWidth: " << IssueWidth << ", "
3712 << "Cycles: " << Result << "\n";
3713 });
3714
3715 LLVM_DEBUG({
3716 if (SwpDebugResource) {
3717 std::stringstream SS;
3718 SS << std::setw(2) << "ID" << std::setw(16) << "Name" << std::setw(10)
3719 << "Units" << std::setw(10) << "Consumed" << std::setw(10) << "Cycles"
3720 << "\n";
3721 dbgs() << SS.str();
3722 }
3723 });
3724 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3726 int Cycles = (ResourceCount[I] + Desc->NumUnits - 1) / Desc->NumUnits;
3727 LLVM_DEBUG({
3728 if (SwpDebugResource) {
3729 std::stringstream SS;
3730 SS << std::setw(2) << I << std::setw(16) << Desc->Name << std::setw(10)
3731 << Desc->NumUnits << std::setw(10) << ResourceCount[I]
3732 << std::setw(10) << Cycles << "\n";
3733 dbgs() << SS.str();
3734 }
3735 });
3736 if (Cycles > Result)
3737 Result = Cycles;
3738 }
3739 return Result;
3740}
3741
3743 InitiationInterval = II;
3744 DFAResources.clear();
3745 DFAResources.resize(II);
3746 for (auto &I : DFAResources)
3747 I.reset(ST->getInstrInfo()->CreateTargetScheduleState(*ST));
3748 MRT.clear();
3750 NumScheduledMops.clear();
3751 NumScheduledMops.resize(II);
3752}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static const LLT S1
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:686
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:622
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
bool End
Definition: ELF_riscv.cpp:480
SmallVector< uint32_t, 0 > Writes
Definition: ELF_riscv.cpp:497
#define DEBUG_TYPE
const HexagonInstrInfo * TII
hexagon gen pred
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
static cl::opt< int > SwpForceII("pipeliner-force-ii", cl::desc("Force pipeliner to use specified II."), cl::Hidden, cl::init(-1))
A command line argument to force pipeliner to use specified initial interval.
static cl::opt< bool > ExperimentalCodeGen("pipeliner-experimental-cg", cl::Hidden, cl::init(false), cl::desc("Use the experimental peeling code generator for software pipelining"))
static bool pred_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Preds, const NodeSet *S=nullptr)
Compute the Pred_L(O) set, as defined in the paper.
static cl::opt< bool > MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false), cl::desc("Use the MVE code generator for software pipelining"))
static cl::opt< int > RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden, cl::init(5), cl::desc("Margin representing the unused percentage of " "the register pressure limit"))
static cl::opt< bool > SwpDebugResource("pipeliner-dbg-res", cl::Hidden, cl::init(false))
static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker, NodeSet &NS)
Compute the live-out registers for the instructions in a node-set.
static void computeScheduledInsts(const SwingSchedulerDAG *SSD, SMSchedule &Schedule, std::vector< MachineInstr * > &OrderedInsts, DenseMap< MachineInstr *, unsigned > &Stages)
Create an instruction stream that represents a single iteration and stage of each instruction.
static cl::opt< bool > EmitTestAnnotations("pipeliner-annotate-for-testing", cl::Hidden, cl::init(false), cl::desc("Instead of emitting the pipelined code, annotate instructions " "with the generated schedule for feeding into the " "-modulo-schedule-test pass"))
static bool isIntersect(SmallSetVector< SUnit *, 8 > &Set1, const NodeSet &Set2, SmallSetVector< SUnit *, 8 > &Result)
Return true if Set1 contains elements in Set2.
static cl::opt< bool > SwpIgnoreRecMII("pipeliner-ignore-recmii", cl::ReallyHidden, cl::desc("Ignore RecMII"))
static cl::opt< int > SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1))
static bool succ_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Succs, const NodeSet *S=nullptr)
Compute the Succ_L(O) set, as defined in the paper.
Modulo Software Pipelining
static cl::opt< bool > SwpPruneLoopCarried("pipeliner-prune-loop-carried", cl::desc("Prune loop carried order dependences."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of loop carried order dependences.
static bool isDependenceBarrier(MachineInstr &MI)
Return true if the instruction causes a chain between memory references before and after it.
static cl::opt< int > SwpMaxMii("pipeliner-max-mii", cl::desc("Size limit for the MII."), cl::Hidden, cl::init(27))
A command line argument to limit minimum initial interval for pipelining.
static unsigned getLoopPhiReg(const MachineInstr &Phi, const MachineBasicBlock *LoopBB)
Return the Phi register value that comes the loop block.
static void swapAntiDependences(std::vector< SUnit > &SUnits)
Swap all the anti dependences in the DAG.
static bool isSuccOrder(SUnit *SUa, SUnit *SUb)
Return true if SUb can be reached from SUa following the chain edges.
static cl::opt< int > SwpMaxStages("pipeliner-max-stages", cl::desc("Maximum stages allowed in the generated scheduled."), cl::Hidden, cl::init(3))
A command line argument to limit the number of stages in the pipeline.
static cl::opt< bool > EnableSWPOptSize("enable-pipeliner-opt-size", cl::desc("Enable SWP at Os."), cl::Hidden, cl::init(false))
A command line option to enable SWP at -Os.
static cl::opt< bool > SwpShowResMask("pipeliner-show-mask", cl::Hidden, cl::init(false))
static cl::opt< int > SwpIISearchRange("pipeliner-ii-search-range", cl::desc("Range to search for II"), cl::Hidden, cl::init(10))
static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop, unsigned &InitVal, unsigned &LoopVal)
Return the register values for the operands of a Phi instruction.
static cl::opt< bool > LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false), cl::desc("Limit register pressure of scheduled loop"))
#define DEBUG_TYPE
static cl::opt< bool > EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true), cl::desc("Enable Software Pipelining"))
A command line option to turn software pipelining on or off.
static bool ignoreDependence(const SDep &D, bool isPred)
Return true for DAG nodes that we ignore when computing the cost functions.
static cl::opt< bool > SwpPruneDeps("pipeliner-prune-deps", cl::desc("Prune dependences between unrelated Phi nodes."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of chain dependences due to an unrelated Phi.
static SUnit * multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG)
If an instruction has a use that spans multiple iterations, then return true.
static bool computePath(SUnit *Cur, SetVector< SUnit * > &Path, SetVector< SUnit * > &DestNodes, SetVector< SUnit * > &Exclude, SmallPtrSet< SUnit *, 8 > &Visited)
Return true if there is a path from the specified node to any of the nodes in DestNodes.
unsigned const TargetRegisterInfo * TRI
unsigned Reg
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file defines the PriorityQueue class.
Remove Loads Into Fake Uses
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
Target-Independent Code Generator Pass Configuration Options pass.
static unsigned getSize(unsigned Kind)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
bool isNoAlias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A trivial helper function to check to see if the specified pointers are no-alias.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:226
bool erase(const KeyT &Val)
Definition: DenseMap.h:321
bool empty() const
Definition: DenseMap.h:98
iterator end()
Definition: DenseMap.h:84
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
A possibly irreducible generalization of a Loop.
Itinerary data supplied by a subtarget to be used by a target.
const InstrStage * beginStage(unsigned ItinClassIndx) const
Return the first stage of the itinerary.
const InstrStage * endStage(unsigned ItinClassIndx) const
Return the last+1 stage of the itinerary.
bool isEmpty() const
Returns true if there are no itineraries.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:390
bool hasValue() const
TypeSize getValue() const
BlockT * getHeader() const
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:632
unsigned getSchedClass() const
Return the scheduling class for this instruction.
Definition: MCInstrDesc.h:600
const MCInstrDesc & get(unsigned Opcode) const
Return the machine instruction descriptor that corresponds to the specified instruction opcode.
Definition: MCInstrInfo.h:63
Generic base class for all target subtargets.
const MCWriteProcResEntry * getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCWriteProcResEntry * getWriteProcResBegin(const MCSchedClassDesc *SC) const
Return an iterator at the first process resource consumed by the given scheduling class.
const MCSchedModel & getSchedModel() const
Get the machine model for this subtarget's CPU.
Metadata node.
Definition: Metadata.h:1069
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1430
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1428
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1436
Tracking metadata reference owned by Metadata.
Definition: Metadata.h:891
A single uniqued string.
Definition: Metadata.h:720
StringRef getString() const
Definition: Metadata.cpp:616
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
instr_iterator instr_end()
Analysis pass which computes a MachineDominatorTree.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void deleteMachineInstr(MachineInstr *MI)
DeleteMachineInstr - Delete the given MachineInstr.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
bool mayRaiseFPException() const
Return true if this instruction could possibly raise a floating-point exception.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:575
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:347
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition: MachineInstr.h:572
bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
bool isRegSequence() const
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:806
bool isIdenticalTo(const MachineInstr &Other, MICheckType Check=CheckDefs) const
Return true if this instruction is identical to Other.
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool isPseudo(QueryType Type=IgnoreBundle) const
Return true if this is a pseudo instruction that doesn't correspond to a real machine instruction.
Definition: MachineInstr.h:936
bool isPHI() const
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:585
iterator_range< filtered_mop_iterator > all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
Definition: MachineInstr.h:762
A description of a memory reference used in the backend.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
const Value * getValue() const
Return the base address of the memory access.
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
Register getReg() const
getReg - Returns the register number.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
Diagnostic information for optimization analysis remarks.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
The main class in the implementation of the target independent software pipeliner pass.
const TargetInstrInfo * TII
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
MachineFunction * MF
const MachineDominatorTree * MDT
const MachineLoopInfo * MLI
MachineOptimizationRemarkEmitter * ORE
RegisterClassInfo RegClassInfo
defusechain_iterator - This class provides iterator support for machine operands in the function that...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
use_instr_iterator use_instr_begin(Register RegNo) const
static use_instr_iterator use_instr_end()
MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator end()
Definition: MapVector.h:71
iterator find(const KeyT &Key)
Definition: MapVector.h:167
void clear()
Definition: MapVector.h:88
static MemoryLocation getAfter(const Value *Ptr, const AAMDNodes &AATags=AAMDNodes())
Return a location that may access any location after Ptr, while remaining within the underlying objec...
Expand the kernel using modulo variable expansion algorithm (MVE).
static bool canApply(MachineLoop &L)
Check if ModuloScheduleExpanderMVE can be applied to L.
The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place, rewriting the old loop and...
void cleanup()
Performs final cleanup after expansion.
void expand()
Performs the actual expansion.
Expander that simply annotates each scheduled instruction with a post-instr symbol that can be consum...
void annotate()
Performs the annotation.
Represents a schedule for a single-block loop.
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
SUnit * getNode(unsigned i) const
void print(raw_ostream &os) const
void setRecMII(unsigned mii)
unsigned count(SUnit *SU) const
void setColocate(unsigned c)
int compareRecMII(NodeSet &RHS)
unsigned size() const
bool insert(SUnit *SU)
LLVM_DUMP_METHOD void dump() const
bool empty() const
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
void dump() const
Definition: Pass.cpp:136
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
A reimplementation of ModuloScheduleExpander.
unsigned getPSet() const
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
Track the current register pressure at some position in the instruction stream, and remember the high...
void addLiveRegs(ArrayRef< RegisterMaskPair > Regs)
Force liveness of virtual registers or physical register units.
unsigned getRegPressureSetLimit(unsigned Idx) const
Get the register unit limit for the given pressure set index.
void runOnMachineFunction(const MachineFunction &MF)
runOnFunction - Prepare to answer questions about MF.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static constexpr bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:65
void initProcResourceVectors(const MCSchedModel &SM, SmallVectorImpl< uint64_t > &Masks)
void init(int II)
Initialize resources with the initiation interval II.
bool canReserveResources(SUnit &SU, int Cycle)
Check if the resources occupied by a machine instruction are available in the current state.
Scheduling dependency.
Definition: ScheduleDAG.h:49
SUnit * getSUnit() const
Definition: ScheduleDAG.h:498
Kind getKind() const
Returns an enum value representing the kind of the dependence.
Definition: ScheduleDAG.h:504
Kind
These are the different kinds of scheduling dependencies.
Definition: ScheduleDAG.h:52
@ Output
A register output-dependence (aka WAW).
Definition: ScheduleDAG.h:55
@ Order
Any other ordering dependency.
Definition: ScheduleDAG.h:56
@ Anti
A register anti-dependence (aka WAR).
Definition: ScheduleDAG.h:54
@ Data
Regular data dependence (aka true-dependence).
Definition: ScheduleDAG.h:53
void setLatency(unsigned Lat)
Sets the latency for this edge.
Definition: ScheduleDAG.h:147
@ Barrier
An unknown scheduling barrier.
Definition: ScheduleDAG.h:69
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
Definition: ScheduleDAG.h:142
bool isArtificial() const
Tests if this is an Order dependence that is marked as "artificial", meaning it isn't necessary for c...
Definition: ScheduleDAG.h:200
This class represents the scheduled code.
std::deque< SUnit * > reorderInstructions(const SwingSchedulerDAG *SSD, const std::deque< SUnit * > &Instrs) const
int earliestCycleInChain(const SDep &Dep)
Return the cycle of the earliest scheduled instruction in the dependence chain.
void setInitiationInterval(int ii)
Set the initiation interval for this schedule.
SmallSet< SUnit *, 8 > computeUnpipelineableNodes(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
Determine transitive dependences of unpipelineable instructions.
void dump() const
Utility function used for debugging to print the schedule.
bool insert(SUnit *SU, int StartCycle, int EndCycle, int II)
Try to schedule the node at the specified StartCycle and continue until the node is schedule or the E...
unsigned getMaxStageCount()
Return the maximum stage count needed for this schedule.
void print(raw_ostream &os) const
Print the schedule information to the given output.
int latestCycleInChain(const SDep &Dep)
Return the cycle of the latest scheduled instruction in the dependence chain.
int stageScheduled(SUnit *SU) const
Return the stage for a scheduled instruction.
void orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU, std::deque< SUnit * > &Insts) const
Order the instructions within a cycle so that the definitions occur before the uses.
bool isValidSchedule(SwingSchedulerDAG *SSD)
int getInitiationInterval() const
Return the initiation interval for this schedule.
std::deque< SUnit * > & getInstructions(int cycle)
Return the instructions that are scheduled at the specified cycle.
int getFirstCycle() const
Return the first cycle in the completed schedule.
bool isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD, MachineInstr *Def, MachineOperand &MO) const
Return true if the instruction is a definition that is loop carried and defines the use on the next i...
unsigned cycleScheduled(SUnit *SU) const
Return the cycle for a scheduled instruction.
void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart, int II, SwingSchedulerDAG *DAG)
Compute the scheduling start slot for the instruction.
bool onlyHasLoopCarriedOutputOrOrderPreds(SUnit *SU, SwingSchedulerDAG *DAG) const
Return true if all scheduled predecessors are loop-carried output/order dependencies.
bool normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
bool isLoopCarried(const SwingSchedulerDAG *SSD, MachineInstr &Phi) const
Return true if the scheduled Phi has a loop carried operand.
int getFinalCycle() const
Return the last cycle in the finalized schedule.
void finalizeSchedule(SwingSchedulerDAG *SSD)
After the schedule has been formed, call this function to combine the instructions from the different...
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
unsigned NumPreds
Definition: ScheduleDAG.h:272
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
Definition: ScheduleDAG.h:378
unsigned NodeNum
Entry # of node in the node vector.
Definition: ScheduleDAG.h:270
void setInstr(MachineInstr *MI)
Assigns the instruction for the SUnit.
Definition: ScheduleDAG.h:382
void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
bool isPred(const SUnit *N) const
Tests if node N is a predecessor of this node.
Definition: ScheduleDAG.h:449
unsigned short Latency
Node latency.
Definition: ScheduleDAG.h:303
bool isBoundaryNode() const
Boundary nodes are placeholders for the boundary of the scheduling region.
Definition: ScheduleDAG.h:358
bool hasPhysRegDefs
Has physreg defs that are being used.
Definition: ScheduleDAG.h:292
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:263
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:262
bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:390
A ScheduleDAG for scheduling lists of MachineInstr.
DenseMap< MachineInstr *, SUnit * > MISUnitMap
After calling BuildSchedGraph, each machine instruction in the current scheduling region is mapped to...
virtual void finishBlock()
Cleans up after scheduling in the given block.
MachineBasicBlock * BB
The block in which to insert instructions.
const MCSchedClassDesc * getSchedClass(SUnit *SU) const
Resolves and cache a resolved scheduling class for an SUnit.
void dumpNode(const SUnit &SU) const override
UndefValue * UnknownValue
For an unanalyzable memory access, this Value is used in maps.
void buildSchedGraph(AAResults *AA, RegPressureTracker *RPTracker=nullptr, PressureDiffs *PDiffs=nullptr, LiveIntervals *LIS=nullptr, bool TrackLaneMasks=false)
Builds SUnits for the current region.
SUnit * getSUnit(MachineInstr *MI) const
Returns an existing SUnit for this MI, or nullptr.
TargetSchedModel SchedModel
TargetSchedModel provides an interface to the machine model.
void dump() const override
void RemovePred(SUnit *M, SUnit *N)
Updates the topological ordering to accommodate an edge to be removed from the specified node N from ...
void InitDAGTopologicalSorting()
Creates the initial topological ordering from the DAG to be scheduled.
void AddPred(SUnit *Y, SUnit *X)
Updates the topological ordering to accommodate an edge to be added from SUnit X to SUnit Y.
bool IsReachable(const SUnit *SU, const SUnit *TargetSU)
Checks if SU is reachable from TargetSU.
MachineRegisterInfo & MRI
Virtual/real register map.
Definition: ScheduleDAG.h:578
const TargetInstrInfo * TII
Target instruction information.
Definition: ScheduleDAG.h:575
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:579
const TargetRegisterInfo * TRI
Target processor register info.
Definition: ScheduleDAG.h:576
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:577
A vector that has set insertion semantics.
Definition: SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
typename vector_type::const_iterator iterator
Definition: SetVector.h:69
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:254
SlotIndexes pass.
Definition: SlotIndexes.h:297
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:531
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:298
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:401
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
iterator end() const
Definition: SmallPtrSet.h:477
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
iterator begin() const
Definition: SmallPtrSet.h:472
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:175
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:222
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
This class builds the dependence graph for the instructions in a loop, and attempts to schedule the i...
bool isLoopCarriedDep(SUnit *Source, const SDep &Dep, bool isSucc=true) const
Return true for an order or output dependence that is loop carried potentially.
unsigned getInstrBaseReg(SUnit *SU) const
Return the new base register that was stored away for the changed instruction.
unsigned getDepth(SUnit *Node)
The depth, in the dependence graph, for a node.
int getASAP(SUnit *Node)
Return the earliest time an instruction may be scheduled.
void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule)
Apply changes to the instruction if needed.
void finishBlock() override
Clean up after the software pipeliner runs.
void fixupRegisterOverlaps(std::deque< SUnit * > &Instrs)
Attempt to fix the degenerate cases when the instruction serialization causes the register lifetimes ...
int getZeroLatencyDepth(SUnit *Node)
The maximum unweighted length of a path from an arbitrary node to the given node in which each edge h...
unsigned getDistance(SUnit *U, SUnit *V, const SDep &Dep)
The distance function, which indicates that operation V of iteration I depends on operations U of ite...
void schedule() override
We override the schedule function in ScheduleDAGInstrs to implement the scheduling part of the Swing ...
int getMOV(SUnit *Node)
The mobility function, which the number of slots in which an instruction may be scheduled.
int getZeroLatencyHeight(SUnit *Node)
The maximum unweighted length of a path from the given node to an arbitrary node in which each edge h...
bool isBackedge(SUnit *Source, const SDep &Dep)
Return true if the dependence is a back-edge in the data dependence graph.
unsigned getHeight(SUnit *Node)
The height, in the dependence graph, for a node.
int getALAP(SUnit *Node)
Return the latest time an instruction my be scheduled.
Object returned by analyzeLoopForPipelining.
virtual bool isMVEExpanderSupported()
Return true if the target can expand pipelined schedule with modulo variable expansion.
virtual bool shouldIgnoreForPipelining(const MachineInstr *MI) const =0
Return true if the given instruction should not be pipelined and should be ignored.
virtual bool shouldUseSchedule(SwingSchedulerDAG &SSD, SMSchedule &SMS)
Return true if the proposed schedule should used.
virtual std::unique_ptr< PipelinerLoopInfo > analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const
Analyze loop L, which must be a single-basic-block loop, and if the conditions can be understood enou...
bool isZeroCost(unsigned Opcode) const
Return true for pseudo instructions that don't consume any machine resources in their current form.
virtual bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
virtual DFAPacketizer * CreateTargetScheduleState(const TargetSubtargetInfo &) const
Create machine specific model for scheduling.
virtual bool isPostIncrement(const MachineInstr &MI) const
Return true for post-incremented instructions.
virtual bool getBaseAndOffsetPosition(const MachineInstr &MI, unsigned &BasePos, unsigned &OffsetPos) const
Return true if the instruction contains a base register and offset.
virtual bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const
Sometimes, it is possible for the target to tell, even without aliasing information,...
virtual bool getIncrementValue(const MachineInstr &MI, int &Value) const
If the instruction is an increment of a constant value, return the amount.
bool getMemOperandWithOffset(const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const
Get the base operand and byte offset of an instruction that reads/writes memory.
Target-Independent Code Generator Pass Configuration Options.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const char * getRegPressureSetName(unsigned Idx) const =0
Get the name of this register unit pressure set.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
static Type * getVoidTy(LLVMContext &C)
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
The main class in the implementation of the target independent window scheduler.
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:193
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Entry
Definition: COFF.h:844
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:138
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:711
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double e
Definition: MathExtras.h:47
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Offset
Definition: DWP.cpp:480
void stable_sort(R &&Range)
Definition: STLExtras.h:2037
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2115
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2107
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
@ WS_Force
Use window algorithm after SMS algorithm fails.
@ WS_On
Turn off window algorithm.
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1664
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
unsigned getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1978
char & MachinePipelinerID
This pass performs software pipelining on machine instructions.
CycleInfo::CycleT Cycle
Definition: CycleInfo.h:24
cl::opt< bool > SwpEnableCopyToPhi
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
cl::opt< WindowSchedulingFlag > WindowSchedulingOption("window-sched", cl::Hidden, cl::init(WindowSchedulingFlag::WS_On), cl::desc("Set how to use window scheduling algorithm."), cl::values(clEnumValN(WindowSchedulingFlag::WS_Off, "off", "Turn off window algorithm."), clEnumValN(WindowSchedulingFlag::WS_On, "on", "Use window algorithm after SMS algorithm fails."), clEnumValN(WindowSchedulingFlag::WS_Force, "force", "Use window algorithm instead of SMS algorithm.")))
A command line argument to set the window scheduling option.
bool isIdentifiedObject(const Value *V)
Return true if this pointer refers to a distinct and identifiable object.
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
cl::opt< int > SwpForceIssueWidth
A command line argument to force pipeliner to use specified issue width.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Description of the encoding of one expression Op.
These values represent a non-pipelined step in the execution of an instruction.
RegisterPressure computed within a region of instructions delimited by TopIdx and BottomIdx.
static constexpr LaneBitmask getNone()
Definition: LaneBitmask.h:81
Define a kind of processor resource that will be modeled by the scheduler.
Definition: MCSchedule.h:34
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition: MCSchedule.h:121
bool isValid() const
Definition: MCSchedule.h:139
Machine model for scheduling, bundling, and heuristics.
Definition: MCSchedule.h:256
const MCSchedClassDesc * getSchedClassDesc(unsigned SchedClassIdx) const
Definition: MCSchedule.h:363
unsigned getNumProcResourceKinds() const
Definition: MCSchedule.h:352
bool hasInstrSchedModel() const
Does this machine model include instruction-level scheduling.
Definition: MCSchedule.h:337
const MCProcResourceDesc * getProcResource(unsigned ProcResourceIdx) const
Definition: MCSchedule.h:356
Identify one of the processor resource kinds consumed by a particular scheduling class for the specif...
Definition: MCSchedule.h:66
SmallVector< MachineOperand, 4 > BrCond
std::unique_ptr< TargetInstrInfo::PipelinerLoopInfo > LoopPipelinerInfo
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
const TargetPassConfig * PassConfig
const MachineDominatorTree * MDT
RegisterClassInfo * RegClassInfo
const MachineLoopInfo * MLI
Store the effects of a change in pressure on things that MI scheduler cares about.
std::vector< unsigned > MaxSetPressure
Map of max reg pressure indexed by pressure set ID, not class ID.