LLVM 19.0.0git
MachinePipeliner.cpp
Go to the documentation of this file.
1//===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
10//
11// This SMS implementation is a target-independent back-end pass. When enabled,
12// the pass runs just prior to the register allocation pass, while the machine
13// IR is in SSA form. If software pipelining is successful, then the original
14// loop is replaced by the optimized loop. The optimized loop contains one or
15// more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
16// the instructions cannot be scheduled in a given MII, we increase the MII by
17// one and try again.
18//
19// The SMS implementation is an extension of the ScheduleDAGInstrs class. We
20// represent loop carried dependences in the DAG as order edges to the Phi
21// nodes. We also perform several passes over the DAG to eliminate unnecessary
22// edges that inhibit the ability to pipeline. The implementation uses the
23// DFAPacketizer class to compute the minimum initiation interval and the check
24// where an instruction may be inserted in the pipelined schedule.
25//
26// In order for the SMS pass to work, several target specific hooks need to be
27// implemented to get information about the loop structure and to rewrite
28// instructions.
29//
30//===----------------------------------------------------------------------===//
31
33#include "llvm/ADT/ArrayRef.h"
34#include "llvm/ADT/BitVector.h"
35#include "llvm/ADT/DenseMap.h"
36#include "llvm/ADT/MapVector.h"
38#include "llvm/ADT/STLExtras.h"
40#include "llvm/ADT/SetVector.h"
42#include "llvm/ADT/SmallSet.h"
44#include "llvm/ADT/Statistic.h"
74#include "llvm/Config/llvm-config.h"
75#include "llvm/IR/Attributes.h"
76#include "llvm/IR/Function.h"
77#include "llvm/MC/LaneBitmask.h"
78#include "llvm/MC/MCInstrDesc.h"
81#include "llvm/Pass.h"
84#include "llvm/Support/Debug.h"
87#include <algorithm>
88#include <cassert>
89#include <climits>
90#include <cstdint>
91#include <deque>
92#include <functional>
93#include <iomanip>
94#include <iterator>
95#include <map>
96#include <memory>
97#include <sstream>
98#include <tuple>
99#include <utility>
100#include <vector>
101
102using namespace llvm;
103
104#define DEBUG_TYPE "pipeliner"
105
106STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
107STATISTIC(NumPipelined, "Number of loops software pipelined");
108STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
109STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
110STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
111STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
112STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
113STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
114STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
115STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
116STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
117
118/// A command line option to turn software pipelining on or off.
119static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
120 cl::desc("Enable Software Pipelining"));
121
122/// A command line option to enable SWP at -Os.
123static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
124 cl::desc("Enable SWP at Os."), cl::Hidden,
125 cl::init(false));
126
127/// A command line argument to limit minimum initial interval for pipelining.
128static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
129 cl::desc("Size limit for the MII."),
130 cl::Hidden, cl::init(27));
131
132/// A command line argument to force pipeliner to use specified initial
133/// interval.
134static cl::opt<int> SwpForceII("pipeliner-force-ii",
135 cl::desc("Force pipeliner to use specified II."),
136 cl::Hidden, cl::init(-1));
137
138/// A command line argument to limit the number of stages in the pipeline.
139static cl::opt<int>
140 SwpMaxStages("pipeliner-max-stages",
141 cl::desc("Maximum stages allowed in the generated scheduled."),
142 cl::Hidden, cl::init(3));
143
144/// A command line option to disable the pruning of chain dependences due to
145/// an unrelated Phi.
146static cl::opt<bool>
147 SwpPruneDeps("pipeliner-prune-deps",
148 cl::desc("Prune dependences between unrelated Phi nodes."),
149 cl::Hidden, cl::init(true));
150
151/// A command line option to disable the pruning of loop carried order
152/// dependences.
153static cl::opt<bool>
154 SwpPruneLoopCarried("pipeliner-prune-loop-carried",
155 cl::desc("Prune loop carried order dependences."),
156 cl::Hidden, cl::init(true));
157
158#ifndef NDEBUG
159static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
160#endif
161
162static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
164 cl::desc("Ignore RecMII"));
165
166static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
167 cl::init(false));
168static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,
169 cl::init(false));
170
172 "pipeliner-annotate-for-testing", cl::Hidden, cl::init(false),
173 cl::desc("Instead of emitting the pipelined code, annotate instructions "
174 "with the generated schedule for feeding into the "
175 "-modulo-schedule-test pass"));
176
178 "pipeliner-experimental-cg", cl::Hidden, cl::init(false),
179 cl::desc(
180 "Use the experimental peeling code generator for software pipelining"));
181
182static cl::opt<int> SwpIISearchRange("pipeliner-ii-search-range",
183 cl::desc("Range to search for II"),
184 cl::Hidden, cl::init(10));
185
186static cl::opt<bool>
187 LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false),
188 cl::desc("Limit register pressure of scheduled loop"));
189
190static cl::opt<int>
191 RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden,
192 cl::init(5),
193 cl::desc("Margin representing the unused percentage of "
194 "the register pressure limit"));
195
196static cl::opt<bool>
197 MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false),
198 cl::desc("Use the MVE code generator for software pipelining"));
199
200namespace llvm {
201
202// A command line option to enable the CopyToPhi DAG mutation.
204 cl::init(true),
205 cl::desc("Enable CopyToPhi DAG Mutation"));
206
207/// A command line argument to force pipeliner to use specified issue
208/// width.
210 "pipeliner-force-issue-width",
211 cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
212 cl::init(-1));
213
214/// A command line argument to set the window scheduling option.
217 cl::desc("Set how to use window scheduling algorithm."),
219 "Turn off window algorithm."),
221 "Use window algorithm after SMS algorithm fails."),
223 "Use window algorithm instead of SMS algorithm.")));
224
225} // end namespace llvm
226
227unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
228char MachinePipeliner::ID = 0;
229#ifndef NDEBUG
231#endif
233
235 "Modulo Software Pipelining", false, false)
241 "Modulo Software Pipelining", false, false)
242
243/// The "main" function for implementing Swing Modulo Scheduling.
244bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
245 if (skipFunction(mf.getFunction()))
246 return false;
247
248 if (!EnableSWP)
249 return false;
250
251 if (mf.getFunction().getAttributes().hasFnAttr(Attribute::OptimizeForSize) &&
252 !EnableSWPOptSize.getPosition())
253 return false;
254
255 if (!mf.getSubtarget().enableMachinePipeliner())
256 return false;
257
258 // Cannot pipeline loops without instruction itineraries if we are using
259 // DFA for the pipeliner.
260 if (mf.getSubtarget().useDFAforSMS() &&
261 (!mf.getSubtarget().getInstrItineraryData() ||
262 mf.getSubtarget().getInstrItineraryData()->isEmpty()))
263 return false;
264
265 MF = &mf;
266 MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
267 MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
268 ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
269 TII = MF->getSubtarget().getInstrInfo();
270 RegClassInfo.runOnMachineFunction(*MF);
271
272 for (const auto &L : *MLI)
273 scheduleLoop(*L);
274
275 return false;
276}
277
278/// Attempt to perform the SMS algorithm on the specified loop. This function is
279/// the main entry point for the algorithm. The function identifies candidate
280/// loops, calculates the minimum initiation interval, and attempts to schedule
281/// the loop.
282bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
283 bool Changed = false;
284 for (const auto &InnerLoop : L)
285 Changed |= scheduleLoop(*InnerLoop);
286
287#ifndef NDEBUG
288 // Stop trying after reaching the limit (if any).
289 int Limit = SwpLoopLimit;
290 if (Limit >= 0) {
291 if (NumTries >= SwpLoopLimit)
292 return Changed;
293 NumTries++;
294 }
295#endif
296
297 setPragmaPipelineOptions(L);
298 if (!canPipelineLoop(L)) {
299 LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
300 ORE->emit([&]() {
301 return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",
302 L.getStartLoc(), L.getHeader())
303 << "Failed to pipeline loop";
304 });
305
306 LI.LoopPipelinerInfo.reset();
307 return Changed;
308 }
309
310 ++NumTrytoPipeline;
311 if (useSwingModuloScheduler())
312 Changed = swingModuloScheduler(L);
313
314 if (useWindowScheduler(Changed))
315 Changed = runWindowScheduler(L);
316
317 LI.LoopPipelinerInfo.reset();
318 return Changed;
319}
320
321void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
322 // Reset the pragma for the next loop in iteration.
323 disabledByPragma = false;
324 II_setByPragma = 0;
325
326 MachineBasicBlock *LBLK = L.getTopBlock();
327
328 if (LBLK == nullptr)
329 return;
330
331 const BasicBlock *BBLK = LBLK->getBasicBlock();
332 if (BBLK == nullptr)
333 return;
334
335 const Instruction *TI = BBLK->getTerminator();
336 if (TI == nullptr)
337 return;
338
339 MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop);
340 if (LoopID == nullptr)
341 return;
342
343 assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");
344 assert(LoopID->getOperand(0) == LoopID && "invalid loop");
345
346 for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
347 MDNode *MD = dyn_cast<MDNode>(MDO);
348
349 if (MD == nullptr)
350 continue;
351
352 MDString *S = dyn_cast<MDString>(MD->getOperand(0));
353
354 if (S == nullptr)
355 continue;
356
357 if (S->getString() == "llvm.loop.pipeline.initiationinterval") {
358 assert(MD->getNumOperands() == 2 &&
359 "Pipeline initiation interval hint metadata should have two operands.");
361 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
362 assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");
363 } else if (S->getString() == "llvm.loop.pipeline.disable") {
364 disabledByPragma = true;
365 }
366 }
367}
368
369/// Return true if the loop can be software pipelined. The algorithm is
370/// restricted to loops with a single basic block. Make sure that the
371/// branch in the loop can be analyzed.
372bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
373 if (L.getNumBlocks() != 1) {
374 ORE->emit([&]() {
375 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
376 L.getStartLoc(), L.getHeader())
377 << "Not a single basic block: "
378 << ore::NV("NumBlocks", L.getNumBlocks());
379 });
380 return false;
381 }
382
383 if (disabledByPragma) {
384 ORE->emit([&]() {
385 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
386 L.getStartLoc(), L.getHeader())
387 << "Disabled by Pragma.";
388 });
389 return false;
390 }
391
392 // Check if the branch can't be understood because we can't do pipelining
393 // if that's the case.
394 LI.TBB = nullptr;
395 LI.FBB = nullptr;
396 LI.BrCond.clear();
397 if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
398 LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
399 NumFailBranch++;
400 ORE->emit([&]() {
401 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
402 L.getStartLoc(), L.getHeader())
403 << "The branch can't be understood";
404 });
405 return false;
406 }
407
408 LI.LoopInductionVar = nullptr;
409 LI.LoopCompare = nullptr;
411 if (!LI.LoopPipelinerInfo) {
412 LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
413 NumFailLoop++;
414 ORE->emit([&]() {
415 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
416 L.getStartLoc(), L.getHeader())
417 << "The loop structure is not supported";
418 });
419 return false;
420 }
421
422 if (!L.getLoopPreheader()) {
423 LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
424 NumFailPreheader++;
425 ORE->emit([&]() {
426 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
427 L.getStartLoc(), L.getHeader())
428 << "No loop preheader found";
429 });
430 return false;
431 }
432
433 // Remove any subregisters from inputs to phi nodes.
434 preprocessPhiNodes(*L.getHeader());
435 return true;
436}
437
438void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
440 SlotIndexes &Slots =
441 *getAnalysis<LiveIntervalsWrapperPass>().getLIS().getSlotIndexes();
442
443 for (MachineInstr &PI : B.phis()) {
444 MachineOperand &DefOp = PI.getOperand(0);
445 assert(DefOp.getSubReg() == 0);
446 auto *RC = MRI.getRegClass(DefOp.getReg());
447
448 for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
449 MachineOperand &RegOp = PI.getOperand(i);
450 if (RegOp.getSubReg() == 0)
451 continue;
452
453 // If the operand uses a subregister, replace it with a new register
454 // without subregisters, and generate a copy to the new register.
455 Register NewReg = MRI.createVirtualRegister(RC);
456 MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
458 const DebugLoc &DL = PredB.findDebugLoc(At);
459 auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
460 .addReg(RegOp.getReg(), getRegState(RegOp),
461 RegOp.getSubReg());
462 Slots.insertMachineInstrInMaps(*Copy);
463 RegOp.setReg(NewReg);
464 RegOp.setSubReg(0);
465 }
466 }
467}
468
469/// The SMS algorithm consists of the following main steps:
470/// 1. Computation and analysis of the dependence graph.
471/// 2. Ordering of the nodes (instructions).
472/// 3. Attempt to Schedule the loop.
473bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
474 assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
475
477 *this, L, getAnalysis<LiveIntervalsWrapperPass>().getLIS(), RegClassInfo,
479
480 MachineBasicBlock *MBB = L.getHeader();
481 // The kernel should not include any terminator instructions. These
482 // will be added back later.
483 SMS.startBlock(MBB);
484
485 // Compute the number of 'real' instructions in the basic block by
486 // ignoring terminators.
487 unsigned size = MBB->size();
489 E = MBB->instr_end();
490 I != E; ++I, --size)
491 ;
492
493 SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
494 SMS.schedule();
495 SMS.exitRegion();
496
497 SMS.finishBlock();
498 return SMS.hasNewSchedule();
499}
500
510}
511
512bool MachinePipeliner::runWindowScheduler(MachineLoop &L) {
513 MachineSchedContext Context;
514 Context.MF = MF;
515 Context.MLI = MLI;
516 Context.MDT = MDT;
517 Context.PassConfig = &getAnalysis<TargetPassConfig>();
518 Context.AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
519 Context.LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
521 WindowScheduler WS(&Context, L);
522 return WS.run();
523}
524
525bool MachinePipeliner::useSwingModuloScheduler() {
526 // SwingModuloScheduler does not work when WindowScheduler is forced.
528}
529
530bool MachinePipeliner::useWindowScheduler(bool Changed) {
531 // WindowScheduler does not work when it is off or when SwingModuloScheduler
532 // is successfully scheduled.
535}
536
537void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
538 if (SwpForceII > 0)
539 MII = SwpForceII;
540 else if (II_setByPragma > 0)
541 MII = II_setByPragma;
542 else
543 MII = std::max(ResMII, RecMII);
544}
545
546void SwingSchedulerDAG::setMAX_II() {
547 if (SwpForceII > 0)
548 MAX_II = SwpForceII;
549 else if (II_setByPragma > 0)
550 MAX_II = II_setByPragma;
551 else
552 MAX_II = MII + SwpIISearchRange;
553}
554
555/// We override the schedule function in ScheduleDAGInstrs to implement the
556/// scheduling part of the Swing Modulo Scheduling algorithm.
558 AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
559 buildSchedGraph(AA);
560 addLoopCarriedDependences(AA);
561 updatePhiDependences();
563 changeDependences();
564 postProcessDAG();
565 LLVM_DEBUG(dump());
566
567 NodeSetType NodeSets;
568 findCircuits(NodeSets);
569 NodeSetType Circuits = NodeSets;
570
571 // Calculate the MII.
572 unsigned ResMII = calculateResMII();
573 unsigned RecMII = calculateRecMII(NodeSets);
574
575 fuseRecs(NodeSets);
576
577 // This flag is used for testing and can cause correctness problems.
578 if (SwpIgnoreRecMII)
579 RecMII = 0;
580
581 setMII(ResMII, RecMII);
582 setMAX_II();
583
584 LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II
585 << " (rec=" << RecMII << ", res=" << ResMII << ")\n");
586
587 // Can't schedule a loop without a valid MII.
588 if (MII == 0) {
589 LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
590 NumFailZeroMII++;
591 Pass.ORE->emit([&]() {
593 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
594 << "Invalid Minimal Initiation Interval: 0";
595 });
596 return;
597 }
598
599 // Don't pipeline large loops.
600 if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {
601 LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
602 << ", we don't pipeline large loops\n");
603 NumFailLargeMaxMII++;
604 Pass.ORE->emit([&]() {
606 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
607 << "Minimal Initiation Interval too large: "
608 << ore::NV("MII", (int)MII) << " > "
609 << ore::NV("SwpMaxMii", SwpMaxMii) << "."
610 << "Refer to -pipeliner-max-mii.";
611 });
612 return;
613 }
614
615 computeNodeFunctions(NodeSets);
616
617 registerPressureFilter(NodeSets);
618
619 colocateNodeSets(NodeSets);
620
621 checkNodeSets(NodeSets);
622
623 LLVM_DEBUG({
624 for (auto &I : NodeSets) {
625 dbgs() << " Rec NodeSet ";
626 I.dump();
627 }
628 });
629
630 llvm::stable_sort(NodeSets, std::greater<NodeSet>());
631
632 groupRemainingNodes(NodeSets);
633
634 removeDuplicateNodes(NodeSets);
635
636 LLVM_DEBUG({
637 for (auto &I : NodeSets) {
638 dbgs() << " NodeSet ";
639 I.dump();
640 }
641 });
642
643 computeNodeOrder(NodeSets);
644
645 // check for node order issues
646 checkValidNodeOrder(Circuits);
647
648 SMSchedule Schedule(Pass.MF, this);
649 Scheduled = schedulePipeline(Schedule);
650
651 if (!Scheduled){
652 LLVM_DEBUG(dbgs() << "No schedule found, return\n");
653 NumFailNoSchedule++;
654 Pass.ORE->emit([&]() {
656 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
657 << "Unable to find schedule";
658 });
659 return;
660 }
661
662 unsigned numStages = Schedule.getMaxStageCount();
663 // No need to generate pipeline if there are no overlapped iterations.
664 if (numStages == 0) {
665 LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
666 NumFailZeroStage++;
667 Pass.ORE->emit([&]() {
669 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
670 << "No need to pipeline - no overlapped iterations in schedule.";
671 });
672 return;
673 }
674 // Check that the maximum stage count is less than user-defined limit.
675 if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) {
676 LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
677 << " : too many stages, abort\n");
678 NumFailLargeMaxStage++;
679 Pass.ORE->emit([&]() {
681 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
682 << "Too many stages in schedule: "
683 << ore::NV("numStages", (int)numStages) << " > "
684 << ore::NV("SwpMaxStages", SwpMaxStages)
685 << ". Refer to -pipeliner-max-stages.";
686 });
687 return;
688 }
689
690 Pass.ORE->emit([&]() {
692 Loop.getHeader())
693 << "Pipelined succesfully!";
694 });
695
696 // Generate the schedule as a ModuloSchedule.
697 DenseMap<MachineInstr *, int> Cycles, Stages;
698 std::vector<MachineInstr *> OrderedInsts;
699 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
700 ++Cycle) {
701 for (SUnit *SU : Schedule.getInstructions(Cycle)) {
702 OrderedInsts.push_back(SU->getInstr());
703 Cycles[SU->getInstr()] = Cycle;
704 Stages[SU->getInstr()] = Schedule.stageScheduled(SU);
705 }
706 }
708 for (auto &KV : NewMIs) {
709 Cycles[KV.first] = Cycles[KV.second];
710 Stages[KV.first] = Stages[KV.second];
711 NewInstrChanges[KV.first] = InstrChanges[getSUnit(KV.first)];
712 }
713
714 ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
715 std::move(Stages));
717 assert(NewInstrChanges.empty() &&
718 "Cannot serialize a schedule with InstrChanges!");
720 MSTI.annotate();
721 return;
722 }
723 // The experimental code generator can't work if there are InstChanges.
724 if (ExperimentalCodeGen && NewInstrChanges.empty()) {
725 PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
726 MSE.expand();
727 } else if (MVECodeGen && NewInstrChanges.empty() &&
728 LoopPipelinerInfo->isMVEExpanderSupported() &&
730 ModuloScheduleExpanderMVE MSE(MF, MS, LIS);
731 MSE.expand();
732 } else {
733 ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
734 MSE.expand();
735 MSE.cleanup();
736 }
737 ++NumPipelined;
738}
739
740/// Clean up after the software pipeliner runs.
742 for (auto &KV : NewMIs)
743 MF.deleteMachineInstr(KV.second);
744 NewMIs.clear();
745
746 // Call the superclass.
748}
749
750/// Return the register values for the operands of a Phi instruction.
751/// This function assume the instruction is a Phi.
753 unsigned &InitVal, unsigned &LoopVal) {
754 assert(Phi.isPHI() && "Expecting a Phi.");
755
756 InitVal = 0;
757 LoopVal = 0;
758 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
759 if (Phi.getOperand(i + 1).getMBB() != Loop)
760 InitVal = Phi.getOperand(i).getReg();
761 else
762 LoopVal = Phi.getOperand(i).getReg();
763
764 assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
765}
766
767/// Return the Phi register value that comes the loop block.
768static unsigned getLoopPhiReg(const MachineInstr &Phi,
769 const MachineBasicBlock *LoopBB) {
770 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
771 if (Phi.getOperand(i + 1).getMBB() == LoopBB)
772 return Phi.getOperand(i).getReg();
773 return 0;
774}
775
776/// Return true if SUb can be reached from SUa following the chain edges.
777static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
780 Worklist.push_back(SUa);
781 while (!Worklist.empty()) {
782 const SUnit *SU = Worklist.pop_back_val();
783 for (const auto &SI : SU->Succs) {
784 SUnit *SuccSU = SI.getSUnit();
785 if (SI.getKind() == SDep::Order) {
786 if (Visited.count(SuccSU))
787 continue;
788 if (SuccSU == SUb)
789 return true;
790 Worklist.push_back(SuccSU);
791 Visited.insert(SuccSU);
792 }
793 }
794 }
795 return false;
796}
797
798/// Return true if the instruction causes a chain between memory
799/// references before and after it.
801 return MI.isCall() || MI.mayRaiseFPException() ||
802 MI.hasUnmodeledSideEffects() ||
803 (MI.hasOrderedMemoryRef() &&
804 (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad()));
805}
806
807/// Return the underlying objects for the memory references of an instruction.
808/// This function calls the code in ValueTracking, but first checks that the
809/// instruction has a memory operand.
812 if (!MI->hasOneMemOperand())
813 return;
814 MachineMemOperand *MM = *MI->memoperands_begin();
815 if (!MM->getValue())
816 return;
817 getUnderlyingObjects(MM->getValue(), Objs);
818 for (const Value *V : Objs) {
819 if (!isIdentifiedObject(V)) {
820 Objs.clear();
821 return;
822 }
823 }
824}
825
826/// Add a chain edge between a load and store if the store can be an
827/// alias of the load on a subsequent iteration, i.e., a loop carried
828/// dependence. This code is very similar to the code in ScheduleDAGInstrs
829/// but that code doesn't create loop carried dependences.
830void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
834 for (auto &SU : SUnits) {
835 MachineInstr &MI = *SU.getInstr();
837 PendingLoads.clear();
838 else if (MI.mayLoad()) {
841 if (Objs.empty())
843 for (const auto *V : Objs) {
844 SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
845 SUs.push_back(&SU);
846 }
847 } else if (MI.mayStore()) {
850 if (Objs.empty())
852 for (const auto *V : Objs) {
854 PendingLoads.find(V);
855 if (I == PendingLoads.end())
856 continue;
857 for (auto *Load : I->second) {
858 if (isSuccOrder(Load, &SU))
859 continue;
860 MachineInstr &LdMI = *Load->getInstr();
861 // First, perform the cheaper check that compares the base register.
862 // If they are the same and the load offset is less than the store
863 // offset, then mark the dependence as loop carried potentially.
864 const MachineOperand *BaseOp1, *BaseOp2;
865 int64_t Offset1, Offset2;
866 bool Offset1IsScalable, Offset2IsScalable;
867 if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1,
868 Offset1IsScalable, TRI) &&
869 TII->getMemOperandWithOffset(MI, BaseOp2, Offset2,
870 Offset2IsScalable, TRI)) {
871 if (BaseOp1->isIdenticalTo(*BaseOp2) &&
872 Offset1IsScalable == Offset2IsScalable &&
873 (int)Offset1 < (int)Offset2) {
875 "What happened to the chain edge?");
876 SDep Dep(Load, SDep::Barrier);
877 Dep.setLatency(1);
878 SU.addPred(Dep);
879 continue;
880 }
881 }
882 // Second, the more expensive check that uses alias analysis on the
883 // base registers. If they alias, and the load offset is less than
884 // the store offset, the mark the dependence as loop carried.
885 if (!AA) {
886 SDep Dep(Load, SDep::Barrier);
887 Dep.setLatency(1);
888 SU.addPred(Dep);
889 continue;
890 }
891 MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
892 MachineMemOperand *MMO2 = *MI.memoperands_begin();
893 if (!MMO1->getValue() || !MMO2->getValue()) {
894 SDep Dep(Load, SDep::Barrier);
895 Dep.setLatency(1);
896 SU.addPred(Dep);
897 continue;
898 }
899 if (MMO1->getValue() == MMO2->getValue() &&
900 MMO1->getOffset() <= MMO2->getOffset()) {
901 SDep Dep(Load, SDep::Barrier);
902 Dep.setLatency(1);
903 SU.addPred(Dep);
904 continue;
905 }
906 if (!AA->isNoAlias(
909 MMO2->getAAInfo()))) {
910 SDep Dep(Load, SDep::Barrier);
911 Dep.setLatency(1);
912 SU.addPred(Dep);
913 }
914 }
915 }
916 }
917 }
918}
919
920/// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
921/// processes dependences for PHIs. This function adds true dependences
922/// from a PHI to a use, and a loop carried dependence from the use to the
923/// PHI. The loop carried dependence is represented as an anti dependence
924/// edge. This function also removes chain dependences between unrelated
925/// PHIs.
926void SwingSchedulerDAG::updatePhiDependences() {
927 SmallVector<SDep, 4> RemoveDeps;
929
930 // Iterate over each DAG node.
931 for (SUnit &I : SUnits) {
932 RemoveDeps.clear();
933 // Set to true if the instruction has an operand defined by a Phi.
934 unsigned HasPhiUse = 0;
935 unsigned HasPhiDef = 0;
936 MachineInstr *MI = I.getInstr();
937 // Iterate over each operand, and we process the definitions.
938 for (const MachineOperand &MO : MI->operands()) {
939 if (!MO.isReg())
940 continue;
941 Register Reg = MO.getReg();
942 if (MO.isDef()) {
943 // If the register is used by a Phi, then create an anti dependence.
945 UI = MRI.use_instr_begin(Reg),
946 UE = MRI.use_instr_end();
947 UI != UE; ++UI) {
948 MachineInstr *UseMI = &*UI;
949 SUnit *SU = getSUnit(UseMI);
950 if (SU != nullptr && UseMI->isPHI()) {
951 if (!MI->isPHI()) {
952 SDep Dep(SU, SDep::Anti, Reg);
953 Dep.setLatency(1);
954 I.addPred(Dep);
955 } else {
956 HasPhiDef = Reg;
957 // Add a chain edge to a dependent Phi that isn't an existing
958 // predecessor.
959 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
960 I.addPred(SDep(SU, SDep::Barrier));
961 }
962 }
963 }
964 } else if (MO.isUse()) {
965 // If the register is defined by a Phi, then create a true dependence.
967 if (DefMI == nullptr)
968 continue;
969 SUnit *SU = getSUnit(DefMI);
970 if (SU != nullptr && DefMI->isPHI()) {
971 if (!MI->isPHI()) {
972 SDep Dep(SU, SDep::Data, Reg);
973 Dep.setLatency(0);
974 ST.adjustSchedDependency(SU, 0, &I, MO.getOperandNo(), Dep,
975 &SchedModel);
976 I.addPred(Dep);
977 } else {
978 HasPhiUse = Reg;
979 // Add a chain edge to a dependent Phi that isn't an existing
980 // predecessor.
981 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
982 I.addPred(SDep(SU, SDep::Barrier));
983 }
984 }
985 }
986 }
987 // Remove order dependences from an unrelated Phi.
988 if (!SwpPruneDeps)
989 continue;
990 for (auto &PI : I.Preds) {
991 MachineInstr *PMI = PI.getSUnit()->getInstr();
992 if (PMI->isPHI() && PI.getKind() == SDep::Order) {
993 if (I.getInstr()->isPHI()) {
994 if (PMI->getOperand(0).getReg() == HasPhiUse)
995 continue;
996 if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
997 continue;
998 }
999 RemoveDeps.push_back(PI);
1000 }
1001 }
1002 for (const SDep &D : RemoveDeps)
1003 I.removePred(D);
1004 }
1005}
1006
1007/// Iterate over each DAG node and see if we can change any dependences
1008/// in order to reduce the recurrence MII.
1009void SwingSchedulerDAG::changeDependences() {
1010 // See if an instruction can use a value from the previous iteration.
1011 // If so, we update the base and offset of the instruction and change
1012 // the dependences.
1013 for (SUnit &I : SUnits) {
1014 unsigned BasePos = 0, OffsetPos = 0, NewBase = 0;
1015 int64_t NewOffset = 0;
1016 if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
1017 NewOffset))
1018 continue;
1019
1020 // Get the MI and SUnit for the instruction that defines the original base.
1021 Register OrigBase = I.getInstr()->getOperand(BasePos).getReg();
1023 if (!DefMI)
1024 continue;
1025 SUnit *DefSU = getSUnit(DefMI);
1026 if (!DefSU)
1027 continue;
1028 // Get the MI and SUnit for the instruction that defins the new base.
1029 MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
1030 if (!LastMI)
1031 continue;
1032 SUnit *LastSU = getSUnit(LastMI);
1033 if (!LastSU)
1034 continue;
1035
1036 if (Topo.IsReachable(&I, LastSU))
1037 continue;
1038
1039 // Remove the dependence. The value now depends on a prior iteration.
1041 for (const SDep &P : I.Preds)
1042 if (P.getSUnit() == DefSU)
1043 Deps.push_back(P);
1044 for (const SDep &D : Deps) {
1045 Topo.RemovePred(&I, D.getSUnit());
1046 I.removePred(D);
1047 }
1048 // Remove the chain dependence between the instructions.
1049 Deps.clear();
1050 for (auto &P : LastSU->Preds)
1051 if (P.getSUnit() == &I && P.getKind() == SDep::Order)
1052 Deps.push_back(P);
1053 for (const SDep &D : Deps) {
1054 Topo.RemovePred(LastSU, D.getSUnit());
1055 LastSU->removePred(D);
1056 }
1057
1058 // Add a dependence between the new instruction and the instruction
1059 // that defines the new base.
1060 SDep Dep(&I, SDep::Anti, NewBase);
1061 Topo.AddPred(LastSU, &I);
1062 LastSU->addPred(Dep);
1063
1064 // Remember the base and offset information so that we can update the
1065 // instruction during code generation.
1066 InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
1067 }
1068}
1069
1070/// Create an instruction stream that represents a single iteration and stage of
1071/// each instruction. This function differs from SMSchedule::finalizeSchedule in
1072/// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this
1073/// function is an approximation of SMSchedule::finalizeSchedule with all
1074/// non-const operations removed.
1076 SMSchedule &Schedule,
1077 std::vector<MachineInstr *> &OrderedInsts,
1080
1081 // Move all instructions to the first stage from the later stages.
1082 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1083 ++Cycle) {
1084 for (int Stage = 0, LastStage = Schedule.getMaxStageCount();
1085 Stage <= LastStage; ++Stage) {
1086 for (SUnit *SU : llvm::reverse(Schedule.getInstructions(
1087 Cycle + Stage * Schedule.getInitiationInterval()))) {
1088 Instrs[Cycle].push_front(SU);
1089 }
1090 }
1091 }
1092
1093 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1094 ++Cycle) {
1095 std::deque<SUnit *> &CycleInstrs = Instrs[Cycle];
1096 CycleInstrs = Schedule.reorderInstructions(SSD, CycleInstrs);
1097 for (SUnit *SU : CycleInstrs) {
1098 MachineInstr *MI = SU->getInstr();
1099 OrderedInsts.push_back(MI);
1100 Stages[MI] = Schedule.stageScheduled(SU);
1101 }
1102 }
1103}
1104
1105namespace {
1106
1107// FuncUnitSorter - Comparison operator used to sort instructions by
1108// the number of functional unit choices.
1109struct FuncUnitSorter {
1110 const InstrItineraryData *InstrItins;
1111 const MCSubtargetInfo *STI;
1113
1114 FuncUnitSorter(const TargetSubtargetInfo &TSI)
1115 : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
1116
1117 // Compute the number of functional unit alternatives needed
1118 // at each stage, and take the minimum value. We prioritize the
1119 // instructions by the least number of choices first.
1120 unsigned minFuncUnits(const MachineInstr *Inst,
1121 InstrStage::FuncUnits &F) const {
1122 unsigned SchedClass = Inst->getDesc().getSchedClass();
1123 unsigned min = UINT_MAX;
1124 if (InstrItins && !InstrItins->isEmpty()) {
1125 for (const InstrStage &IS :
1126 make_range(InstrItins->beginStage(SchedClass),
1127 InstrItins->endStage(SchedClass))) {
1128 InstrStage::FuncUnits funcUnits = IS.getUnits();
1129 unsigned numAlternatives = llvm::popcount(funcUnits);
1130 if (numAlternatives < min) {
1131 min = numAlternatives;
1132 F = funcUnits;
1133 }
1134 }
1135 return min;
1136 }
1137 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1138 const MCSchedClassDesc *SCDesc =
1139 STI->getSchedModel().getSchedClassDesc(SchedClass);
1140 if (!SCDesc->isValid())
1141 // No valid Schedule Class Desc for schedClass, should be
1142 // Pseudo/PostRAPseudo
1143 return min;
1144
1145 for (const MCWriteProcResEntry &PRE :
1146 make_range(STI->getWriteProcResBegin(SCDesc),
1147 STI->getWriteProcResEnd(SCDesc))) {
1148 if (!PRE.ReleaseAtCycle)
1149 continue;
1150 const MCProcResourceDesc *ProcResource =
1151 STI->getSchedModel().getProcResource(PRE.ProcResourceIdx);
1152 unsigned NumUnits = ProcResource->NumUnits;
1153 if (NumUnits < min) {
1154 min = NumUnits;
1155 F = PRE.ProcResourceIdx;
1156 }
1157 }
1158 return min;
1159 }
1160 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1161 }
1162
1163 // Compute the critical resources needed by the instruction. This
1164 // function records the functional units needed by instructions that
1165 // must use only one functional unit. We use this as a tie breaker
1166 // for computing the resource MII. The instrutions that require
1167 // the same, highly used, functional unit have high priority.
1168 void calcCriticalResources(MachineInstr &MI) {
1169 unsigned SchedClass = MI.getDesc().getSchedClass();
1170 if (InstrItins && !InstrItins->isEmpty()) {
1171 for (const InstrStage &IS :
1172 make_range(InstrItins->beginStage(SchedClass),
1173 InstrItins->endStage(SchedClass))) {
1174 InstrStage::FuncUnits FuncUnits = IS.getUnits();
1175 if (llvm::popcount(FuncUnits) == 1)
1176 Resources[FuncUnits]++;
1177 }
1178 return;
1179 }
1180 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1181 const MCSchedClassDesc *SCDesc =
1182 STI->getSchedModel().getSchedClassDesc(SchedClass);
1183 if (!SCDesc->isValid())
1184 // No valid Schedule Class Desc for schedClass, should be
1185 // Pseudo/PostRAPseudo
1186 return;
1187
1188 for (const MCWriteProcResEntry &PRE :
1189 make_range(STI->getWriteProcResBegin(SCDesc),
1190 STI->getWriteProcResEnd(SCDesc))) {
1191 if (!PRE.ReleaseAtCycle)
1192 continue;
1193 Resources[PRE.ProcResourceIdx]++;
1194 }
1195 return;
1196 }
1197 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1198 }
1199
1200 /// Return true if IS1 has less priority than IS2.
1201 bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
1202 InstrStage::FuncUnits F1 = 0, F2 = 0;
1203 unsigned MFUs1 = minFuncUnits(IS1, F1);
1204 unsigned MFUs2 = minFuncUnits(IS2, F2);
1205 if (MFUs1 == MFUs2)
1206 return Resources.lookup(F1) < Resources.lookup(F2);
1207 return MFUs1 > MFUs2;
1208 }
1209};
1210
1211/// Calculate the maximum register pressure of the scheduled instructions stream
1212class HighRegisterPressureDetector {
1213 MachineBasicBlock *OrigMBB;
1214 const MachineFunction &MF;
1215 const MachineRegisterInfo &MRI;
1216 const TargetRegisterInfo *TRI;
1217
1218 const unsigned PSetNum;
1219
1220 // Indexed by PSet ID
1221 // InitSetPressure takes into account the register pressure of live-in
1222 // registers. It's not depend on how the loop is scheduled, so it's enough to
1223 // calculate them once at the beginning.
1224 std::vector<unsigned> InitSetPressure;
1225
1226 // Indexed by PSet ID
1227 // Upper limit for each register pressure set
1228 std::vector<unsigned> PressureSetLimit;
1229
1231
1233
1234public:
1235 using OrderedInstsTy = std::vector<MachineInstr *>;
1236 using Instr2StageTy = DenseMap<MachineInstr *, unsigned>;
1237
1238private:
1239 static void dumpRegisterPressures(const std::vector<unsigned> &Pressures) {
1240 if (Pressures.size() == 0) {
1241 dbgs() << "[]";
1242 } else {
1243 char Prefix = '[';
1244 for (unsigned P : Pressures) {
1245 dbgs() << Prefix << P;
1246 Prefix = ' ';
1247 }
1248 dbgs() << ']';
1249 }
1250 }
1251
1252 void dumpPSet(Register Reg) const {
1253 dbgs() << "Reg=" << printReg(Reg, TRI, 0, &MRI) << " PSet=";
1254 for (auto PSetIter = MRI.getPressureSets(Reg); PSetIter.isValid();
1255 ++PSetIter) {
1256 dbgs() << *PSetIter << ' ';
1257 }
1258 dbgs() << '\n';
1259 }
1260
1261 void increaseRegisterPressure(std::vector<unsigned> &Pressure,
1262 Register Reg) const {
1263 auto PSetIter = MRI.getPressureSets(Reg);
1264 unsigned Weight = PSetIter.getWeight();
1265 for (; PSetIter.isValid(); ++PSetIter)
1266 Pressure[*PSetIter] += Weight;
1267 }
1268
1269 void decreaseRegisterPressure(std::vector<unsigned> &Pressure,
1270 Register Reg) const {
1271 auto PSetIter = MRI.getPressureSets(Reg);
1272 unsigned Weight = PSetIter.getWeight();
1273 for (; PSetIter.isValid(); ++PSetIter) {
1274 auto &P = Pressure[*PSetIter];
1275 assert(P >= Weight &&
1276 "register pressure must be greater than or equal weight");
1277 P -= Weight;
1278 }
1279 }
1280
1281 // Return true if Reg is fixed one, for example, stack pointer
1282 bool isFixedRegister(Register Reg) const {
1283 return Reg.isPhysical() && TRI->isFixedRegister(MF, Reg.asMCReg());
1284 }
1285
1286 bool isDefinedInThisLoop(Register Reg) const {
1287 return Reg.isVirtual() && MRI.getVRegDef(Reg)->getParent() == OrigMBB;
1288 }
1289
1290 // Search for live-in variables. They are factored into the register pressure
1291 // from the begining. Live-in variables used by every iteration should be
1292 // considered as alive throughout the loop. For example, the variable `c` in
1293 // following code. \code
1294 // int c = ...;
1295 // for (int i = 0; i < n; i++)
1296 // a[i] += b[i] + c;
1297 // \endcode
1298 void computeLiveIn() {
1300 for (auto &MI : *OrigMBB) {
1301 if (MI.isDebugInstr())
1302 continue;
1303 for (auto &Use : ROMap[&MI].Uses) {
1304 auto Reg = Use.RegUnit;
1305 // Ignore the variable that appears only on one side of phi instruction
1306 // because it's used only at the first iteration.
1307 if (MI.isPHI() && Reg != getLoopPhiReg(MI, OrigMBB))
1308 continue;
1309 if (isFixedRegister(Reg))
1310 continue;
1311 if (isDefinedInThisLoop(Reg))
1312 continue;
1313 Used.insert(Reg);
1314 }
1315 }
1316
1317 for (auto LiveIn : Used)
1318 increaseRegisterPressure(InitSetPressure, LiveIn);
1319 }
1320
1321 // Calculate the upper limit of each pressure set
1322 void computePressureSetLimit(const RegisterClassInfo &RCI) {
1323 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1324 PressureSetLimit[PSet] = TRI->getRegPressureSetLimit(MF, PSet);
1325
1326 // We assume fixed registers, such as stack pointer, are already in use.
1327 // Therefore subtracting the weight of the fixed registers from the limit of
1328 // each pressure set in advance.
1330 for (const TargetRegisterClass *TRC : TRI->regclasses()) {
1331 for (const MCPhysReg Reg : *TRC)
1332 if (isFixedRegister(Reg))
1333 FixedRegs.insert(Reg);
1334 }
1335
1336 LLVM_DEBUG({
1337 for (auto Reg : FixedRegs) {
1338 dbgs() << printReg(Reg, TRI, 0, &MRI) << ": [";
1339 const int *Sets = TRI->getRegUnitPressureSets(Reg);
1340 for (; *Sets != -1; Sets++) {
1341 dbgs() << TRI->getRegPressureSetName(*Sets) << ", ";
1342 }
1343 dbgs() << "]\n";
1344 }
1345 });
1346
1347 for (auto Reg : FixedRegs) {
1348 LLVM_DEBUG(dbgs() << "fixed register: " << printReg(Reg, TRI, 0, &MRI)
1349 << "\n");
1350 auto PSetIter = MRI.getPressureSets(Reg);
1351 unsigned Weight = PSetIter.getWeight();
1352 for (; PSetIter.isValid(); ++PSetIter) {
1353 unsigned &Limit = PressureSetLimit[*PSetIter];
1354 assert(Limit >= Weight &&
1355 "register pressure limit must be greater than or equal weight");
1356 Limit -= Weight;
1357 LLVM_DEBUG(dbgs() << "PSet=" << *PSetIter << " Limit=" << Limit
1358 << " (decreased by " << Weight << ")\n");
1359 }
1360 }
1361 }
1362
1363 // There are two patterns of last-use.
1364 // - by an instruction of the current iteration
1365 // - by a phi instruction of the next iteration (loop carried value)
1366 //
1367 // Furthermore, following two groups of instructions are executed
1368 // simultaneously
1369 // - next iteration's phi instructions in i-th stage
1370 // - current iteration's instructions in i+1-th stage
1371 //
1372 // This function calculates the last-use of each register while taking into
1373 // account the above two patterns.
1374 Instr2LastUsesTy computeLastUses(const OrderedInstsTy &OrderedInsts,
1375 Instr2StageTy &Stages) const {
1376 // We treat virtual registers that are defined and used in this loop.
1377 // Following virtual register will be ignored
1378 // - live-in one
1379 // - defined but not used in the loop (potentially live-out)
1380 DenseSet<Register> TargetRegs;
1381 const auto UpdateTargetRegs = [this, &TargetRegs](Register Reg) {
1382 if (isDefinedInThisLoop(Reg))
1383 TargetRegs.insert(Reg);
1384 };
1385 for (MachineInstr *MI : OrderedInsts) {
1386 if (MI->isPHI()) {
1387 Register Reg = getLoopPhiReg(*MI, OrigMBB);
1388 UpdateTargetRegs(Reg);
1389 } else {
1390 for (auto &Use : ROMap.find(MI)->getSecond().Uses)
1391 UpdateTargetRegs(Use.RegUnit);
1392 }
1393 }
1394
1395 const auto InstrScore = [&Stages](MachineInstr *MI) {
1396 return Stages[MI] + MI->isPHI();
1397 };
1398
1400 for (MachineInstr *MI : llvm::reverse(OrderedInsts)) {
1401 for (auto &Use : ROMap.find(MI)->getSecond().Uses) {
1402 auto Reg = Use.RegUnit;
1403 if (!TargetRegs.contains(Reg))
1404 continue;
1405 auto Ite = LastUseMI.find(Reg);
1406 if (Ite == LastUseMI.end()) {
1407 LastUseMI[Reg] = MI;
1408 } else {
1409 MachineInstr *Orig = Ite->second;
1410 MachineInstr *New = MI;
1411 if (InstrScore(Orig) < InstrScore(New))
1412 LastUseMI[Reg] = New;
1413 }
1414 }
1415 }
1416
1417 Instr2LastUsesTy LastUses;
1418 for (auto &Entry : LastUseMI)
1419 LastUses[Entry.second].insert(Entry.first);
1420 return LastUses;
1421 }
1422
1423 // Compute the maximum register pressure of the kernel. We'll simulate #Stage
1424 // iterations and check the register pressure at the point where all stages
1425 // overlapping.
1426 //
1427 // An example of unrolled loop where #Stage is 4..
1428 // Iter i+0 i+1 i+2 i+3
1429 // ------------------------
1430 // Stage 0
1431 // Stage 1 0
1432 // Stage 2 1 0
1433 // Stage 3 2 1 0 <- All stages overlap
1434 //
1435 std::vector<unsigned>
1436 computeMaxSetPressure(const OrderedInstsTy &OrderedInsts,
1437 Instr2StageTy &Stages,
1438 const unsigned StageCount) const {
1439 using RegSetTy = SmallDenseSet<Register, 16>;
1440
1441 // Indexed by #Iter. To treat "local" variables of each stage separately, we
1442 // manage the liveness of the registers independently by iterations.
1443 SmallVector<RegSetTy> LiveRegSets(StageCount);
1444
1445 auto CurSetPressure = InitSetPressure;
1446 auto MaxSetPressure = InitSetPressure;
1447 auto LastUses = computeLastUses(OrderedInsts, Stages);
1448
1449 LLVM_DEBUG({
1450 dbgs() << "Ordered instructions:\n";
1451 for (MachineInstr *MI : OrderedInsts) {
1452 dbgs() << "Stage " << Stages[MI] << ": ";
1453 MI->dump();
1454 }
1455 });
1456
1457 const auto InsertReg = [this, &CurSetPressure](RegSetTy &RegSet,
1458 Register Reg) {
1459 if (!Reg.isValid() || isFixedRegister(Reg))
1460 return;
1461
1462 bool Inserted = RegSet.insert(Reg).second;
1463 if (!Inserted)
1464 return;
1465
1466 LLVM_DEBUG(dbgs() << "insert " << printReg(Reg, TRI, 0, &MRI) << "\n");
1467 increaseRegisterPressure(CurSetPressure, Reg);
1468 LLVM_DEBUG(dumpPSet(Reg));
1469 };
1470
1471 const auto EraseReg = [this, &CurSetPressure](RegSetTy &RegSet,
1472 Register Reg) {
1473 if (!Reg.isValid() || isFixedRegister(Reg))
1474 return;
1475
1476 // live-in register
1477 if (!RegSet.contains(Reg))
1478 return;
1479
1480 LLVM_DEBUG(dbgs() << "erase " << printReg(Reg, TRI, 0, &MRI) << "\n");
1481 RegSet.erase(Reg);
1482 decreaseRegisterPressure(CurSetPressure, Reg);
1483 LLVM_DEBUG(dumpPSet(Reg));
1484 };
1485
1486 for (unsigned I = 0; I < StageCount; I++) {
1487 for (MachineInstr *MI : OrderedInsts) {
1488 const auto Stage = Stages[MI];
1489 if (I < Stage)
1490 continue;
1491
1492 const unsigned Iter = I - Stage;
1493
1494 for (auto &Def : ROMap.find(MI)->getSecond().Defs)
1495 InsertReg(LiveRegSets[Iter], Def.RegUnit);
1496
1497 for (auto LastUse : LastUses[MI]) {
1498 if (MI->isPHI()) {
1499 if (Iter != 0)
1500 EraseReg(LiveRegSets[Iter - 1], LastUse);
1501 } else {
1502 EraseReg(LiveRegSets[Iter], LastUse);
1503 }
1504 }
1505
1506 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1507 MaxSetPressure[PSet] =
1508 std::max(MaxSetPressure[PSet], CurSetPressure[PSet]);
1509
1510 LLVM_DEBUG({
1511 dbgs() << "CurSetPressure=";
1512 dumpRegisterPressures(CurSetPressure);
1513 dbgs() << " iter=" << Iter << " stage=" << Stage << ":";
1514 MI->dump();
1515 });
1516 }
1517 }
1518
1519 return MaxSetPressure;
1520 }
1521
1522public:
1523 HighRegisterPressureDetector(MachineBasicBlock *OrigMBB,
1524 const MachineFunction &MF)
1525 : OrigMBB(OrigMBB), MF(MF), MRI(MF.getRegInfo()),
1526 TRI(MF.getSubtarget().getRegisterInfo()),
1527 PSetNum(TRI->getNumRegPressureSets()), InitSetPressure(PSetNum, 0),
1528 PressureSetLimit(PSetNum, 0) {}
1529
1530 // Used to calculate register pressure, which is independent of loop
1531 // scheduling.
1532 void init(const RegisterClassInfo &RCI) {
1533 for (MachineInstr &MI : *OrigMBB) {
1534 if (MI.isDebugInstr())
1535 continue;
1536 ROMap[&MI].collect(MI, *TRI, MRI, false, true);
1537 }
1538
1539 computeLiveIn();
1540 computePressureSetLimit(RCI);
1541 }
1542
1543 // Calculate the maximum register pressures of the loop and check if they
1544 // exceed the limit
1545 bool detect(const SwingSchedulerDAG *SSD, SMSchedule &Schedule,
1546 const unsigned MaxStage) const {
1548 "the percentage of the margin must be between 0 to 100");
1549
1550 OrderedInstsTy OrderedInsts;
1551 Instr2StageTy Stages;
1552 computeScheduledInsts(SSD, Schedule, OrderedInsts, Stages);
1553 const auto MaxSetPressure =
1554 computeMaxSetPressure(OrderedInsts, Stages, MaxStage + 1);
1555
1556 LLVM_DEBUG({
1557 dbgs() << "Dump MaxSetPressure:\n";
1558 for (unsigned I = 0; I < MaxSetPressure.size(); I++) {
1559 dbgs() << format("MaxSetPressure[%d]=%d\n", I, MaxSetPressure[I]);
1560 }
1561 dbgs() << '\n';
1562 });
1563
1564 for (unsigned PSet = 0; PSet < PSetNum; PSet++) {
1565 unsigned Limit = PressureSetLimit[PSet];
1566 unsigned Margin = Limit * RegPressureMargin / 100;
1567 LLVM_DEBUG(dbgs() << "PSet=" << PSet << " Limit=" << Limit
1568 << " Margin=" << Margin << "\n");
1569 if (Limit < MaxSetPressure[PSet] + Margin) {
1570 LLVM_DEBUG(
1571 dbgs()
1572 << "Rejected the schedule because of too high register pressure\n");
1573 return true;
1574 }
1575 }
1576 return false;
1577 }
1578};
1579
1580} // end anonymous namespace
1581
1582/// Calculate the resource constrained minimum initiation interval for the
1583/// specified loop. We use the DFA to model the resources needed for
1584/// each instruction, and we ignore dependences. A different DFA is created
1585/// for each cycle that is required. When adding a new instruction, we attempt
1586/// to add it to each existing DFA, until a legal space is found. If the
1587/// instruction cannot be reserved in an existing DFA, we create a new one.
1588unsigned SwingSchedulerDAG::calculateResMII() {
1589 LLVM_DEBUG(dbgs() << "calculateResMII:\n");
1591 return RM.calculateResMII();
1592}
1593
1594/// Calculate the recurrence-constrainted minimum initiation interval.
1595/// Iterate over each circuit. Compute the delay(c) and distance(c)
1596/// for each circuit. The II needs to satisfy the inequality
1597/// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1598/// II that satisfies the inequality, and the RecMII is the maximum
1599/// of those values.
1600unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1601 unsigned RecMII = 0;
1602
1603 for (NodeSet &Nodes : NodeSets) {
1604 if (Nodes.empty())
1605 continue;
1606
1607 unsigned Delay = Nodes.getLatency();
1608 unsigned Distance = 1;
1609
1610 // ii = ceil(delay / distance)
1611 unsigned CurMII = (Delay + Distance - 1) / Distance;
1612 Nodes.setRecMII(CurMII);
1613 if (CurMII > RecMII)
1614 RecMII = CurMII;
1615 }
1616
1617 return RecMII;
1618}
1619
1620/// Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1621/// but we do this to find the circuits, and then change them back.
1622static void swapAntiDependences(std::vector<SUnit> &SUnits) {
1624 for (SUnit &SU : SUnits) {
1625 for (SDep &Pred : SU.Preds)
1626 if (Pred.getKind() == SDep::Anti)
1627 DepsAdded.push_back(std::make_pair(&SU, Pred));
1628 }
1629 for (std::pair<SUnit *, SDep> &P : DepsAdded) {
1630 // Remove this anti dependency and add one in the reverse direction.
1631 SUnit *SU = P.first;
1632 SDep &D = P.second;
1633 SUnit *TargetSU = D.getSUnit();
1634 unsigned Reg = D.getReg();
1635 unsigned Lat = D.getLatency();
1636 SU->removePred(D);
1637 SDep Dep(SU, SDep::Anti, Reg);
1638 Dep.setLatency(Lat);
1639 TargetSU->addPred(Dep);
1640 }
1641}
1642
1643/// Create the adjacency structure of the nodes in the graph.
1644void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1645 SwingSchedulerDAG *DAG) {
1646 BitVector Added(SUnits.size());
1647 DenseMap<int, int> OutputDeps;
1648 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1649 Added.reset();
1650 // Add any successor to the adjacency matrix and exclude duplicates.
1651 for (auto &SI : SUnits[i].Succs) {
1652 // Only create a back-edge on the first and last nodes of a dependence
1653 // chain. This records any chains and adds them later.
1654 if (SI.getKind() == SDep::Output) {
1655 int N = SI.getSUnit()->NodeNum;
1656 int BackEdge = i;
1657 auto Dep = OutputDeps.find(BackEdge);
1658 if (Dep != OutputDeps.end()) {
1659 BackEdge = Dep->second;
1660 OutputDeps.erase(Dep);
1661 }
1662 OutputDeps[N] = BackEdge;
1663 }
1664 // Do not process a boundary node, an artificial node.
1665 // A back-edge is processed only if it goes to a Phi.
1666 if (SI.getSUnit()->isBoundaryNode() || SI.isArtificial() ||
1667 (SI.getKind() == SDep::Anti && !SI.getSUnit()->getInstr()->isPHI()))
1668 continue;
1669 int N = SI.getSUnit()->NodeNum;
1670 if (!Added.test(N)) {
1671 AdjK[i].push_back(N);
1672 Added.set(N);
1673 }
1674 }
1675 // A chain edge between a store and a load is treated as a back-edge in the
1676 // adjacency matrix.
1677 for (auto &PI : SUnits[i].Preds) {
1678 if (!SUnits[i].getInstr()->mayStore() ||
1679 !DAG->isLoopCarriedDep(&SUnits[i], PI, false))
1680 continue;
1681 if (PI.getKind() == SDep::Order && PI.getSUnit()->getInstr()->mayLoad()) {
1682 int N = PI.getSUnit()->NodeNum;
1683 if (!Added.test(N)) {
1684 AdjK[i].push_back(N);
1685 Added.set(N);
1686 }
1687 }
1688 }
1689 }
1690 // Add back-edges in the adjacency matrix for the output dependences.
1691 for (auto &OD : OutputDeps)
1692 if (!Added.test(OD.second)) {
1693 AdjK[OD.first].push_back(OD.second);
1694 Added.set(OD.second);
1695 }
1696}
1697
1698/// Identify an elementary circuit in the dependence graph starting at the
1699/// specified node.
1700bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1701 bool HasBackedge) {
1702 SUnit *SV = &SUnits[V];
1703 bool F = false;
1704 Stack.insert(SV);
1705 Blocked.set(V);
1706
1707 for (auto W : AdjK[V]) {
1708 if (NumPaths > MaxPaths)
1709 break;
1710 if (W < S)
1711 continue;
1712 if (W == S) {
1713 if (!HasBackedge)
1714 NodeSets.push_back(NodeSet(Stack.begin(), Stack.end()));
1715 F = true;
1716 ++NumPaths;
1717 break;
1718 } else if (!Blocked.test(W)) {
1719 if (circuit(W, S, NodeSets,
1720 Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
1721 F = true;
1722 }
1723 }
1724
1725 if (F)
1726 unblock(V);
1727 else {
1728 for (auto W : AdjK[V]) {
1729 if (W < S)
1730 continue;
1731 B[W].insert(SV);
1732 }
1733 }
1734 Stack.pop_back();
1735 return F;
1736}
1737
1738/// Unblock a node in the circuit finding algorithm.
1739void SwingSchedulerDAG::Circuits::unblock(int U) {
1740 Blocked.reset(U);
1742 while (!BU.empty()) {
1744 assert(SI != BU.end() && "Invalid B set.");
1745 SUnit *W = *SI;
1746 BU.erase(W);
1747 if (Blocked.test(W->NodeNum))
1748 unblock(W->NodeNum);
1749 }
1750}
1751
1752/// Identify all the elementary circuits in the dependence graph using
1753/// Johnson's circuit algorithm.
1754void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
1755 // Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1756 // but we do this to find the circuits, and then change them back.
1758
1759 Circuits Cir(SUnits, Topo);
1760 // Create the adjacency structure.
1761 Cir.createAdjacencyStructure(this);
1762 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1763 Cir.reset();
1764 Cir.circuit(i, i, NodeSets);
1765 }
1766
1767 // Change the dependences back so that we've created a DAG again.
1769}
1770
1771// Create artificial dependencies between the source of COPY/REG_SEQUENCE that
1772// is loop-carried to the USE in next iteration. This will help pipeliner avoid
1773// additional copies that are needed across iterations. An artificial dependence
1774// edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
1775
1776// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
1777// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
1778// PHI-------True-Dep------> USEOfPhi
1779
1780// The mutation creates
1781// USEOfPHI -------Artificial-Dep---> SRCOfCopy
1782
1783// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
1784// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
1785// late to avoid additional copies across iterations. The possible scheduling
1786// order would be
1787// USEOfPHI --- SRCOfCopy--- COPY/REG_SEQUENCE.
1788
1789void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
1790 for (SUnit &SU : DAG->SUnits) {
1791 // Find the COPY/REG_SEQUENCE instruction.
1792 if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
1793 continue;
1794
1795 // Record the loop carried PHIs.
1797 // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
1799
1800 for (auto &Dep : SU.Preds) {
1801 SUnit *TmpSU = Dep.getSUnit();
1802 MachineInstr *TmpMI = TmpSU->getInstr();
1803 SDep::Kind DepKind = Dep.getKind();
1804 // Save the loop carried PHI.
1805 if (DepKind == SDep::Anti && TmpMI->isPHI())
1806 PHISUs.push_back(TmpSU);
1807 // Save the source of COPY/REG_SEQUENCE.
1808 // If the source has no pre-decessors, we will end up creating cycles.
1809 else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
1810 SrcSUs.push_back(TmpSU);
1811 }
1812
1813 if (PHISUs.size() == 0 || SrcSUs.size() == 0)
1814 continue;
1815
1816 // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
1817 // SUnit to the container.
1819 // Do not use iterator based loop here as we are updating the container.
1820 for (size_t Index = 0; Index < PHISUs.size(); ++Index) {
1821 for (auto &Dep : PHISUs[Index]->Succs) {
1822 if (Dep.getKind() != SDep::Data)
1823 continue;
1824
1825 SUnit *TmpSU = Dep.getSUnit();
1826 MachineInstr *TmpMI = TmpSU->getInstr();
1827 if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
1828 PHISUs.push_back(TmpSU);
1829 continue;
1830 }
1831 UseSUs.push_back(TmpSU);
1832 }
1833 }
1834
1835 if (UseSUs.size() == 0)
1836 continue;
1837
1838 SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
1839 // Add the artificial dependencies if it does not form a cycle.
1840 for (auto *I : UseSUs) {
1841 for (auto *Src : SrcSUs) {
1842 if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
1843 Src->addPred(SDep(I, SDep::Artificial));
1844 SDAG->Topo.AddPred(Src, I);
1845 }
1846 }
1847 }
1848 }
1849}
1850
1851/// Return true for DAG nodes that we ignore when computing the cost functions.
1852/// We ignore the back-edge recurrence in order to avoid unbounded recursion
1853/// in the calculation of the ASAP, ALAP, etc functions.
1854static bool ignoreDependence(const SDep &D, bool isPred) {
1855 if (D.isArtificial() || D.getSUnit()->isBoundaryNode())
1856 return true;
1857 return D.getKind() == SDep::Anti && isPred;
1858}
1859
1860/// Compute several functions need to order the nodes for scheduling.
1861/// ASAP - Earliest time to schedule a node.
1862/// ALAP - Latest time to schedule a node.
1863/// MOV - Mobility function, difference between ALAP and ASAP.
1864/// D - Depth of each node.
1865/// H - Height of each node.
1866void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
1867 ScheduleInfo.resize(SUnits.size());
1868
1869 LLVM_DEBUG({
1870 for (int I : Topo) {
1871 const SUnit &SU = SUnits[I];
1872 dumpNode(SU);
1873 }
1874 });
1875
1876 int maxASAP = 0;
1877 // Compute ASAP and ZeroLatencyDepth.
1878 for (int I : Topo) {
1879 int asap = 0;
1880 int zeroLatencyDepth = 0;
1881 SUnit *SU = &SUnits[I];
1882 for (const SDep &P : SU->Preds) {
1883 SUnit *pred = P.getSUnit();
1884 if (P.getLatency() == 0)
1885 zeroLatencyDepth =
1886 std::max(zeroLatencyDepth, getZeroLatencyDepth(pred) + 1);
1887 if (ignoreDependence(P, true))
1888 continue;
1889 asap = std::max(asap, (int)(getASAP(pred) + P.getLatency() -
1890 getDistance(pred, SU, P) * MII));
1891 }
1892 maxASAP = std::max(maxASAP, asap);
1893 ScheduleInfo[I].ASAP = asap;
1894 ScheduleInfo[I].ZeroLatencyDepth = zeroLatencyDepth;
1895 }
1896
1897 // Compute ALAP, ZeroLatencyHeight, and MOV.
1898 for (int I : llvm::reverse(Topo)) {
1899 int alap = maxASAP;
1900 int zeroLatencyHeight = 0;
1901 SUnit *SU = &SUnits[I];
1902 for (const SDep &S : SU->Succs) {
1903 SUnit *succ = S.getSUnit();
1904 if (succ->isBoundaryNode())
1905 continue;
1906 if (S.getLatency() == 0)
1907 zeroLatencyHeight =
1908 std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
1909 if (ignoreDependence(S, true))
1910 continue;
1911 alap = std::min(alap, (int)(getALAP(succ) - S.getLatency() +
1912 getDistance(SU, succ, S) * MII));
1913 }
1914
1915 ScheduleInfo[I].ALAP = alap;
1916 ScheduleInfo[I].ZeroLatencyHeight = zeroLatencyHeight;
1917 }
1918
1919 // After computing the node functions, compute the summary for each node set.
1920 for (NodeSet &I : NodeSets)
1921 I.computeNodeSetInfo(this);
1922
1923 LLVM_DEBUG({
1924 for (unsigned i = 0; i < SUnits.size(); i++) {
1925 dbgs() << "\tNode " << i << ":\n";
1926 dbgs() << "\t ASAP = " << getASAP(&SUnits[i]) << "\n";
1927 dbgs() << "\t ALAP = " << getALAP(&SUnits[i]) << "\n";
1928 dbgs() << "\t MOV = " << getMOV(&SUnits[i]) << "\n";
1929 dbgs() << "\t D = " << getDepth(&SUnits[i]) << "\n";
1930 dbgs() << "\t H = " << getHeight(&SUnits[i]) << "\n";
1931 dbgs() << "\t ZLD = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
1932 dbgs() << "\t ZLH = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
1933 }
1934 });
1935}
1936
1937/// Compute the Pred_L(O) set, as defined in the paper. The set is defined
1938/// as the predecessors of the elements of NodeOrder that are not also in
1939/// NodeOrder.
1942 const NodeSet *S = nullptr) {
1943 Preds.clear();
1944 for (const SUnit *SU : NodeOrder) {
1945 for (const SDep &Pred : SU->Preds) {
1946 if (S && S->count(Pred.getSUnit()) == 0)
1947 continue;
1948 if (ignoreDependence(Pred, true))
1949 continue;
1950 if (NodeOrder.count(Pred.getSUnit()) == 0)
1951 Preds.insert(Pred.getSUnit());
1952 }
1953 // Back-edges are predecessors with an anti-dependence.
1954 for (const SDep &Succ : SU->Succs) {
1955 if (Succ.getKind() != SDep::Anti)
1956 continue;
1957 if (S && S->count(Succ.getSUnit()) == 0)
1958 continue;
1959 if (NodeOrder.count(Succ.getSUnit()) == 0)
1960 Preds.insert(Succ.getSUnit());
1961 }
1962 }
1963 return !Preds.empty();
1964}
1965
1966/// Compute the Succ_L(O) set, as defined in the paper. The set is defined
1967/// as the successors of the elements of NodeOrder that are not also in
1968/// NodeOrder.
1971 const NodeSet *S = nullptr) {
1972 Succs.clear();
1973 for (const SUnit *SU : NodeOrder) {
1974 for (const SDep &Succ : SU->Succs) {
1975 if (S && S->count(Succ.getSUnit()) == 0)
1976 continue;
1977 if (ignoreDependence(Succ, false))
1978 continue;
1979 if (NodeOrder.count(Succ.getSUnit()) == 0)
1980 Succs.insert(Succ.getSUnit());
1981 }
1982 for (const SDep &Pred : SU->Preds) {
1983 if (Pred.getKind() != SDep::Anti)
1984 continue;
1985 if (S && S->count(Pred.getSUnit()) == 0)
1986 continue;
1987 if (NodeOrder.count(Pred.getSUnit()) == 0)
1988 Succs.insert(Pred.getSUnit());
1989 }
1990 }
1991 return !Succs.empty();
1992}
1993
1994/// Return true if there is a path from the specified node to any of the nodes
1995/// in DestNodes. Keep track and return the nodes in any path.
1996static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
1997 SetVector<SUnit *> &DestNodes,
1998 SetVector<SUnit *> &Exclude,
1999 SmallPtrSet<SUnit *, 8> &Visited) {
2000 if (Cur->isBoundaryNode())
2001 return false;
2002 if (Exclude.contains(Cur))
2003 return false;
2004 if (DestNodes.contains(Cur))
2005 return true;
2006 if (!Visited.insert(Cur).second)
2007 return Path.contains(Cur);
2008 bool FoundPath = false;
2009 for (auto &SI : Cur->Succs)
2010 if (!ignoreDependence(SI, false))
2011 FoundPath |=
2012 computePath(SI.getSUnit(), Path, DestNodes, Exclude, Visited);
2013 for (auto &PI : Cur->Preds)
2014 if (PI.getKind() == SDep::Anti)
2015 FoundPath |=
2016 computePath(PI.getSUnit(), Path, DestNodes, Exclude, Visited);
2017 if (FoundPath)
2018 Path.insert(Cur);
2019 return FoundPath;
2020}
2021
2022/// Compute the live-out registers for the instructions in a node-set.
2023/// The live-out registers are those that are defined in the node-set,
2024/// but not used. Except for use operands of Phis.
2026 NodeSet &NS) {
2031 for (SUnit *SU : NS) {
2032 const MachineInstr *MI = SU->getInstr();
2033 if (MI->isPHI())
2034 continue;
2035 for (const MachineOperand &MO : MI->all_uses()) {
2036 Register Reg = MO.getReg();
2037 if (Reg.isVirtual())
2038 Uses.insert(Reg);
2039 else if (MRI.isAllocatable(Reg))
2040 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
2041 Uses.insert(Unit);
2042 }
2043 }
2044 for (SUnit *SU : NS)
2045 for (const MachineOperand &MO : SU->getInstr()->all_defs())
2046 if (!MO.isDead()) {
2047 Register Reg = MO.getReg();
2048 if (Reg.isVirtual()) {
2049 if (!Uses.count(Reg))
2050 LiveOutRegs.push_back(RegisterMaskPair(Reg,
2052 } else if (MRI.isAllocatable(Reg)) {
2053 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
2054 if (!Uses.count(Unit))
2055 LiveOutRegs.push_back(
2057 }
2058 }
2059 RPTracker.addLiveRegs(LiveOutRegs);
2060}
2061
2062/// A heuristic to filter nodes in recurrent node-sets if the register
2063/// pressure of a set is too high.
2064void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
2065 for (auto &NS : NodeSets) {
2066 // Skip small node-sets since they won't cause register pressure problems.
2067 if (NS.size() <= 2)
2068 continue;
2069 IntervalPressure RecRegPressure;
2070 RegPressureTracker RecRPTracker(RecRegPressure);
2071 RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
2072 computeLiveOuts(MF, RecRPTracker, NS);
2073 RecRPTracker.closeBottom();
2074
2075 std::vector<SUnit *> SUnits(NS.begin(), NS.end());
2076 llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {
2077 return A->NodeNum > B->NodeNum;
2078 });
2079
2080 for (auto &SU : SUnits) {
2081 // Since we're computing the register pressure for a subset of the
2082 // instructions in a block, we need to set the tracker for each
2083 // instruction in the node-set. The tracker is set to the instruction
2084 // just after the one we're interested in.
2086 RecRPTracker.setPos(std::next(CurInstI));
2087
2088 RegPressureDelta RPDelta;
2089 ArrayRef<PressureChange> CriticalPSets;
2090 RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
2091 CriticalPSets,
2092 RecRegPressure.MaxSetPressure);
2093 if (RPDelta.Excess.isValid()) {
2094 LLVM_DEBUG(
2095 dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
2097 << ":" << RPDelta.Excess.getUnitInc() << "\n");
2098 NS.setExceedPressure(SU);
2099 break;
2100 }
2101 RecRPTracker.recede();
2102 }
2103 }
2104}
2105
2106/// A heuristic to colocate node sets that have the same set of
2107/// successors.
2108void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
2109 unsigned Colocate = 0;
2110 for (int i = 0, e = NodeSets.size(); i < e; ++i) {
2111 NodeSet &N1 = NodeSets[i];
2113 if (N1.empty() || !succ_L(N1, S1))
2114 continue;
2115 for (int j = i + 1; j < e; ++j) {
2116 NodeSet &N2 = NodeSets[j];
2117 if (N1.compareRecMII(N2) != 0)
2118 continue;
2120 if (N2.empty() || !succ_L(N2, S2))
2121 continue;
2122 if (llvm::set_is_subset(S1, S2) && S1.size() == S2.size()) {
2123 N1.setColocate(++Colocate);
2124 N2.setColocate(Colocate);
2125 break;
2126 }
2127 }
2128 }
2129}
2130
2131/// Check if the existing node-sets are profitable. If not, then ignore the
2132/// recurrent node-sets, and attempt to schedule all nodes together. This is
2133/// a heuristic. If the MII is large and all the recurrent node-sets are small,
2134/// then it's best to try to schedule all instructions together instead of
2135/// starting with the recurrent node-sets.
2136void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
2137 // Look for loops with a large MII.
2138 if (MII < 17)
2139 return;
2140 // Check if the node-set contains only a simple add recurrence.
2141 for (auto &NS : NodeSets) {
2142 if (NS.getRecMII() > 2)
2143 return;
2144 if (NS.getMaxDepth() > MII)
2145 return;
2146 }
2147 NodeSets.clear();
2148 LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
2149}
2150
2151/// Add the nodes that do not belong to a recurrence set into groups
2152/// based upon connected components.
2153void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
2154 SetVector<SUnit *> NodesAdded;
2156 // Add the nodes that are on a path between the previous node sets and
2157 // the current node set.
2158 for (NodeSet &I : NodeSets) {
2160 // Add the nodes from the current node set to the previous node set.
2161 if (succ_L(I, N)) {
2163 for (SUnit *NI : N) {
2164 Visited.clear();
2165 computePath(NI, Path, NodesAdded, I, Visited);
2166 }
2167 if (!Path.empty())
2168 I.insert(Path.begin(), Path.end());
2169 }
2170 // Add the nodes from the previous node set to the current node set.
2171 N.clear();
2172 if (succ_L(NodesAdded, N)) {
2174 for (SUnit *NI : N) {
2175 Visited.clear();
2176 computePath(NI, Path, I, NodesAdded, Visited);
2177 }
2178 if (!Path.empty())
2179 I.insert(Path.begin(), Path.end());
2180 }
2181 NodesAdded.insert(I.begin(), I.end());
2182 }
2183
2184 // Create a new node set with the connected nodes of any successor of a node
2185 // in a recurrent set.
2186 NodeSet NewSet;
2188 if (succ_L(NodesAdded, N))
2189 for (SUnit *I : N)
2190 addConnectedNodes(I, NewSet, NodesAdded);
2191 if (!NewSet.empty())
2192 NodeSets.push_back(NewSet);
2193
2194 // Create a new node set with the connected nodes of any predecessor of a node
2195 // in a recurrent set.
2196 NewSet.clear();
2197 if (pred_L(NodesAdded, N))
2198 for (SUnit *I : N)
2199 addConnectedNodes(I, NewSet, NodesAdded);
2200 if (!NewSet.empty())
2201 NodeSets.push_back(NewSet);
2202
2203 // Create new nodes sets with the connected nodes any remaining node that
2204 // has no predecessor.
2205 for (SUnit &SU : SUnits) {
2206 if (NodesAdded.count(&SU) == 0) {
2207 NewSet.clear();
2208 addConnectedNodes(&SU, NewSet, NodesAdded);
2209 if (!NewSet.empty())
2210 NodeSets.push_back(NewSet);
2211 }
2212 }
2213}
2214
2215/// Add the node to the set, and add all of its connected nodes to the set.
2216void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
2217 SetVector<SUnit *> &NodesAdded) {
2218 NewSet.insert(SU);
2219 NodesAdded.insert(SU);
2220 for (auto &SI : SU->Succs) {
2221 SUnit *Successor = SI.getSUnit();
2222 if (!SI.isArtificial() && !Successor->isBoundaryNode() &&
2223 NodesAdded.count(Successor) == 0)
2224 addConnectedNodes(Successor, NewSet, NodesAdded);
2225 }
2226 for (auto &PI : SU->Preds) {
2227 SUnit *Predecessor = PI.getSUnit();
2228 if (!PI.isArtificial() && NodesAdded.count(Predecessor) == 0)
2229 addConnectedNodes(Predecessor, NewSet, NodesAdded);
2230 }
2231}
2232
2233/// Return true if Set1 contains elements in Set2. The elements in common
2234/// are returned in a different container.
2235static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
2237 Result.clear();
2238 for (SUnit *SU : Set1) {
2239 if (Set2.count(SU) != 0)
2240 Result.insert(SU);
2241 }
2242 return !Result.empty();
2243}
2244
2245/// Merge the recurrence node sets that have the same initial node.
2246void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
2247 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2248 ++I) {
2249 NodeSet &NI = *I;
2250 for (NodeSetType::iterator J = I + 1; J != E;) {
2251 NodeSet &NJ = *J;
2252 if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
2253 if (NJ.compareRecMII(NI) > 0)
2254 NI.setRecMII(NJ.getRecMII());
2255 for (SUnit *SU : *J)
2256 I->insert(SU);
2257 NodeSets.erase(J);
2258 E = NodeSets.end();
2259 } else {
2260 ++J;
2261 }
2262 }
2263 }
2264}
2265
2266/// Remove nodes that have been scheduled in previous NodeSets.
2267void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
2268 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2269 ++I)
2270 for (NodeSetType::iterator J = I + 1; J != E;) {
2271 J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
2272
2273 if (J->empty()) {
2274 NodeSets.erase(J);
2275 E = NodeSets.end();
2276 } else {
2277 ++J;
2278 }
2279 }
2280}
2281
2282/// Compute an ordered list of the dependence graph nodes, which
2283/// indicates the order that the nodes will be scheduled. This is a
2284/// two-level algorithm. First, a partial order is created, which
2285/// consists of a list of sets ordered from highest to lowest priority.
2286void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
2288 NodeOrder.clear();
2289
2290 for (auto &Nodes : NodeSets) {
2291 LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
2292 OrderKind Order;
2294 if (pred_L(NodeOrder, N) && llvm::set_is_subset(N, Nodes)) {
2295 R.insert(N.begin(), N.end());
2296 Order = BottomUp;
2297 LLVM_DEBUG(dbgs() << " Bottom up (preds) ");
2298 } else if (succ_L(NodeOrder, N) && llvm::set_is_subset(N, Nodes)) {
2299 R.insert(N.begin(), N.end());
2300 Order = TopDown;
2301 LLVM_DEBUG(dbgs() << " Top down (succs) ");
2302 } else if (isIntersect(N, Nodes, R)) {
2303 // If some of the successors are in the existing node-set, then use the
2304 // top-down ordering.
2305 Order = TopDown;
2306 LLVM_DEBUG(dbgs() << " Top down (intersect) ");
2307 } else if (NodeSets.size() == 1) {
2308 for (const auto &N : Nodes)
2309 if (N->Succs.size() == 0)
2310 R.insert(N);
2311 Order = BottomUp;
2312 LLVM_DEBUG(dbgs() << " Bottom up (all) ");
2313 } else {
2314 // Find the node with the highest ASAP.
2315 SUnit *maxASAP = nullptr;
2316 for (SUnit *SU : Nodes) {
2317 if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
2318 (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
2319 maxASAP = SU;
2320 }
2321 R.insert(maxASAP);
2322 Order = BottomUp;
2323 LLVM_DEBUG(dbgs() << " Bottom up (default) ");
2324 }
2325
2326 while (!R.empty()) {
2327 if (Order == TopDown) {
2328 // Choose the node with the maximum height. If more than one, choose
2329 // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
2330 // choose the node with the lowest MOV.
2331 while (!R.empty()) {
2332 SUnit *maxHeight = nullptr;
2333 for (SUnit *I : R) {
2334 if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
2335 maxHeight = I;
2336 else if (getHeight(I) == getHeight(maxHeight) &&
2338 maxHeight = I;
2339 else if (getHeight(I) == getHeight(maxHeight) &&
2341 getZeroLatencyHeight(maxHeight) &&
2342 getMOV(I) < getMOV(maxHeight))
2343 maxHeight = I;
2344 }
2345 NodeOrder.insert(maxHeight);
2346 LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
2347 R.remove(maxHeight);
2348 for (const auto &I : maxHeight->Succs) {
2349 if (Nodes.count(I.getSUnit()) == 0)
2350 continue;
2351 if (NodeOrder.contains(I.getSUnit()))
2352 continue;
2353 if (ignoreDependence(I, false))
2354 continue;
2355 R.insert(I.getSUnit());
2356 }
2357 // Back-edges are predecessors with an anti-dependence.
2358 for (const auto &I : maxHeight->Preds) {
2359 if (I.getKind() != SDep::Anti)
2360 continue;
2361 if (Nodes.count(I.getSUnit()) == 0)
2362 continue;
2363 if (NodeOrder.contains(I.getSUnit()))
2364 continue;
2365 R.insert(I.getSUnit());
2366 }
2367 }
2368 Order = BottomUp;
2369 LLVM_DEBUG(dbgs() << "\n Switching order to bottom up ");
2371 if (pred_L(NodeOrder, N, &Nodes))
2372 R.insert(N.begin(), N.end());
2373 } else {
2374 // Choose the node with the maximum depth. If more than one, choose
2375 // the node with the maximum ZeroLatencyDepth. If still more than one,
2376 // choose the node with the lowest MOV.
2377 while (!R.empty()) {
2378 SUnit *maxDepth = nullptr;
2379 for (SUnit *I : R) {
2380 if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
2381 maxDepth = I;
2382 else if (getDepth(I) == getDepth(maxDepth) &&
2384 maxDepth = I;
2385 else if (getDepth(I) == getDepth(maxDepth) &&
2387 getMOV(I) < getMOV(maxDepth))
2388 maxDepth = I;
2389 }
2390 NodeOrder.insert(maxDepth);
2391 LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
2392 R.remove(maxDepth);
2393 if (Nodes.isExceedSU(maxDepth)) {
2394 Order = TopDown;
2395 R.clear();
2396 R.insert(Nodes.getNode(0));
2397 break;
2398 }
2399 for (const auto &I : maxDepth->Preds) {
2400 if (Nodes.count(I.getSUnit()) == 0)
2401 continue;
2402 if (NodeOrder.contains(I.getSUnit()))
2403 continue;
2404 R.insert(I.getSUnit());
2405 }
2406 // Back-edges are predecessors with an anti-dependence.
2407 for (const auto &I : maxDepth->Succs) {
2408 if (I.getKind() != SDep::Anti)
2409 continue;
2410 if (Nodes.count(I.getSUnit()) == 0)
2411 continue;
2412 if (NodeOrder.contains(I.getSUnit()))
2413 continue;
2414 R.insert(I.getSUnit());
2415 }
2416 }
2417 Order = TopDown;
2418 LLVM_DEBUG(dbgs() << "\n Switching order to top down ");
2420 if (succ_L(NodeOrder, N, &Nodes))
2421 R.insert(N.begin(), N.end());
2422 }
2423 }
2424 LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
2425 }
2426
2427 LLVM_DEBUG({
2428 dbgs() << "Node order: ";
2429 for (SUnit *I : NodeOrder)
2430 dbgs() << " " << I->NodeNum << " ";
2431 dbgs() << "\n";
2432 });
2433}
2434
2435/// Process the nodes in the computed order and create the pipelined schedule
2436/// of the instructions, if possible. Return true if a schedule is found.
2437bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
2438
2439 if (NodeOrder.empty()){
2440 LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );
2441 return false;
2442 }
2443
2444 bool scheduleFound = false;
2445 std::unique_ptr<HighRegisterPressureDetector> HRPDetector;
2446 if (LimitRegPressure) {
2447 HRPDetector =
2448 std::make_unique<HighRegisterPressureDetector>(Loop.getHeader(), MF);
2449 HRPDetector->init(RegClassInfo);
2450 }
2451 // Keep increasing II until a valid schedule is found.
2452 for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) {
2453 Schedule.reset();
2454 Schedule.setInitiationInterval(II);
2455 LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
2456
2457 SetVector<SUnit *>::iterator NI = NodeOrder.begin();
2458 SetVector<SUnit *>::iterator NE = NodeOrder.end();
2459 do {
2460 SUnit *SU = *NI;
2461
2462 // Compute the schedule time for the instruction, which is based
2463 // upon the scheduled time for any predecessors/successors.
2464 int EarlyStart = INT_MIN;
2465 int LateStart = INT_MAX;
2466 Schedule.computeStart(SU, &EarlyStart, &LateStart, II, this);
2467 LLVM_DEBUG({
2468 dbgs() << "\n";
2469 dbgs() << "Inst (" << SU->NodeNum << ") ";
2470 SU->getInstr()->dump();
2471 dbgs() << "\n";
2472 });
2473 LLVM_DEBUG(
2474 dbgs() << format("\tes: %8x ls: %8x\n", EarlyStart, LateStart));
2475
2476 if (EarlyStart > LateStart)
2477 scheduleFound = false;
2478 else if (EarlyStart != INT_MIN && LateStart == INT_MAX)
2479 scheduleFound =
2480 Schedule.insert(SU, EarlyStart, EarlyStart + (int)II - 1, II);
2481 else if (EarlyStart == INT_MIN && LateStart != INT_MAX)
2482 scheduleFound =
2483 Schedule.insert(SU, LateStart, LateStart - (int)II + 1, II);
2484 else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2485 LateStart = std::min(LateStart, EarlyStart + (int)II - 1);
2486 // When scheduling a Phi it is better to start at the late cycle and
2487 // go backwards. The default order may insert the Phi too far away
2488 // from its first dependence.
2489 // Also, do backward search when all scheduled predecessors are
2490 // loop-carried output/order dependencies. Empirically, there are also
2491 // cases where scheduling becomes possible with backward search.
2492 if (SU->getInstr()->isPHI() ||
2493 Schedule.onlyHasLoopCarriedOutputOrOrderPreds(SU, this))
2494 scheduleFound = Schedule.insert(SU, LateStart, EarlyStart, II);
2495 else
2496 scheduleFound = Schedule.insert(SU, EarlyStart, LateStart, II);
2497 } else {
2498 int FirstCycle = Schedule.getFirstCycle();
2499 scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
2500 FirstCycle + getASAP(SU) + II - 1, II);
2501 }
2502
2503 // Even if we find a schedule, make sure the schedule doesn't exceed the
2504 // allowable number of stages. We keep trying if this happens.
2505 if (scheduleFound)
2506 if (SwpMaxStages > -1 &&
2507 Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2508 scheduleFound = false;
2509
2510 LLVM_DEBUG({
2511 if (!scheduleFound)
2512 dbgs() << "\tCan't schedule\n";
2513 });
2514 } while (++NI != NE && scheduleFound);
2515
2516 // If a schedule is found, ensure non-pipelined instructions are in stage 0
2517 if (scheduleFound)
2518 scheduleFound =
2519 Schedule.normalizeNonPipelinedInstructions(this, LoopPipelinerInfo);
2520
2521 // If a schedule is found, check if it is a valid schedule too.
2522 if (scheduleFound)
2523 scheduleFound = Schedule.isValidSchedule(this);
2524
2525 // If a schedule was found and the option is enabled, check if the schedule
2526 // might generate additional register spills/fills.
2527 if (scheduleFound && LimitRegPressure)
2528 scheduleFound =
2529 !HRPDetector->detect(this, Schedule, Schedule.getMaxStageCount());
2530 }
2531
2532 LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
2533 << " (II=" << Schedule.getInitiationInterval()
2534 << ")\n");
2535
2536 if (scheduleFound) {
2537 scheduleFound = LoopPipelinerInfo->shouldUseSchedule(*this, Schedule);
2538 if (!scheduleFound)
2539 LLVM_DEBUG(dbgs() << "Target rejected schedule\n");
2540 }
2541
2542 if (scheduleFound) {
2543 Schedule.finalizeSchedule(this);
2544 Pass.ORE->emit([&]() {
2546 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
2547 << "Schedule found with Initiation Interval: "
2548 << ore::NV("II", Schedule.getInitiationInterval())
2549 << ", MaxStageCount: "
2550 << ore::NV("MaxStageCount", Schedule.getMaxStageCount());
2551 });
2552 } else
2553 Schedule.reset();
2554
2555 return scheduleFound && Schedule.getMaxStageCount() > 0;
2556}
2557
2558/// Return true if we can compute the amount the instruction changes
2559/// during each iteration. Set Delta to the amount of the change.
2560bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
2562 const MachineOperand *BaseOp;
2563 int64_t Offset;
2564 bool OffsetIsScalable;
2565 if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
2566 return false;
2567
2568 // FIXME: This algorithm assumes instructions have fixed-size offsets.
2569 if (OffsetIsScalable)
2570 return false;
2571
2572 if (!BaseOp->isReg())
2573 return false;
2574
2575 Register BaseReg = BaseOp->getReg();
2576
2578 // Check if there is a Phi. If so, get the definition in the loop.
2579 MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
2580 if (BaseDef && BaseDef->isPHI()) {
2581 BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
2582 BaseDef = MRI.getVRegDef(BaseReg);
2583 }
2584 if (!BaseDef)
2585 return false;
2586
2587 int D = 0;
2588 if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
2589 return false;
2590
2591 Delta = D;
2592 return true;
2593}
2594
2595/// Check if we can change the instruction to use an offset value from the
2596/// previous iteration. If so, return true and set the base and offset values
2597/// so that we can rewrite the load, if necessary.
2598/// v1 = Phi(v0, v3)
2599/// v2 = load v1, 0
2600/// v3 = post_store v1, 4, x
2601/// This function enables the load to be rewritten as v2 = load v3, 4.
2602bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
2603 unsigned &BasePos,
2604 unsigned &OffsetPos,
2605 unsigned &NewBase,
2606 int64_t &Offset) {
2607 // Get the load instruction.
2608 if (TII->isPostIncrement(*MI))
2609 return false;
2610 unsigned BasePosLd, OffsetPosLd;
2611 if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
2612 return false;
2613 Register BaseReg = MI->getOperand(BasePosLd).getReg();
2614
2615 // Look for the Phi instruction.
2616 MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
2617 MachineInstr *Phi = MRI.getVRegDef(BaseReg);
2618 if (!Phi || !Phi->isPHI())
2619 return false;
2620 // Get the register defined in the loop block.
2621 unsigned PrevReg = getLoopPhiReg(*Phi, MI->getParent());
2622 if (!PrevReg)
2623 return false;
2624
2625 // Check for the post-increment load/store instruction.
2626 MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
2627 if (!PrevDef || PrevDef == MI)
2628 return false;
2629
2630 if (!TII->isPostIncrement(*PrevDef))
2631 return false;
2632
2633 unsigned BasePos1 = 0, OffsetPos1 = 0;
2634 if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
2635 return false;
2636
2637 // Make sure that the instructions do not access the same memory location in
2638 // the next iteration.
2639 int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
2640 int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
2642 NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
2643 bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
2644 MF.deleteMachineInstr(NewMI);
2645 if (!Disjoint)
2646 return false;
2647
2648 // Set the return value once we determine that we return true.
2649 BasePos = BasePosLd;
2650 OffsetPos = OffsetPosLd;
2651 NewBase = PrevReg;
2652 Offset = StoreOffset;
2653 return true;
2654}
2655
2656/// Apply changes to the instruction if needed. The changes are need
2657/// to improve the scheduling and depend up on the final schedule.
2659 SMSchedule &Schedule) {
2660 SUnit *SU = getSUnit(MI);
2662 InstrChanges.find(SU);
2663 if (It != InstrChanges.end()) {
2664 std::pair<unsigned, int64_t> RegAndOffset = It->second;
2665 unsigned BasePos, OffsetPos;
2666 if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2667 return;
2668 Register BaseReg = MI->getOperand(BasePos).getReg();
2669 MachineInstr *LoopDef = findDefInLoop(BaseReg);
2670 int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
2671 int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
2672 int BaseStageNum = Schedule.stageScheduled(SU);
2673 int BaseCycleNum = Schedule.cycleScheduled(SU);
2674 if (BaseStageNum < DefStageNum) {
2676 int OffsetDiff = DefStageNum - BaseStageNum;
2677 if (DefCycleNum < BaseCycleNum) {
2678 NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
2679 if (OffsetDiff > 0)
2680 --OffsetDiff;
2681 }
2682 int64_t NewOffset =
2683 MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
2684 NewMI->getOperand(OffsetPos).setImm(NewOffset);
2685 SU->setInstr(NewMI);
2686 MISUnitMap[NewMI] = SU;
2687 NewMIs[MI] = NewMI;
2688 }
2689 }
2690}
2691
2692/// Return the instruction in the loop that defines the register.
2693/// If the definition is a Phi, then follow the Phi operand to
2694/// the instruction in the loop.
2695MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
2697 MachineInstr *Def = MRI.getVRegDef(Reg);
2698 while (Def->isPHI()) {
2699 if (!Visited.insert(Def).second)
2700 break;
2701 for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
2702 if (Def->getOperand(i + 1).getMBB() == BB) {
2703 Def = MRI.getVRegDef(Def->getOperand(i).getReg());
2704 break;
2705 }
2706 }
2707 return Def;
2708}
2709
2710/// Return true for an order or output dependence that is loop carried
2711/// potentially. A dependence is loop carried if the destination defines a value
2712/// that may be used or defined by the source in a subsequent iteration.
2714 bool isSucc) {
2715 if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
2716 Dep.isArtificial() || Dep.getSUnit()->isBoundaryNode())
2717 return false;
2718
2720 return true;
2721
2722 if (Dep.getKind() == SDep::Output)
2723 return true;
2724
2725 MachineInstr *SI = Source->getInstr();
2726 MachineInstr *DI = Dep.getSUnit()->getInstr();
2727 if (!isSucc)
2728 std::swap(SI, DI);
2729 assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
2730
2731 // Assume ordered loads and stores may have a loop carried dependence.
2732 if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
2733 SI->mayRaiseFPException() || DI->mayRaiseFPException() ||
2734 SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
2735 return true;
2736
2737 if (!DI->mayLoadOrStore() || !SI->mayLoadOrStore())
2738 return false;
2739
2740 // The conservative assumption is that a dependence between memory operations
2741 // may be loop carried. The following code checks when it can be proved that
2742 // there is no loop carried dependence.
2743 unsigned DeltaS, DeltaD;
2744 if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD))
2745 return true;
2746
2747 const MachineOperand *BaseOpS, *BaseOpD;
2748 int64_t OffsetS, OffsetD;
2749 bool OffsetSIsScalable, OffsetDIsScalable;
2751 if (!TII->getMemOperandWithOffset(*SI, BaseOpS, OffsetS, OffsetSIsScalable,
2752 TRI) ||
2753 !TII->getMemOperandWithOffset(*DI, BaseOpD, OffsetD, OffsetDIsScalable,
2754 TRI))
2755 return true;
2756
2757 assert(!OffsetSIsScalable && !OffsetDIsScalable &&
2758 "Expected offsets to be byte offsets");
2759
2760 MachineInstr *DefS = MRI.getVRegDef(BaseOpS->getReg());
2761 MachineInstr *DefD = MRI.getVRegDef(BaseOpD->getReg());
2762 if (!DefS || !DefD || !DefS->isPHI() || !DefD->isPHI())
2763 return true;
2764
2765 unsigned InitValS = 0;
2766 unsigned LoopValS = 0;
2767 unsigned InitValD = 0;
2768 unsigned LoopValD = 0;
2769 getPhiRegs(*DefS, BB, InitValS, LoopValS);
2770 getPhiRegs(*DefD, BB, InitValD, LoopValD);
2771 MachineInstr *InitDefS = MRI.getVRegDef(InitValS);
2772 MachineInstr *InitDefD = MRI.getVRegDef(InitValD);
2773
2774 if (!InitDefS->isIdenticalTo(*InitDefD))
2775 return true;
2776
2777 // Check that the base register is incremented by a constant value for each
2778 // iteration.
2779 MachineInstr *LoopDefS = MRI.getVRegDef(LoopValS);
2780 int D = 0;
2781 if (!LoopDefS || !TII->getIncrementValue(*LoopDefS, D))
2782 return true;
2783
2784 LocationSize AccessSizeS = (*SI->memoperands_begin())->getSize();
2785 LocationSize AccessSizeD = (*DI->memoperands_begin())->getSize();
2786
2787 // This is the main test, which checks the offset values and the loop
2788 // increment value to determine if the accesses may be loop carried.
2789 if (!AccessSizeS.hasValue() || !AccessSizeD.hasValue())
2790 return true;
2791
2792 if (DeltaS != DeltaD || DeltaS < AccessSizeS.getValue() ||
2793 DeltaD < AccessSizeD.getValue())
2794 return true;
2795
2796 return (OffsetS + (int64_t)AccessSizeS.getValue() <
2797 OffsetD + (int64_t)AccessSizeD.getValue());
2798}
2799
2800void SwingSchedulerDAG::postProcessDAG() {
2801 for (auto &M : Mutations)
2802 M->apply(this);
2803}
2804
2805/// Try to schedule the node at the specified StartCycle and continue
2806/// until the node is schedule or the EndCycle is reached. This function
2807/// returns true if the node is scheduled. This routine may search either
2808/// forward or backward for a place to insert the instruction based upon
2809/// the relative values of StartCycle and EndCycle.
2810bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
2811 bool forward = true;
2812 LLVM_DEBUG({
2813 dbgs() << "Trying to insert node between " << StartCycle << " and "
2814 << EndCycle << " II: " << II << "\n";
2815 });
2816 if (StartCycle > EndCycle)
2817 forward = false;
2818
2819 // The terminating condition depends on the direction.
2820 int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
2821 for (int curCycle = StartCycle; curCycle != termCycle;
2822 forward ? ++curCycle : --curCycle) {
2823
2824 if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
2825 ProcItinResources.canReserveResources(*SU, curCycle)) {
2826 LLVM_DEBUG({
2827 dbgs() << "\tinsert at cycle " << curCycle << " ";
2828 SU->getInstr()->dump();
2829 });
2830
2831 if (!ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
2832 ProcItinResources.reserveResources(*SU, curCycle);
2833 ScheduledInstrs[curCycle].push_back(SU);
2834 InstrToCycle.insert(std::make_pair(SU, curCycle));
2835 if (curCycle > LastCycle)
2836 LastCycle = curCycle;
2837 if (curCycle < FirstCycle)
2838 FirstCycle = curCycle;
2839 return true;
2840 }
2841 LLVM_DEBUG({
2842 dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
2843 SU->getInstr()->dump();
2844 });
2845 }
2846 return false;
2847}
2848
2849// Return the cycle of the earliest scheduled instruction in the chain.
2852 SmallVector<SDep, 8> Worklist;
2853 Worklist.push_back(Dep);
2854 int EarlyCycle = INT_MAX;
2855 while (!Worklist.empty()) {
2856 const SDep &Cur = Worklist.pop_back_val();
2857 SUnit *PrevSU = Cur.getSUnit();
2858 if (Visited.count(PrevSU))
2859 continue;
2860 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
2861 if (it == InstrToCycle.end())
2862 continue;
2863 EarlyCycle = std::min(EarlyCycle, it->second);
2864 for (const auto &PI : PrevSU->Preds)
2865 if (PI.getKind() == SDep::Order || PI.getKind() == SDep::Output)
2866 Worklist.push_back(PI);
2867 Visited.insert(PrevSU);
2868 }
2869 return EarlyCycle;
2870}
2871
2872// Return the cycle of the latest scheduled instruction in the chain.
2875 SmallVector<SDep, 8> Worklist;
2876 Worklist.push_back(Dep);
2877 int LateCycle = INT_MIN;
2878 while (!Worklist.empty()) {
2879 const SDep &Cur = Worklist.pop_back_val();
2880 SUnit *SuccSU = Cur.getSUnit();
2881 if (Visited.count(SuccSU) || SuccSU->isBoundaryNode())
2882 continue;
2883 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
2884 if (it == InstrToCycle.end())
2885 continue;
2886 LateCycle = std::max(LateCycle, it->second);
2887 for (const auto &SI : SuccSU->Succs)
2888 if (SI.getKind() == SDep::Order || SI.getKind() == SDep::Output)
2889 Worklist.push_back(SI);
2890 Visited.insert(SuccSU);
2891 }
2892 return LateCycle;
2893}
2894
2895/// If an instruction has a use that spans multiple iterations, then
2896/// return true. These instructions are characterized by having a back-ege
2897/// to a Phi, which contains a reference to another Phi.
2899 for (auto &P : SU->Preds)
2900 if (DAG->isBackedge(SU, P) && P.getSUnit()->getInstr()->isPHI())
2901 for (auto &S : P.getSUnit()->Succs)
2902 if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
2903 return P.getSUnit();
2904 return nullptr;
2905}
2906
2907/// Compute the scheduling start slot for the instruction. The start slot
2908/// depends on any predecessor or successor nodes scheduled already.
2909void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
2910 int II, SwingSchedulerDAG *DAG) {
2911 // Iterate over each instruction that has been scheduled already. The start
2912 // slot computation depends on whether the previously scheduled instruction
2913 // is a predecessor or successor of the specified instruction.
2914 for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
2915
2916 // Iterate over each instruction in the current cycle.
2917 for (SUnit *I : getInstructions(cycle)) {
2918 // Because we're processing a DAG for the dependences, we recognize
2919 // the back-edge in recurrences by anti dependences.
2920 for (unsigned i = 0, e = (unsigned)SU->Preds.size(); i != e; ++i) {
2921 const SDep &Dep = SU->Preds[i];
2922 if (Dep.getSUnit() == I) {
2923 if (!DAG->isBackedge(SU, Dep)) {
2924 int EarlyStart = cycle + Dep.getLatency() -
2925 DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
2926 *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
2927 if (DAG->isLoopCarriedDep(SU, Dep, false)) {
2928 int End = earliestCycleInChain(Dep) + (II - 1);
2929 *MinLateStart = std::min(*MinLateStart, End);
2930 }
2931 } else {
2932 int LateStart = cycle - Dep.getLatency() +
2933 DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
2934 *MinLateStart = std::min(*MinLateStart, LateStart);
2935 }
2936 }
2937 // For instruction that requires multiple iterations, make sure that
2938 // the dependent instruction is not scheduled past the definition.
2939 SUnit *BE = multipleIterations(I, DAG);
2940 if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
2941 !SU->isPred(I))
2942 *MinLateStart = std::min(*MinLateStart, cycle);
2943 }
2944 for (unsigned i = 0, e = (unsigned)SU->Succs.size(); i != e; ++i) {
2945 if (SU->Succs[i].getSUnit() == I) {
2946 const SDep &Dep = SU->Succs[i];
2947 if (!DAG->isBackedge(SU, Dep)) {
2948 int LateStart = cycle - Dep.getLatency() +
2949 DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
2950 *MinLateStart = std::min(*MinLateStart, LateStart);
2951 if (DAG->isLoopCarriedDep(SU, Dep)) {
2952 int Start = latestCycleInChain(Dep) + 1 - II;
2953 *MaxEarlyStart = std::max(*MaxEarlyStart, Start);
2954 }
2955 } else {
2956 int EarlyStart = cycle + Dep.getLatency() -
2957 DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
2958 *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
2959 }
2960 }
2961 }
2962 }
2963 }
2964}
2965
2966/// Order the instructions within a cycle so that the definitions occur
2967/// before the uses. Returns true if the instruction is added to the start
2968/// of the list, or false if added to the end.
2970 std::deque<SUnit *> &Insts) const {
2971 MachineInstr *MI = SU->getInstr();
2972 bool OrderBeforeUse = false;
2973 bool OrderAfterDef = false;
2974 bool OrderBeforeDef = false;
2975 unsigned MoveDef = 0;
2976 unsigned MoveUse = 0;
2977 int StageInst1 = stageScheduled(SU);
2978
2979 unsigned Pos = 0;
2980 for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
2981 ++I, ++Pos) {
2982 for (MachineOperand &MO : MI->operands()) {
2983 if (!MO.isReg() || !MO.getReg().isVirtual())
2984 continue;
2985
2986 Register Reg = MO.getReg();
2987 unsigned BasePos, OffsetPos;
2988 if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2989 if (MI->getOperand(BasePos).getReg() == Reg)
2990 if (unsigned NewReg = SSD->getInstrBaseReg(SU))
2991 Reg = NewReg;
2992 bool Reads, Writes;
2993 std::tie(Reads, Writes) =
2994 (*I)->getInstr()->readsWritesVirtualRegister(Reg);
2995 if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
2996 OrderBeforeUse = true;
2997 if (MoveUse == 0)
2998 MoveUse = Pos;
2999 } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
3000 // Add the instruction after the scheduled instruction.
3001 OrderAfterDef = true;
3002 MoveDef = Pos;
3003 } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
3004 if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
3005 OrderBeforeUse = true;
3006 if (MoveUse == 0)
3007 MoveUse = Pos;
3008 } else {
3009 OrderAfterDef = true;
3010 MoveDef = Pos;
3011 }
3012 } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
3013 OrderBeforeUse = true;
3014 if (MoveUse == 0)
3015 MoveUse = Pos;
3016 if (MoveUse != 0) {
3017 OrderAfterDef = true;
3018 MoveDef = Pos - 1;
3019 }
3020 } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
3021 // Add the instruction before the scheduled instruction.
3022 OrderBeforeUse = true;
3023 if (MoveUse == 0)
3024 MoveUse = Pos;
3025 } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
3026 isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
3027 if (MoveUse == 0) {
3028 OrderBeforeDef = true;
3029 MoveUse = Pos;
3030 }
3031 }
3032 }
3033 // Check for order dependences between instructions. Make sure the source
3034 // is ordered before the destination.
3035 for (auto &S : SU->Succs) {
3036 if (S.getSUnit() != *I)
3037 continue;
3038 if (S.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
3039 OrderBeforeUse = true;
3040 if (Pos < MoveUse)
3041 MoveUse = Pos;
3042 }
3043 // We did not handle HW dependences in previous for loop,
3044 // and we normally set Latency = 0 for Anti deps,
3045 // so may have nodes in same cycle with Anti denpendent on HW regs.
3046 else if (S.getKind() == SDep::Anti && stageScheduled(*I) == StageInst1) {
3047 OrderBeforeUse = true;
3048 if ((MoveUse == 0) || (Pos < MoveUse))
3049 MoveUse = Pos;
3050 }
3051 }
3052 for (auto &P : SU->Preds) {
3053 if (P.getSUnit() != *I)
3054 continue;
3055 if (P.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
3056 OrderAfterDef = true;
3057 MoveDef = Pos;
3058 }
3059 }
3060 }
3061
3062 // A circular dependence.
3063 if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
3064 OrderBeforeUse = false;
3065
3066 // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
3067 // to a loop-carried dependence.
3068 if (OrderBeforeDef)
3069 OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
3070
3071 // The uncommon case when the instruction order needs to be updated because
3072 // there is both a use and def.
3073 if (OrderBeforeUse && OrderAfterDef) {
3074 SUnit *UseSU = Insts.at(MoveUse);
3075 SUnit *DefSU = Insts.at(MoveDef);
3076 if (MoveUse > MoveDef) {
3077 Insts.erase(Insts.begin() + MoveUse);
3078 Insts.erase(Insts.begin() + MoveDef);
3079 } else {
3080 Insts.erase(Insts.begin() + MoveDef);
3081 Insts.erase(Insts.begin() + MoveUse);
3082 }
3083 orderDependence(SSD, UseSU, Insts);
3084 orderDependence(SSD, SU, Insts);
3085 orderDependence(SSD, DefSU, Insts);
3086 return;
3087 }
3088 // Put the new instruction first if there is a use in the list. Otherwise,
3089 // put it at the end of the list.
3090 if (OrderBeforeUse)
3091 Insts.push_front(SU);
3092 else
3093 Insts.push_back(SU);
3094}
3095
3096/// Return true if the scheduled Phi has a loop carried operand.
3098 MachineInstr &Phi) const {
3099 if (!Phi.isPHI())
3100 return false;
3101 assert(Phi.isPHI() && "Expecting a Phi.");
3102 SUnit *DefSU = SSD->getSUnit(&Phi);
3103 unsigned DefCycle = cycleScheduled(DefSU);
3104 int DefStage = stageScheduled(DefSU);
3105
3106 unsigned InitVal = 0;
3107 unsigned LoopVal = 0;
3108 getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
3109 SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
3110 if (!UseSU)
3111 return true;
3112 if (UseSU->getInstr()->isPHI())
3113 return true;
3114 unsigned LoopCycle = cycleScheduled(UseSU);
3115 int LoopStage = stageScheduled(UseSU);
3116 return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
3117}
3118
3119/// Return true if the instruction is a definition that is loop carried
3120/// and defines the use on the next iteration.
3121/// v1 = phi(v2, v3)
3122/// (Def) v3 = op v1
3123/// (MO) = v1
3124/// If MO appears before Def, then v1 and v3 may get assigned to the same
3125/// register.
3127 MachineInstr *Def,
3128 MachineOperand &MO) const {
3129 if (!MO.isReg())
3130 return false;
3131 if (Def->isPHI())
3132 return false;
3133 MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
3134 if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
3135 return false;
3136 if (!isLoopCarried(SSD, *Phi))
3137 return false;
3138 unsigned LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
3139 for (MachineOperand &DMO : Def->all_defs()) {
3140 if (DMO.getReg() == LoopReg)
3141 return true;
3142 }
3143 return false;
3144}
3145
3146/// Return true if all scheduled predecessors are loop-carried output/order
3147/// dependencies.
3149 SUnit *SU, SwingSchedulerDAG *DAG) const {
3150 for (const SDep &Pred : SU->Preds)
3151 if (InstrToCycle.count(Pred.getSUnit()) && !DAG->isBackedge(SU, Pred))
3152 return false;
3153 for (const SDep &Succ : SU->Succs)
3154 if (InstrToCycle.count(Succ.getSUnit()) && DAG->isBackedge(SU, Succ))
3155 return false;
3156 return true;
3157}
3158
3159/// Determine transitive dependences of unpipelineable instructions
3162 SmallSet<SUnit *, 8> DoNotPipeline;
3163 SmallVector<SUnit *, 8> Worklist;
3164
3165 for (auto &SU : SSD->SUnits)
3166 if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr()))
3167 Worklist.push_back(&SU);
3168
3169 while (!Worklist.empty()) {
3170 auto SU = Worklist.pop_back_val();
3171 if (DoNotPipeline.count(SU))
3172 continue;
3173 LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n");
3174 DoNotPipeline.insert(SU);
3175 for (auto &Dep : SU->Preds)
3176 Worklist.push_back(Dep.getSUnit());
3177 if (SU->getInstr()->isPHI())
3178 for (auto &Dep : SU->Succs)
3179 if (Dep.getKind() == SDep::Anti)
3180 Worklist.push_back(Dep.getSUnit());
3181 }
3182 return DoNotPipeline;
3183}
3184
3185// Determine all instructions upon which any unpipelineable instruction depends
3186// and ensure that they are in stage 0. If unable to do so, return false.
3190
3191 int NewLastCycle = INT_MIN;
3192 for (SUnit &SU : SSD->SUnits) {
3193 if (!SU.isInstr())
3194 continue;
3195 if (!DNP.contains(&SU) || stageScheduled(&SU) == 0) {
3196 NewLastCycle = std::max(NewLastCycle, InstrToCycle[&SU]);
3197 continue;
3198 }
3199
3200 // Put the non-pipelined instruction as early as possible in the schedule
3201 int NewCycle = getFirstCycle();
3202 for (auto &Dep : SU.Preds)
3203 NewCycle = std::max(InstrToCycle[Dep.getSUnit()], NewCycle);
3204
3205 int OldCycle = InstrToCycle[&SU];
3206 if (OldCycle != NewCycle) {
3207 InstrToCycle[&SU] = NewCycle;
3208 auto &OldS = getInstructions(OldCycle);
3209 llvm::erase(OldS, &SU);
3210 getInstructions(NewCycle).emplace_back(&SU);
3211 LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum
3212 << ") is not pipelined; moving from cycle " << OldCycle
3213 << " to " << NewCycle << " Instr:" << *SU.getInstr());
3214 }
3215 NewLastCycle = std::max(NewLastCycle, NewCycle);
3216 }
3217 LastCycle = NewLastCycle;
3218 return true;
3219}
3220
3221// Check if the generated schedule is valid. This function checks if
3222// an instruction that uses a physical register is scheduled in a
3223// different stage than the definition. The pipeliner does not handle
3224// physical register values that may cross a basic block boundary.
3225// Furthermore, if a physical def/use pair is assigned to the same
3226// cycle, orderDependence does not guarantee def/use ordering, so that
3227// case should be considered invalid. (The test checks for both
3228// earlier and same-cycle use to be more robust.)
3230 for (SUnit &SU : SSD->SUnits) {
3231 if (!SU.hasPhysRegDefs)
3232 continue;
3233 int StageDef = stageScheduled(&SU);
3234 int CycleDef = InstrToCycle[&SU];
3235 assert(StageDef != -1 && "Instruction should have been scheduled.");
3236 for (auto &SI : SU.Succs)
3237 if (SI.isAssignedRegDep() && !SI.getSUnit()->isBoundaryNode())
3238 if (Register::isPhysicalRegister(SI.getReg())) {
3239 if (stageScheduled(SI.getSUnit()) != StageDef)
3240 return false;
3241 if (InstrToCycle[SI.getSUnit()] <= CycleDef)
3242 return false;
3243 }
3244 }
3245 return true;
3246}
3247
3248/// A property of the node order in swing-modulo-scheduling is
3249/// that for nodes outside circuits the following holds:
3250/// none of them is scheduled after both a successor and a
3251/// predecessor.
3252/// The method below checks whether the property is met.
3253/// If not, debug information is printed and statistics information updated.
3254/// Note that we do not use an assert statement.
3255/// The reason is that although an invalid node oder may prevent
3256/// the pipeliner from finding a pipelined schedule for arbitrary II,
3257/// it does not lead to the generation of incorrect code.
3258void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
3259
3260 // a sorted vector that maps each SUnit to its index in the NodeOrder
3261 typedef std::pair<SUnit *, unsigned> UnitIndex;
3262 std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
3263
3264 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
3265 Indices.push_back(std::make_pair(NodeOrder[i], i));
3266
3267 auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
3268 return std::get<0>(i1) < std::get<0>(i2);
3269 };
3270
3271 // sort, so that we can perform a binary search
3272 llvm::sort(Indices, CompareKey);
3273
3274 bool Valid = true;
3275 (void)Valid;
3276 // for each SUnit in the NodeOrder, check whether
3277 // it appears after both a successor and a predecessor
3278 // of the SUnit. If this is the case, and the SUnit
3279 // is not part of circuit, then the NodeOrder is not
3280 // valid.
3281 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
3282 SUnit *SU = NodeOrder[i];
3283 unsigned Index = i;
3284
3285 bool PredBefore = false;
3286 bool SuccBefore = false;
3287
3288 SUnit *Succ;
3289 SUnit *Pred;
3290 (void)Succ;
3291 (void)Pred;
3292
3293 for (SDep &PredEdge : SU->Preds) {
3294 SUnit *PredSU = PredEdge.getSUnit();
3295 unsigned PredIndex = std::get<1>(
3296 *llvm::lower_bound(Indices, std::make_pair(PredSU, 0), CompareKey));
3297 if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
3298 PredBefore = true;
3299 Pred = PredSU;
3300 break;
3301 }
3302 }
3303
3304 for (SDep &SuccEdge : SU->Succs) {
3305 SUnit *SuccSU = SuccEdge.getSUnit();
3306 // Do not process a boundary node, it was not included in NodeOrder,
3307 // hence not in Indices either, call to std::lower_bound() below will
3308 // return Indices.end().
3309 if (SuccSU->isBoundaryNode())
3310 continue;
3311 unsigned SuccIndex = std::get<1>(
3312 *llvm::lower_bound(Indices, std::make_pair(SuccSU, 0), CompareKey));
3313 if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
3314 SuccBefore = true;
3315 Succ = SuccSU;
3316 break;
3317 }
3318 }
3319
3320 if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
3321 // instructions in circuits are allowed to be scheduled
3322 // after both a successor and predecessor.
3323 bool InCircuit = llvm::any_of(
3324 Circuits, [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
3325 if (InCircuit)
3326 LLVM_DEBUG(dbgs() << "In a circuit, predecessor ";);
3327 else {
3328 Valid = false;
3329 NumNodeOrderIssues++;
3330 LLVM_DEBUG(dbgs() << "Predecessor ";);
3331 }
3332 LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
3333 << " are scheduled before node " << SU->NodeNum
3334 << "\n";);
3335 }
3336 }
3337
3338 LLVM_DEBUG({
3339 if (!Valid)
3340 dbgs() << "Invalid node order found!\n";
3341 });
3342}
3343
3344/// Attempt to fix the degenerate cases when the instruction serialization
3345/// causes the register lifetimes to overlap. For example,
3346/// p' = store_pi(p, b)
3347/// = load p, offset
3348/// In this case p and p' overlap, which means that two registers are needed.
3349/// Instead, this function changes the load to use p' and updates the offset.
3350void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
3351 unsigned OverlapReg = 0;
3352 unsigned NewBaseReg = 0;
3353 for (SUnit *SU : Instrs) {
3354 MachineInstr *MI = SU->getInstr();
3355 for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3356 const MachineOperand &MO = MI->getOperand(i);
3357 // Look for an instruction that uses p. The instruction occurs in the
3358 // same cycle but occurs later in the serialized order.
3359 if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
3360 // Check that the instruction appears in the InstrChanges structure,
3361 // which contains instructions that can have the offset updated.
3363 InstrChanges.find(SU);
3364 if (It != InstrChanges.end()) {
3365 unsigned BasePos, OffsetPos;
3366 // Update the base register and adjust the offset.
3367 if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
3369 NewMI->getOperand(BasePos).setReg(NewBaseReg);
3370 int64_t NewOffset =
3371 MI->getOperand(OffsetPos).getImm() - It->second.second;
3372 NewMI->getOperand(OffsetPos).setImm(NewOffset);
3373 SU->setInstr(NewMI);
3374 MISUnitMap[NewMI] = SU;
3375 NewMIs[MI] = NewMI;
3376 }
3377 }
3378 OverlapReg = 0;
3379 NewBaseReg = 0;
3380 break;
3381 }
3382 // Look for an instruction of the form p' = op(p), which uses and defines
3383 // two virtual registers that get allocated to the same physical register.
3384 unsigned TiedUseIdx = 0;
3385 if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
3386 // OverlapReg is p in the example above.
3387 OverlapReg = MI->getOperand(TiedUseIdx).getReg();
3388 // NewBaseReg is p' in the example above.
3389 NewBaseReg = MI->getOperand(i).getReg();
3390 break;
3391 }
3392 }
3393 }
3394}
3395
3396std::deque<SUnit *>
3398 const std::deque<SUnit *> &Instrs) const {
3399 std::deque<SUnit *> NewOrderPhi;
3400 for (SUnit *SU : Instrs) {
3401 if (SU->getInstr()->isPHI())
3402 NewOrderPhi.push_back(SU);
3403 }
3404 std::deque<SUnit *> NewOrderI;
3405 for (SUnit *SU : Instrs) {
3406 if (!SU->getInstr()->isPHI())
3407 orderDependence(SSD, SU, NewOrderI);
3408 }
3409 llvm::append_range(NewOrderPhi, NewOrderI);
3410 return NewOrderPhi;
3411}
3412
3413/// After the schedule has been formed, call this function to combine
3414/// the instructions from the different stages/cycles. That is, this
3415/// function creates a schedule that represents a single iteration.
3417 // Move all instructions to the first stage from later stages.
3418 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3419 for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
3420 ++stage) {
3421 std::deque<SUnit *> &cycleInstrs =
3422 ScheduledInstrs[cycle + (stage * InitiationInterval)];
3423 for (SUnit *SU : llvm::reverse(cycleInstrs))
3424 ScheduledInstrs[cycle].push_front(SU);
3425 }
3426 }
3427
3428 // Erase all the elements in the later stages. Only one iteration should
3429 // remain in the scheduled list, and it contains all the instructions.
3430 for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
3431 ScheduledInstrs.erase(cycle);
3432
3433 // Change the registers in instruction as specified in the InstrChanges
3434 // map. We need to use the new registers to create the correct order.
3435 for (const SUnit &SU : SSD->SUnits)
3436 SSD->applyInstrChange(SU.getInstr(), *this);
3437
3438 // Reorder the instructions in each cycle to fix and improve the
3439 // generated code.
3440 for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
3441 std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
3442 cycleInstrs = reorderInstructions(SSD, cycleInstrs);
3443 SSD->fixupRegisterOverlaps(cycleInstrs);
3444 }
3445
3446 LLVM_DEBUG(dump(););
3447}
3448
3450 os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
3451 << " depth " << MaxDepth << " col " << Colocate << "\n";
3452 for (const auto &I : Nodes)
3453 os << " SU(" << I->NodeNum << ") " << *(I->getInstr());
3454 os << "\n";
3455}
3456
3457#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3458/// Print the schedule information to the given output.
3460 // Iterate over each cycle.
3461 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3462 // Iterate over each instruction in the cycle.
3463 const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
3464 for (SUnit *CI : cycleInstrs->second) {
3465 os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
3466 os << "(" << CI->NodeNum << ") ";
3467 CI->getInstr()->print(os);
3468 os << "\n";
3469 }
3470 }
3471}
3472
3473/// Utility function used for debugging to print the schedule.
3476
3477void ResourceManager::dumpMRT() const {
3478 LLVM_DEBUG({
3479 if (UseDFA)
3480 return;
3481 std::stringstream SS;
3482 SS << "MRT:\n";
3483 SS << std::setw(4) << "Slot";
3484 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3485 SS << std::setw(3) << I;
3486 SS << std::setw(7) << "#Mops"
3487 << "\n";
3488 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3489 SS << std::setw(4) << Slot;
3490 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3491 SS << std::setw(3) << MRT[Slot][I];
3492 SS << std::setw(7) << NumScheduledMops[Slot] << "\n";
3493 }
3494 dbgs() << SS.str();
3495 });
3496}
3497#endif
3498
3500 const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {
3501 unsigned ProcResourceID = 0;
3502
3503 // We currently limit the resource kinds to 64 and below so that we can use
3504 // uint64_t for Masks
3505 assert(SM.getNumProcResourceKinds() < 64 &&
3506 "Too many kinds of resources, unsupported");
3507 // Create a unique bitmask for every processor resource unit.
3508 // Skip resource at index 0, since it always references 'InvalidUnit'.
3509 Masks.resize(SM.getNumProcResourceKinds());
3510 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3512 if (Desc.SubUnitsIdxBegin)
3513 continue;
3514 Masks[I] = 1ULL << ProcResourceID;
3515 ProcResourceID++;
3516 }
3517 // Create a unique bitmask for every processor resource group.
3518 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3520 if (!Desc.SubUnitsIdxBegin)
3521 continue;
3522 Masks[I] = 1ULL << ProcResourceID;
3523 for (unsigned U = 0; U < Desc.NumUnits; ++U)
3524 Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];
3525 ProcResourceID++;
3526 }
3527 LLVM_DEBUG({
3528 if (SwpShowResMask) {
3529 dbgs() << "ProcResourceDesc:\n";
3530 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3531 const MCProcResourceDesc *ProcResource = SM.getProcResource(I);
3532 dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
3533 ProcResource->Name, I, Masks[I],
3534 ProcResource->NumUnits);
3535 }
3536 dbgs() << " -----------------\n";
3537 }
3538 });
3539}
3540
3542 LLVM_DEBUG({
3543 if (SwpDebugResource)
3544 dbgs() << "canReserveResources:\n";
3545 });
3546 if (UseDFA)
3547 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3548 ->canReserveResources(&SU.getInstr()->getDesc());
3549
3550 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3551 if (!SCDesc->isValid()) {
3552 LLVM_DEBUG({
3553 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3554 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3555 });
3556 return true;
3557 }
3558
3559 reserveResources(SCDesc, Cycle);
3560 bool Result = !isOverbooked();
3561 unreserveResources(SCDesc, Cycle);
3562
3563 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return " << Result << "\n\n";);
3564 return Result;
3565}
3566
3567void ResourceManager::reserveResources(SUnit &SU, int Cycle) {
3568 LLVM_DEBUG({
3569 if (SwpDebugResource)
3570 dbgs() << "reserveResources:\n";
3571 });
3572 if (UseDFA)
3573 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3574 ->reserveResources(&SU.getInstr()->getDesc());
3575
3576 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3577 if (!SCDesc->isValid()) {
3578 LLVM_DEBUG({
3579 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3580 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3581 });
3582 return;
3583 }
3584
3585 reserveResources(SCDesc, Cycle);
3586
3587 LLVM_DEBUG({
3588 if (SwpDebugResource) {
3589 dumpMRT();
3590 dbgs() << "reserveResources: done!\n\n";
3591 }
3592 });
3593}
3594
3595void ResourceManager::reserveResources(const MCSchedClassDesc *SCDesc,
3596 int Cycle) {
3597 assert(!UseDFA);
3598 for (const MCWriteProcResEntry &PRE : make_range(
3599 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3600 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3601 ++MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3602
3603 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3604 ++NumScheduledMops[positiveModulo(C, InitiationInterval)];
3605}
3606
3607void ResourceManager::unreserveResources(const MCSchedClassDesc *SCDesc,
3608 int Cycle) {
3609 assert(!UseDFA);
3610 for (const MCWriteProcResEntry &PRE : make_range(
3611 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3612 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3613 --MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3614
3615 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3616 --NumScheduledMops[positiveModulo(C, InitiationInterval)];
3617}
3618
3619bool ResourceManager::isOverbooked() const {
3620 assert(!UseDFA);
3621 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3622 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3624 if (MRT[Slot][I] > Desc->NumUnits)
3625 return true;
3626 }
3627 if (NumScheduledMops[Slot] > IssueWidth)
3628 return true;
3629 }
3630 return false;
3631}
3632
3633int ResourceManager::calculateResMIIDFA() const {
3634 assert(UseDFA);
3635
3636 // Sort the instructions by the number of available choices for scheduling,
3637 // least to most. Use the number of critical resources as the tie breaker.
3638 FuncUnitSorter FUS = FuncUnitSorter(*ST);
3639 for (SUnit &SU : DAG->SUnits)
3640 FUS.calcCriticalResources(*SU.getInstr());
3642 FuncUnitOrder(FUS);
3643
3644 for (SUnit &SU : DAG->SUnits)
3645 FuncUnitOrder.push(SU.getInstr());
3646
3648 Resources.push_back(
3649 std::unique_ptr<DFAPacketizer>(TII->CreateTargetScheduleState(*ST)));
3650
3651 while (!FuncUnitOrder.empty()) {
3652 MachineInstr *MI = FuncUnitOrder.top();
3653 FuncUnitOrder.pop();
3654 if (TII->isZeroCost(MI->getOpcode()))
3655 continue;
3656
3657 // Attempt to reserve the instruction in an existing DFA. At least one
3658 // DFA is needed for each cycle.
3659 unsigned NumCycles = DAG->getSUnit(MI)->Latency;
3660 unsigned ReservedCycles = 0;
3661 auto *RI = Resources.begin();
3662 auto *RE = Resources.end();
3663 LLVM_DEBUG({
3664 dbgs() << "Trying to reserve resource for " << NumCycles
3665 << " cycles for \n";
3666 MI->dump();
3667 });
3668 for (unsigned C = 0; C < NumCycles; ++C)
3669 while (RI != RE) {
3670 if ((*RI)->canReserveResources(*MI)) {
3671 (*RI)->reserveResources(*MI);
3672 ++ReservedCycles;
3673 break;
3674 }
3675 RI++;
3676 }
3677 LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles
3678 << ", NumCycles:" << NumCycles << "\n");
3679 // Add new DFAs, if needed, to reserve resources.
3680 for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
3682 << "NewResource created to reserve resources"
3683 << "\n");
3684 auto *NewResource = TII->CreateTargetScheduleState(*ST);
3685 assert(NewResource->canReserveResources(*MI) && "Reserve error.");
3686 NewResource->reserveResources(*MI);
3687 Resources.push_back(std::unique_ptr<DFAPacketizer>(NewResource));
3688 }
3689 }
3690
3691 int Resmii = Resources.size();
3692 LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
3693 return Resmii;
3694}
3695
3697 if (UseDFA)
3698 return calculateResMIIDFA();
3699
3700 // Count each resource consumption and divide it by the number of units.
3701 // ResMII is the max value among them.
3702
3703 int NumMops = 0;
3705 for (SUnit &SU : DAG->SUnits) {
3706 if (TII->isZeroCost(SU.getInstr()->getOpcode()))
3707 continue;
3708
3709 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3710 if (!SCDesc->isValid())
3711 continue;
3712
3713 LLVM_DEBUG({
3714 if (SwpDebugResource) {
3715 DAG->dumpNode(SU);
3716 dbgs() << " #Mops: " << SCDesc->NumMicroOps << "\n"
3717 << " WriteProcRes: ";
3718 }
3719 });
3720 NumMops += SCDesc->NumMicroOps;
3721 for (const MCWriteProcResEntry &PRE :
3722 make_range(STI->getWriteProcResBegin(SCDesc),
3723 STI->getWriteProcResEnd(SCDesc))) {
3724 LLVM_DEBUG({
3725 if (SwpDebugResource) {
3726 const MCProcResourceDesc *Desc =
3727 SM.getProcResource(PRE.ProcResourceIdx);
3728 dbgs() << Desc->Name << ": " << PRE.ReleaseAtCycle << ", ";
3729 }
3730 });
3731 ResourceCount[PRE.ProcResourceIdx] += PRE.ReleaseAtCycle;
3732 }
3733 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "\n");
3734 }
3735
3736 int Result = (NumMops + IssueWidth - 1) / IssueWidth;
3737 LLVM_DEBUG({
3738 if (SwpDebugResource)
3739 dbgs() << "#Mops: " << NumMops << ", "
3740 << "IssueWidth: " << IssueWidth << ", "
3741 << "Cycles: " << Result << "\n";
3742 });
3743
3744 LLVM_DEBUG({
3745 if (SwpDebugResource) {
3746 std::stringstream SS;
3747 SS << std::setw(2) << "ID" << std::setw(16) << "Name" << std::setw(10)
3748 << "Units" << std::setw(10) << "Consumed" << std::setw(10) << "Cycles"
3749 << "\n";
3750 dbgs() << SS.str();
3751 }
3752 });
3753 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3755 int Cycles = (ResourceCount[I] + Desc->NumUnits - 1) / Desc->NumUnits;
3756 LLVM_DEBUG({
3757 if (SwpDebugResource) {
3758 std::stringstream SS;
3759 SS << std::setw(2) << I << std::setw(16) << Desc->Name << std::setw(10)
3760 << Desc->NumUnits << std::setw(10) << ResourceCount[I]
3761 << std::setw(10) << Cycles << "\n";
3762 dbgs() << SS.str();
3763 }
3764 });
3765 if (Cycles > Result)
3766 Result = Cycles;
3767 }
3768 return Result;
3769}
3770
3772 InitiationInterval = II;
3773 DFAResources.clear();
3774 DFAResources.resize(II);
3775 for (auto &I : DFAResources)
3776 I.reset(ST->getInstrInfo()->CreateTargetScheduleState(*ST));
3777 MRT.clear();
3779 NumScheduledMops.clear();
3780 NumScheduledMops.resize(II);
3781}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static const LLT S1
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:686
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:537
This file declares an analysis pass that computes CycleInfo for LLVM IR, specialized from GenericCycl...
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
bool End
Definition: ELF_riscv.cpp:480
SmallVector< uint32_t, 0 > Writes
Definition: ELF_riscv.cpp:497
Rewrite Partial Register Uses
#define DEBUG_TYPE
const HexagonInstrInfo * TII
hexagon gen pred
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
static cl::opt< int > SwpForceII("pipeliner-force-ii", cl::desc("Force pipeliner to use specified II."), cl::Hidden, cl::init(-1))
A command line argument to force pipeliner to use specified initial interval.
static cl::opt< bool > ExperimentalCodeGen("pipeliner-experimental-cg", cl::Hidden, cl::init(false), cl::desc("Use the experimental peeling code generator for software pipelining"))
static bool pred_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Preds, const NodeSet *S=nullptr)
Compute the Pred_L(O) set, as defined in the paper.
static cl::opt< bool > MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false), cl::desc("Use the MVE code generator for software pipelining"))
static cl::opt< int > RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden, cl::init(5), cl::desc("Margin representing the unused percentage of " "the register pressure limit"))
static cl::opt< bool > SwpDebugResource("pipeliner-dbg-res", cl::Hidden, cl::init(false))
static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker, NodeSet &NS)
Compute the live-out registers for the instructions in a node-set.
static void computeScheduledInsts(const SwingSchedulerDAG *SSD, SMSchedule &Schedule, std::vector< MachineInstr * > &OrderedInsts, DenseMap< MachineInstr *, unsigned > &Stages)
Create an instruction stream that represents a single iteration and stage of each instruction.
static cl::opt< bool > EmitTestAnnotations("pipeliner-annotate-for-testing", cl::Hidden, cl::init(false), cl::desc("Instead of emitting the pipelined code, annotate instructions " "with the generated schedule for feeding into the " "-modulo-schedule-test pass"))
static bool isIntersect(SmallSetVector< SUnit *, 8 > &Set1, const NodeSet &Set2, SmallSetVector< SUnit *, 8 > &Result)
Return true if Set1 contains elements in Set2.
static cl::opt< bool > SwpIgnoreRecMII("pipeliner-ignore-recmii", cl::ReallyHidden, cl::desc("Ignore RecMII"))
static cl::opt< int > SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1))
static bool succ_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Succs, const NodeSet *S=nullptr)
Compute the Succ_L(O) set, as defined in the paper.
Modulo Software Pipelining
static cl::opt< bool > SwpPruneLoopCarried("pipeliner-prune-loop-carried", cl::desc("Prune loop carried order dependences."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of loop carried order dependences.
static bool isDependenceBarrier(MachineInstr &MI)
Return true if the instruction causes a chain between memory references before and after it.
static cl::opt< int > SwpMaxMii("pipeliner-max-mii", cl::desc("Size limit for the MII."), cl::Hidden, cl::init(27))
A command line argument to limit minimum initial interval for pipelining.
static unsigned getLoopPhiReg(const MachineInstr &Phi, const MachineBasicBlock *LoopBB)
Return the Phi register value that comes the loop block.
static void swapAntiDependences(std::vector< SUnit > &SUnits)
Swap all the anti dependences in the DAG.
static bool isSuccOrder(SUnit *SUa, SUnit *SUb)
Return true if SUb can be reached from SUa following the chain edges.
static cl::opt< int > SwpMaxStages("pipeliner-max-stages", cl::desc("Maximum stages allowed in the generated scheduled."), cl::Hidden, cl::init(3))
A command line argument to limit the number of stages in the pipeline.
static cl::opt< bool > EnableSWPOptSize("enable-pipeliner-opt-size", cl::desc("Enable SWP at Os."), cl::Hidden, cl::init(false))
A command line option to enable SWP at -Os.
static cl::opt< bool > SwpShowResMask("pipeliner-show-mask", cl::Hidden, cl::init(false))
static cl::opt< int > SwpIISearchRange("pipeliner-ii-search-range", cl::desc("Range to search for II"), cl::Hidden, cl::init(10))
static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop, unsigned &InitVal, unsigned &LoopVal)
Return the register values for the operands of a Phi instruction.
static cl::opt< bool > LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false), cl::desc("Limit register pressure of scheduled loop"))
#define DEBUG_TYPE
static cl::opt< bool > EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true), cl::desc("Enable Software Pipelining"))
A command line option to turn software pipelining on or off.
static bool ignoreDependence(const SDep &D, bool isPred)
Return true for DAG nodes that we ignore when computing the cost functions.
static cl::opt< bool > SwpPruneDeps("pipeliner-prune-deps", cl::desc("Prune dependences between unrelated Phi nodes."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of chain dependences due to an unrelated Phi.
static SUnit * multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG)
If an instruction has a use that spans multiple iterations, then return true.
static bool computePath(SUnit *Cur, SetVector< SUnit * > &Path, SetVector< SUnit * > &DestNodes, SetVector< SUnit * > &Exclude, SmallPtrSet< SUnit *, 8 > &Visited)
Return true if there is a path from the specified node to any of the nodes in DestNodes.
unsigned const TargetRegisterInfo * TRI
unsigned Reg
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file defines the PriorityQueue class.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
Target-Independent Code Generator Pass Configuration Options pass.
static unsigned getSize(unsigned Kind)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
bool isNoAlias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A trivial helper function to check to see if the specified pointers are no-alias.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:229
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
bool erase(const KeyT &Val)
Definition: DenseMap.h:345
bool empty() const
Definition: DenseMap.h:98
iterator end()
Definition: DenseMap.h:84
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:358
A possibly irreducible generalization of a Loop.
Itinerary data supplied by a subtarget to be used by a target.
const InstrStage * beginStage(unsigned ItinClassIndx) const
Return the first stage of the itinerary.
const InstrStage * endStage(unsigned ItinClassIndx) const
Return the last+1 stage of the itinerary.
bool isEmpty() const
Returns true if there are no itineraries.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:381
bool hasValue() const
TypeSize getValue() const
BlockT * getHeader() const
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:632
unsigned getSchedClass() const
Return the scheduling class for this instruction.
Definition: MCInstrDesc.h:600
const MCInstrDesc & get(unsigned Opcode) const
Return the machine instruction descriptor that corresponds to the specified instruction opcode.
Definition: MCInstrInfo.h:63
Generic base class for all target subtargets.
const MCWriteProcResEntry * getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCWriteProcResEntry * getWriteProcResBegin(const MCSchedClassDesc *SC) const
Return an iterator at the first process resource consumed by the given scheduling class.
const MCSchedModel & getSchedModel() const
Get the machine model for this subtarget's CPU.
Metadata node.
Definition: Metadata.h:1067
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1428
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1426
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1434
Tracking metadata reference owned by Metadata.
Definition: Metadata.h:889
A single uniqued string.
Definition: Metadata.h:720
StringRef getString() const
Definition: Metadata.cpp:610
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
instr_iterator instr_end()
Analysis pass which computes a MachineDominatorTree.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void deleteMachineInstr(MachineInstr *MI)
DeleteMachineInstr - Delete the given MachineInstr.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
bool mayRaiseFPException() const
Return true if this instruction could possibly raise a floating-point exception.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:569
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:346
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition: MachineInstr.h:566
bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
bool isRegSequence() const
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:800
bool isIdenticalTo(const MachineInstr &Other, MICheckType Check=CheckDefs) const
Return true if this instruction is identical to Other.
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool isPseudo(QueryType Type=IgnoreBundle) const
Return true if this is a pseudo instruction that doesn't correspond to a real machine instruction.
Definition: MachineInstr.h:930
bool isPHI() const
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:579
iterator_range< filtered_mop_iterator > all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
Definition: MachineInstr.h:756
A description of a memory reference used in the backend.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
const Value * getValue() const
Return the base address of the memory access.
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
Register getReg() const
getReg - Returns the register number.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
Diagnostic information for optimization analysis remarks.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
The main class in the implementation of the target independent software pipeliner pass.
const TargetInstrInfo * TII
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
MachineFunction * MF
const MachineDominatorTree * MDT
const MachineLoopInfo * MLI
MachineOptimizationRemarkEmitter * ORE
RegisterClassInfo RegClassInfo
defusechain_iterator - This class provides iterator support for machine operands in the function that...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
use_instr_iterator use_instr_begin(Register RegNo) const
static use_instr_iterator use_instr_end()
MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator end()
Definition: MapVector.h:71
iterator find(const KeyT &Key)
Definition: MapVector.h:167
void clear()
Definition: MapVector.h:88
static MemoryLocation getAfter(const Value *Ptr, const AAMDNodes &AATags=AAMDNodes())
Return a location that may access any location after Ptr, while remaining within the underlying objec...
Expand the kernel using modulo variable expansion algorithm (MVE).
static bool canApply(MachineLoop &L)
Check if ModuloScheduleExpanderMVE can be applied to L.
The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place, rewriting the old loop and...
void cleanup()
Performs final cleanup after expansion.
void expand()
Performs the actual expansion.
Expander that simply annotates each scheduled instruction with a post-instr symbol that can be consum...
void annotate()
Performs the annotation.
Represents a schedule for a single-block loop.
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
SUnit * getNode(unsigned i) const
void print(raw_ostream &os) const
void setRecMII(unsigned mii)
unsigned count(SUnit *SU) const
void setColocate(unsigned c)
int compareRecMII(NodeSet &RHS)
unsigned size() const
bool insert(SUnit *SU)
LLVM_DUMP_METHOD void dump() const
bool empty() const
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
void dump() const
Definition: Pass.cpp:136
A reimplementation of ModuloScheduleExpander.
unsigned getPSet() const
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
Track the current register pressure at some position in the instruction stream, and remember the high...
void addLiveRegs(ArrayRef< RegisterMaskPair > Regs)
Force liveness of virtual registers or physical register units.
void runOnMachineFunction(const MachineFunction &MF)
runOnFunction - Prepare to answer questions about MF.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static constexpr bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:65
void initProcResourceVectors(const MCSchedModel &SM, SmallVectorImpl< uint64_t > &Masks)
void init(int II)
Initialize resources with the initiation interval II.
bool canReserveResources(SUnit &SU, int Cycle)
Check if the resources occupied by a machine instruction are available in the current state.
Scheduling dependency.
Definition: ScheduleDAG.h:49
SUnit * getSUnit() const
Definition: ScheduleDAG.h:498
Kind getKind() const
Returns an enum value representing the kind of the dependence.
Definition: ScheduleDAG.h:504
Kind
These are the different kinds of scheduling dependencies.
Definition: ScheduleDAG.h:52
@ Output
A register output-dependence (aka WAW).
Definition: ScheduleDAG.h:55
@ Order
Any other ordering dependency.
Definition: ScheduleDAG.h:56
@ Anti
A register anti-dependence (aka WAR).
Definition: ScheduleDAG.h:54
@ Data
Regular data dependence (aka true-dependence).
Definition: ScheduleDAG.h:53
void setLatency(unsigned Lat)
Sets the latency for this edge.
Definition: ScheduleDAG.h:147
@ Barrier
An unknown scheduling barrier.
Definition: ScheduleDAG.h:69
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
Definition: ScheduleDAG.h:142
bool isArtificial() const
Tests if this is an Order dependence that is marked as "artificial", meaning it isn't necessary for c...
Definition: ScheduleDAG.h:200
This class represents the scheduled code.
std::deque< SUnit * > reorderInstructions(const SwingSchedulerDAG *SSD, const std::deque< SUnit * > &Instrs) const
int earliestCycleInChain(const SDep &Dep)
Return the cycle of the earliest scheduled instruction in the dependence chain.
void setInitiationInterval(int ii)
Set the initiation interval for this schedule.
SmallSet< SUnit *, 8 > computeUnpipelineableNodes(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
Determine transitive dependences of unpipelineable instructions.
void dump() const
Utility function used for debugging to print the schedule.
bool insert(SUnit *SU, int StartCycle, int EndCycle, int II)
Try to schedule the node at the specified StartCycle and continue until the node is schedule or the E...
unsigned getMaxStageCount()
Return the maximum stage count needed for this schedule.
void print(raw_ostream &os) const
Print the schedule information to the given output.
int latestCycleInChain(const SDep &Dep)
Return the cycle of the latest scheduled instruction in the dependence chain.
int stageScheduled(SUnit *SU) const
Return the stage for a scheduled instruction.
void orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU, std::deque< SUnit * > &Insts) const
Order the instructions within a cycle so that the definitions occur before the uses.
bool isValidSchedule(SwingSchedulerDAG *SSD)
int getInitiationInterval() const
Return the initiation interval for this schedule.
std::deque< SUnit * > & getInstructions(int cycle)
Return the instructions that are scheduled at the specified cycle.
int getFirstCycle() const
Return the first cycle in the completed schedule.
bool isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD, MachineInstr *Def, MachineOperand &MO) const
Return true if the instruction is a definition that is loop carried and defines the use on the next i...
unsigned cycleScheduled(SUnit *SU) const
Return the cycle for a scheduled instruction.
void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart, int II, SwingSchedulerDAG *DAG)
Compute the scheduling start slot for the instruction.
bool onlyHasLoopCarriedOutputOrOrderPreds(SUnit *SU, SwingSchedulerDAG *DAG) const
Return true if all scheduled predecessors are loop-carried output/order dependencies.
bool normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
bool isLoopCarried(const SwingSchedulerDAG *SSD, MachineInstr &Phi) const
Return true if the scheduled Phi has a loop carried operand.
int getFinalCycle() const
Return the last cycle in the finalized schedule.
void finalizeSchedule(SwingSchedulerDAG *SSD)
After the schedule has been formed, call this function to combine the instructions from the different...
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
unsigned NumPreds
Definition: ScheduleDAG.h:272
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
Definition: ScheduleDAG.h:378
unsigned NodeNum
Entry # of node in the node vector.
Definition: ScheduleDAG.h:270
void setInstr(MachineInstr *MI)
Assigns the instruction for the SUnit.
Definition: ScheduleDAG.h:382
void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
bool isPred(const SUnit *N) const
Tests if node N is a predecessor of this node.
Definition: ScheduleDAG.h:449
unsigned short Latency
Node latency.
Definition: ScheduleDAG.h:303
bool isBoundaryNode() const
Boundary nodes are placeholders for the boundary of the scheduling region.
Definition: ScheduleDAG.h:358
bool hasPhysRegDefs
Has physreg defs that are being used.
Definition: ScheduleDAG.h:292
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:263
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:262
bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:390
A ScheduleDAG for scheduling lists of MachineInstr.
DenseMap< MachineInstr *, SUnit * > MISUnitMap
After calling BuildSchedGraph, each machine instruction in the current scheduling region is mapped to...
virtual void finishBlock()
Cleans up after scheduling in the given block.
MachineBasicBlock * BB
The block in which to insert instructions.
const MCSchedClassDesc * getSchedClass(SUnit *SU) const
Resolves and cache a resolved scheduling class for an SUnit.
void dumpNode(const SUnit &SU) const override
UndefValue * UnknownValue
For an unanalyzable memory access, this Value is used in maps.
void buildSchedGraph(AAResults *AA, RegPressureTracker *RPTracker=nullptr, PressureDiffs *PDiffs=nullptr, LiveIntervals *LIS=nullptr, bool TrackLaneMasks=false)
Builds SUnits for the current region.
SUnit * getSUnit(MachineInstr *MI) const
Returns an existing SUnit for this MI, or nullptr.
TargetSchedModel SchedModel
TargetSchedModel provides an interface to the machine model.
void dump() const override
void RemovePred(SUnit *M, SUnit *N)
Updates the topological ordering to accommodate an edge to be removed from the specified node N from ...
void InitDAGTopologicalSorting()
Creates the initial topological ordering from the DAG to be scheduled.
void AddPred(SUnit *Y, SUnit *X)
Updates the topological ordering to accommodate an edge to be added from SUnit X to SUnit Y.
bool IsReachable(const SUnit *SU, const SUnit *TargetSU)
Checks if SU is reachable from TargetSU.
MachineRegisterInfo & MRI
Virtual/real register map.
Definition: ScheduleDAG.h:578
const TargetInstrInfo * TII
Target instruction information.
Definition: ScheduleDAG.h:575
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:579
const TargetRegisterInfo * TRI
Target processor register info.
Definition: ScheduleDAG.h:576
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:577
A vector that has set insertion semantics.
Definition: SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
typename vector_type::const_iterator iterator
Definition: SetVector.h:69
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:254
SlotIndexes pass.
Definition: SlotIndexes.h:297
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:531
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:290
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:361
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:412
iterator end() const
Definition: SmallPtrSet.h:437
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:344
iterator begin() const
Definition: SmallPtrSet.h:432
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:479
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:236
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
This class builds the dependence graph for the instructions in a loop, and attempts to schedule the i...
unsigned getInstrBaseReg(SUnit *SU) const
Return the new base register that was stored away for the changed instruction.
unsigned getDepth(SUnit *Node)
The depth, in the dependence graph, for a node.
int getASAP(SUnit *Node)
Return the earliest time an instruction may be scheduled.
void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule)
Apply changes to the instruction if needed.
void finishBlock() override
Clean up after the software pipeliner runs.
void fixupRegisterOverlaps(std::deque< SUnit * > &Instrs)
Attempt to fix the degenerate cases when the instruction serialization causes the register lifetimes ...
int getZeroLatencyDepth(SUnit *Node)
The maximum unweighted length of a path from an arbitrary node to the given node in which each edge h...
bool isLoopCarriedDep(SUnit *Source, const SDep &Dep, bool isSucc=true)
Return true for an order or output dependence that is loop carried potentially.
unsigned getDistance(SUnit *U, SUnit *V, const SDep &Dep)
The distance function, which indicates that operation V of iteration I depends on operations U of ite...
void schedule() override
We override the schedule function in ScheduleDAGInstrs to implement the scheduling part of the Swing ...
int getMOV(SUnit *Node)
The mobility function, which the number of slots in which an instruction may be scheduled.
int getZeroLatencyHeight(SUnit *Node)
The maximum unweighted length of a path from the given node to an arbitrary node in which each edge h...
bool isBackedge(SUnit *Source, const SDep &Dep)
Return true if the dependence is a back-edge in the data dependence graph.
unsigned getHeight(SUnit *Node)
The height, in the dependence graph, for a node.
int getALAP(SUnit *Node)
Return the latest time an instruction my be scheduled.
Object returned by analyzeLoopForPipelining.
virtual bool isMVEExpanderSupported()
Return true if the target can expand pipelined schedule with modulo variable expansion.
virtual bool shouldIgnoreForPipelining(const MachineInstr *MI) const =0
Return true if the given instruction should not be pipelined and should be ignored.
virtual bool shouldUseSchedule(SwingSchedulerDAG &SSD, SMSchedule &SMS)
Return true if the proposed schedule should used.
virtual std::unique_ptr< PipelinerLoopInfo > analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const
Analyze loop L, which must be a single-basic-block loop, and if the conditions can be understood enou...
bool isZeroCost(unsigned Opcode) const
Return true for pseudo instructions that don't consume any machine resources in their current form.
virtual bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
virtual DFAPacketizer * CreateTargetScheduleState(const TargetSubtargetInfo &) const
Create machine specific model for scheduling.
virtual bool isPostIncrement(const MachineInstr &MI) const
Return true for post-incremented instructions.
virtual bool getBaseAndOffsetPosition(const MachineInstr &MI, unsigned &BasePos, unsigned &OffsetPos) const
Return true if the instruction contains a base register and offset.
virtual bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const
Sometimes, it is possible for the target to tell, even without aliasing information,...
virtual bool getIncrementValue(const MachineInstr &MI, int &Value) const
If the instruction is an increment of a constant value, return the amount.
bool getMemOperandWithOffset(const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const
Get the base operand and byte offset of an instruction that reads/writes memory.
Target-Independent Code Generator Pass Configuration Options.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const char * getRegPressureSetName(unsigned Idx) const =0
Get the name of this register unit pressure set.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
static Type * getVoidTy(LLVMContext &C)
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1833
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
The main class in the implementation of the target independent window scheduler.
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Entry
Definition: COFF.h:811
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:138
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:711
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double e
Definition: MathExtras.h:47
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Offset
Definition: DWP.cpp:480
void stable_sort(R &&Range)
Definition: STLExtras.h:1995
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2067
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2059
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
@ WS_Force
Use window algorithm after SMS algorithm fails.
@ WS_On
Turn off window algorithm.
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
unsigned getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1954
char & MachinePipelinerID
This pass performs software pipelining on machine instructions.
CycleInfo::CycleT Cycle
Definition: CycleInfo.h:24
cl::opt< bool > SwpEnableCopyToPhi
cl::opt< WindowSchedulingFlag > WindowSchedulingOption("window-sched", cl::Hidden, cl::init(WindowSchedulingFlag::WS_On), cl::desc("Set how to use window scheduling algorithm."), cl::values(clEnumValN(WindowSchedulingFlag::WS_Off, "off", "Turn off window algorithm."), clEnumValN(WindowSchedulingFlag::WS_On, "on", "Use window algorithm after SMS algorithm fails."), clEnumValN(WindowSchedulingFlag::WS_Force, "force", "Use window algorithm instead of SMS algorithm.")))
A command line argument to set the window scheduling option.
bool isIdentifiedObject(const Value *V)
Return true if this pointer refers to a distinct and identifiable object.
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
cl::opt< int > SwpForceIssueWidth
A command line argument to force pipeliner to use specified issue width.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Description of the encoding of one expression Op.
These values represent a non-pipelined step in the execution of an instruction.
RegisterPressure computed within a region of instructions delimited by TopIdx and BottomIdx.
static constexpr LaneBitmask getNone()
Definition: LaneBitmask.h:81
Define a kind of processor resource that will be modeled by the scheduler.
Definition: MCSchedule.h:31
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition: MCSchedule.h:118
bool isValid() const
Definition: MCSchedule.h:136
Machine model for scheduling, bundling, and heuristics.
Definition: MCSchedule.h:253
const MCSchedClassDesc * getSchedClassDesc(unsigned SchedClassIdx) const
Definition: MCSchedule.h:360
unsigned getNumProcResourceKinds() const
Definition: MCSchedule.h:349
bool hasInstrSchedModel() const
Does this machine model include instruction-level scheduling.
Definition: MCSchedule.h:334
const MCProcResourceDesc * getProcResource(unsigned ProcResourceIdx) const
Definition: MCSchedule.h:353
Identify one of the processor resource kinds consumed by a particular scheduling class for the specif...
Definition: MCSchedule.h:63
SmallVector< MachineOperand, 4 > BrCond
std::unique_ptr< TargetInstrInfo::PipelinerLoopInfo > LoopPipelinerInfo
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
const TargetPassConfig * PassConfig
const MachineDominatorTree * MDT
RegisterClassInfo * RegClassInfo
const MachineLoopInfo * MLI
Store the effects of a change in pressure on things that MI scheduler cares about.
std::vector< unsigned > MaxSetPressure
Map of max reg pressure indexed by pressure set ID, not class ID.