LLVM 19.0.0git
MachinePipeliner.cpp
Go to the documentation of this file.
1//===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
10//
11// This SMS implementation is a target-independent back-end pass. When enabled,
12// the pass runs just prior to the register allocation pass, while the machine
13// IR is in SSA form. If software pipelining is successful, then the original
14// loop is replaced by the optimized loop. The optimized loop contains one or
15// more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
16// the instructions cannot be scheduled in a given MII, we increase the MII by
17// one and try again.
18//
19// The SMS implementation is an extension of the ScheduleDAGInstrs class. We
20// represent loop carried dependences in the DAG as order edges to the Phi
21// nodes. We also perform several passes over the DAG to eliminate unnecessary
22// edges that inhibit the ability to pipeline. The implementation uses the
23// DFAPacketizer class to compute the minimum initiation interval and the check
24// where an instruction may be inserted in the pipelined schedule.
25//
26// In order for the SMS pass to work, several target specific hooks need to be
27// implemented to get information about the loop structure and to rewrite
28// instructions.
29//
30//===----------------------------------------------------------------------===//
31
33#include "llvm/ADT/ArrayRef.h"
34#include "llvm/ADT/BitVector.h"
35#include "llvm/ADT/DenseMap.h"
36#include "llvm/ADT/MapVector.h"
38#include "llvm/ADT/STLExtras.h"
40#include "llvm/ADT/SetVector.h"
42#include "llvm/ADT/SmallSet.h"
44#include "llvm/ADT/Statistic.h"
73#include "llvm/Config/llvm-config.h"
74#include "llvm/IR/Attributes.h"
75#include "llvm/IR/Function.h"
76#include "llvm/MC/LaneBitmask.h"
77#include "llvm/MC/MCInstrDesc.h"
80#include "llvm/Pass.h"
83#include "llvm/Support/Debug.h"
86#include <algorithm>
87#include <cassert>
88#include <climits>
89#include <cstdint>
90#include <deque>
91#include <functional>
92#include <iomanip>
93#include <iterator>
94#include <map>
95#include <memory>
96#include <sstream>
97#include <tuple>
98#include <utility>
99#include <vector>
100
101using namespace llvm;
102
103#define DEBUG_TYPE "pipeliner"
104
105STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
106STATISTIC(NumPipelined, "Number of loops software pipelined");
107STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
108STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
109STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
110STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
111STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
112STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
113STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
114STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
115STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
116
117/// A command line option to turn software pipelining on or off.
118static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
119 cl::desc("Enable Software Pipelining"));
120
121/// A command line option to enable SWP at -Os.
122static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
123 cl::desc("Enable SWP at Os."), cl::Hidden,
124 cl::init(false));
125
126/// A command line argument to limit minimum initial interval for pipelining.
127static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
128 cl::desc("Size limit for the MII."),
129 cl::Hidden, cl::init(27));
130
131/// A command line argument to force pipeliner to use specified initial
132/// interval.
133static cl::opt<int> SwpForceII("pipeliner-force-ii",
134 cl::desc("Force pipeliner to use specified II."),
135 cl::Hidden, cl::init(-1));
136
137/// A command line argument to limit the number of stages in the pipeline.
138static cl::opt<int>
139 SwpMaxStages("pipeliner-max-stages",
140 cl::desc("Maximum stages allowed in the generated scheduled."),
141 cl::Hidden, cl::init(3));
142
143/// A command line option to disable the pruning of chain dependences due to
144/// an unrelated Phi.
145static cl::opt<bool>
146 SwpPruneDeps("pipeliner-prune-deps",
147 cl::desc("Prune dependences between unrelated Phi nodes."),
148 cl::Hidden, cl::init(true));
149
150/// A command line option to disable the pruning of loop carried order
151/// dependences.
152static cl::opt<bool>
153 SwpPruneLoopCarried("pipeliner-prune-loop-carried",
154 cl::desc("Prune loop carried order dependences."),
155 cl::Hidden, cl::init(true));
156
157#ifndef NDEBUG
158static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
159#endif
160
161static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
163 cl::desc("Ignore RecMII"));
164
165static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
166 cl::init(false));
167static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,
168 cl::init(false));
169
171 "pipeliner-annotate-for-testing", cl::Hidden, cl::init(false),
172 cl::desc("Instead of emitting the pipelined code, annotate instructions "
173 "with the generated schedule for feeding into the "
174 "-modulo-schedule-test pass"));
175
177 "pipeliner-experimental-cg", cl::Hidden, cl::init(false),
178 cl::desc(
179 "Use the experimental peeling code generator for software pipelining"));
180
181static cl::opt<int> SwpIISearchRange("pipeliner-ii-search-range",
182 cl::desc("Range to search for II"),
183 cl::Hidden, cl::init(10));
184
185static cl::opt<bool>
186 LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false),
187 cl::desc("Limit register pressure of scheduled loop"));
188
189static cl::opt<int>
190 RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden,
191 cl::init(5),
192 cl::desc("Margin representing the unused percentage of "
193 "the register pressure limit"));
194
195namespace llvm {
196
197// A command line option to enable the CopyToPhi DAG mutation.
199 cl::init(true),
200 cl::desc("Enable CopyToPhi DAG Mutation"));
201
202/// A command line argument to force pipeliner to use specified issue
203/// width.
205 "pipeliner-force-issue-width",
206 cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
207 cl::init(-1));
208
209} // end namespace llvm
210
211unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
212char MachinePipeliner::ID = 0;
213#ifndef NDEBUG
215#endif
217
219 "Modulo Software Pipelining", false, false)
225 "Modulo Software Pipelining", false, false)
226
227/// The "main" function for implementing Swing Modulo Scheduling.
228bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
229 if (skipFunction(mf.getFunction()))
230 return false;
231
232 if (!EnableSWP)
233 return false;
234
235 if (mf.getFunction().getAttributes().hasFnAttr(Attribute::OptimizeForSize) &&
236 !EnableSWPOptSize.getPosition())
237 return false;
238
239 if (!mf.getSubtarget().enableMachinePipeliner())
240 return false;
241
242 // Cannot pipeline loops without instruction itineraries if we are using
243 // DFA for the pipeliner.
244 if (mf.getSubtarget().useDFAforSMS() &&
245 (!mf.getSubtarget().getInstrItineraryData() ||
246 mf.getSubtarget().getInstrItineraryData()->isEmpty()))
247 return false;
248
249 MF = &mf;
250 MLI = &getAnalysis<MachineLoopInfo>();
251 MDT = &getAnalysis<MachineDominatorTree>();
252 ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
253 TII = MF->getSubtarget().getInstrInfo();
254 RegClassInfo.runOnMachineFunction(*MF);
255
256 for (const auto &L : *MLI)
257 scheduleLoop(*L);
258
259 return false;
260}
261
262/// Attempt to perform the SMS algorithm on the specified loop. This function is
263/// the main entry point for the algorithm. The function identifies candidate
264/// loops, calculates the minimum initiation interval, and attempts to schedule
265/// the loop.
266bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
267 bool Changed = false;
268 for (const auto &InnerLoop : L)
269 Changed |= scheduleLoop(*InnerLoop);
270
271#ifndef NDEBUG
272 // Stop trying after reaching the limit (if any).
273 int Limit = SwpLoopLimit;
274 if (Limit >= 0) {
275 if (NumTries >= SwpLoopLimit)
276 return Changed;
277 NumTries++;
278 }
279#endif
280
281 setPragmaPipelineOptions(L);
282 if (!canPipelineLoop(L)) {
283 LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
284 ORE->emit([&]() {
285 return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",
286 L.getStartLoc(), L.getHeader())
287 << "Failed to pipeline loop";
288 });
289
290 LI.LoopPipelinerInfo.reset();
291 return Changed;
292 }
293
294 ++NumTrytoPipeline;
295
296 Changed = swingModuloScheduler(L);
297
298 LI.LoopPipelinerInfo.reset();
299 return Changed;
300}
301
302void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
303 // Reset the pragma for the next loop in iteration.
304 disabledByPragma = false;
305 II_setByPragma = 0;
306
307 MachineBasicBlock *LBLK = L.getTopBlock();
308
309 if (LBLK == nullptr)
310 return;
311
312 const BasicBlock *BBLK = LBLK->getBasicBlock();
313 if (BBLK == nullptr)
314 return;
315
316 const Instruction *TI = BBLK->getTerminator();
317 if (TI == nullptr)
318 return;
319
320 MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop);
321 if (LoopID == nullptr)
322 return;
323
324 assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");
325 assert(LoopID->getOperand(0) == LoopID && "invalid loop");
326
327 for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
328 MDNode *MD = dyn_cast<MDNode>(MDO);
329
330 if (MD == nullptr)
331 continue;
332
333 MDString *S = dyn_cast<MDString>(MD->getOperand(0));
334
335 if (S == nullptr)
336 continue;
337
338 if (S->getString() == "llvm.loop.pipeline.initiationinterval") {
339 assert(MD->getNumOperands() == 2 &&
340 "Pipeline initiation interval hint metadata should have two operands.");
342 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
343 assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");
344 } else if (S->getString() == "llvm.loop.pipeline.disable") {
345 disabledByPragma = true;
346 }
347 }
348}
349
350/// Return true if the loop can be software pipelined. The algorithm is
351/// restricted to loops with a single basic block. Make sure that the
352/// branch in the loop can be analyzed.
353bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
354 if (L.getNumBlocks() != 1) {
355 ORE->emit([&]() {
356 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
357 L.getStartLoc(), L.getHeader())
358 << "Not a single basic block: "
359 << ore::NV("NumBlocks", L.getNumBlocks());
360 });
361 return false;
362 }
363
364 if (disabledByPragma) {
365 ORE->emit([&]() {
366 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
367 L.getStartLoc(), L.getHeader())
368 << "Disabled by Pragma.";
369 });
370 return false;
371 }
372
373 // Check if the branch can't be understood because we can't do pipelining
374 // if that's the case.
375 LI.TBB = nullptr;
376 LI.FBB = nullptr;
377 LI.BrCond.clear();
378 if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
379 LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
380 NumFailBranch++;
381 ORE->emit([&]() {
382 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
383 L.getStartLoc(), L.getHeader())
384 << "The branch can't be understood";
385 });
386 return false;
387 }
388
389 LI.LoopInductionVar = nullptr;
390 LI.LoopCompare = nullptr;
392 if (!LI.LoopPipelinerInfo) {
393 LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
394 NumFailLoop++;
395 ORE->emit([&]() {
396 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
397 L.getStartLoc(), L.getHeader())
398 << "The loop structure is not supported";
399 });
400 return false;
401 }
402
403 if (!L.getLoopPreheader()) {
404 LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
405 NumFailPreheader++;
406 ORE->emit([&]() {
407 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
408 L.getStartLoc(), L.getHeader())
409 << "No loop preheader found";
410 });
411 return false;
412 }
413
414 // Remove any subregisters from inputs to phi nodes.
415 preprocessPhiNodes(*L.getHeader());
416 return true;
417}
418
419void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
421 SlotIndexes &Slots = *getAnalysis<LiveIntervals>().getSlotIndexes();
422
423 for (MachineInstr &PI : B.phis()) {
424 MachineOperand &DefOp = PI.getOperand(0);
425 assert(DefOp.getSubReg() == 0);
426 auto *RC = MRI.getRegClass(DefOp.getReg());
427
428 for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
429 MachineOperand &RegOp = PI.getOperand(i);
430 if (RegOp.getSubReg() == 0)
431 continue;
432
433 // If the operand uses a subregister, replace it with a new register
434 // without subregisters, and generate a copy to the new register.
435 Register NewReg = MRI.createVirtualRegister(RC);
436 MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
438 const DebugLoc &DL = PredB.findDebugLoc(At);
439 auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
440 .addReg(RegOp.getReg(), getRegState(RegOp),
441 RegOp.getSubReg());
442 Slots.insertMachineInstrInMaps(*Copy);
443 RegOp.setReg(NewReg);
444 RegOp.setSubReg(0);
445 }
446 }
447}
448
449/// The SMS algorithm consists of the following main steps:
450/// 1. Computation and analysis of the dependence graph.
451/// 2. Ordering of the nodes (instructions).
452/// 3. Attempt to Schedule the loop.
453bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
454 assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
455
456 SwingSchedulerDAG SMS(*this, L, getAnalysis<LiveIntervals>(), RegClassInfo,
458
459 MachineBasicBlock *MBB = L.getHeader();
460 // The kernel should not include any terminator instructions. These
461 // will be added back later.
462 SMS.startBlock(MBB);
463
464 // Compute the number of 'real' instructions in the basic block by
465 // ignoring terminators.
466 unsigned size = MBB->size();
468 E = MBB->instr_end();
469 I != E; ++I, --size)
470 ;
471
472 SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
473 SMS.schedule();
474 SMS.exitRegion();
475
476 SMS.finishBlock();
477 return SMS.hasNewSchedule();
478}
479
488}
489
490void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
491 if (SwpForceII > 0)
492 MII = SwpForceII;
493 else if (II_setByPragma > 0)
494 MII = II_setByPragma;
495 else
496 MII = std::max(ResMII, RecMII);
497}
498
499void SwingSchedulerDAG::setMAX_II() {
500 if (SwpForceII > 0)
501 MAX_II = SwpForceII;
502 else if (II_setByPragma > 0)
503 MAX_II = II_setByPragma;
504 else
505 MAX_II = MII + SwpIISearchRange;
506}
507
508/// We override the schedule function in ScheduleDAGInstrs to implement the
509/// scheduling part of the Swing Modulo Scheduling algorithm.
511 AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
512 buildSchedGraph(AA);
513 addLoopCarriedDependences(AA);
514 updatePhiDependences();
516 changeDependences();
517 postProcessDAG();
518 LLVM_DEBUG(dump());
519
520 NodeSetType NodeSets;
521 findCircuits(NodeSets);
522 NodeSetType Circuits = NodeSets;
523
524 // Calculate the MII.
525 unsigned ResMII = calculateResMII();
526 unsigned RecMII = calculateRecMII(NodeSets);
527
528 fuseRecs(NodeSets);
529
530 // This flag is used for testing and can cause correctness problems.
531 if (SwpIgnoreRecMII)
532 RecMII = 0;
533
534 setMII(ResMII, RecMII);
535 setMAX_II();
536
537 LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II
538 << " (rec=" << RecMII << ", res=" << ResMII << ")\n");
539
540 // Can't schedule a loop without a valid MII.
541 if (MII == 0) {
542 LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
543 NumFailZeroMII++;
544 Pass.ORE->emit([&]() {
546 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
547 << "Invalid Minimal Initiation Interval: 0";
548 });
549 return;
550 }
551
552 // Don't pipeline large loops.
553 if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {
554 LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
555 << ", we don't pipeline large loops\n");
556 NumFailLargeMaxMII++;
557 Pass.ORE->emit([&]() {
559 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
560 << "Minimal Initiation Interval too large: "
561 << ore::NV("MII", (int)MII) << " > "
562 << ore::NV("SwpMaxMii", SwpMaxMii) << "."
563 << "Refer to -pipeliner-max-mii.";
564 });
565 return;
566 }
567
568 computeNodeFunctions(NodeSets);
569
570 registerPressureFilter(NodeSets);
571
572 colocateNodeSets(NodeSets);
573
574 checkNodeSets(NodeSets);
575
576 LLVM_DEBUG({
577 for (auto &I : NodeSets) {
578 dbgs() << " Rec NodeSet ";
579 I.dump();
580 }
581 });
582
583 llvm::stable_sort(NodeSets, std::greater<NodeSet>());
584
585 groupRemainingNodes(NodeSets);
586
587 removeDuplicateNodes(NodeSets);
588
589 LLVM_DEBUG({
590 for (auto &I : NodeSets) {
591 dbgs() << " NodeSet ";
592 I.dump();
593 }
594 });
595
596 computeNodeOrder(NodeSets);
597
598 // check for node order issues
599 checkValidNodeOrder(Circuits);
600
601 SMSchedule Schedule(Pass.MF, this);
602 Scheduled = schedulePipeline(Schedule);
603
604 if (!Scheduled){
605 LLVM_DEBUG(dbgs() << "No schedule found, return\n");
606 NumFailNoSchedule++;
607 Pass.ORE->emit([&]() {
609 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
610 << "Unable to find schedule";
611 });
612 return;
613 }
614
615 unsigned numStages = Schedule.getMaxStageCount();
616 // No need to generate pipeline if there are no overlapped iterations.
617 if (numStages == 0) {
618 LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
619 NumFailZeroStage++;
620 Pass.ORE->emit([&]() {
622 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
623 << "No need to pipeline - no overlapped iterations in schedule.";
624 });
625 return;
626 }
627 // Check that the maximum stage count is less than user-defined limit.
628 if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) {
629 LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
630 << " : too many stages, abort\n");
631 NumFailLargeMaxStage++;
632 Pass.ORE->emit([&]() {
634 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
635 << "Too many stages in schedule: "
636 << ore::NV("numStages", (int)numStages) << " > "
637 << ore::NV("SwpMaxStages", SwpMaxStages)
638 << ". Refer to -pipeliner-max-stages.";
639 });
640 return;
641 }
642
643 Pass.ORE->emit([&]() {
645 Loop.getHeader())
646 << "Pipelined succesfully!";
647 });
648
649 // Generate the schedule as a ModuloSchedule.
650 DenseMap<MachineInstr *, int> Cycles, Stages;
651 std::vector<MachineInstr *> OrderedInsts;
652 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
653 ++Cycle) {
654 for (SUnit *SU : Schedule.getInstructions(Cycle)) {
655 OrderedInsts.push_back(SU->getInstr());
656 Cycles[SU->getInstr()] = Cycle;
657 Stages[SU->getInstr()] = Schedule.stageScheduled(SU);
658 }
659 }
661 for (auto &KV : NewMIs) {
662 Cycles[KV.first] = Cycles[KV.second];
663 Stages[KV.first] = Stages[KV.second];
664 NewInstrChanges[KV.first] = InstrChanges[getSUnit(KV.first)];
665 }
666
667 ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
668 std::move(Stages));
670 assert(NewInstrChanges.empty() &&
671 "Cannot serialize a schedule with InstrChanges!");
673 MSTI.annotate();
674 return;
675 }
676 // The experimental code generator can't work if there are InstChanges.
677 if (ExperimentalCodeGen && NewInstrChanges.empty()) {
678 PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
679 MSE.expand();
680 } else {
681 ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
682 MSE.expand();
683 MSE.cleanup();
684 }
685 ++NumPipelined;
686}
687
688/// Clean up after the software pipeliner runs.
690 for (auto &KV : NewMIs)
691 MF.deleteMachineInstr(KV.second);
692 NewMIs.clear();
693
694 // Call the superclass.
696}
697
698/// Return the register values for the operands of a Phi instruction.
699/// This function assume the instruction is a Phi.
701 unsigned &InitVal, unsigned &LoopVal) {
702 assert(Phi.isPHI() && "Expecting a Phi.");
703
704 InitVal = 0;
705 LoopVal = 0;
706 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
707 if (Phi.getOperand(i + 1).getMBB() != Loop)
708 InitVal = Phi.getOperand(i).getReg();
709 else
710 LoopVal = Phi.getOperand(i).getReg();
711
712 assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
713}
714
715/// Return the Phi register value that comes the loop block.
716static unsigned getLoopPhiReg(const MachineInstr &Phi,
717 const MachineBasicBlock *LoopBB) {
718 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
719 if (Phi.getOperand(i + 1).getMBB() == LoopBB)
720 return Phi.getOperand(i).getReg();
721 return 0;
722}
723
724/// Return true if SUb can be reached from SUa following the chain edges.
725static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
728 Worklist.push_back(SUa);
729 while (!Worklist.empty()) {
730 const SUnit *SU = Worklist.pop_back_val();
731 for (const auto &SI : SU->Succs) {
732 SUnit *SuccSU = SI.getSUnit();
733 if (SI.getKind() == SDep::Order) {
734 if (Visited.count(SuccSU))
735 continue;
736 if (SuccSU == SUb)
737 return true;
738 Worklist.push_back(SuccSU);
739 Visited.insert(SuccSU);
740 }
741 }
742 }
743 return false;
744}
745
746/// Return true if the instruction causes a chain between memory
747/// references before and after it.
749 return MI.isCall() || MI.mayRaiseFPException() ||
750 MI.hasUnmodeledSideEffects() ||
751 (MI.hasOrderedMemoryRef() &&
752 (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad()));
753}
754
755/// Return the underlying objects for the memory references of an instruction.
756/// This function calls the code in ValueTracking, but first checks that the
757/// instruction has a memory operand.
760 if (!MI->hasOneMemOperand())
761 return;
762 MachineMemOperand *MM = *MI->memoperands_begin();
763 if (!MM->getValue())
764 return;
765 getUnderlyingObjects(MM->getValue(), Objs);
766 for (const Value *V : Objs) {
767 if (!isIdentifiedObject(V)) {
768 Objs.clear();
769 return;
770 }
771 Objs.push_back(V);
772 }
773}
774
775/// Add a chain edge between a load and store if the store can be an
776/// alias of the load on a subsequent iteration, i.e., a loop carried
777/// dependence. This code is very similar to the code in ScheduleDAGInstrs
778/// but that code doesn't create loop carried dependences.
779void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
783 for (auto &SU : SUnits) {
784 MachineInstr &MI = *SU.getInstr();
786 PendingLoads.clear();
787 else if (MI.mayLoad()) {
790 if (Objs.empty())
792 for (const auto *V : Objs) {
793 SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
794 SUs.push_back(&SU);
795 }
796 } else if (MI.mayStore()) {
799 if (Objs.empty())
801 for (const auto *V : Objs) {
803 PendingLoads.find(V);
804 if (I == PendingLoads.end())
805 continue;
806 for (auto *Load : I->second) {
807 if (isSuccOrder(Load, &SU))
808 continue;
809 MachineInstr &LdMI = *Load->getInstr();
810 // First, perform the cheaper check that compares the base register.
811 // If they are the same and the load offset is less than the store
812 // offset, then mark the dependence as loop carried potentially.
813 const MachineOperand *BaseOp1, *BaseOp2;
814 int64_t Offset1, Offset2;
815 bool Offset1IsScalable, Offset2IsScalable;
816 if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1,
817 Offset1IsScalable, TRI) &&
818 TII->getMemOperandWithOffset(MI, BaseOp2, Offset2,
819 Offset2IsScalable, TRI)) {
820 if (BaseOp1->isIdenticalTo(*BaseOp2) &&
821 Offset1IsScalable == Offset2IsScalable &&
822 (int)Offset1 < (int)Offset2) {
824 "What happened to the chain edge?");
825 SDep Dep(Load, SDep::Barrier);
826 Dep.setLatency(1);
827 SU.addPred(Dep);
828 continue;
829 }
830 }
831 // Second, the more expensive check that uses alias analysis on the
832 // base registers. If they alias, and the load offset is less than
833 // the store offset, the mark the dependence as loop carried.
834 if (!AA) {
835 SDep Dep(Load, SDep::Barrier);
836 Dep.setLatency(1);
837 SU.addPred(Dep);
838 continue;
839 }
840 MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
841 MachineMemOperand *MMO2 = *MI.memoperands_begin();
842 if (!MMO1->getValue() || !MMO2->getValue()) {
843 SDep Dep(Load, SDep::Barrier);
844 Dep.setLatency(1);
845 SU.addPred(Dep);
846 continue;
847 }
848 if (MMO1->getValue() == MMO2->getValue() &&
849 MMO1->getOffset() <= MMO2->getOffset()) {
850 SDep Dep(Load, SDep::Barrier);
851 Dep.setLatency(1);
852 SU.addPred(Dep);
853 continue;
854 }
855 if (!AA->isNoAlias(
858 MMO2->getAAInfo()))) {
859 SDep Dep(Load, SDep::Barrier);
860 Dep.setLatency(1);
861 SU.addPred(Dep);
862 }
863 }
864 }
865 }
866 }
867}
868
869/// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
870/// processes dependences for PHIs. This function adds true dependences
871/// from a PHI to a use, and a loop carried dependence from the use to the
872/// PHI. The loop carried dependence is represented as an anti dependence
873/// edge. This function also removes chain dependences between unrelated
874/// PHIs.
875void SwingSchedulerDAG::updatePhiDependences() {
876 SmallVector<SDep, 4> RemoveDeps;
878
879 // Iterate over each DAG node.
880 for (SUnit &I : SUnits) {
881 RemoveDeps.clear();
882 // Set to true if the instruction has an operand defined by a Phi.
883 unsigned HasPhiUse = 0;
884 unsigned HasPhiDef = 0;
885 MachineInstr *MI = I.getInstr();
886 // Iterate over each operand, and we process the definitions.
887 for (const MachineOperand &MO : MI->operands()) {
888 if (!MO.isReg())
889 continue;
890 Register Reg = MO.getReg();
891 if (MO.isDef()) {
892 // If the register is used by a Phi, then create an anti dependence.
894 UI = MRI.use_instr_begin(Reg),
895 UE = MRI.use_instr_end();
896 UI != UE; ++UI) {
897 MachineInstr *UseMI = &*UI;
898 SUnit *SU = getSUnit(UseMI);
899 if (SU != nullptr && UseMI->isPHI()) {
900 if (!MI->isPHI()) {
901 SDep Dep(SU, SDep::Anti, Reg);
902 Dep.setLatency(1);
903 I.addPred(Dep);
904 } else {
905 HasPhiDef = Reg;
906 // Add a chain edge to a dependent Phi that isn't an existing
907 // predecessor.
908 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
909 I.addPred(SDep(SU, SDep::Barrier));
910 }
911 }
912 }
913 } else if (MO.isUse()) {
914 // If the register is defined by a Phi, then create a true dependence.
916 if (DefMI == nullptr)
917 continue;
918 SUnit *SU = getSUnit(DefMI);
919 if (SU != nullptr && DefMI->isPHI()) {
920 if (!MI->isPHI()) {
921 SDep Dep(SU, SDep::Data, Reg);
922 Dep.setLatency(0);
923 ST.adjustSchedDependency(SU, 0, &I, MO.getOperandNo(), Dep);
924 I.addPred(Dep);
925 } else {
926 HasPhiUse = Reg;
927 // Add a chain edge to a dependent Phi that isn't an existing
928 // predecessor.
929 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
930 I.addPred(SDep(SU, SDep::Barrier));
931 }
932 }
933 }
934 }
935 // Remove order dependences from an unrelated Phi.
936 if (!SwpPruneDeps)
937 continue;
938 for (auto &PI : I.Preds) {
939 MachineInstr *PMI = PI.getSUnit()->getInstr();
940 if (PMI->isPHI() && PI.getKind() == SDep::Order) {
941 if (I.getInstr()->isPHI()) {
942 if (PMI->getOperand(0).getReg() == HasPhiUse)
943 continue;
944 if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
945 continue;
946 }
947 RemoveDeps.push_back(PI);
948 }
949 }
950 for (int i = 0, e = RemoveDeps.size(); i != e; ++i)
951 I.removePred(RemoveDeps[i]);
952 }
953}
954
955/// Iterate over each DAG node and see if we can change any dependences
956/// in order to reduce the recurrence MII.
957void SwingSchedulerDAG::changeDependences() {
958 // See if an instruction can use a value from the previous iteration.
959 // If so, we update the base and offset of the instruction and change
960 // the dependences.
961 for (SUnit &I : SUnits) {
962 unsigned BasePos = 0, OffsetPos = 0, NewBase = 0;
963 int64_t NewOffset = 0;
964 if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
965 NewOffset))
966 continue;
967
968 // Get the MI and SUnit for the instruction that defines the original base.
969 Register OrigBase = I.getInstr()->getOperand(BasePos).getReg();
971 if (!DefMI)
972 continue;
973 SUnit *DefSU = getSUnit(DefMI);
974 if (!DefSU)
975 continue;
976 // Get the MI and SUnit for the instruction that defins the new base.
977 MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
978 if (!LastMI)
979 continue;
980 SUnit *LastSU = getSUnit(LastMI);
981 if (!LastSU)
982 continue;
983
984 if (Topo.IsReachable(&I, LastSU))
985 continue;
986
987 // Remove the dependence. The value now depends on a prior iteration.
989 for (const SDep &P : I.Preds)
990 if (P.getSUnit() == DefSU)
991 Deps.push_back(P);
992 for (int i = 0, e = Deps.size(); i != e; i++) {
993 Topo.RemovePred(&I, Deps[i].getSUnit());
994 I.removePred(Deps[i]);
995 }
996 // Remove the chain dependence between the instructions.
997 Deps.clear();
998 for (auto &P : LastSU->Preds)
999 if (P.getSUnit() == &I && P.getKind() == SDep::Order)
1000 Deps.push_back(P);
1001 for (int i = 0, e = Deps.size(); i != e; i++) {
1002 Topo.RemovePred(LastSU, Deps[i].getSUnit());
1003 LastSU->removePred(Deps[i]);
1004 }
1005
1006 // Add a dependence between the new instruction and the instruction
1007 // that defines the new base.
1008 SDep Dep(&I, SDep::Anti, NewBase);
1009 Topo.AddPred(LastSU, &I);
1010 LastSU->addPred(Dep);
1011
1012 // Remember the base and offset information so that we can update the
1013 // instruction during code generation.
1014 InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
1015 }
1016}
1017
1018/// Create an instruction stream that represents a single iteration and stage of
1019/// each instruction. This function differs from SMSchedule::finalizeSchedule in
1020/// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this
1021/// function is an approximation of SMSchedule::finalizeSchedule with all
1022/// non-const operations removed.
1024 SMSchedule &Schedule,
1025 std::vector<MachineInstr *> &OrderedInsts,
1028
1029 // Move all instructions to the first stage from the later stages.
1030 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1031 ++Cycle) {
1032 for (int Stage = 0, LastStage = Schedule.getMaxStageCount();
1033 Stage <= LastStage; ++Stage) {
1034 for (SUnit *SU : llvm::reverse(Schedule.getInstructions(
1035 Cycle + Stage * Schedule.getInitiationInterval()))) {
1036 Instrs[Cycle].push_front(SU);
1037 }
1038 }
1039 }
1040
1041 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1042 ++Cycle) {
1043 std::deque<SUnit *> &CycleInstrs = Instrs[Cycle];
1044 CycleInstrs = Schedule.reorderInstructions(SSD, CycleInstrs);
1045 for (SUnit *SU : CycleInstrs) {
1046 MachineInstr *MI = SU->getInstr();
1047 OrderedInsts.push_back(MI);
1048 Stages[MI] = Schedule.stageScheduled(SU);
1049 }
1050 }
1051}
1052
1053namespace {
1054
1055// FuncUnitSorter - Comparison operator used to sort instructions by
1056// the number of functional unit choices.
1057struct FuncUnitSorter {
1058 const InstrItineraryData *InstrItins;
1059 const MCSubtargetInfo *STI;
1061
1062 FuncUnitSorter(const TargetSubtargetInfo &TSI)
1063 : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
1064
1065 // Compute the number of functional unit alternatives needed
1066 // at each stage, and take the minimum value. We prioritize the
1067 // instructions by the least number of choices first.
1068 unsigned minFuncUnits(const MachineInstr *Inst,
1069 InstrStage::FuncUnits &F) const {
1070 unsigned SchedClass = Inst->getDesc().getSchedClass();
1071 unsigned min = UINT_MAX;
1072 if (InstrItins && !InstrItins->isEmpty()) {
1073 for (const InstrStage &IS :
1074 make_range(InstrItins->beginStage(SchedClass),
1075 InstrItins->endStage(SchedClass))) {
1076 InstrStage::FuncUnits funcUnits = IS.getUnits();
1077 unsigned numAlternatives = llvm::popcount(funcUnits);
1078 if (numAlternatives < min) {
1079 min = numAlternatives;
1080 F = funcUnits;
1081 }
1082 }
1083 return min;
1084 }
1085 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1086 const MCSchedClassDesc *SCDesc =
1087 STI->getSchedModel().getSchedClassDesc(SchedClass);
1088 if (!SCDesc->isValid())
1089 // No valid Schedule Class Desc for schedClass, should be
1090 // Pseudo/PostRAPseudo
1091 return min;
1092
1093 for (const MCWriteProcResEntry &PRE :
1094 make_range(STI->getWriteProcResBegin(SCDesc),
1095 STI->getWriteProcResEnd(SCDesc))) {
1096 if (!PRE.ReleaseAtCycle)
1097 continue;
1098 const MCProcResourceDesc *ProcResource =
1099 STI->getSchedModel().getProcResource(PRE.ProcResourceIdx);
1100 unsigned NumUnits = ProcResource->NumUnits;
1101 if (NumUnits < min) {
1102 min = NumUnits;
1103 F = PRE.ProcResourceIdx;
1104 }
1105 }
1106 return min;
1107 }
1108 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1109 }
1110
1111 // Compute the critical resources needed by the instruction. This
1112 // function records the functional units needed by instructions that
1113 // must use only one functional unit. We use this as a tie breaker
1114 // for computing the resource MII. The instrutions that require
1115 // the same, highly used, functional unit have high priority.
1116 void calcCriticalResources(MachineInstr &MI) {
1117 unsigned SchedClass = MI.getDesc().getSchedClass();
1118 if (InstrItins && !InstrItins->isEmpty()) {
1119 for (const InstrStage &IS :
1120 make_range(InstrItins->beginStage(SchedClass),
1121 InstrItins->endStage(SchedClass))) {
1122 InstrStage::FuncUnits FuncUnits = IS.getUnits();
1123 if (llvm::popcount(FuncUnits) == 1)
1124 Resources[FuncUnits]++;
1125 }
1126 return;
1127 }
1128 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1129 const MCSchedClassDesc *SCDesc =
1130 STI->getSchedModel().getSchedClassDesc(SchedClass);
1131 if (!SCDesc->isValid())
1132 // No valid Schedule Class Desc for schedClass, should be
1133 // Pseudo/PostRAPseudo
1134 return;
1135
1136 for (const MCWriteProcResEntry &PRE :
1137 make_range(STI->getWriteProcResBegin(SCDesc),
1138 STI->getWriteProcResEnd(SCDesc))) {
1139 if (!PRE.ReleaseAtCycle)
1140 continue;
1141 Resources[PRE.ProcResourceIdx]++;
1142 }
1143 return;
1144 }
1145 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1146 }
1147
1148 /// Return true if IS1 has less priority than IS2.
1149 bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
1150 InstrStage::FuncUnits F1 = 0, F2 = 0;
1151 unsigned MFUs1 = minFuncUnits(IS1, F1);
1152 unsigned MFUs2 = minFuncUnits(IS2, F2);
1153 if (MFUs1 == MFUs2)
1154 return Resources.lookup(F1) < Resources.lookup(F2);
1155 return MFUs1 > MFUs2;
1156 }
1157};
1158
1159/// Calculate the maximum register pressure of the scheduled instructions stream
1160class HighRegisterPressureDetector {
1161 MachineBasicBlock *OrigMBB;
1162 const MachineFunction &MF;
1163 const MachineRegisterInfo &MRI;
1164 const TargetRegisterInfo *TRI;
1165
1166 const unsigned PSetNum;
1167
1168 // Indexed by PSet ID
1169 // InitSetPressure takes into account the register pressure of live-in
1170 // registers. It's not depend on how the loop is scheduled, so it's enough to
1171 // calculate them once at the beginning.
1172 std::vector<unsigned> InitSetPressure;
1173
1174 // Indexed by PSet ID
1175 // Upper limit for each register pressure set
1176 std::vector<unsigned> PressureSetLimit;
1177
1179
1181
1182public:
1183 using OrderedInstsTy = std::vector<MachineInstr *>;
1184 using Instr2StageTy = DenseMap<MachineInstr *, unsigned>;
1185
1186private:
1187 static void dumpRegisterPressures(const std::vector<unsigned> &Pressures) {
1188 if (Pressures.size() == 0) {
1189 dbgs() << "[]";
1190 } else {
1191 char Prefix = '[';
1192 for (unsigned P : Pressures) {
1193 dbgs() << Prefix << P;
1194 Prefix = ' ';
1195 }
1196 dbgs() << ']';
1197 }
1198 }
1199
1200 void dumpPSet(Register Reg) const {
1201 dbgs() << "Reg=" << printReg(Reg, TRI, 0, &MRI) << " PSet=";
1202 for (auto PSetIter = MRI.getPressureSets(Reg); PSetIter.isValid();
1203 ++PSetIter) {
1204 dbgs() << *PSetIter << ' ';
1205 }
1206 dbgs() << '\n';
1207 }
1208
1209 void increaseRegisterPressure(std::vector<unsigned> &Pressure,
1210 Register Reg) const {
1211 auto PSetIter = MRI.getPressureSets(Reg);
1212 unsigned Weight = PSetIter.getWeight();
1213 for (; PSetIter.isValid(); ++PSetIter)
1214 Pressure[*PSetIter] += Weight;
1215 }
1216
1217 void decreaseRegisterPressure(std::vector<unsigned> &Pressure,
1218 Register Reg) const {
1219 auto PSetIter = MRI.getPressureSets(Reg);
1220 unsigned Weight = PSetIter.getWeight();
1221 for (; PSetIter.isValid(); ++PSetIter) {
1222 auto &P = Pressure[*PSetIter];
1223 assert(P >= Weight &&
1224 "register pressure must be greater than or equal weight");
1225 P -= Weight;
1226 }
1227 }
1228
1229 // Return true if Reg is fixed one, for example, stack pointer
1230 bool isFixedRegister(Register Reg) const {
1231 return Reg.isPhysical() && TRI->isFixedRegister(MF, Reg.asMCReg());
1232 }
1233
1234 bool isDefinedInThisLoop(Register Reg) const {
1235 return Reg.isVirtual() && MRI.getVRegDef(Reg)->getParent() == OrigMBB;
1236 }
1237
1238 // Search for live-in variables. They are factored into the register pressure
1239 // from the begining. Live-in variables used by every iteration should be
1240 // considered as alive throughout the loop. For example, the variable `c` in
1241 // following code. \code
1242 // int c = ...;
1243 // for (int i = 0; i < n; i++)
1244 // a[i] += b[i] + c;
1245 // \endcode
1246 void computeLiveIn() {
1248 for (auto &MI : *OrigMBB) {
1249 if (MI.isDebugInstr())
1250 continue;
1251 for (auto Use : ROMap[&MI].Uses) {
1252 auto Reg = Use.RegUnit;
1253 // Ignore the variable that appears only on one side of phi instruction
1254 // because it's used only at the first iteration.
1255 if (MI.isPHI() && Reg != getLoopPhiReg(MI, OrigMBB))
1256 continue;
1257 if (isFixedRegister(Reg))
1258 continue;
1259 if (isDefinedInThisLoop(Reg))
1260 continue;
1261 Used.insert(Reg);
1262 }
1263 }
1264
1265 for (auto LiveIn : Used)
1266 increaseRegisterPressure(InitSetPressure, LiveIn);
1267 }
1268
1269 // Calculate the upper limit of each pressure set
1270 void computePressureSetLimit(const RegisterClassInfo &RCI) {
1271 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1272 PressureSetLimit[PSet] = RCI.getRegPressureSetLimit(PSet);
1273
1274 // We assume fixed registers, such as stack pointer, are already in use.
1275 // Therefore subtracting the weight of the fixed registers from the limit of
1276 // each pressure set in advance.
1278 for (const TargetRegisterClass *TRC : TRI->regclasses()) {
1279 for (const MCPhysReg Reg : *TRC)
1280 if (isFixedRegister(Reg))
1281 FixedRegs.insert(Reg);
1282 }
1283
1284 LLVM_DEBUG({
1285 for (auto Reg : FixedRegs) {
1286 dbgs() << printReg(Reg, TRI, 0, &MRI) << ": [";
1287 const int *Sets = TRI->getRegUnitPressureSets(Reg);
1288 for (; *Sets != -1; Sets++) {
1289 dbgs() << TRI->getRegPressureSetName(*Sets) << ", ";
1290 }
1291 dbgs() << "]\n";
1292 }
1293 });
1294
1295 for (auto Reg : FixedRegs) {
1296 LLVM_DEBUG(dbgs() << "fixed register: " << printReg(Reg, TRI, 0, &MRI)
1297 << "\n");
1298 auto PSetIter = MRI.getPressureSets(Reg);
1299 unsigned Weight = PSetIter.getWeight();
1300 for (; PSetIter.isValid(); ++PSetIter) {
1301 unsigned &Limit = PressureSetLimit[*PSetIter];
1302 assert(Limit >= Weight &&
1303 "register pressure limit must be greater than or equal weight");
1304 Limit -= Weight;
1305 LLVM_DEBUG(dbgs() << "PSet=" << *PSetIter << " Limit=" << Limit
1306 << " (decreased by " << Weight << ")\n");
1307 }
1308 }
1309 }
1310
1311 // There are two patterns of last-use.
1312 // - by an instruction of the current iteration
1313 // - by a phi instruction of the next iteration (loop carried value)
1314 //
1315 // Furthermore, following two groups of instructions are executed
1316 // simultaneously
1317 // - next iteration's phi instructions in i-th stage
1318 // - current iteration's instructions in i+1-th stage
1319 //
1320 // This function calculates the last-use of each register while taking into
1321 // account the above two patterns.
1322 Instr2LastUsesTy computeLastUses(const OrderedInstsTy &OrderedInsts,
1323 Instr2StageTy &Stages) const {
1324 // We treat virtual registers that are defined and used in this loop.
1325 // Following virtual register will be ignored
1326 // - live-in one
1327 // - defined but not used in the loop (potentially live-out)
1328 DenseSet<Register> TargetRegs;
1329 const auto UpdateTargetRegs = [this, &TargetRegs](Register Reg) {
1330 if (isDefinedInThisLoop(Reg))
1331 TargetRegs.insert(Reg);
1332 };
1333 for (MachineInstr *MI : OrderedInsts) {
1334 if (MI->isPHI()) {
1335 Register Reg = getLoopPhiReg(*MI, OrigMBB);
1336 UpdateTargetRegs(Reg);
1337 } else {
1338 for (auto Use : ROMap.find(MI)->getSecond().Uses)
1339 UpdateTargetRegs(Use.RegUnit);
1340 }
1341 }
1342
1343 const auto InstrScore = [&Stages](MachineInstr *MI) {
1344 return Stages[MI] + MI->isPHI();
1345 };
1346
1348 for (MachineInstr *MI : llvm::reverse(OrderedInsts)) {
1349 for (auto Use : ROMap.find(MI)->getSecond().Uses) {
1350 auto Reg = Use.RegUnit;
1351 if (!TargetRegs.contains(Reg))
1352 continue;
1353 auto Ite = LastUseMI.find(Reg);
1354 if (Ite == LastUseMI.end()) {
1355 LastUseMI[Reg] = MI;
1356 } else {
1357 MachineInstr *Orig = Ite->second;
1358 MachineInstr *New = MI;
1359 if (InstrScore(Orig) < InstrScore(New))
1360 LastUseMI[Reg] = New;
1361 }
1362 }
1363 }
1364
1365 Instr2LastUsesTy LastUses;
1366 for (auto &Entry : LastUseMI)
1367 LastUses[Entry.second].insert(Entry.first);
1368 return LastUses;
1369 }
1370
1371 // Compute the maximum register pressure of the kernel. We'll simulate #Stage
1372 // iterations and check the register pressure at the point where all stages
1373 // overlapping.
1374 //
1375 // An example of unrolled loop where #Stage is 4..
1376 // Iter i+0 i+1 i+2 i+3
1377 // ------------------------
1378 // Stage 0
1379 // Stage 1 0
1380 // Stage 2 1 0
1381 // Stage 3 2 1 0 <- All stages overlap
1382 //
1383 std::vector<unsigned>
1384 computeMaxSetPressure(const OrderedInstsTy &OrderedInsts,
1385 Instr2StageTy &Stages,
1386 const unsigned StageCount) const {
1387 using RegSetTy = SmallDenseSet<Register, 16>;
1388
1389 // Indexed by #Iter. To treat "local" variables of each stage separately, we
1390 // manage the liveness of the registers independently by iterations.
1391 SmallVector<RegSetTy> LiveRegSets(StageCount);
1392
1393 auto CurSetPressure = InitSetPressure;
1394 auto MaxSetPressure = InitSetPressure;
1395 auto LastUses = computeLastUses(OrderedInsts, Stages);
1396
1397 LLVM_DEBUG({
1398 dbgs() << "Ordered instructions:\n";
1399 for (MachineInstr *MI : OrderedInsts) {
1400 dbgs() << "Stage " << Stages[MI] << ": ";
1401 MI->dump();
1402 }
1403 });
1404
1405 const auto InsertReg = [this, &CurSetPressure](RegSetTy &RegSet,
1406 Register Reg) {
1407 if (!Reg.isValid() || isFixedRegister(Reg))
1408 return;
1409
1410 bool Inserted = RegSet.insert(Reg).second;
1411 if (!Inserted)
1412 return;
1413
1414 LLVM_DEBUG(dbgs() << "insert " << printReg(Reg, TRI, 0, &MRI) << "\n");
1415 increaseRegisterPressure(CurSetPressure, Reg);
1416 LLVM_DEBUG(dumpPSet(Reg));
1417 };
1418
1419 const auto EraseReg = [this, &CurSetPressure](RegSetTy &RegSet,
1420 Register Reg) {
1421 if (!Reg.isValid() || isFixedRegister(Reg))
1422 return;
1423
1424 // live-in register
1425 if (!RegSet.contains(Reg))
1426 return;
1427
1428 LLVM_DEBUG(dbgs() << "erase " << printReg(Reg, TRI, 0, &MRI) << "\n");
1429 RegSet.erase(Reg);
1430 decreaseRegisterPressure(CurSetPressure, Reg);
1431 LLVM_DEBUG(dumpPSet(Reg));
1432 };
1433
1434 for (unsigned I = 0; I < StageCount; I++) {
1435 for (MachineInstr *MI : OrderedInsts) {
1436 const auto Stage = Stages[MI];
1437 if (I < Stage)
1438 continue;
1439
1440 const unsigned Iter = I - Stage;
1441
1442 for (auto Def : ROMap.find(MI)->getSecond().Defs)
1443 InsertReg(LiveRegSets[Iter], Def.RegUnit);
1444
1445 for (auto LastUse : LastUses[MI]) {
1446 if (MI->isPHI()) {
1447 if (Iter != 0)
1448 EraseReg(LiveRegSets[Iter - 1], LastUse);
1449 } else {
1450 EraseReg(LiveRegSets[Iter], LastUse);
1451 }
1452 }
1453
1454 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1455 MaxSetPressure[PSet] =
1456 std::max(MaxSetPressure[PSet], CurSetPressure[PSet]);
1457
1458 LLVM_DEBUG({
1459 dbgs() << "CurSetPressure=";
1460 dumpRegisterPressures(CurSetPressure);
1461 dbgs() << " iter=" << Iter << " stage=" << Stage << ":";
1462 MI->dump();
1463 });
1464 }
1465 }
1466
1467 return MaxSetPressure;
1468 }
1469
1470public:
1471 HighRegisterPressureDetector(MachineBasicBlock *OrigMBB,
1472 const MachineFunction &MF)
1473 : OrigMBB(OrigMBB), MF(MF), MRI(MF.getRegInfo()),
1474 TRI(MF.getSubtarget().getRegisterInfo()),
1475 PSetNum(TRI->getNumRegPressureSets()), InitSetPressure(PSetNum, 0),
1476 PressureSetLimit(PSetNum, 0) {}
1477
1478 // Used to calculate register pressure, which is independent of loop
1479 // scheduling.
1480 void init(const RegisterClassInfo &RCI) {
1481 for (MachineInstr &MI : *OrigMBB) {
1482 if (MI.isDebugInstr())
1483 continue;
1484 ROMap[&MI].collect(MI, *TRI, MRI, false, true);
1485 }
1486
1487 computeLiveIn();
1488 computePressureSetLimit(RCI);
1489 }
1490
1491 // Calculate the maximum register pressures of the loop and check if they
1492 // exceed the limit
1493 bool detect(const SwingSchedulerDAG *SSD, SMSchedule &Schedule,
1494 const unsigned MaxStage) const {
1496 "the percentage of the margin must be between 0 to 100");
1497
1498 OrderedInstsTy OrderedInsts;
1499 Instr2StageTy Stages;
1500 computeScheduledInsts(SSD, Schedule, OrderedInsts, Stages);
1501 const auto MaxSetPressure =
1502 computeMaxSetPressure(OrderedInsts, Stages, MaxStage + 1);
1503
1504 LLVM_DEBUG({
1505 dbgs() << "Dump MaxSetPressure:\n";
1506 for (unsigned I = 0; I < MaxSetPressure.size(); I++) {
1507 dbgs() << format("MaxSetPressure[%d]=%d\n", I, MaxSetPressure[I]);
1508 }
1509 dbgs() << '\n';
1510 });
1511
1512 for (unsigned PSet = 0; PSet < PSetNum; PSet++) {
1513 unsigned Limit = PressureSetLimit[PSet];
1514 unsigned Margin = Limit * RegPressureMargin / 100;
1515 LLVM_DEBUG(dbgs() << "PSet=" << PSet << " Limit=" << Limit
1516 << " Margin=" << Margin << "\n");
1517 if (Limit < MaxSetPressure[PSet] + Margin) {
1518 LLVM_DEBUG(
1519 dbgs()
1520 << "Rejected the schedule because of too high register pressure\n");
1521 return true;
1522 }
1523 }
1524 return false;
1525 }
1526};
1527
1528} // end anonymous namespace
1529
1530/// Calculate the resource constrained minimum initiation interval for the
1531/// specified loop. We use the DFA to model the resources needed for
1532/// each instruction, and we ignore dependences. A different DFA is created
1533/// for each cycle that is required. When adding a new instruction, we attempt
1534/// to add it to each existing DFA, until a legal space is found. If the
1535/// instruction cannot be reserved in an existing DFA, we create a new one.
1536unsigned SwingSchedulerDAG::calculateResMII() {
1537 LLVM_DEBUG(dbgs() << "calculateResMII:\n");
1539 return RM.calculateResMII();
1540}
1541
1542/// Calculate the recurrence-constrainted minimum initiation interval.
1543/// Iterate over each circuit. Compute the delay(c) and distance(c)
1544/// for each circuit. The II needs to satisfy the inequality
1545/// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1546/// II that satisfies the inequality, and the RecMII is the maximum
1547/// of those values.
1548unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1549 unsigned RecMII = 0;
1550
1551 for (NodeSet &Nodes : NodeSets) {
1552 if (Nodes.empty())
1553 continue;
1554
1555 unsigned Delay = Nodes.getLatency();
1556 unsigned Distance = 1;
1557
1558 // ii = ceil(delay / distance)
1559 unsigned CurMII = (Delay + Distance - 1) / Distance;
1560 Nodes.setRecMII(CurMII);
1561 if (CurMII > RecMII)
1562 RecMII = CurMII;
1563 }
1564
1565 return RecMII;
1566}
1567
1568/// Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1569/// but we do this to find the circuits, and then change them back.
1570static void swapAntiDependences(std::vector<SUnit> &SUnits) {
1572 for (SUnit &SU : SUnits) {
1573 for (SDep &Pred : SU.Preds)
1574 if (Pred.getKind() == SDep::Anti)
1575 DepsAdded.push_back(std::make_pair(&SU, Pred));
1576 }
1577 for (std::pair<SUnit *, SDep> &P : DepsAdded) {
1578 // Remove this anti dependency and add one in the reverse direction.
1579 SUnit *SU = P.first;
1580 SDep &D = P.second;
1581 SUnit *TargetSU = D.getSUnit();
1582 unsigned Reg = D.getReg();
1583 unsigned Lat = D.getLatency();
1584 SU->removePred(D);
1585 SDep Dep(SU, SDep::Anti, Reg);
1586 Dep.setLatency(Lat);
1587 TargetSU->addPred(Dep);
1588 }
1589}
1590
1591/// Create the adjacency structure of the nodes in the graph.
1592void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1593 SwingSchedulerDAG *DAG) {
1594 BitVector Added(SUnits.size());
1595 DenseMap<int, int> OutputDeps;
1596 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1597 Added.reset();
1598 // Add any successor to the adjacency matrix and exclude duplicates.
1599 for (auto &SI : SUnits[i].Succs) {
1600 // Only create a back-edge on the first and last nodes of a dependence
1601 // chain. This records any chains and adds them later.
1602 if (SI.getKind() == SDep::Output) {
1603 int N = SI.getSUnit()->NodeNum;
1604 int BackEdge = i;
1605 auto Dep = OutputDeps.find(BackEdge);
1606 if (Dep != OutputDeps.end()) {
1607 BackEdge = Dep->second;
1608 OutputDeps.erase(Dep);
1609 }
1610 OutputDeps[N] = BackEdge;
1611 }
1612 // Do not process a boundary node, an artificial node.
1613 // A back-edge is processed only if it goes to a Phi.
1614 if (SI.getSUnit()->isBoundaryNode() || SI.isArtificial() ||
1615 (SI.getKind() == SDep::Anti && !SI.getSUnit()->getInstr()->isPHI()))
1616 continue;
1617 int N = SI.getSUnit()->NodeNum;
1618 if (!Added.test(N)) {
1619 AdjK[i].push_back(N);
1620 Added.set(N);
1621 }
1622 }
1623 // A chain edge between a store and a load is treated as a back-edge in the
1624 // adjacency matrix.
1625 for (auto &PI : SUnits[i].Preds) {
1626 if (!SUnits[i].getInstr()->mayStore() ||
1627 !DAG->isLoopCarriedDep(&SUnits[i], PI, false))
1628 continue;
1629 if (PI.getKind() == SDep::Order && PI.getSUnit()->getInstr()->mayLoad()) {
1630 int N = PI.getSUnit()->NodeNum;
1631 if (!Added.test(N)) {
1632 AdjK[i].push_back(N);
1633 Added.set(N);
1634 }
1635 }
1636 }
1637 }
1638 // Add back-edges in the adjacency matrix for the output dependences.
1639 for (auto &OD : OutputDeps)
1640 if (!Added.test(OD.second)) {
1641 AdjK[OD.first].push_back(OD.second);
1642 Added.set(OD.second);
1643 }
1644}
1645
1646/// Identify an elementary circuit in the dependence graph starting at the
1647/// specified node.
1648bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1649 bool HasBackedge) {
1650 SUnit *SV = &SUnits[V];
1651 bool F = false;
1652 Stack.insert(SV);
1653 Blocked.set(V);
1654
1655 for (auto W : AdjK[V]) {
1656 if (NumPaths > MaxPaths)
1657 break;
1658 if (W < S)
1659 continue;
1660 if (W == S) {
1661 if (!HasBackedge)
1662 NodeSets.push_back(NodeSet(Stack.begin(), Stack.end()));
1663 F = true;
1664 ++NumPaths;
1665 break;
1666 } else if (!Blocked.test(W)) {
1667 if (circuit(W, S, NodeSets,
1668 Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
1669 F = true;
1670 }
1671 }
1672
1673 if (F)
1674 unblock(V);
1675 else {
1676 for (auto W : AdjK[V]) {
1677 if (W < S)
1678 continue;
1679 B[W].insert(SV);
1680 }
1681 }
1682 Stack.pop_back();
1683 return F;
1684}
1685
1686/// Unblock a node in the circuit finding algorithm.
1687void SwingSchedulerDAG::Circuits::unblock(int U) {
1688 Blocked.reset(U);
1690 while (!BU.empty()) {
1692 assert(SI != BU.end() && "Invalid B set.");
1693 SUnit *W = *SI;
1694 BU.erase(W);
1695 if (Blocked.test(W->NodeNum))
1696 unblock(W->NodeNum);
1697 }
1698}
1699
1700/// Identify all the elementary circuits in the dependence graph using
1701/// Johnson's circuit algorithm.
1702void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
1703 // Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1704 // but we do this to find the circuits, and then change them back.
1706
1707 Circuits Cir(SUnits, Topo);
1708 // Create the adjacency structure.
1709 Cir.createAdjacencyStructure(this);
1710 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1711 Cir.reset();
1712 Cir.circuit(i, i, NodeSets);
1713 }
1714
1715 // Change the dependences back so that we've created a DAG again.
1717}
1718
1719// Create artificial dependencies between the source of COPY/REG_SEQUENCE that
1720// is loop-carried to the USE in next iteration. This will help pipeliner avoid
1721// additional copies that are needed across iterations. An artificial dependence
1722// edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
1723
1724// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
1725// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
1726// PHI-------True-Dep------> USEOfPhi
1727
1728// The mutation creates
1729// USEOfPHI -------Artificial-Dep---> SRCOfCopy
1730
1731// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
1732// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
1733// late to avoid additional copies across iterations. The possible scheduling
1734// order would be
1735// USEOfPHI --- SRCOfCopy--- COPY/REG_SEQUENCE.
1736
1737void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
1738 for (SUnit &SU : DAG->SUnits) {
1739 // Find the COPY/REG_SEQUENCE instruction.
1740 if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
1741 continue;
1742
1743 // Record the loop carried PHIs.
1745 // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
1747
1748 for (auto &Dep : SU.Preds) {
1749 SUnit *TmpSU = Dep.getSUnit();
1750 MachineInstr *TmpMI = TmpSU->getInstr();
1751 SDep::Kind DepKind = Dep.getKind();
1752 // Save the loop carried PHI.
1753 if (DepKind == SDep::Anti && TmpMI->isPHI())
1754 PHISUs.push_back(TmpSU);
1755 // Save the source of COPY/REG_SEQUENCE.
1756 // If the source has no pre-decessors, we will end up creating cycles.
1757 else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
1758 SrcSUs.push_back(TmpSU);
1759 }
1760
1761 if (PHISUs.size() == 0 || SrcSUs.size() == 0)
1762 continue;
1763
1764 // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
1765 // SUnit to the container.
1767 // Do not use iterator based loop here as we are updating the container.
1768 for (size_t Index = 0; Index < PHISUs.size(); ++Index) {
1769 for (auto &Dep : PHISUs[Index]->Succs) {
1770 if (Dep.getKind() != SDep::Data)
1771 continue;
1772
1773 SUnit *TmpSU = Dep.getSUnit();
1774 MachineInstr *TmpMI = TmpSU->getInstr();
1775 if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
1776 PHISUs.push_back(TmpSU);
1777 continue;
1778 }
1779 UseSUs.push_back(TmpSU);
1780 }
1781 }
1782
1783 if (UseSUs.size() == 0)
1784 continue;
1785
1786 SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
1787 // Add the artificial dependencies if it does not form a cycle.
1788 for (auto *I : UseSUs) {
1789 for (auto *Src : SrcSUs) {
1790 if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
1791 Src->addPred(SDep(I, SDep::Artificial));
1792 SDAG->Topo.AddPred(Src, I);
1793 }
1794 }
1795 }
1796 }
1797}
1798
1799/// Return true for DAG nodes that we ignore when computing the cost functions.
1800/// We ignore the back-edge recurrence in order to avoid unbounded recursion
1801/// in the calculation of the ASAP, ALAP, etc functions.
1802static bool ignoreDependence(const SDep &D, bool isPred) {
1803 if (D.isArtificial() || D.getSUnit()->isBoundaryNode())
1804 return true;
1805 return D.getKind() == SDep::Anti && isPred;
1806}
1807
1808/// Compute several functions need to order the nodes for scheduling.
1809/// ASAP - Earliest time to schedule a node.
1810/// ALAP - Latest time to schedule a node.
1811/// MOV - Mobility function, difference between ALAP and ASAP.
1812/// D - Depth of each node.
1813/// H - Height of each node.
1814void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
1815 ScheduleInfo.resize(SUnits.size());
1816
1817 LLVM_DEBUG({
1818 for (int I : Topo) {
1819 const SUnit &SU = SUnits[I];
1820 dumpNode(SU);
1821 }
1822 });
1823
1824 int maxASAP = 0;
1825 // Compute ASAP and ZeroLatencyDepth.
1826 for (int I : Topo) {
1827 int asap = 0;
1828 int zeroLatencyDepth = 0;
1829 SUnit *SU = &SUnits[I];
1830 for (const SDep &P : SU->Preds) {
1831 SUnit *pred = P.getSUnit();
1832 if (P.getLatency() == 0)
1833 zeroLatencyDepth =
1834 std::max(zeroLatencyDepth, getZeroLatencyDepth(pred) + 1);
1835 if (ignoreDependence(P, true))
1836 continue;
1837 asap = std::max(asap, (int)(getASAP(pred) + P.getLatency() -
1838 getDistance(pred, SU, P) * MII));
1839 }
1840 maxASAP = std::max(maxASAP, asap);
1841 ScheduleInfo[I].ASAP = asap;
1842 ScheduleInfo[I].ZeroLatencyDepth = zeroLatencyDepth;
1843 }
1844
1845 // Compute ALAP, ZeroLatencyHeight, and MOV.
1846 for (int I : llvm::reverse(Topo)) {
1847 int alap = maxASAP;
1848 int zeroLatencyHeight = 0;
1849 SUnit *SU = &SUnits[I];
1850 for (const SDep &S : SU->Succs) {
1851 SUnit *succ = S.getSUnit();
1852 if (succ->isBoundaryNode())
1853 continue;
1854 if (S.getLatency() == 0)
1855 zeroLatencyHeight =
1856 std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
1857 if (ignoreDependence(S, true))
1858 continue;
1859 alap = std::min(alap, (int)(getALAP(succ) - S.getLatency() +
1860 getDistance(SU, succ, S) * MII));
1861 }
1862
1863 ScheduleInfo[I].ALAP = alap;
1864 ScheduleInfo[I].ZeroLatencyHeight = zeroLatencyHeight;
1865 }
1866
1867 // After computing the node functions, compute the summary for each node set.
1868 for (NodeSet &I : NodeSets)
1869 I.computeNodeSetInfo(this);
1870
1871 LLVM_DEBUG({
1872 for (unsigned i = 0; i < SUnits.size(); i++) {
1873 dbgs() << "\tNode " << i << ":\n";
1874 dbgs() << "\t ASAP = " << getASAP(&SUnits[i]) << "\n";
1875 dbgs() << "\t ALAP = " << getALAP(&SUnits[i]) << "\n";
1876 dbgs() << "\t MOV = " << getMOV(&SUnits[i]) << "\n";
1877 dbgs() << "\t D = " << getDepth(&SUnits[i]) << "\n";
1878 dbgs() << "\t H = " << getHeight(&SUnits[i]) << "\n";
1879 dbgs() << "\t ZLD = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
1880 dbgs() << "\t ZLH = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
1881 }
1882 });
1883}
1884
1885/// Compute the Pred_L(O) set, as defined in the paper. The set is defined
1886/// as the predecessors of the elements of NodeOrder that are not also in
1887/// NodeOrder.
1890 const NodeSet *S = nullptr) {
1891 Preds.clear();
1892 for (const SUnit *SU : NodeOrder) {
1893 for (const SDep &Pred : SU->Preds) {
1894 if (S && S->count(Pred.getSUnit()) == 0)
1895 continue;
1896 if (ignoreDependence(Pred, true))
1897 continue;
1898 if (NodeOrder.count(Pred.getSUnit()) == 0)
1899 Preds.insert(Pred.getSUnit());
1900 }
1901 // Back-edges are predecessors with an anti-dependence.
1902 for (const SDep &Succ : SU->Succs) {
1903 if (Succ.getKind() != SDep::Anti)
1904 continue;
1905 if (S && S->count(Succ.getSUnit()) == 0)
1906 continue;
1907 if (NodeOrder.count(Succ.getSUnit()) == 0)
1908 Preds.insert(Succ.getSUnit());
1909 }
1910 }
1911 return !Preds.empty();
1912}
1913
1914/// Compute the Succ_L(O) set, as defined in the paper. The set is defined
1915/// as the successors of the elements of NodeOrder that are not also in
1916/// NodeOrder.
1919 const NodeSet *S = nullptr) {
1920 Succs.clear();
1921 for (const SUnit *SU : NodeOrder) {
1922 for (const SDep &Succ : SU->Succs) {
1923 if (S && S->count(Succ.getSUnit()) == 0)
1924 continue;
1925 if (ignoreDependence(Succ, false))
1926 continue;
1927 if (NodeOrder.count(Succ.getSUnit()) == 0)
1928 Succs.insert(Succ.getSUnit());
1929 }
1930 for (const SDep &Pred : SU->Preds) {
1931 if (Pred.getKind() != SDep::Anti)
1932 continue;
1933 if (S && S->count(Pred.getSUnit()) == 0)
1934 continue;
1935 if (NodeOrder.count(Pred.getSUnit()) == 0)
1936 Succs.insert(Pred.getSUnit());
1937 }
1938 }
1939 return !Succs.empty();
1940}
1941
1942/// Return true if there is a path from the specified node to any of the nodes
1943/// in DestNodes. Keep track and return the nodes in any path.
1944static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
1945 SetVector<SUnit *> &DestNodes,
1946 SetVector<SUnit *> &Exclude,
1947 SmallPtrSet<SUnit *, 8> &Visited) {
1948 if (Cur->isBoundaryNode())
1949 return false;
1950 if (Exclude.contains(Cur))
1951 return false;
1952 if (DestNodes.contains(Cur))
1953 return true;
1954 if (!Visited.insert(Cur).second)
1955 return Path.contains(Cur);
1956 bool FoundPath = false;
1957 for (auto &SI : Cur->Succs)
1958 if (!ignoreDependence(SI, false))
1959 FoundPath |=
1960 computePath(SI.getSUnit(), Path, DestNodes, Exclude, Visited);
1961 for (auto &PI : Cur->Preds)
1962 if (PI.getKind() == SDep::Anti)
1963 FoundPath |=
1964 computePath(PI.getSUnit(), Path, DestNodes, Exclude, Visited);
1965 if (FoundPath)
1966 Path.insert(Cur);
1967 return FoundPath;
1968}
1969
1970/// Compute the live-out registers for the instructions in a node-set.
1971/// The live-out registers are those that are defined in the node-set,
1972/// but not used. Except for use operands of Phis.
1974 NodeSet &NS) {
1979 for (SUnit *SU : NS) {
1980 const MachineInstr *MI = SU->getInstr();
1981 if (MI->isPHI())
1982 continue;
1983 for (const MachineOperand &MO : MI->all_uses()) {
1984 Register Reg = MO.getReg();
1985 if (Reg.isVirtual())
1986 Uses.insert(Reg);
1987 else if (MRI.isAllocatable(Reg))
1988 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
1989 Uses.insert(Unit);
1990 }
1991 }
1992 for (SUnit *SU : NS)
1993 for (const MachineOperand &MO : SU->getInstr()->all_defs())
1994 if (!MO.isDead()) {
1995 Register Reg = MO.getReg();
1996 if (Reg.isVirtual()) {
1997 if (!Uses.count(Reg))
1998 LiveOutRegs.push_back(RegisterMaskPair(Reg,
2000 } else if (MRI.isAllocatable(Reg)) {
2001 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
2002 if (!Uses.count(Unit))
2003 LiveOutRegs.push_back(
2005 }
2006 }
2007 RPTracker.addLiveRegs(LiveOutRegs);
2008}
2009
2010/// A heuristic to filter nodes in recurrent node-sets if the register
2011/// pressure of a set is too high.
2012void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
2013 for (auto &NS : NodeSets) {
2014 // Skip small node-sets since they won't cause register pressure problems.
2015 if (NS.size() <= 2)
2016 continue;
2017 IntervalPressure RecRegPressure;
2018 RegPressureTracker RecRPTracker(RecRegPressure);
2019 RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
2020 computeLiveOuts(MF, RecRPTracker, NS);
2021 RecRPTracker.closeBottom();
2022
2023 std::vector<SUnit *> SUnits(NS.begin(), NS.end());
2024 llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {
2025 return A->NodeNum > B->NodeNum;
2026 });
2027
2028 for (auto &SU : SUnits) {
2029 // Since we're computing the register pressure for a subset of the
2030 // instructions in a block, we need to set the tracker for each
2031 // instruction in the node-set. The tracker is set to the instruction
2032 // just after the one we're interested in.
2034 RecRPTracker.setPos(std::next(CurInstI));
2035
2036 RegPressureDelta RPDelta;
2037 ArrayRef<PressureChange> CriticalPSets;
2038 RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
2039 CriticalPSets,
2040 RecRegPressure.MaxSetPressure);
2041 if (RPDelta.Excess.isValid()) {
2042 LLVM_DEBUG(
2043 dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
2045 << ":" << RPDelta.Excess.getUnitInc() << "\n");
2046 NS.setExceedPressure(SU);
2047 break;
2048 }
2049 RecRPTracker.recede();
2050 }
2051 }
2052}
2053
2054/// A heuristic to colocate node sets that have the same set of
2055/// successors.
2056void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
2057 unsigned Colocate = 0;
2058 for (int i = 0, e = NodeSets.size(); i < e; ++i) {
2059 NodeSet &N1 = NodeSets[i];
2061 if (N1.empty() || !succ_L(N1, S1))
2062 continue;
2063 for (int j = i + 1; j < e; ++j) {
2064 NodeSet &N2 = NodeSets[j];
2065 if (N1.compareRecMII(N2) != 0)
2066 continue;
2068 if (N2.empty() || !succ_L(N2, S2))
2069 continue;
2070 if (llvm::set_is_subset(S1, S2) && S1.size() == S2.size()) {
2071 N1.setColocate(++Colocate);
2072 N2.setColocate(Colocate);
2073 break;
2074 }
2075 }
2076 }
2077}
2078
2079/// Check if the existing node-sets are profitable. If not, then ignore the
2080/// recurrent node-sets, and attempt to schedule all nodes together. This is
2081/// a heuristic. If the MII is large and all the recurrent node-sets are small,
2082/// then it's best to try to schedule all instructions together instead of
2083/// starting with the recurrent node-sets.
2084void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
2085 // Look for loops with a large MII.
2086 if (MII < 17)
2087 return;
2088 // Check if the node-set contains only a simple add recurrence.
2089 for (auto &NS : NodeSets) {
2090 if (NS.getRecMII() > 2)
2091 return;
2092 if (NS.getMaxDepth() > MII)
2093 return;
2094 }
2095 NodeSets.clear();
2096 LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
2097}
2098
2099/// Add the nodes that do not belong to a recurrence set into groups
2100/// based upon connected components.
2101void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
2102 SetVector<SUnit *> NodesAdded;
2104 // Add the nodes that are on a path between the previous node sets and
2105 // the current node set.
2106 for (NodeSet &I : NodeSets) {
2108 // Add the nodes from the current node set to the previous node set.
2109 if (succ_L(I, N)) {
2111 for (SUnit *NI : N) {
2112 Visited.clear();
2113 computePath(NI, Path, NodesAdded, I, Visited);
2114 }
2115 if (!Path.empty())
2116 I.insert(Path.begin(), Path.end());
2117 }
2118 // Add the nodes from the previous node set to the current node set.
2119 N.clear();
2120 if (succ_L(NodesAdded, N)) {
2122 for (SUnit *NI : N) {
2123 Visited.clear();
2124 computePath(NI, Path, I, NodesAdded, Visited);
2125 }
2126 if (!Path.empty())
2127 I.insert(Path.begin(), Path.end());
2128 }
2129 NodesAdded.insert(I.begin(), I.end());
2130 }
2131
2132 // Create a new node set with the connected nodes of any successor of a node
2133 // in a recurrent set.
2134 NodeSet NewSet;
2136 if (succ_L(NodesAdded, N))
2137 for (SUnit *I : N)
2138 addConnectedNodes(I, NewSet, NodesAdded);
2139 if (!NewSet.empty())
2140 NodeSets.push_back(NewSet);
2141
2142 // Create a new node set with the connected nodes of any predecessor of a node
2143 // in a recurrent set.
2144 NewSet.clear();
2145 if (pred_L(NodesAdded, N))
2146 for (SUnit *I : N)
2147 addConnectedNodes(I, NewSet, NodesAdded);
2148 if (!NewSet.empty())
2149 NodeSets.push_back(NewSet);
2150
2151 // Create new nodes sets with the connected nodes any remaining node that
2152 // has no predecessor.
2153 for (SUnit &SU : SUnits) {
2154 if (NodesAdded.count(&SU) == 0) {
2155 NewSet.clear();
2156 addConnectedNodes(&SU, NewSet, NodesAdded);
2157 if (!NewSet.empty())
2158 NodeSets.push_back(NewSet);
2159 }
2160 }
2161}
2162
2163/// Add the node to the set, and add all of its connected nodes to the set.
2164void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
2165 SetVector<SUnit *> &NodesAdded) {
2166 NewSet.insert(SU);
2167 NodesAdded.insert(SU);
2168 for (auto &SI : SU->Succs) {
2169 SUnit *Successor = SI.getSUnit();
2170 if (!SI.isArtificial() && !Successor->isBoundaryNode() &&
2171 NodesAdded.count(Successor) == 0)
2172 addConnectedNodes(Successor, NewSet, NodesAdded);
2173 }
2174 for (auto &PI : SU->Preds) {
2175 SUnit *Predecessor = PI.getSUnit();
2176 if (!PI.isArtificial() && NodesAdded.count(Predecessor) == 0)
2177 addConnectedNodes(Predecessor, NewSet, NodesAdded);
2178 }
2179}
2180
2181/// Return true if Set1 contains elements in Set2. The elements in common
2182/// are returned in a different container.
2183static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
2185 Result.clear();
2186 for (SUnit *SU : Set1) {
2187 if (Set2.count(SU) != 0)
2188 Result.insert(SU);
2189 }
2190 return !Result.empty();
2191}
2192
2193/// Merge the recurrence node sets that have the same initial node.
2194void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
2195 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2196 ++I) {
2197 NodeSet &NI = *I;
2198 for (NodeSetType::iterator J = I + 1; J != E;) {
2199 NodeSet &NJ = *J;
2200 if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
2201 if (NJ.compareRecMII(NI) > 0)
2202 NI.setRecMII(NJ.getRecMII());
2203 for (SUnit *SU : *J)
2204 I->insert(SU);
2205 NodeSets.erase(J);
2206 E = NodeSets.end();
2207 } else {
2208 ++J;
2209 }
2210 }
2211 }
2212}
2213
2214/// Remove nodes that have been scheduled in previous NodeSets.
2215void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
2216 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2217 ++I)
2218 for (NodeSetType::iterator J = I + 1; J != E;) {
2219 J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
2220
2221 if (J->empty()) {
2222 NodeSets.erase(J);
2223 E = NodeSets.end();
2224 } else {
2225 ++J;
2226 }
2227 }
2228}
2229
2230/// Compute an ordered list of the dependence graph nodes, which
2231/// indicates the order that the nodes will be scheduled. This is a
2232/// two-level algorithm. First, a partial order is created, which
2233/// consists of a list of sets ordered from highest to lowest priority.
2234void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
2236 NodeOrder.clear();
2237
2238 for (auto &Nodes : NodeSets) {
2239 LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
2240 OrderKind Order;
2242 if (pred_L(NodeOrder, N) && llvm::set_is_subset(N, Nodes)) {
2243 R.insert(N.begin(), N.end());
2244 Order = BottomUp;
2245 LLVM_DEBUG(dbgs() << " Bottom up (preds) ");
2246 } else if (succ_L(NodeOrder, N) && llvm::set_is_subset(N, Nodes)) {
2247 R.insert(N.begin(), N.end());
2248 Order = TopDown;
2249 LLVM_DEBUG(dbgs() << " Top down (succs) ");
2250 } else if (isIntersect(N, Nodes, R)) {
2251 // If some of the successors are in the existing node-set, then use the
2252 // top-down ordering.
2253 Order = TopDown;
2254 LLVM_DEBUG(dbgs() << " Top down (intersect) ");
2255 } else if (NodeSets.size() == 1) {
2256 for (const auto &N : Nodes)
2257 if (N->Succs.size() == 0)
2258 R.insert(N);
2259 Order = BottomUp;
2260 LLVM_DEBUG(dbgs() << " Bottom up (all) ");
2261 } else {
2262 // Find the node with the highest ASAP.
2263 SUnit *maxASAP = nullptr;
2264 for (SUnit *SU : Nodes) {
2265 if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
2266 (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
2267 maxASAP = SU;
2268 }
2269 R.insert(maxASAP);
2270 Order = BottomUp;
2271 LLVM_DEBUG(dbgs() << " Bottom up (default) ");
2272 }
2273
2274 while (!R.empty()) {
2275 if (Order == TopDown) {
2276 // Choose the node with the maximum height. If more than one, choose
2277 // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
2278 // choose the node with the lowest MOV.
2279 while (!R.empty()) {
2280 SUnit *maxHeight = nullptr;
2281 for (SUnit *I : R) {
2282 if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
2283 maxHeight = I;
2284 else if (getHeight(I) == getHeight(maxHeight) &&
2286 maxHeight = I;
2287 else if (getHeight(I) == getHeight(maxHeight) &&
2289 getZeroLatencyHeight(maxHeight) &&
2290 getMOV(I) < getMOV(maxHeight))
2291 maxHeight = I;
2292 }
2293 NodeOrder.insert(maxHeight);
2294 LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
2295 R.remove(maxHeight);
2296 for (const auto &I : maxHeight->Succs) {
2297 if (Nodes.count(I.getSUnit()) == 0)
2298 continue;
2299 if (NodeOrder.contains(I.getSUnit()))
2300 continue;
2301 if (ignoreDependence(I, false))
2302 continue;
2303 R.insert(I.getSUnit());
2304 }
2305 // Back-edges are predecessors with an anti-dependence.
2306 for (const auto &I : maxHeight->Preds) {
2307 if (I.getKind() != SDep::Anti)
2308 continue;
2309 if (Nodes.count(I.getSUnit()) == 0)
2310 continue;
2311 if (NodeOrder.contains(I.getSUnit()))
2312 continue;
2313 R.insert(I.getSUnit());
2314 }
2315 }
2316 Order = BottomUp;
2317 LLVM_DEBUG(dbgs() << "\n Switching order to bottom up ");
2319 if (pred_L(NodeOrder, N, &Nodes))
2320 R.insert(N.begin(), N.end());
2321 } else {
2322 // Choose the node with the maximum depth. If more than one, choose
2323 // the node with the maximum ZeroLatencyDepth. If still more than one,
2324 // choose the node with the lowest MOV.
2325 while (!R.empty()) {
2326 SUnit *maxDepth = nullptr;
2327 for (SUnit *I : R) {
2328 if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
2329 maxDepth = I;
2330 else if (getDepth(I) == getDepth(maxDepth) &&
2332 maxDepth = I;
2333 else if (getDepth(I) == getDepth(maxDepth) &&
2335 getMOV(I) < getMOV(maxDepth))
2336 maxDepth = I;
2337 }
2338 NodeOrder.insert(maxDepth);
2339 LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
2340 R.remove(maxDepth);
2341 if (Nodes.isExceedSU(maxDepth)) {
2342 Order = TopDown;
2343 R.clear();
2344 R.insert(Nodes.getNode(0));
2345 break;
2346 }
2347 for (const auto &I : maxDepth->Preds) {
2348 if (Nodes.count(I.getSUnit()) == 0)
2349 continue;
2350 if (NodeOrder.contains(I.getSUnit()))
2351 continue;
2352 R.insert(I.getSUnit());
2353 }
2354 // Back-edges are predecessors with an anti-dependence.
2355 for (const auto &I : maxDepth->Succs) {
2356 if (I.getKind() != SDep::Anti)
2357 continue;
2358 if (Nodes.count(I.getSUnit()) == 0)
2359 continue;
2360 if (NodeOrder.contains(I.getSUnit()))
2361 continue;
2362 R.insert(I.getSUnit());
2363 }
2364 }
2365 Order = TopDown;
2366 LLVM_DEBUG(dbgs() << "\n Switching order to top down ");
2368 if (succ_L(NodeOrder, N, &Nodes))
2369 R.insert(N.begin(), N.end());
2370 }
2371 }
2372 LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
2373 }
2374
2375 LLVM_DEBUG({
2376 dbgs() << "Node order: ";
2377 for (SUnit *I : NodeOrder)
2378 dbgs() << " " << I->NodeNum << " ";
2379 dbgs() << "\n";
2380 });
2381}
2382
2383/// Process the nodes in the computed order and create the pipelined schedule
2384/// of the instructions, if possible. Return true if a schedule is found.
2385bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
2386
2387 if (NodeOrder.empty()){
2388 LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );
2389 return false;
2390 }
2391
2392 bool scheduleFound = false;
2393 std::unique_ptr<HighRegisterPressureDetector> HRPDetector;
2394 if (LimitRegPressure) {
2395 HRPDetector =
2396 std::make_unique<HighRegisterPressureDetector>(Loop.getHeader(), MF);
2397 HRPDetector->init(RegClassInfo);
2398 }
2399 // Keep increasing II until a valid schedule is found.
2400 for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) {
2401 Schedule.reset();
2402 Schedule.setInitiationInterval(II);
2403 LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
2404
2405 SetVector<SUnit *>::iterator NI = NodeOrder.begin();
2406 SetVector<SUnit *>::iterator NE = NodeOrder.end();
2407 do {
2408 SUnit *SU = *NI;
2409
2410 // Compute the schedule time for the instruction, which is based
2411 // upon the scheduled time for any predecessors/successors.
2412 int EarlyStart = INT_MIN;
2413 int LateStart = INT_MAX;
2414 // These values are set when the size of the schedule window is limited
2415 // due to chain dependences.
2416 int SchedEnd = INT_MAX;
2417 int SchedStart = INT_MIN;
2418 Schedule.computeStart(SU, &EarlyStart, &LateStart, &SchedEnd, &SchedStart,
2419 II, this);
2420 LLVM_DEBUG({
2421 dbgs() << "\n";
2422 dbgs() << "Inst (" << SU->NodeNum << ") ";
2423 SU->getInstr()->dump();
2424 dbgs() << "\n";
2425 });
2426 LLVM_DEBUG({
2427 dbgs() << format("\tes: %8x ls: %8x me: %8x ms: %8x\n", EarlyStart,
2428 LateStart, SchedEnd, SchedStart);
2429 });
2430
2431 if (EarlyStart > LateStart || SchedEnd < EarlyStart ||
2432 SchedStart > LateStart)
2433 scheduleFound = false;
2434 else if (EarlyStart != INT_MIN && LateStart == INT_MAX) {
2435 SchedEnd = std::min(SchedEnd, EarlyStart + (int)II - 1);
2436 scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2437 } else if (EarlyStart == INT_MIN && LateStart != INT_MAX) {
2438 SchedStart = std::max(SchedStart, LateStart - (int)II + 1);
2439 scheduleFound = Schedule.insert(SU, LateStart, SchedStart, II);
2440 } else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2441 SchedEnd =
2442 std::min(SchedEnd, std::min(LateStart, EarlyStart + (int)II - 1));
2443 // When scheduling a Phi it is better to start at the late cycle and go
2444 // backwards. The default order may insert the Phi too far away from
2445 // its first dependence.
2446 if (SU->getInstr()->isPHI())
2447 scheduleFound = Schedule.insert(SU, SchedEnd, EarlyStart, II);
2448 else
2449 scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2450 } else {
2451 int FirstCycle = Schedule.getFirstCycle();
2452 scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
2453 FirstCycle + getASAP(SU) + II - 1, II);
2454 }
2455 // Even if we find a schedule, make sure the schedule doesn't exceed the
2456 // allowable number of stages. We keep trying if this happens.
2457 if (scheduleFound)
2458 if (SwpMaxStages > -1 &&
2459 Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2460 scheduleFound = false;
2461
2462 LLVM_DEBUG({
2463 if (!scheduleFound)
2464 dbgs() << "\tCan't schedule\n";
2465 });
2466 } while (++NI != NE && scheduleFound);
2467
2468 // If a schedule is found, ensure non-pipelined instructions are in stage 0
2469 if (scheduleFound)
2470 scheduleFound =
2471 Schedule.normalizeNonPipelinedInstructions(this, LoopPipelinerInfo);
2472
2473 // If a schedule is found, check if it is a valid schedule too.
2474 if (scheduleFound)
2475 scheduleFound = Schedule.isValidSchedule(this);
2476
2477 // If a schedule was found and the option is enabled, check if the schedule
2478 // might generate additional register spills/fills.
2479 if (scheduleFound && LimitRegPressure)
2480 scheduleFound =
2481 !HRPDetector->detect(this, Schedule, Schedule.getMaxStageCount());
2482 }
2483
2484 LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
2485 << " (II=" << Schedule.getInitiationInterval()
2486 << ")\n");
2487
2488 if (scheduleFound) {
2489 scheduleFound = LoopPipelinerInfo->shouldUseSchedule(*this, Schedule);
2490 if (!scheduleFound)
2491 LLVM_DEBUG(dbgs() << "Target rejected schedule\n");
2492 }
2493
2494 if (scheduleFound) {
2495 Schedule.finalizeSchedule(this);
2496 Pass.ORE->emit([&]() {
2498 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
2499 << "Schedule found with Initiation Interval: "
2500 << ore::NV("II", Schedule.getInitiationInterval())
2501 << ", MaxStageCount: "
2502 << ore::NV("MaxStageCount", Schedule.getMaxStageCount());
2503 });
2504 } else
2505 Schedule.reset();
2506
2507 return scheduleFound && Schedule.getMaxStageCount() > 0;
2508}
2509
2510/// Return true if we can compute the amount the instruction changes
2511/// during each iteration. Set Delta to the amount of the change.
2512bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
2514 const MachineOperand *BaseOp;
2515 int64_t Offset;
2516 bool OffsetIsScalable;
2517 if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
2518 return false;
2519
2520 // FIXME: This algorithm assumes instructions have fixed-size offsets.
2521 if (OffsetIsScalable)
2522 return false;
2523
2524 if (!BaseOp->isReg())
2525 return false;
2526
2527 Register BaseReg = BaseOp->getReg();
2528
2530 // Check if there is a Phi. If so, get the definition in the loop.
2531 MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
2532 if (BaseDef && BaseDef->isPHI()) {
2533 BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
2534 BaseDef = MRI.getVRegDef(BaseReg);
2535 }
2536 if (!BaseDef)
2537 return false;
2538
2539 int D = 0;
2540 if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
2541 return false;
2542
2543 Delta = D;
2544 return true;
2545}
2546
2547/// Check if we can change the instruction to use an offset value from the
2548/// previous iteration. If so, return true and set the base and offset values
2549/// so that we can rewrite the load, if necessary.
2550/// v1 = Phi(v0, v3)
2551/// v2 = load v1, 0
2552/// v3 = post_store v1, 4, x
2553/// This function enables the load to be rewritten as v2 = load v3, 4.
2554bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
2555 unsigned &BasePos,
2556 unsigned &OffsetPos,
2557 unsigned &NewBase,
2558 int64_t &Offset) {
2559 // Get the load instruction.
2560 if (TII->isPostIncrement(*MI))
2561 return false;
2562 unsigned BasePosLd, OffsetPosLd;
2563 if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
2564 return false;
2565 Register BaseReg = MI->getOperand(BasePosLd).getReg();
2566
2567 // Look for the Phi instruction.
2568 MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
2569 MachineInstr *Phi = MRI.getVRegDef(BaseReg);
2570 if (!Phi || !Phi->isPHI())
2571 return false;
2572 // Get the register defined in the loop block.
2573 unsigned PrevReg = getLoopPhiReg(*Phi, MI->getParent());
2574 if (!PrevReg)
2575 return false;
2576
2577 // Check for the post-increment load/store instruction.
2578 MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
2579 if (!PrevDef || PrevDef == MI)
2580 return false;
2581
2582 if (!TII->isPostIncrement(*PrevDef))
2583 return false;
2584
2585 unsigned BasePos1 = 0, OffsetPos1 = 0;
2586 if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
2587 return false;
2588
2589 // Make sure that the instructions do not access the same memory location in
2590 // the next iteration.
2591 int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
2592 int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
2594 NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
2595 bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
2596 MF.deleteMachineInstr(NewMI);
2597 if (!Disjoint)
2598 return false;
2599
2600 // Set the return value once we determine that we return true.
2601 BasePos = BasePosLd;
2602 OffsetPos = OffsetPosLd;
2603 NewBase = PrevReg;
2604 Offset = StoreOffset;
2605 return true;
2606}
2607
2608/// Apply changes to the instruction if needed. The changes are need
2609/// to improve the scheduling and depend up on the final schedule.
2611 SMSchedule &Schedule) {
2612 SUnit *SU = getSUnit(MI);
2614 InstrChanges.find(SU);
2615 if (It != InstrChanges.end()) {
2616 std::pair<unsigned, int64_t> RegAndOffset = It->second;
2617 unsigned BasePos, OffsetPos;
2618 if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2619 return;
2620 Register BaseReg = MI->getOperand(BasePos).getReg();
2621 MachineInstr *LoopDef = findDefInLoop(BaseReg);
2622 int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
2623 int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
2624 int BaseStageNum = Schedule.stageScheduled(SU);
2625 int BaseCycleNum = Schedule.cycleScheduled(SU);
2626 if (BaseStageNum < DefStageNum) {
2628 int OffsetDiff = DefStageNum - BaseStageNum;
2629 if (DefCycleNum < BaseCycleNum) {
2630 NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
2631 if (OffsetDiff > 0)
2632 --OffsetDiff;
2633 }
2634 int64_t NewOffset =
2635 MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
2636 NewMI->getOperand(OffsetPos).setImm(NewOffset);
2637 SU->setInstr(NewMI);
2638 MISUnitMap[NewMI] = SU;
2639 NewMIs[MI] = NewMI;
2640 }
2641 }
2642}
2643
2644/// Return the instruction in the loop that defines the register.
2645/// If the definition is a Phi, then follow the Phi operand to
2646/// the instruction in the loop.
2647MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
2649 MachineInstr *Def = MRI.getVRegDef(Reg);
2650 while (Def->isPHI()) {
2651 if (!Visited.insert(Def).second)
2652 break;
2653 for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
2654 if (Def->getOperand(i + 1).getMBB() == BB) {
2655 Def = MRI.getVRegDef(Def->getOperand(i).getReg());
2656 break;
2657 }
2658 }
2659 return Def;
2660}
2661
2662/// Return true for an order or output dependence that is loop carried
2663/// potentially. A dependence is loop carried if the destination defines a value
2664/// that may be used or defined by the source in a subsequent iteration.
2666 bool isSucc) {
2667 if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
2668 Dep.isArtificial() || Dep.getSUnit()->isBoundaryNode())
2669 return false;
2670
2672 return true;
2673
2674 if (Dep.getKind() == SDep::Output)
2675 return true;
2676
2677 MachineInstr *SI = Source->getInstr();
2678 MachineInstr *DI = Dep.getSUnit()->getInstr();
2679 if (!isSucc)
2680 std::swap(SI, DI);
2681 assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
2682
2683 // Assume ordered loads and stores may have a loop carried dependence.
2684 if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
2685 SI->mayRaiseFPException() || DI->mayRaiseFPException() ||
2686 SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
2687 return true;
2688
2689 if (!DI->mayLoadOrStore() || !SI->mayLoadOrStore())
2690 return false;
2691
2692 // The conservative assumption is that a dependence between memory operations
2693 // may be loop carried. The following code checks when it can be proved that
2694 // there is no loop carried dependence.
2695 unsigned DeltaS, DeltaD;
2696 if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD))
2697 return true;
2698
2699 const MachineOperand *BaseOpS, *BaseOpD;
2700 int64_t OffsetS, OffsetD;
2701 bool OffsetSIsScalable, OffsetDIsScalable;
2703 if (!TII->getMemOperandWithOffset(*SI, BaseOpS, OffsetS, OffsetSIsScalable,
2704 TRI) ||
2705 !TII->getMemOperandWithOffset(*DI, BaseOpD, OffsetD, OffsetDIsScalable,
2706 TRI))
2707 return true;
2708
2709 assert(!OffsetSIsScalable && !OffsetDIsScalable &&
2710 "Expected offsets to be byte offsets");
2711
2712 MachineInstr *DefS = MRI.getVRegDef(BaseOpS->getReg());
2713 MachineInstr *DefD = MRI.getVRegDef(BaseOpD->getReg());
2714 if (!DefS || !DefD || !DefS->isPHI() || !DefD->isPHI())
2715 return true;
2716
2717 unsigned InitValS = 0;
2718 unsigned LoopValS = 0;
2719 unsigned InitValD = 0;
2720 unsigned LoopValD = 0;
2721 getPhiRegs(*DefS, BB, InitValS, LoopValS);
2722 getPhiRegs(*DefD, BB, InitValD, LoopValD);
2723 MachineInstr *InitDefS = MRI.getVRegDef(InitValS);
2724 MachineInstr *InitDefD = MRI.getVRegDef(InitValD);
2725
2726 if (!InitDefS->isIdenticalTo(*InitDefD))
2727 return true;
2728
2729 // Check that the base register is incremented by a constant value for each
2730 // iteration.
2731 MachineInstr *LoopDefS = MRI.getVRegDef(LoopValS);
2732 int D = 0;
2733 if (!LoopDefS || !TII->getIncrementValue(*LoopDefS, D))
2734 return true;
2735
2736 uint64_t AccessSizeS = (*SI->memoperands_begin())->getSize();
2737 uint64_t AccessSizeD = (*DI->memoperands_begin())->getSize();
2738
2739 // This is the main test, which checks the offset values and the loop
2740 // increment value to determine if the accesses may be loop carried.
2741 if (AccessSizeS == MemoryLocation::UnknownSize ||
2742 AccessSizeD == MemoryLocation::UnknownSize)
2743 return true;
2744
2745 if (DeltaS != DeltaD || DeltaS < AccessSizeS || DeltaD < AccessSizeD)
2746 return true;
2747
2748 return (OffsetS + (int64_t)AccessSizeS < OffsetD + (int64_t)AccessSizeD);
2749}
2750
2751void SwingSchedulerDAG::postProcessDAG() {
2752 for (auto &M : Mutations)
2753 M->apply(this);
2754}
2755
2756/// Try to schedule the node at the specified StartCycle and continue
2757/// until the node is schedule or the EndCycle is reached. This function
2758/// returns true if the node is scheduled. This routine may search either
2759/// forward or backward for a place to insert the instruction based upon
2760/// the relative values of StartCycle and EndCycle.
2761bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
2762 bool forward = true;
2763 LLVM_DEBUG({
2764 dbgs() << "Trying to insert node between " << StartCycle << " and "
2765 << EndCycle << " II: " << II << "\n";
2766 });
2767 if (StartCycle > EndCycle)
2768 forward = false;
2769
2770 // The terminating condition depends on the direction.
2771 int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
2772 for (int curCycle = StartCycle; curCycle != termCycle;
2773 forward ? ++curCycle : --curCycle) {
2774
2775 if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
2776 ProcItinResources.canReserveResources(*SU, curCycle)) {
2777 LLVM_DEBUG({
2778 dbgs() << "\tinsert at cycle " << curCycle << " ";
2779 SU->getInstr()->dump();
2780 });
2781
2782 if (!ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
2783 ProcItinResources.reserveResources(*SU, curCycle);
2784 ScheduledInstrs[curCycle].push_back(SU);
2785 InstrToCycle.insert(std::make_pair(SU, curCycle));
2786 if (curCycle > LastCycle)
2787 LastCycle = curCycle;
2788 if (curCycle < FirstCycle)
2789 FirstCycle = curCycle;
2790 return true;
2791 }
2792 LLVM_DEBUG({
2793 dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
2794 SU->getInstr()->dump();
2795 });
2796 }
2797 return false;
2798}
2799
2800// Return the cycle of the earliest scheduled instruction in the chain.
2803 SmallVector<SDep, 8> Worklist;
2804 Worklist.push_back(Dep);
2805 int EarlyCycle = INT_MAX;
2806 while (!Worklist.empty()) {
2807 const SDep &Cur = Worklist.pop_back_val();
2808 SUnit *PrevSU = Cur.getSUnit();
2809 if (Visited.count(PrevSU))
2810 continue;
2811 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
2812 if (it == InstrToCycle.end())
2813 continue;
2814 EarlyCycle = std::min(EarlyCycle, it->second);
2815 for (const auto &PI : PrevSU->Preds)
2816 if (PI.getKind() == SDep::Order || PI.getKind() == SDep::Output)
2817 Worklist.push_back(PI);
2818 Visited.insert(PrevSU);
2819 }
2820 return EarlyCycle;
2821}
2822
2823// Return the cycle of the latest scheduled instruction in the chain.
2826 SmallVector<SDep, 8> Worklist;
2827 Worklist.push_back(Dep);
2828 int LateCycle = INT_MIN;
2829 while (!Worklist.empty()) {
2830 const SDep &Cur = Worklist.pop_back_val();
2831 SUnit *SuccSU = Cur.getSUnit();
2832 if (Visited.count(SuccSU) || SuccSU->isBoundaryNode())
2833 continue;
2834 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
2835 if (it == InstrToCycle.end())
2836 continue;
2837 LateCycle = std::max(LateCycle, it->second);
2838 for (const auto &SI : SuccSU->Succs)
2839 if (SI.getKind() == SDep::Order || SI.getKind() == SDep::Output)
2840 Worklist.push_back(SI);
2841 Visited.insert(SuccSU);
2842 }
2843 return LateCycle;
2844}
2845
2846/// If an instruction has a use that spans multiple iterations, then
2847/// return true. These instructions are characterized by having a back-ege
2848/// to a Phi, which contains a reference to another Phi.
2850 for (auto &P : SU->Preds)
2851 if (DAG->isBackedge(SU, P) && P.getSUnit()->getInstr()->isPHI())
2852 for (auto &S : P.getSUnit()->Succs)
2853 if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
2854 return P.getSUnit();
2855 return nullptr;
2856}
2857
2858/// Compute the scheduling start slot for the instruction. The start slot
2859/// depends on any predecessor or successor nodes scheduled already.
2860void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
2861 int *MinEnd, int *MaxStart, int II,
2862 SwingSchedulerDAG *DAG) {
2863 // Iterate over each instruction that has been scheduled already. The start
2864 // slot computation depends on whether the previously scheduled instruction
2865 // is a predecessor or successor of the specified instruction.
2866 for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
2867
2868 // Iterate over each instruction in the current cycle.
2869 for (SUnit *I : getInstructions(cycle)) {
2870 // Because we're processing a DAG for the dependences, we recognize
2871 // the back-edge in recurrences by anti dependences.
2872 for (unsigned i = 0, e = (unsigned)SU->Preds.size(); i != e; ++i) {
2873 const SDep &Dep = SU->Preds[i];
2874 if (Dep.getSUnit() == I) {
2875 if (!DAG->isBackedge(SU, Dep)) {
2876 int EarlyStart = cycle + Dep.getLatency() -
2877 DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
2878 *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
2879 if (DAG->isLoopCarriedDep(SU, Dep, false)) {
2880 int End = earliestCycleInChain(Dep) + (II - 1);
2881 *MinEnd = std::min(*MinEnd, End);
2882 }
2883 } else {
2884 int LateStart = cycle - Dep.getLatency() +
2885 DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
2886 *MinLateStart = std::min(*MinLateStart, LateStart);
2887 }
2888 }
2889 // For instruction that requires multiple iterations, make sure that
2890 // the dependent instruction is not scheduled past the definition.
2891 SUnit *BE = multipleIterations(I, DAG);
2892 if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
2893 !SU->isPred(I))
2894 *MinLateStart = std::min(*MinLateStart, cycle);
2895 }
2896 for (unsigned i = 0, e = (unsigned)SU->Succs.size(); i != e; ++i) {
2897 if (SU->Succs[i].getSUnit() == I) {
2898 const SDep &Dep = SU->Succs[i];
2899 if (!DAG->isBackedge(SU, Dep)) {
2900 int LateStart = cycle - Dep.getLatency() +
2901 DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
2902 *MinLateStart = std::min(*MinLateStart, LateStart);
2903 if (DAG->isLoopCarriedDep(SU, Dep)) {
2904 int Start = latestCycleInChain(Dep) + 1 - II;
2905 *MaxStart = std::max(*MaxStart, Start);
2906 }
2907 } else {
2908 int EarlyStart = cycle + Dep.getLatency() -
2909 DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
2910 *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
2911 }
2912 }
2913 }
2914 }
2915 }
2916}
2917
2918/// Order the instructions within a cycle so that the definitions occur
2919/// before the uses. Returns true if the instruction is added to the start
2920/// of the list, or false if added to the end.
2922 std::deque<SUnit *> &Insts) const {
2923 MachineInstr *MI = SU->getInstr();
2924 bool OrderBeforeUse = false;
2925 bool OrderAfterDef = false;
2926 bool OrderBeforeDef = false;
2927 unsigned MoveDef = 0;
2928 unsigned MoveUse = 0;
2929 int StageInst1 = stageScheduled(SU);
2930
2931 unsigned Pos = 0;
2932 for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
2933 ++I, ++Pos) {
2934 for (MachineOperand &MO : MI->operands()) {
2935 if (!MO.isReg() || !MO.getReg().isVirtual())
2936 continue;
2937
2938 Register Reg = MO.getReg();
2939 unsigned BasePos, OffsetPos;
2940 if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2941 if (MI->getOperand(BasePos).getReg() == Reg)
2942 if (unsigned NewReg = SSD->getInstrBaseReg(SU))
2943 Reg = NewReg;
2944 bool Reads, Writes;
2945 std::tie(Reads, Writes) =
2946 (*I)->getInstr()->readsWritesVirtualRegister(Reg);
2947 if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
2948 OrderBeforeUse = true;
2949 if (MoveUse == 0)
2950 MoveUse = Pos;
2951 } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
2952 // Add the instruction after the scheduled instruction.
2953 OrderAfterDef = true;
2954 MoveDef = Pos;
2955 } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
2956 if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
2957 OrderBeforeUse = true;
2958 if (MoveUse == 0)
2959 MoveUse = Pos;
2960 } else {
2961 OrderAfterDef = true;
2962 MoveDef = Pos;
2963 }
2964 } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
2965 OrderBeforeUse = true;
2966 if (MoveUse == 0)
2967 MoveUse = Pos;
2968 if (MoveUse != 0) {
2969 OrderAfterDef = true;
2970 MoveDef = Pos - 1;
2971 }
2972 } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
2973 // Add the instruction before the scheduled instruction.
2974 OrderBeforeUse = true;
2975 if (MoveUse == 0)
2976 MoveUse = Pos;
2977 } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
2978 isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
2979 if (MoveUse == 0) {
2980 OrderBeforeDef = true;
2981 MoveUse = Pos;
2982 }
2983 }
2984 }
2985 // Check for order dependences between instructions. Make sure the source
2986 // is ordered before the destination.
2987 for (auto &S : SU->Succs) {
2988 if (S.getSUnit() != *I)
2989 continue;
2990 if (S.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
2991 OrderBeforeUse = true;
2992 if (Pos < MoveUse)
2993 MoveUse = Pos;
2994 }
2995 // We did not handle HW dependences in previous for loop,
2996 // and we normally set Latency = 0 for Anti deps,
2997 // so may have nodes in same cycle with Anti denpendent on HW regs.
2998 else if (S.getKind() == SDep::Anti && stageScheduled(*I) == StageInst1) {
2999 OrderBeforeUse = true;
3000 if ((MoveUse == 0) || (Pos < MoveUse))
3001 MoveUse = Pos;
3002 }
3003 }
3004 for (auto &P : SU->Preds) {
3005 if (P.getSUnit() != *I)
3006 continue;
3007 if (P.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
3008 OrderAfterDef = true;
3009 MoveDef = Pos;
3010 }
3011 }
3012 }
3013
3014 // A circular dependence.
3015 if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
3016 OrderBeforeUse = false;
3017
3018 // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
3019 // to a loop-carried dependence.
3020 if (OrderBeforeDef)
3021 OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
3022
3023 // The uncommon case when the instruction order needs to be updated because
3024 // there is both a use and def.
3025 if (OrderBeforeUse && OrderAfterDef) {
3026 SUnit *UseSU = Insts.at(MoveUse);
3027 SUnit *DefSU = Insts.at(MoveDef);
3028 if (MoveUse > MoveDef) {
3029 Insts.erase(Insts.begin() + MoveUse);
3030 Insts.erase(Insts.begin() + MoveDef);
3031 } else {
3032 Insts.erase(Insts.begin() + MoveDef);
3033 Insts.erase(Insts.begin() + MoveUse);
3034 }
3035 orderDependence(SSD, UseSU, Insts);
3036 orderDependence(SSD, SU, Insts);
3037 orderDependence(SSD, DefSU, Insts);
3038 return;
3039 }
3040 // Put the new instruction first if there is a use in the list. Otherwise,
3041 // put it at the end of the list.
3042 if (OrderBeforeUse)
3043 Insts.push_front(SU);
3044 else
3045 Insts.push_back(SU);
3046}
3047
3048/// Return true if the scheduled Phi has a loop carried operand.
3050 MachineInstr &Phi) const {
3051 if (!Phi.isPHI())
3052 return false;
3053 assert(Phi.isPHI() && "Expecting a Phi.");
3054 SUnit *DefSU = SSD->getSUnit(&Phi);
3055 unsigned DefCycle = cycleScheduled(DefSU);
3056 int DefStage = stageScheduled(DefSU);
3057
3058 unsigned InitVal = 0;
3059 unsigned LoopVal = 0;
3060 getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
3061 SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
3062 if (!UseSU)
3063 return true;
3064 if (UseSU->getInstr()->isPHI())
3065 return true;
3066 unsigned LoopCycle = cycleScheduled(UseSU);
3067 int LoopStage = stageScheduled(UseSU);
3068 return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
3069}
3070
3071/// Return true if the instruction is a definition that is loop carried
3072/// and defines the use on the next iteration.
3073/// v1 = phi(v2, v3)
3074/// (Def) v3 = op v1
3075/// (MO) = v1
3076/// If MO appears before Def, then v1 and v3 may get assigned to the same
3077/// register.
3079 MachineInstr *Def,
3080 MachineOperand &MO) const {
3081 if (!MO.isReg())
3082 return false;
3083 if (Def->isPHI())
3084 return false;
3085 MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
3086 if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
3087 return false;
3088 if (!isLoopCarried(SSD, *Phi))
3089 return false;
3090 unsigned LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
3091 for (MachineOperand &DMO : Def->all_defs()) {
3092 if (DMO.getReg() == LoopReg)
3093 return true;
3094 }
3095 return false;
3096}
3097
3098/// Determine transitive dependences of unpipelineable instructions
3101 SmallSet<SUnit *, 8> DoNotPipeline;
3102 SmallVector<SUnit *, 8> Worklist;
3103
3104 for (auto &SU : SSD->SUnits)
3105 if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr()))
3106 Worklist.push_back(&SU);
3107
3108 while (!Worklist.empty()) {
3109 auto SU = Worklist.pop_back_val();
3110 if (DoNotPipeline.count(SU))
3111 continue;
3112 LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n");
3113 DoNotPipeline.insert(SU);
3114 for (auto &Dep : SU->Preds)
3115 Worklist.push_back(Dep.getSUnit());
3116 if (SU->getInstr()->isPHI())
3117 for (auto &Dep : SU->Succs)
3118 if (Dep.getKind() == SDep::Anti)
3119 Worklist.push_back(Dep.getSUnit());
3120 }
3121 return DoNotPipeline;
3122}
3123
3124// Determine all instructions upon which any unpipelineable instruction depends
3125// and ensure that they are in stage 0. If unable to do so, return false.
3129
3130 int NewLastCycle = INT_MIN;
3131 for (SUnit &SU : SSD->SUnits) {
3132 if (!SU.isInstr())
3133 continue;
3134 if (!DNP.contains(&SU) || stageScheduled(&SU) == 0) {
3135 NewLastCycle = std::max(NewLastCycle, InstrToCycle[&SU]);
3136 continue;
3137 }
3138
3139 // Put the non-pipelined instruction as early as possible in the schedule
3140 int NewCycle = getFirstCycle();
3141 for (auto &Dep : SU.Preds)
3142 NewCycle = std::max(InstrToCycle[Dep.getSUnit()], NewCycle);
3143
3144 int OldCycle = InstrToCycle[&SU];
3145 if (OldCycle != NewCycle) {
3146 InstrToCycle[&SU] = NewCycle;
3147 auto &OldS = getInstructions(OldCycle);
3148 llvm::erase(OldS, &SU);
3149 getInstructions(NewCycle).emplace_back(&SU);
3150 LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum
3151 << ") is not pipelined; moving from cycle " << OldCycle
3152 << " to " << NewCycle << " Instr:" << *SU.getInstr());
3153 }
3154 NewLastCycle = std::max(NewLastCycle, NewCycle);
3155 }
3156 LastCycle = NewLastCycle;
3157 return true;
3158}
3159
3160// Check if the generated schedule is valid. This function checks if
3161// an instruction that uses a physical register is scheduled in a
3162// different stage than the definition. The pipeliner does not handle
3163// physical register values that may cross a basic block boundary.
3164// Furthermore, if a physical def/use pair is assigned to the same
3165// cycle, orderDependence does not guarantee def/use ordering, so that
3166// case should be considered invalid. (The test checks for both
3167// earlier and same-cycle use to be more robust.)
3169 for (SUnit &SU : SSD->SUnits) {
3170 if (!SU.hasPhysRegDefs)
3171 continue;
3172 int StageDef = stageScheduled(&SU);
3173 int CycleDef = InstrToCycle[&SU];
3174 assert(StageDef != -1 && "Instruction should have been scheduled.");
3175 for (auto &SI : SU.Succs)
3176 if (SI.isAssignedRegDep() && !SI.getSUnit()->isBoundaryNode())
3177 if (Register::isPhysicalRegister(SI.getReg())) {
3178 if (stageScheduled(SI.getSUnit()) != StageDef)
3179 return false;
3180 if (InstrToCycle[SI.getSUnit()] <= CycleDef)
3181 return false;
3182 }
3183 }
3184 return true;
3185}
3186
3187/// A property of the node order in swing-modulo-scheduling is
3188/// that for nodes outside circuits the following holds:
3189/// none of them is scheduled after both a successor and a
3190/// predecessor.
3191/// The method below checks whether the property is met.
3192/// If not, debug information is printed and statistics information updated.
3193/// Note that we do not use an assert statement.
3194/// The reason is that although an invalid node oder may prevent
3195/// the pipeliner from finding a pipelined schedule for arbitrary II,
3196/// it does not lead to the generation of incorrect code.
3197void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
3198
3199 // a sorted vector that maps each SUnit to its index in the NodeOrder
3200 typedef std::pair<SUnit *, unsigned> UnitIndex;
3201 std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
3202
3203 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
3204 Indices.push_back(std::make_pair(NodeOrder[i], i));
3205
3206 auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
3207 return std::get<0>(i1) < std::get<0>(i2);
3208 };
3209
3210 // sort, so that we can perform a binary search
3211 llvm::sort(Indices, CompareKey);
3212
3213 bool Valid = true;
3214 (void)Valid;
3215 // for each SUnit in the NodeOrder, check whether
3216 // it appears after both a successor and a predecessor
3217 // of the SUnit. If this is the case, and the SUnit
3218 // is not part of circuit, then the NodeOrder is not
3219 // valid.
3220 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
3221 SUnit *SU = NodeOrder[i];
3222 unsigned Index = i;
3223
3224 bool PredBefore = false;
3225 bool SuccBefore = false;
3226
3227 SUnit *Succ;
3228 SUnit *Pred;
3229 (void)Succ;
3230 (void)Pred;
3231
3232 for (SDep &PredEdge : SU->Preds) {
3233 SUnit *PredSU = PredEdge.getSUnit();
3234 unsigned PredIndex = std::get<1>(
3235 *llvm::lower_bound(Indices, std::make_pair(PredSU, 0), CompareKey));
3236 if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
3237 PredBefore = true;
3238 Pred = PredSU;
3239 break;
3240 }
3241 }
3242
3243 for (SDep &SuccEdge : SU->Succs) {
3244 SUnit *SuccSU = SuccEdge.getSUnit();
3245 // Do not process a boundary node, it was not included in NodeOrder,
3246 // hence not in Indices either, call to std::lower_bound() below will
3247 // return Indices.end().
3248 if (SuccSU->isBoundaryNode())
3249 continue;
3250 unsigned SuccIndex = std::get<1>(
3251 *llvm::lower_bound(Indices, std::make_pair(SuccSU, 0), CompareKey));
3252 if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
3253 SuccBefore = true;
3254 Succ = SuccSU;
3255 break;
3256 }
3257 }
3258
3259 if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
3260 // instructions in circuits are allowed to be scheduled
3261 // after both a successor and predecessor.
3262 bool InCircuit = llvm::any_of(
3263 Circuits, [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
3264 if (InCircuit)
3265 LLVM_DEBUG(dbgs() << "In a circuit, predecessor ";);
3266 else {
3267 Valid = false;
3268 NumNodeOrderIssues++;
3269 LLVM_DEBUG(dbgs() << "Predecessor ";);
3270 }
3271 LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
3272 << " are scheduled before node " << SU->NodeNum
3273 << "\n";);
3274 }
3275 }
3276
3277 LLVM_DEBUG({
3278 if (!Valid)
3279 dbgs() << "Invalid node order found!\n";
3280 });
3281}
3282
3283/// Attempt to fix the degenerate cases when the instruction serialization
3284/// causes the register lifetimes to overlap. For example,
3285/// p' = store_pi(p, b)
3286/// = load p, offset
3287/// In this case p and p' overlap, which means that two registers are needed.
3288/// Instead, this function changes the load to use p' and updates the offset.
3289void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
3290 unsigned OverlapReg = 0;
3291 unsigned NewBaseReg = 0;
3292 for (SUnit *SU : Instrs) {
3293 MachineInstr *MI = SU->getInstr();
3294 for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3295 const MachineOperand &MO = MI->getOperand(i);
3296 // Look for an instruction that uses p. The instruction occurs in the
3297 // same cycle but occurs later in the serialized order.
3298 if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
3299 // Check that the instruction appears in the InstrChanges structure,
3300 // which contains instructions that can have the offset updated.
3302 InstrChanges.find(SU);
3303 if (It != InstrChanges.end()) {
3304 unsigned BasePos, OffsetPos;
3305 // Update the base register and adjust the offset.
3306 if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
3308 NewMI->getOperand(BasePos).setReg(NewBaseReg);
3309 int64_t NewOffset =
3310 MI->getOperand(OffsetPos).getImm() - It->second.second;
3311 NewMI->getOperand(OffsetPos).setImm(NewOffset);
3312 SU->setInstr(NewMI);
3313 MISUnitMap[NewMI] = SU;
3314 NewMIs[MI] = NewMI;
3315 }
3316 }
3317 OverlapReg = 0;
3318 NewBaseReg = 0;
3319 break;
3320 }
3321 // Look for an instruction of the form p' = op(p), which uses and defines
3322 // two virtual registers that get allocated to the same physical register.
3323 unsigned TiedUseIdx = 0;
3324 if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
3325 // OverlapReg is p in the example above.
3326 OverlapReg = MI->getOperand(TiedUseIdx).getReg();
3327 // NewBaseReg is p' in the example above.
3328 NewBaseReg = MI->getOperand(i).getReg();
3329 break;
3330 }
3331 }
3332 }
3333}
3334
3335std::deque<SUnit *>
3337 const std::deque<SUnit *> &Instrs) const {
3338 std::deque<SUnit *> NewOrderPhi;
3339 for (SUnit *SU : Instrs) {
3340 if (SU->getInstr()->isPHI())
3341 NewOrderPhi.push_back(SU);
3342 }
3343 std::deque<SUnit *> NewOrderI;
3344 for (SUnit *SU : Instrs) {
3345 if (!SU->getInstr()->isPHI())
3346 orderDependence(SSD, SU, NewOrderI);
3347 }
3348 llvm::append_range(NewOrderPhi, NewOrderI);
3349 return NewOrderPhi;
3350}
3351
3352/// After the schedule has been formed, call this function to combine
3353/// the instructions from the different stages/cycles. That is, this
3354/// function creates a schedule that represents a single iteration.
3356 // Move all instructions to the first stage from later stages.
3357 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3358 for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
3359 ++stage) {
3360 std::deque<SUnit *> &cycleInstrs =
3361 ScheduledInstrs[cycle + (stage * InitiationInterval)];
3362 for (SUnit *SU : llvm::reverse(cycleInstrs))
3363 ScheduledInstrs[cycle].push_front(SU);
3364 }
3365 }
3366
3367 // Erase all the elements in the later stages. Only one iteration should
3368 // remain in the scheduled list, and it contains all the instructions.
3369 for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
3370 ScheduledInstrs.erase(cycle);
3371
3372 // Change the registers in instruction as specified in the InstrChanges
3373 // map. We need to use the new registers to create the correct order.
3374 for (const SUnit &SU : SSD->SUnits)
3375 SSD->applyInstrChange(SU.getInstr(), *this);
3376
3377 // Reorder the instructions in each cycle to fix and improve the
3378 // generated code.
3379 for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
3380 std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
3381 cycleInstrs = reorderInstructions(SSD, cycleInstrs);
3382 SSD->fixupRegisterOverlaps(cycleInstrs);
3383 }
3384
3385 LLVM_DEBUG(dump(););
3386}
3387
3389 os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
3390 << " depth " << MaxDepth << " col " << Colocate << "\n";
3391 for (const auto &I : Nodes)
3392 os << " SU(" << I->NodeNum << ") " << *(I->getInstr());
3393 os << "\n";
3394}
3395
3396#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3397/// Print the schedule information to the given output.
3399 // Iterate over each cycle.
3400 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3401 // Iterate over each instruction in the cycle.
3402 const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
3403 for (SUnit *CI : cycleInstrs->second) {
3404 os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
3405 os << "(" << CI->NodeNum << ") ";
3406 CI->getInstr()->print(os);
3407 os << "\n";
3408 }
3409 }
3410}
3411
3412/// Utility function used for debugging to print the schedule.
3415
3416void ResourceManager::dumpMRT() const {
3417 LLVM_DEBUG({
3418 if (UseDFA)
3419 return;
3420 std::stringstream SS;
3421 SS << "MRT:\n";
3422 SS << std::setw(4) << "Slot";
3423 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3424 SS << std::setw(3) << I;
3425 SS << std::setw(7) << "#Mops"
3426 << "\n";
3427 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3428 SS << std::setw(4) << Slot;
3429 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3430 SS << std::setw(3) << MRT[Slot][I];
3431 SS << std::setw(7) << NumScheduledMops[Slot] << "\n";
3432 }
3433 dbgs() << SS.str();
3434 });
3435}
3436#endif
3437
3439 const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {
3440 unsigned ProcResourceID = 0;
3441
3442 // We currently limit the resource kinds to 64 and below so that we can use
3443 // uint64_t for Masks
3444 assert(SM.getNumProcResourceKinds() < 64 &&
3445 "Too many kinds of resources, unsupported");
3446 // Create a unique bitmask for every processor resource unit.
3447 // Skip resource at index 0, since it always references 'InvalidUnit'.
3448 Masks.resize(SM.getNumProcResourceKinds());
3449 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3451 if (Desc.SubUnitsIdxBegin)
3452 continue;
3453 Masks[I] = 1ULL << ProcResourceID;
3454 ProcResourceID++;
3455 }
3456 // Create a unique bitmask for every processor resource group.
3457 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3459 if (!Desc.SubUnitsIdxBegin)
3460 continue;
3461 Masks[I] = 1ULL << ProcResourceID;
3462 for (unsigned U = 0; U < Desc.NumUnits; ++U)
3463 Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];
3464 ProcResourceID++;
3465 }
3466 LLVM_DEBUG({
3467 if (SwpShowResMask) {
3468 dbgs() << "ProcResourceDesc:\n";
3469 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3470 const MCProcResourceDesc *ProcResource = SM.getProcResource(I);
3471 dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
3472 ProcResource->Name, I, Masks[I],
3473 ProcResource->NumUnits);
3474 }
3475 dbgs() << " -----------------\n";
3476 }
3477 });
3478}
3479
3481 LLVM_DEBUG({
3482 if (SwpDebugResource)
3483 dbgs() << "canReserveResources:\n";
3484 });
3485 if (UseDFA)
3486 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3487 ->canReserveResources(&SU.getInstr()->getDesc());
3488
3489 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3490 if (!SCDesc->isValid()) {
3491 LLVM_DEBUG({
3492 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3493 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3494 });
3495 return true;
3496 }
3497
3498 reserveResources(SCDesc, Cycle);
3499 bool Result = !isOverbooked();
3500 unreserveResources(SCDesc, Cycle);
3501
3502 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return " << Result << "\n\n";);
3503 return Result;
3504}
3505
3506void ResourceManager::reserveResources(SUnit &SU, int Cycle) {
3507 LLVM_DEBUG({
3508 if (SwpDebugResource)
3509 dbgs() << "reserveResources:\n";
3510 });
3511 if (UseDFA)
3512 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3513 ->reserveResources(&SU.getInstr()->getDesc());
3514
3515 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3516 if (!SCDesc->isValid()) {
3517 LLVM_DEBUG({
3518 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3519 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3520 });
3521 return;
3522 }
3523
3524 reserveResources(SCDesc, Cycle);
3525
3526 LLVM_DEBUG({
3527 if (SwpDebugResource) {
3528 dumpMRT();
3529 dbgs() << "reserveResources: done!\n\n";
3530 }
3531 });
3532}
3533
3534void ResourceManager::reserveResources(const MCSchedClassDesc *SCDesc,
3535 int Cycle) {
3536 assert(!UseDFA);
3537 for (const MCWriteProcResEntry &PRE : make_range(
3538 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3539 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3540 ++MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3541
3542 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3543 ++NumScheduledMops[positiveModulo(C, InitiationInterval)];
3544}
3545
3546void ResourceManager::unreserveResources(const MCSchedClassDesc *SCDesc,
3547 int Cycle) {
3548 assert(!UseDFA);
3549 for (const MCWriteProcResEntry &PRE : make_range(
3550 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3551 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3552 --MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3553
3554 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3555 --NumScheduledMops[positiveModulo(C, InitiationInterval)];
3556}
3557
3558bool ResourceManager::isOverbooked() const {
3559 assert(!UseDFA);
3560 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3561 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3563 if (MRT[Slot][I] > Desc->NumUnits)
3564 return true;
3565 }
3566 if (NumScheduledMops[Slot] > IssueWidth)
3567 return true;
3568 }
3569 return false;
3570}
3571
3572int ResourceManager::calculateResMIIDFA() const {
3573 assert(UseDFA);
3574
3575 // Sort the instructions by the number of available choices for scheduling,
3576 // least to most. Use the number of critical resources as the tie breaker.
3577 FuncUnitSorter FUS = FuncUnitSorter(*ST);
3578 for (SUnit &SU : DAG->SUnits)
3579 FUS.calcCriticalResources(*SU.getInstr());
3581 FuncUnitOrder(FUS);
3582
3583 for (SUnit &SU : DAG->SUnits)
3584 FuncUnitOrder.push(SU.getInstr());
3585
3587 Resources.push_back(
3588 std::unique_ptr<DFAPacketizer>(TII->CreateTargetScheduleState(*ST)));
3589
3590 while (!FuncUnitOrder.empty()) {
3591 MachineInstr *MI = FuncUnitOrder.top();
3592 FuncUnitOrder.pop();
3593 if (TII->isZeroCost(MI->getOpcode()))
3594 continue;
3595
3596 // Attempt to reserve the instruction in an existing DFA. At least one
3597 // DFA is needed for each cycle.
3598 unsigned NumCycles = DAG->getSUnit(MI)->Latency;
3599 unsigned ReservedCycles = 0;
3600 auto *RI = Resources.begin();
3601 auto *RE = Resources.end();
3602 LLVM_DEBUG({
3603 dbgs() << "Trying to reserve resource for " << NumCycles
3604 << " cycles for \n";
3605 MI->dump();
3606 });
3607 for (unsigned C = 0; C < NumCycles; ++C)
3608 while (RI != RE) {
3609 if ((*RI)->canReserveResources(*MI)) {
3610 (*RI)->reserveResources(*MI);
3611 ++ReservedCycles;
3612 break;
3613 }
3614 RI++;
3615 }
3616 LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles
3617 << ", NumCycles:" << NumCycles << "\n");
3618 // Add new DFAs, if needed, to reserve resources.
3619 for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
3621 << "NewResource created to reserve resources"
3622 << "\n");
3623 auto *NewResource = TII->CreateTargetScheduleState(*ST);
3624 assert(NewResource->canReserveResources(*MI) && "Reserve error.");
3625 NewResource->reserveResources(*MI);
3626 Resources.push_back(std::unique_ptr<DFAPacketizer>(NewResource));
3627 }
3628 }
3629
3630 int Resmii = Resources.size();
3631 LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
3632 return Resmii;
3633}
3634
3636 if (UseDFA)
3637 return calculateResMIIDFA();
3638
3639 // Count each resource consumption and divide it by the number of units.
3640 // ResMII is the max value among them.
3641
3642 int NumMops = 0;
3644 for (SUnit &SU : DAG->SUnits) {
3645 if (TII->isZeroCost(SU.getInstr()->getOpcode()))
3646 continue;
3647
3648 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3649 if (!SCDesc->isValid())
3650 continue;
3651
3652 LLVM_DEBUG({
3653 if (SwpDebugResource) {
3654 DAG->dumpNode(SU);
3655 dbgs() << " #Mops: " << SCDesc->NumMicroOps << "\n"
3656 << " WriteProcRes: ";
3657 }
3658 });
3659 NumMops += SCDesc->NumMicroOps;
3660 for (const MCWriteProcResEntry &PRE :
3661 make_range(STI->getWriteProcResBegin(SCDesc),
3662 STI->getWriteProcResEnd(SCDesc))) {
3663 LLVM_DEBUG({
3664 if (SwpDebugResource) {
3665 const MCProcResourceDesc *Desc =
3666 SM.getProcResource(PRE.ProcResourceIdx);
3667 dbgs() << Desc->Name << ": " << PRE.ReleaseAtCycle << ", ";
3668 }
3669 });
3670 ResourceCount[PRE.ProcResourceIdx] += PRE.ReleaseAtCycle;
3671 }
3672 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "\n");
3673 }
3674
3675 int Result = (NumMops + IssueWidth - 1) / IssueWidth;
3676 LLVM_DEBUG({
3677 if (SwpDebugResource)
3678 dbgs() << "#Mops: " << NumMops << ", "
3679 << "IssueWidth: " << IssueWidth << ", "
3680 << "Cycles: " << Result << "\n";
3681 });
3682
3683 LLVM_DEBUG({
3684 if (SwpDebugResource) {
3685 std::stringstream SS;
3686 SS << std::setw(2) << "ID" << std::setw(16) << "Name" << std::setw(10)
3687 << "Units" << std::setw(10) << "Consumed" << std::setw(10) << "Cycles"
3688 << "\n";
3689 dbgs() << SS.str();
3690 }
3691 });
3692 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3694 int Cycles = (ResourceCount[I] + Desc->NumUnits - 1) / Desc->NumUnits;
3695 LLVM_DEBUG({
3696 if (SwpDebugResource) {
3697 std::stringstream SS;
3698 SS << std::setw(2) << I << std::setw(16) << Desc->Name << std::setw(10)
3699 << Desc->NumUnits << std::setw(10) << ResourceCount[I]
3700 << std::setw(10) << Cycles << "\n";
3701 dbgs() << SS.str();
3702 }
3703 });
3704 if (Cycles > Result)
3705 Result = Cycles;
3706 }
3707 return Result;
3708}
3709
3711 InitiationInterval = II;
3712 DFAResources.clear();
3713 DFAResources.resize(II);
3714 for (auto &I : DFAResources)
3715 I.reset(ST->getInstrInfo()->CreateTargetScheduleState(*ST));
3716 MRT.clear();
3718 NumScheduledMops.clear();
3719 NumScheduledMops.resize(II);
3720}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static const LLT S1
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:529
This file declares an analysis pass that computes CycleInfo for LLVM IR, specialized from GenericCycl...
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
bool End
Definition: ELF_riscv.cpp:480
SmallVector< uint32_t, 0 > Writes
Definition: ELF_riscv.cpp:497
Rewrite Partial Register Uses
#define DEBUG_TYPE
const HexagonInstrInfo * TII
hexagon gen pred
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
static cl::opt< int > SwpForceII("pipeliner-force-ii", cl::desc("Force pipeliner to use specified II."), cl::Hidden, cl::init(-1))
A command line argument to force pipeliner to use specified initial interval.
static cl::opt< bool > ExperimentalCodeGen("pipeliner-experimental-cg", cl::Hidden, cl::init(false), cl::desc("Use the experimental peeling code generator for software pipelining"))
static bool pred_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Preds, const NodeSet *S=nullptr)
Compute the Pred_L(O) set, as defined in the paper.
static cl::opt< int > RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden, cl::init(5), cl::desc("Margin representing the unused percentage of " "the register pressure limit"))
static cl::opt< bool > SwpDebugResource("pipeliner-dbg-res", cl::Hidden, cl::init(false))
static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker, NodeSet &NS)
Compute the live-out registers for the instructions in a node-set.
static void computeScheduledInsts(const SwingSchedulerDAG *SSD, SMSchedule &Schedule, std::vector< MachineInstr * > &OrderedInsts, DenseMap< MachineInstr *, unsigned > &Stages)
Create an instruction stream that represents a single iteration and stage of each instruction.
static cl::opt< bool > EmitTestAnnotations("pipeliner-annotate-for-testing", cl::Hidden, cl::init(false), cl::desc("Instead of emitting the pipelined code, annotate instructions " "with the generated schedule for feeding into the " "-modulo-schedule-test pass"))
static bool isIntersect(SmallSetVector< SUnit *, 8 > &Set1, const NodeSet &Set2, SmallSetVector< SUnit *, 8 > &Result)
Return true if Set1 contains elements in Set2.
static cl::opt< bool > SwpIgnoreRecMII("pipeliner-ignore-recmii", cl::ReallyHidden, cl::desc("Ignore RecMII"))
static cl::opt< int > SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1))
static bool succ_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Succs, const NodeSet *S=nullptr)
Compute the Succ_L(O) set, as defined in the paper.
Modulo Software Pipelining
static cl::opt< bool > SwpPruneLoopCarried("pipeliner-prune-loop-carried", cl::desc("Prune loop carried order dependences."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of loop carried order dependences.
static bool isDependenceBarrier(MachineInstr &MI)
Return true if the instruction causes a chain between memory references before and after it.
static cl::opt< int > SwpMaxMii("pipeliner-max-mii", cl::desc("Size limit for the MII."), cl::Hidden, cl::init(27))
A command line argument to limit minimum initial interval for pipelining.
static unsigned getLoopPhiReg(const MachineInstr &Phi, const MachineBasicBlock *LoopBB)
Return the Phi register value that comes the loop block.
static void swapAntiDependences(std::vector< SUnit > &SUnits)
Swap all the anti dependences in the DAG.
static bool isSuccOrder(SUnit *SUa, SUnit *SUb)
Return true if SUb can be reached from SUa following the chain edges.
static cl::opt< int > SwpMaxStages("pipeliner-max-stages", cl::desc("Maximum stages allowed in the generated scheduled."), cl::Hidden, cl::init(3))
A command line argument to limit the number of stages in the pipeline.
static cl::opt< bool > EnableSWPOptSize("enable-pipeliner-opt-size", cl::desc("Enable SWP at Os."), cl::Hidden, cl::init(false))
A command line option to enable SWP at -Os.
static cl::opt< bool > SwpShowResMask("pipeliner-show-mask", cl::Hidden, cl::init(false))
static cl::opt< int > SwpIISearchRange("pipeliner-ii-search-range", cl::desc("Range to search for II"), cl::Hidden, cl::init(10))
static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop, unsigned &InitVal, unsigned &LoopVal)
Return the register values for the operands of a Phi instruction.
static cl::opt< bool > LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false), cl::desc("Limit register pressure of scheduled loop"))
#define DEBUG_TYPE
static cl::opt< bool > EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true), cl::desc("Enable Software Pipelining"))
A command line option to turn software pipelining on or off.
static bool ignoreDependence(const SDep &D, bool isPred)
Return true for DAG nodes that we ignore when computing the cost functions.
static cl::opt< bool > SwpPruneDeps("pipeliner-prune-deps", cl::desc("Prune dependences between unrelated Phi nodes."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of chain dependences due to an unrelated Phi.
static SUnit * multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG)
If an instruction has a use that spans multiple iterations, then return true.
static bool computePath(SUnit *Cur, SetVector< SUnit * > &Path, SetVector< SUnit * > &DestNodes, SetVector< SUnit * > &Exclude, SmallPtrSet< SUnit *, 8 > &Visited)
Return true if there is a path from the specified node to any of the nodes in DestNodes.
unsigned const TargetRegisterInfo * TRI
unsigned Reg
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file defines the PriorityQueue class.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static unsigned getSize(unsigned Kind)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
bool isNoAlias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A trivial helper function to check to see if the specified pointers are no-alias.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:229
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
bool erase(const KeyT &Val)
Definition: DenseMap.h:329
bool empty() const
Definition: DenseMap.h:98
iterator end()
Definition: DenseMap.h:84
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:342
A possibly irreducible generalization of a Loop.
Itinerary data supplied by a subtarget to be used by a target.
const InstrStage * beginStage(unsigned ItinClassIndx) const
Return the first stage of the itinerary.
const InstrStage * endStage(unsigned ItinClassIndx) const
Return the last+1 stage of the itinerary.
bool isEmpty() const
Returns true if there are no itineraries.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:357
BlockT * getHeader() const
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:631
unsigned getSchedClass() const
Return the scheduling class for this instruction.
Definition: MCInstrDesc.h:600
const MCInstrDesc & get(unsigned Opcode) const
Return the machine instruction descriptor that corresponds to the specified instruction opcode.
Definition: MCInstrInfo.h:63
Generic base class for all target subtargets.
const MCWriteProcResEntry * getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCWriteProcResEntry * getWriteProcResBegin(const MCSchedClassDesc *SC) const
Return an iterator at the first process resource consumed by the given scheduling class.
const MCSchedModel & getSchedModel() const
Get the machine model for this subtarget's CPU.
Metadata node.
Definition: Metadata.h:1067
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1428
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1426
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1434
Tracking metadata reference owned by Metadata.
Definition: Metadata.h:889
A single uniqued string.
Definition: Metadata.h:720
StringRef getString() const
Definition: Metadata.cpp:607
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
instr_iterator instr_end()
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void deleteMachineInstr(MachineInstr *MI)
DeleteMachineInstr - Delete the given MachineInstr.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:68
bool mayRaiseFPException() const
Return true if this instruction could possibly raise a floating-point exception.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:543
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:326
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition: MachineInstr.h:540
bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
bool isRegSequence() const
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:774
bool isIdenticalTo(const MachineInstr &Other, MICheckType Check=CheckDefs) const
Return true if this instruction is identical to Other.
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool isPseudo(QueryType Type=IgnoreBundle) const
Return true if this is a pseudo instruction that doesn't correspond to a real machine instruction.
Definition: MachineInstr.h:895
bool isPHI() const
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:553
iterator_range< filtered_mop_iterator > all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
Definition: MachineInstr.h:730
A description of a memory reference used in the backend.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
const Value * getValue() const
Return the base address of the memory access.
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
Register getReg() const
getReg - Returns the register number.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
Diagnostic information for optimization analysis remarks.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
The main class in the implementation of the target independent software pipeliner pass.
const TargetInstrInfo * TII
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
MachineFunction * MF
MachineOptimizationRemarkEmitter * ORE
RegisterClassInfo RegClassInfo
defusechain_iterator - This class provides iterator support for machine operands in the function that...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
use_instr_iterator use_instr_begin(Register RegNo) const
static use_instr_iterator use_instr_end()
MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator end()
Definition: MapVector.h:71
iterator find(const KeyT &Key)
Definition: MapVector.h:167
void clear()
Definition: MapVector.h:88
static MemoryLocation getAfter(const Value *Ptr, const AAMDNodes &AATags=AAMDNodes())
Return a location that may access any location after Ptr, while remaining within the underlying objec...
The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place, rewriting the old loop and...
void cleanup()
Performs final cleanup after expansion.
void expand()
Performs the actual expansion.
Expander that simply annotates each scheduled instruction with a post-instr symbol that can be consum...
void annotate()
Performs the annotation.
Represents a schedule for a single-block loop.
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
SUnit * getNode(unsigned i) const
void print(raw_ostream &os) const
void setRecMII(unsigned mii)
unsigned count(SUnit *SU) const
void setColocate(unsigned c)
int compareRecMII(NodeSet &RHS)
unsigned size() const
bool insert(SUnit *SU)
LLVM_DUMP_METHOD void dump() const
bool empty() const
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
void dump() const
Definition: Pass.cpp:136
A reimplementation of ModuloScheduleExpander.
unsigned getPSet() const
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
Track the current register pressure at some position in the instruction stream, and remember the high...
void addLiveRegs(ArrayRef< RegisterMaskPair > Regs)
Force liveness of virtual registers or physical register units.
unsigned getRegPressureSetLimit(unsigned Idx) const
Get the register unit limit for the given pressure set index.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static constexpr bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:65
void initProcResourceVectors(const MCSchedModel &SM, SmallVectorImpl< uint64_t > &Masks)
void init(int II)
Initialize resources with the initiation interval II.
bool canReserveResources(SUnit &SU, int Cycle)
Check if the resources occupied by a machine instruction are available in the current state.
Scheduling dependency.
Definition: ScheduleDAG.h:49
SUnit * getSUnit() const
Definition: ScheduleDAG.h:480
Kind getKind() const
Returns an enum value representing the kind of the dependence.
Definition: ScheduleDAG.h:486
Kind
These are the different kinds of scheduling dependencies.
Definition: ScheduleDAG.h:52
@ Output
A register output-dependence (aka WAW).
Definition: ScheduleDAG.h:55
@ Order
Any other ordering dependency.
Definition: ScheduleDAG.h:56
@ Anti
A register anti-dependence (aka WAR).
Definition: ScheduleDAG.h:54
@ Data
Regular data dependence (aka true-dependence).
Definition: ScheduleDAG.h:53
void setLatency(unsigned Lat)
Sets the latency for this edge.
Definition: ScheduleDAG.h:147
@ Barrier
An unknown scheduling barrier.
Definition: ScheduleDAG.h:69
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
Definition: ScheduleDAG.h:142
bool isArtificial() const
Tests if this is an Order dependence that is marked as "artificial", meaning it isn't necessary for c...
Definition: ScheduleDAG.h:200
This class represents the scheduled code.
std::deque< SUnit * > reorderInstructions(const SwingSchedulerDAG *SSD, const std::deque< SUnit * > &Instrs) const
int earliestCycleInChain(const SDep &Dep)
Return the cycle of the earliest scheduled instruction in the dependence chain.
void setInitiationInterval(int ii)
Set the initiation interval for this schedule.
SmallSet< SUnit *, 8 > computeUnpipelineableNodes(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
Determine transitive dependences of unpipelineable instructions.
void dump() const
Utility function used for debugging to print the schedule.
bool insert(SUnit *SU, int StartCycle, int EndCycle, int II)
Try to schedule the node at the specified StartCycle and continue until the node is schedule or the E...
unsigned getMaxStageCount()
Return the maximum stage count needed for this schedule.
void print(raw_ostream &os) const
Print the schedule information to the given output.
int latestCycleInChain(const SDep &Dep)
Return the cycle of the latest scheduled instruction in the dependence chain.
int stageScheduled(SUnit *SU) const
Return the stage for a scheduled instruction.
void orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU, std::deque< SUnit * > &Insts) const
Order the instructions within a cycle so that the definitions occur before the uses.
void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart, int *MinEnd, int *MaxStart, int II, SwingSchedulerDAG *DAG)
Compute the scheduling start slot for the instruction.
bool isValidSchedule(SwingSchedulerDAG *SSD)
int getInitiationInterval() const
Return the initiation interval for this schedule.
std::deque< SUnit * > & getInstructions(int cycle)
Return the instructions that are scheduled at the specified cycle.
int getFirstCycle() const
Return the first cycle in the completed schedule.
bool isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD, MachineInstr *Def, MachineOperand &MO) const
Return true if the instruction is a definition that is loop carried and defines the use on the next i...
unsigned cycleScheduled(SUnit *SU) const
Return the cycle for a scheduled instruction.
bool normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
bool isLoopCarried(const SwingSchedulerDAG *SSD, MachineInstr &Phi) const
Return true if the scheduled Phi has a loop carried operand.
int getFinalCycle() const
Return the last cycle in the finalized schedule.
void finalizeSchedule(SwingSchedulerDAG *SSD)
After the schedule has been formed, call this function to combine the instructions from the different...
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
unsigned NumPreds
Definition: ScheduleDAG.h:266
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
Definition: ScheduleDAG.h:362
unsigned NodeNum
Entry # of node in the node vector.
Definition: ScheduleDAG.h:264
void setInstr(MachineInstr *MI)
Assigns the instruction for the SUnit.
Definition: ScheduleDAG.h:366
void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
bool isPred(const SUnit *N) const
Tests if node N is a predecessor of this node.
Definition: ScheduleDAG.h:431
unsigned short Latency
Node latency.
Definition: ScheduleDAG.h:273
bool isBoundaryNode() const
Boundary nodes are placeholders for the boundary of the scheduling region.
Definition: ScheduleDAG.h:344
bool hasPhysRegDefs
Has physreg defs that are being used.
Definition: ScheduleDAG.h:280
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:257
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:256
bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:373
A ScheduleDAG for scheduling lists of MachineInstr.
DenseMap< MachineInstr *, SUnit * > MISUnitMap
After calling BuildSchedGraph, each machine instruction in the current scheduling region is mapped to...
virtual void finishBlock()
Cleans up after scheduling in the given block.
MachineBasicBlock * BB
The block in which to insert instructions.
const MCSchedClassDesc * getSchedClass(SUnit *SU) const
Resolves and cache a resolved scheduling class for an SUnit.
void dumpNode(const SUnit &SU) const override
UndefValue * UnknownValue
For an unanalyzable memory access, this Value is used in maps.
void buildSchedGraph(AAResults *AA, RegPressureTracker *RPTracker=nullptr, PressureDiffs *PDiffs=nullptr, LiveIntervals *LIS=nullptr, bool TrackLaneMasks=false)
Builds SUnits for the current region.
SUnit * getSUnit(MachineInstr *MI) const
Returns an existing SUnit for this MI, or nullptr.
void dump() const override
void RemovePred(SUnit *M, SUnit *N)
Updates the topological ordering to accommodate an edge to be removed from the specified node N from ...
void InitDAGTopologicalSorting()
Creates the initial topological ordering from the DAG to be scheduled.
void AddPred(SUnit *Y, SUnit *X)
Updates the topological ordering to accommodate an edge to be added from SUnit X to SUnit Y.
bool IsReachable(const SUnit *SU, const SUnit *TargetSU)
Checks if SU is reachable from TargetSU.
MachineRegisterInfo & MRI
Virtual/real register map.
Definition: ScheduleDAG.h:560
const TargetInstrInfo * TII
Target instruction information.
Definition: ScheduleDAG.h:557
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:561
const TargetRegisterInfo * TRI
Target processor register info.
Definition: ScheduleDAG.h:558
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:559
A vector that has set insertion semantics.
Definition: SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
typename vector_type::const_iterator iterator
Definition: SetVector.h:69
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:254
SlotIndexes pass.
Definition: SlotIndexes.h:300
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:523
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:290
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:356
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
iterator end() const
Definition: SmallPtrSet.h:385
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
iterator begin() const
Definition: SmallPtrSet.h:380
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:236
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
This class builds the dependence graph for the instructions in a loop, and attempts to schedule the i...
unsigned getInstrBaseReg(SUnit *SU) const
Return the new base register that was stored away for the changed instruction.
unsigned getDepth(SUnit *Node)
The depth, in the dependence graph, for a node.
int getASAP(SUnit *Node)
Return the earliest time an instruction may be scheduled.
void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule)
Apply changes to the instruction if needed.
void finishBlock() override
Clean up after the software pipeliner runs.
void fixupRegisterOverlaps(std::deque< SUnit * > &Instrs)
Attempt to fix the degenerate cases when the instruction serialization causes the register lifetimes ...
int getZeroLatencyDepth(SUnit *Node)
The maximum unweighted length of a path from an arbitrary node to the given node in which each edge h...
bool isLoopCarriedDep(SUnit *Source, const SDep &Dep, bool isSucc=true)
Return true for an order or output dependence that is loop carried potentially.
unsigned getDistance(SUnit *U, SUnit *V, const SDep &Dep)
The distance function, which indicates that operation V of iteration I depends on operations U of ite...
void schedule() override
We override the schedule function in ScheduleDAGInstrs to implement the scheduling part of the Swing ...
int getMOV(SUnit *Node)
The mobility function, which the number of slots in which an instruction may be scheduled.
int getZeroLatencyHeight(SUnit *Node)
The maximum unweighted length of a path from the given node to an arbitrary node in which each edge h...
bool isBackedge(SUnit *Source, const SDep &Dep)
Return true if the dependence is a back-edge in the data dependence graph.
unsigned getHeight(SUnit *Node)
The height, in the dependence graph, for a node.
int getALAP(SUnit *Node)
Return the latest time an instruction my be scheduled.
Object returned by analyzeLoopForPipelining.
virtual bool shouldIgnoreForPipelining(const MachineInstr *MI) const =0
Return true if the given instruction should not be pipelined and should be ignored.
virtual bool shouldUseSchedule(SwingSchedulerDAG &SSD, SMSchedule &SMS)
Return true if the proposed schedule should used.
virtual std::unique_ptr< PipelinerLoopInfo > analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const
Analyze loop L, which must be a single-basic-block loop, and if the conditions can be understood enou...
bool isZeroCost(unsigned Opcode) const
Return true for pseudo instructions that don't consume any machine resources in their current form.
virtual bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
virtual DFAPacketizer * CreateTargetScheduleState(const TargetSubtargetInfo &) const
Create machine specific model for scheduling.
virtual bool isPostIncrement(const MachineInstr &MI) const
Return true for post-incremented instructions.
virtual bool getBaseAndOffsetPosition(const MachineInstr &MI, unsigned &BasePos, unsigned &OffsetPos) const
Return true if the instruction contains a base register and offset.
virtual bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const
Sometimes, it is possible for the target to tell, even without aliasing information,...
virtual bool getIncrementValue(const MachineInstr &MI, int &Value) const
If the instruction is an increment of a constant value, return the amount.
bool getMemOperandWithOffset(const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const
Get the base operand and byte offset of an instruction that reads/writes memory.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const char * getRegPressureSetName(unsigned Idx) const =0
Get the name of this register unit pressure set.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
static Type * getVoidTy(LLVMContext &C)
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1724
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:139
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
constexpr double e
Definition: MathExtras.h:31
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Offset
Definition: DWP.cpp:456
void stable_sort(R &&Range)
Definition: STLExtras.h:1975
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1689
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2053
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2039
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1738
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:428
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1656
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
unsigned getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1950
char & MachinePipelinerID
This pass performs software pipelining on machine instructions.
CycleInfo::CycleT Cycle
Definition: CycleInfo.h:24
cl::opt< bool > SwpEnableCopyToPhi
bool isIdentifiedObject(const Value *V)
Return true if this pointer refers to a distinct and identifiable object.
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
cl::opt< int > SwpForceIssueWidth
A command line argument to force pipeliner to use specified issue width.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Description of the encoding of one expression Op.
These values represent a non-pipelined step in the execution of an instruction.
RegisterPressure computed within a region of instructions delimited by TopIdx and BottomIdx.
static constexpr LaneBitmask getNone()
Definition: LaneBitmask.h:81
Define a kind of processor resource that will be modeled by the scheduler.
Definition: MCSchedule.h:31
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition: MCSchedule.h:118
bool isValid() const
Definition: MCSchedule.h:136
Machine model for scheduling, bundling, and heuristics.
Definition: MCSchedule.h:253
const MCSchedClassDesc * getSchedClassDesc(unsigned SchedClassIdx) const
Definition: MCSchedule.h:360
unsigned getNumProcResourceKinds() const
Definition: MCSchedule.h:349
bool hasInstrSchedModel() const
Does this machine model include instruction-level scheduling.
Definition: MCSchedule.h:334
const MCProcResourceDesc * getProcResource(unsigned ProcResourceIdx) const
Definition: MCSchedule.h:353
Identify one of the processor resource kinds consumed by a particular scheduling class for the specif...
Definition: MCSchedule.h:63
SmallVector< MachineOperand, 4 > BrCond
std::unique_ptr< TargetInstrInfo::PipelinerLoopInfo > LoopPipelinerInfo
Store the effects of a change in pressure on things that MI scheduler cares about.
std::vector< unsigned > MaxSetPressure
Map of max reg pressure indexed by pressure set ID, not class ID.