LLVM 19.0.0git
MachinePipeliner.cpp
Go to the documentation of this file.
1//===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
10//
11// This SMS implementation is a target-independent back-end pass. When enabled,
12// the pass runs just prior to the register allocation pass, while the machine
13// IR is in SSA form. If software pipelining is successful, then the original
14// loop is replaced by the optimized loop. The optimized loop contains one or
15// more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
16// the instructions cannot be scheduled in a given MII, we increase the MII by
17// one and try again.
18//
19// The SMS implementation is an extension of the ScheduleDAGInstrs class. We
20// represent loop carried dependences in the DAG as order edges to the Phi
21// nodes. We also perform several passes over the DAG to eliminate unnecessary
22// edges that inhibit the ability to pipeline. The implementation uses the
23// DFAPacketizer class to compute the minimum initiation interval and the check
24// where an instruction may be inserted in the pipelined schedule.
25//
26// In order for the SMS pass to work, several target specific hooks need to be
27// implemented to get information about the loop structure and to rewrite
28// instructions.
29//
30//===----------------------------------------------------------------------===//
31
33#include "llvm/ADT/ArrayRef.h"
34#include "llvm/ADT/BitVector.h"
35#include "llvm/ADT/DenseMap.h"
36#include "llvm/ADT/MapVector.h"
38#include "llvm/ADT/STLExtras.h"
40#include "llvm/ADT/SetVector.h"
42#include "llvm/ADT/SmallSet.h"
44#include "llvm/ADT/Statistic.h"
73#include "llvm/Config/llvm-config.h"
74#include "llvm/IR/Attributes.h"
75#include "llvm/IR/Function.h"
76#include "llvm/MC/LaneBitmask.h"
77#include "llvm/MC/MCInstrDesc.h"
80#include "llvm/Pass.h"
83#include "llvm/Support/Debug.h"
86#include <algorithm>
87#include <cassert>
88#include <climits>
89#include <cstdint>
90#include <deque>
91#include <functional>
92#include <iomanip>
93#include <iterator>
94#include <map>
95#include <memory>
96#include <sstream>
97#include <tuple>
98#include <utility>
99#include <vector>
100
101using namespace llvm;
102
103#define DEBUG_TYPE "pipeliner"
104
105STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
106STATISTIC(NumPipelined, "Number of loops software pipelined");
107STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
108STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
109STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
110STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
111STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
112STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
113STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
114STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
115STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
116
117/// A command line option to turn software pipelining on or off.
118static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
119 cl::desc("Enable Software Pipelining"));
120
121/// A command line option to enable SWP at -Os.
122static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
123 cl::desc("Enable SWP at Os."), cl::Hidden,
124 cl::init(false));
125
126/// A command line argument to limit minimum initial interval for pipelining.
127static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
128 cl::desc("Size limit for the MII."),
129 cl::Hidden, cl::init(27));
130
131/// A command line argument to force pipeliner to use specified initial
132/// interval.
133static cl::opt<int> SwpForceII("pipeliner-force-ii",
134 cl::desc("Force pipeliner to use specified II."),
135 cl::Hidden, cl::init(-1));
136
137/// A command line argument to limit the number of stages in the pipeline.
138static cl::opt<int>
139 SwpMaxStages("pipeliner-max-stages",
140 cl::desc("Maximum stages allowed in the generated scheduled."),
141 cl::Hidden, cl::init(3));
142
143/// A command line option to disable the pruning of chain dependences due to
144/// an unrelated Phi.
145static cl::opt<bool>
146 SwpPruneDeps("pipeliner-prune-deps",
147 cl::desc("Prune dependences between unrelated Phi nodes."),
148 cl::Hidden, cl::init(true));
149
150/// A command line option to disable the pruning of loop carried order
151/// dependences.
152static cl::opt<bool>
153 SwpPruneLoopCarried("pipeliner-prune-loop-carried",
154 cl::desc("Prune loop carried order dependences."),
155 cl::Hidden, cl::init(true));
156
157#ifndef NDEBUG
158static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
159#endif
160
161static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
163 cl::desc("Ignore RecMII"));
164
165static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
166 cl::init(false));
167static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,
168 cl::init(false));
169
171 "pipeliner-annotate-for-testing", cl::Hidden, cl::init(false),
172 cl::desc("Instead of emitting the pipelined code, annotate instructions "
173 "with the generated schedule for feeding into the "
174 "-modulo-schedule-test pass"));
175
177 "pipeliner-experimental-cg", cl::Hidden, cl::init(false),
178 cl::desc(
179 "Use the experimental peeling code generator for software pipelining"));
180
181static cl::opt<int> SwpIISearchRange("pipeliner-ii-search-range",
182 cl::desc("Range to search for II"),
183 cl::Hidden, cl::init(10));
184
185static cl::opt<bool>
186 LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false),
187 cl::desc("Limit register pressure of scheduled loop"));
188
189static cl::opt<int>
190 RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden,
191 cl::init(5),
192 cl::desc("Margin representing the unused percentage of "
193 "the register pressure limit"));
194
195namespace llvm {
196
197// A command line option to enable the CopyToPhi DAG mutation.
199 cl::init(true),
200 cl::desc("Enable CopyToPhi DAG Mutation"));
201
202/// A command line argument to force pipeliner to use specified issue
203/// width.
205 "pipeliner-force-issue-width",
206 cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
207 cl::init(-1));
208
209} // end namespace llvm
210
211unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
212char MachinePipeliner::ID = 0;
213#ifndef NDEBUG
215#endif
217
219 "Modulo Software Pipelining", false, false)
225 "Modulo Software Pipelining", false, false)
226
227/// The "main" function for implementing Swing Modulo Scheduling.
228bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
229 if (skipFunction(mf.getFunction()))
230 return false;
231
232 if (!EnableSWP)
233 return false;
234
235 if (mf.getFunction().getAttributes().hasFnAttr(Attribute::OptimizeForSize) &&
236 !EnableSWPOptSize.getPosition())
237 return false;
238
239 if (!mf.getSubtarget().enableMachinePipeliner())
240 return false;
241
242 // Cannot pipeline loops without instruction itineraries if we are using
243 // DFA for the pipeliner.
244 if (mf.getSubtarget().useDFAforSMS() &&
245 (!mf.getSubtarget().getInstrItineraryData() ||
246 mf.getSubtarget().getInstrItineraryData()->isEmpty()))
247 return false;
248
249 MF = &mf;
250 MLI = &getAnalysis<MachineLoopInfo>();
251 MDT = &getAnalysis<MachineDominatorTree>();
252 ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
253 TII = MF->getSubtarget().getInstrInfo();
254 RegClassInfo.runOnMachineFunction(*MF);
255
256 for (const auto &L : *MLI)
257 scheduleLoop(*L);
258
259 return false;
260}
261
262/// Attempt to perform the SMS algorithm on the specified loop. This function is
263/// the main entry point for the algorithm. The function identifies candidate
264/// loops, calculates the minimum initiation interval, and attempts to schedule
265/// the loop.
266bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
267 bool Changed = false;
268 for (const auto &InnerLoop : L)
269 Changed |= scheduleLoop(*InnerLoop);
270
271#ifndef NDEBUG
272 // Stop trying after reaching the limit (if any).
273 int Limit = SwpLoopLimit;
274 if (Limit >= 0) {
275 if (NumTries >= SwpLoopLimit)
276 return Changed;
277 NumTries++;
278 }
279#endif
280
281 setPragmaPipelineOptions(L);
282 if (!canPipelineLoop(L)) {
283 LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
284 ORE->emit([&]() {
285 return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",
286 L.getStartLoc(), L.getHeader())
287 << "Failed to pipeline loop";
288 });
289
290 LI.LoopPipelinerInfo.reset();
291 return Changed;
292 }
293
294 ++NumTrytoPipeline;
295
296 Changed = swingModuloScheduler(L);
297
298 LI.LoopPipelinerInfo.reset();
299 return Changed;
300}
301
302void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
303 // Reset the pragma for the next loop in iteration.
304 disabledByPragma = false;
305 II_setByPragma = 0;
306
307 MachineBasicBlock *LBLK = L.getTopBlock();
308
309 if (LBLK == nullptr)
310 return;
311
312 const BasicBlock *BBLK = LBLK->getBasicBlock();
313 if (BBLK == nullptr)
314 return;
315
316 const Instruction *TI = BBLK->getTerminator();
317 if (TI == nullptr)
318 return;
319
320 MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop);
321 if (LoopID == nullptr)
322 return;
323
324 assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");
325 assert(LoopID->getOperand(0) == LoopID && "invalid loop");
326
327 for (const MDOperand &MDO : llvm::drop_begin(LoopID->operands())) {
328 MDNode *MD = dyn_cast<MDNode>(MDO);
329
330 if (MD == nullptr)
331 continue;
332
333 MDString *S = dyn_cast<MDString>(MD->getOperand(0));
334
335 if (S == nullptr)
336 continue;
337
338 if (S->getString() == "llvm.loop.pipeline.initiationinterval") {
339 assert(MD->getNumOperands() == 2 &&
340 "Pipeline initiation interval hint metadata should have two operands.");
342 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
343 assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");
344 } else if (S->getString() == "llvm.loop.pipeline.disable") {
345 disabledByPragma = true;
346 }
347 }
348}
349
350/// Return true if the loop can be software pipelined. The algorithm is
351/// restricted to loops with a single basic block. Make sure that the
352/// branch in the loop can be analyzed.
353bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
354 if (L.getNumBlocks() != 1) {
355 ORE->emit([&]() {
356 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
357 L.getStartLoc(), L.getHeader())
358 << "Not a single basic block: "
359 << ore::NV("NumBlocks", L.getNumBlocks());
360 });
361 return false;
362 }
363
364 if (disabledByPragma) {
365 ORE->emit([&]() {
366 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
367 L.getStartLoc(), L.getHeader())
368 << "Disabled by Pragma.";
369 });
370 return false;
371 }
372
373 // Check if the branch can't be understood because we can't do pipelining
374 // if that's the case.
375 LI.TBB = nullptr;
376 LI.FBB = nullptr;
377 LI.BrCond.clear();
378 if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
379 LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
380 NumFailBranch++;
381 ORE->emit([&]() {
382 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
383 L.getStartLoc(), L.getHeader())
384 << "The branch can't be understood";
385 });
386 return false;
387 }
388
389 LI.LoopInductionVar = nullptr;
390 LI.LoopCompare = nullptr;
392 if (!LI.LoopPipelinerInfo) {
393 LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
394 NumFailLoop++;
395 ORE->emit([&]() {
396 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
397 L.getStartLoc(), L.getHeader())
398 << "The loop structure is not supported";
399 });
400 return false;
401 }
402
403 if (!L.getLoopPreheader()) {
404 LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
405 NumFailPreheader++;
406 ORE->emit([&]() {
407 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
408 L.getStartLoc(), L.getHeader())
409 << "No loop preheader found";
410 });
411 return false;
412 }
413
414 // Remove any subregisters from inputs to phi nodes.
415 preprocessPhiNodes(*L.getHeader());
416 return true;
417}
418
419void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
421 SlotIndexes &Slots = *getAnalysis<LiveIntervals>().getSlotIndexes();
422
423 for (MachineInstr &PI : B.phis()) {
424 MachineOperand &DefOp = PI.getOperand(0);
425 assert(DefOp.getSubReg() == 0);
426 auto *RC = MRI.getRegClass(DefOp.getReg());
427
428 for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
429 MachineOperand &RegOp = PI.getOperand(i);
430 if (RegOp.getSubReg() == 0)
431 continue;
432
433 // If the operand uses a subregister, replace it with a new register
434 // without subregisters, and generate a copy to the new register.
435 Register NewReg = MRI.createVirtualRegister(RC);
436 MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
438 const DebugLoc &DL = PredB.findDebugLoc(At);
439 auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
440 .addReg(RegOp.getReg(), getRegState(RegOp),
441 RegOp.getSubReg());
442 Slots.insertMachineInstrInMaps(*Copy);
443 RegOp.setReg(NewReg);
444 RegOp.setSubReg(0);
445 }
446 }
447}
448
449/// The SMS algorithm consists of the following main steps:
450/// 1. Computation and analysis of the dependence graph.
451/// 2. Ordering of the nodes (instructions).
452/// 3. Attempt to Schedule the loop.
453bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
454 assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
455
456 SwingSchedulerDAG SMS(*this, L, getAnalysis<LiveIntervals>(), RegClassInfo,
458
459 MachineBasicBlock *MBB = L.getHeader();
460 // The kernel should not include any terminator instructions. These
461 // will be added back later.
462 SMS.startBlock(MBB);
463
464 // Compute the number of 'real' instructions in the basic block by
465 // ignoring terminators.
466 unsigned size = MBB->size();
468 E = MBB->instr_end();
469 I != E; ++I, --size)
470 ;
471
472 SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
473 SMS.schedule();
474 SMS.exitRegion();
475
476 SMS.finishBlock();
477 return SMS.hasNewSchedule();
478}
479
488}
489
490void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
491 if (SwpForceII > 0)
492 MII = SwpForceII;
493 else if (II_setByPragma > 0)
494 MII = II_setByPragma;
495 else
496 MII = std::max(ResMII, RecMII);
497}
498
499void SwingSchedulerDAG::setMAX_II() {
500 if (SwpForceII > 0)
501 MAX_II = SwpForceII;
502 else if (II_setByPragma > 0)
503 MAX_II = II_setByPragma;
504 else
505 MAX_II = MII + SwpIISearchRange;
506}
507
508/// We override the schedule function in ScheduleDAGInstrs to implement the
509/// scheduling part of the Swing Modulo Scheduling algorithm.
511 AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
512 buildSchedGraph(AA);
513 addLoopCarriedDependences(AA);
514 updatePhiDependences();
516 changeDependences();
517 postProcessDAG();
518 LLVM_DEBUG(dump());
519
520 NodeSetType NodeSets;
521 findCircuits(NodeSets);
522 NodeSetType Circuits = NodeSets;
523
524 // Calculate the MII.
525 unsigned ResMII = calculateResMII();
526 unsigned RecMII = calculateRecMII(NodeSets);
527
528 fuseRecs(NodeSets);
529
530 // This flag is used for testing and can cause correctness problems.
531 if (SwpIgnoreRecMII)
532 RecMII = 0;
533
534 setMII(ResMII, RecMII);
535 setMAX_II();
536
537 LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II
538 << " (rec=" << RecMII << ", res=" << ResMII << ")\n");
539
540 // Can't schedule a loop without a valid MII.
541 if (MII == 0) {
542 LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
543 NumFailZeroMII++;
544 Pass.ORE->emit([&]() {
546 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
547 << "Invalid Minimal Initiation Interval: 0";
548 });
549 return;
550 }
551
552 // Don't pipeline large loops.
553 if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {
554 LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
555 << ", we don't pipeline large loops\n");
556 NumFailLargeMaxMII++;
557 Pass.ORE->emit([&]() {
559 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
560 << "Minimal Initiation Interval too large: "
561 << ore::NV("MII", (int)MII) << " > "
562 << ore::NV("SwpMaxMii", SwpMaxMii) << "."
563 << "Refer to -pipeliner-max-mii.";
564 });
565 return;
566 }
567
568 computeNodeFunctions(NodeSets);
569
570 registerPressureFilter(NodeSets);
571
572 colocateNodeSets(NodeSets);
573
574 checkNodeSets(NodeSets);
575
576 LLVM_DEBUG({
577 for (auto &I : NodeSets) {
578 dbgs() << " Rec NodeSet ";
579 I.dump();
580 }
581 });
582
583 llvm::stable_sort(NodeSets, std::greater<NodeSet>());
584
585 groupRemainingNodes(NodeSets);
586
587 removeDuplicateNodes(NodeSets);
588
589 LLVM_DEBUG({
590 for (auto &I : NodeSets) {
591 dbgs() << " NodeSet ";
592 I.dump();
593 }
594 });
595
596 computeNodeOrder(NodeSets);
597
598 // check for node order issues
599 checkValidNodeOrder(Circuits);
600
601 SMSchedule Schedule(Pass.MF, this);
602 Scheduled = schedulePipeline(Schedule);
603
604 if (!Scheduled){
605 LLVM_DEBUG(dbgs() << "No schedule found, return\n");
606 NumFailNoSchedule++;
607 Pass.ORE->emit([&]() {
609 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
610 << "Unable to find schedule";
611 });
612 return;
613 }
614
615 unsigned numStages = Schedule.getMaxStageCount();
616 // No need to generate pipeline if there are no overlapped iterations.
617 if (numStages == 0) {
618 LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
619 NumFailZeroStage++;
620 Pass.ORE->emit([&]() {
622 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
623 << "No need to pipeline - no overlapped iterations in schedule.";
624 });
625 return;
626 }
627 // Check that the maximum stage count is less than user-defined limit.
628 if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) {
629 LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
630 << " : too many stages, abort\n");
631 NumFailLargeMaxStage++;
632 Pass.ORE->emit([&]() {
634 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
635 << "Too many stages in schedule: "
636 << ore::NV("numStages", (int)numStages) << " > "
637 << ore::NV("SwpMaxStages", SwpMaxStages)
638 << ". Refer to -pipeliner-max-stages.";
639 });
640 return;
641 }
642
643 Pass.ORE->emit([&]() {
645 Loop.getHeader())
646 << "Pipelined succesfully!";
647 });
648
649 // Generate the schedule as a ModuloSchedule.
650 DenseMap<MachineInstr *, int> Cycles, Stages;
651 std::vector<MachineInstr *> OrderedInsts;
652 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
653 ++Cycle) {
654 for (SUnit *SU : Schedule.getInstructions(Cycle)) {
655 OrderedInsts.push_back(SU->getInstr());
656 Cycles[SU->getInstr()] = Cycle;
657 Stages[SU->getInstr()] = Schedule.stageScheduled(SU);
658 }
659 }
661 for (auto &KV : NewMIs) {
662 Cycles[KV.first] = Cycles[KV.second];
663 Stages[KV.first] = Stages[KV.second];
664 NewInstrChanges[KV.first] = InstrChanges[getSUnit(KV.first)];
665 }
666
667 ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
668 std::move(Stages));
670 assert(NewInstrChanges.empty() &&
671 "Cannot serialize a schedule with InstrChanges!");
673 MSTI.annotate();
674 return;
675 }
676 // The experimental code generator can't work if there are InstChanges.
677 if (ExperimentalCodeGen && NewInstrChanges.empty()) {
678 PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
679 MSE.expand();
680 } else {
681 ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
682 MSE.expand();
683 MSE.cleanup();
684 }
685 ++NumPipelined;
686}
687
688/// Clean up after the software pipeliner runs.
690 for (auto &KV : NewMIs)
691 MF.deleteMachineInstr(KV.second);
692 NewMIs.clear();
693
694 // Call the superclass.
696}
697
698/// Return the register values for the operands of a Phi instruction.
699/// This function assume the instruction is a Phi.
701 unsigned &InitVal, unsigned &LoopVal) {
702 assert(Phi.isPHI() && "Expecting a Phi.");
703
704 InitVal = 0;
705 LoopVal = 0;
706 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
707 if (Phi.getOperand(i + 1).getMBB() != Loop)
708 InitVal = Phi.getOperand(i).getReg();
709 else
710 LoopVal = Phi.getOperand(i).getReg();
711
712 assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
713}
714
715/// Return the Phi register value that comes the loop block.
716static unsigned getLoopPhiReg(const MachineInstr &Phi,
717 const MachineBasicBlock *LoopBB) {
718 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
719 if (Phi.getOperand(i + 1).getMBB() == LoopBB)
720 return Phi.getOperand(i).getReg();
721 return 0;
722}
723
724/// Return true if SUb can be reached from SUa following the chain edges.
725static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
728 Worklist.push_back(SUa);
729 while (!Worklist.empty()) {
730 const SUnit *SU = Worklist.pop_back_val();
731 for (const auto &SI : SU->Succs) {
732 SUnit *SuccSU = SI.getSUnit();
733 if (SI.getKind() == SDep::Order) {
734 if (Visited.count(SuccSU))
735 continue;
736 if (SuccSU == SUb)
737 return true;
738 Worklist.push_back(SuccSU);
739 Visited.insert(SuccSU);
740 }
741 }
742 }
743 return false;
744}
745
746/// Return true if the instruction causes a chain between memory
747/// references before and after it.
749 return MI.isCall() || MI.mayRaiseFPException() ||
750 MI.hasUnmodeledSideEffects() ||
751 (MI.hasOrderedMemoryRef() &&
752 (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad()));
753}
754
755/// Return the underlying objects for the memory references of an instruction.
756/// This function calls the code in ValueTracking, but first checks that the
757/// instruction has a memory operand.
760 if (!MI->hasOneMemOperand())
761 return;
762 MachineMemOperand *MM = *MI->memoperands_begin();
763 if (!MM->getValue())
764 return;
765 getUnderlyingObjects(MM->getValue(), Objs);
766 for (const Value *V : Objs) {
767 if (!isIdentifiedObject(V)) {
768 Objs.clear();
769 return;
770 }
771 }
772}
773
774/// Add a chain edge between a load and store if the store can be an
775/// alias of the load on a subsequent iteration, i.e., a loop carried
776/// dependence. This code is very similar to the code in ScheduleDAGInstrs
777/// but that code doesn't create loop carried dependences.
778void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
782 for (auto &SU : SUnits) {
783 MachineInstr &MI = *SU.getInstr();
785 PendingLoads.clear();
786 else if (MI.mayLoad()) {
789 if (Objs.empty())
791 for (const auto *V : Objs) {
792 SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
793 SUs.push_back(&SU);
794 }
795 } else if (MI.mayStore()) {
798 if (Objs.empty())
800 for (const auto *V : Objs) {
802 PendingLoads.find(V);
803 if (I == PendingLoads.end())
804 continue;
805 for (auto *Load : I->second) {
806 if (isSuccOrder(Load, &SU))
807 continue;
808 MachineInstr &LdMI = *Load->getInstr();
809 // First, perform the cheaper check that compares the base register.
810 // If they are the same and the load offset is less than the store
811 // offset, then mark the dependence as loop carried potentially.
812 const MachineOperand *BaseOp1, *BaseOp2;
813 int64_t Offset1, Offset2;
814 bool Offset1IsScalable, Offset2IsScalable;
815 if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1,
816 Offset1IsScalable, TRI) &&
817 TII->getMemOperandWithOffset(MI, BaseOp2, Offset2,
818 Offset2IsScalable, TRI)) {
819 if (BaseOp1->isIdenticalTo(*BaseOp2) &&
820 Offset1IsScalable == Offset2IsScalable &&
821 (int)Offset1 < (int)Offset2) {
823 "What happened to the chain edge?");
824 SDep Dep(Load, SDep::Barrier);
825 Dep.setLatency(1);
826 SU.addPred(Dep);
827 continue;
828 }
829 }
830 // Second, the more expensive check that uses alias analysis on the
831 // base registers. If they alias, and the load offset is less than
832 // the store offset, the mark the dependence as loop carried.
833 if (!AA) {
834 SDep Dep(Load, SDep::Barrier);
835 Dep.setLatency(1);
836 SU.addPred(Dep);
837 continue;
838 }
839 MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
840 MachineMemOperand *MMO2 = *MI.memoperands_begin();
841 if (!MMO1->getValue() || !MMO2->getValue()) {
842 SDep Dep(Load, SDep::Barrier);
843 Dep.setLatency(1);
844 SU.addPred(Dep);
845 continue;
846 }
847 if (MMO1->getValue() == MMO2->getValue() &&
848 MMO1->getOffset() <= MMO2->getOffset()) {
849 SDep Dep(Load, SDep::Barrier);
850 Dep.setLatency(1);
851 SU.addPred(Dep);
852 continue;
853 }
854 if (!AA->isNoAlias(
857 MMO2->getAAInfo()))) {
858 SDep Dep(Load, SDep::Barrier);
859 Dep.setLatency(1);
860 SU.addPred(Dep);
861 }
862 }
863 }
864 }
865 }
866}
867
868/// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
869/// processes dependences for PHIs. This function adds true dependences
870/// from a PHI to a use, and a loop carried dependence from the use to the
871/// PHI. The loop carried dependence is represented as an anti dependence
872/// edge. This function also removes chain dependences between unrelated
873/// PHIs.
874void SwingSchedulerDAG::updatePhiDependences() {
875 SmallVector<SDep, 4> RemoveDeps;
877
878 // Iterate over each DAG node.
879 for (SUnit &I : SUnits) {
880 RemoveDeps.clear();
881 // Set to true if the instruction has an operand defined by a Phi.
882 unsigned HasPhiUse = 0;
883 unsigned HasPhiDef = 0;
884 MachineInstr *MI = I.getInstr();
885 // Iterate over each operand, and we process the definitions.
886 for (const MachineOperand &MO : MI->operands()) {
887 if (!MO.isReg())
888 continue;
889 Register Reg = MO.getReg();
890 if (MO.isDef()) {
891 // If the register is used by a Phi, then create an anti dependence.
893 UI = MRI.use_instr_begin(Reg),
894 UE = MRI.use_instr_end();
895 UI != UE; ++UI) {
896 MachineInstr *UseMI = &*UI;
897 SUnit *SU = getSUnit(UseMI);
898 if (SU != nullptr && UseMI->isPHI()) {
899 if (!MI->isPHI()) {
900 SDep Dep(SU, SDep::Anti, Reg);
901 Dep.setLatency(1);
902 I.addPred(Dep);
903 } else {
904 HasPhiDef = Reg;
905 // Add a chain edge to a dependent Phi that isn't an existing
906 // predecessor.
907 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
908 I.addPred(SDep(SU, SDep::Barrier));
909 }
910 }
911 }
912 } else if (MO.isUse()) {
913 // If the register is defined by a Phi, then create a true dependence.
915 if (DefMI == nullptr)
916 continue;
917 SUnit *SU = getSUnit(DefMI);
918 if (SU != nullptr && DefMI->isPHI()) {
919 if (!MI->isPHI()) {
920 SDep Dep(SU, SDep::Data, Reg);
921 Dep.setLatency(0);
922 ST.adjustSchedDependency(SU, 0, &I, MO.getOperandNo(), Dep);
923 I.addPred(Dep);
924 } else {
925 HasPhiUse = Reg;
926 // Add a chain edge to a dependent Phi that isn't an existing
927 // predecessor.
928 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
929 I.addPred(SDep(SU, SDep::Barrier));
930 }
931 }
932 }
933 }
934 // Remove order dependences from an unrelated Phi.
935 if (!SwpPruneDeps)
936 continue;
937 for (auto &PI : I.Preds) {
938 MachineInstr *PMI = PI.getSUnit()->getInstr();
939 if (PMI->isPHI() && PI.getKind() == SDep::Order) {
940 if (I.getInstr()->isPHI()) {
941 if (PMI->getOperand(0).getReg() == HasPhiUse)
942 continue;
943 if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
944 continue;
945 }
946 RemoveDeps.push_back(PI);
947 }
948 }
949 for (int i = 0, e = RemoveDeps.size(); i != e; ++i)
950 I.removePred(RemoveDeps[i]);
951 }
952}
953
954/// Iterate over each DAG node and see if we can change any dependences
955/// in order to reduce the recurrence MII.
956void SwingSchedulerDAG::changeDependences() {
957 // See if an instruction can use a value from the previous iteration.
958 // If so, we update the base and offset of the instruction and change
959 // the dependences.
960 for (SUnit &I : SUnits) {
961 unsigned BasePos = 0, OffsetPos = 0, NewBase = 0;
962 int64_t NewOffset = 0;
963 if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
964 NewOffset))
965 continue;
966
967 // Get the MI and SUnit for the instruction that defines the original base.
968 Register OrigBase = I.getInstr()->getOperand(BasePos).getReg();
970 if (!DefMI)
971 continue;
972 SUnit *DefSU = getSUnit(DefMI);
973 if (!DefSU)
974 continue;
975 // Get the MI and SUnit for the instruction that defins the new base.
976 MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
977 if (!LastMI)
978 continue;
979 SUnit *LastSU = getSUnit(LastMI);
980 if (!LastSU)
981 continue;
982
983 if (Topo.IsReachable(&I, LastSU))
984 continue;
985
986 // Remove the dependence. The value now depends on a prior iteration.
988 for (const SDep &P : I.Preds)
989 if (P.getSUnit() == DefSU)
990 Deps.push_back(P);
991 for (int i = 0, e = Deps.size(); i != e; i++) {
992 Topo.RemovePred(&I, Deps[i].getSUnit());
993 I.removePred(Deps[i]);
994 }
995 // Remove the chain dependence between the instructions.
996 Deps.clear();
997 for (auto &P : LastSU->Preds)
998 if (P.getSUnit() == &I && P.getKind() == SDep::Order)
999 Deps.push_back(P);
1000 for (int i = 0, e = Deps.size(); i != e; i++) {
1001 Topo.RemovePred(LastSU, Deps[i].getSUnit());
1002 LastSU->removePred(Deps[i]);
1003 }
1004
1005 // Add a dependence between the new instruction and the instruction
1006 // that defines the new base.
1007 SDep Dep(&I, SDep::Anti, NewBase);
1008 Topo.AddPred(LastSU, &I);
1009 LastSU->addPred(Dep);
1010
1011 // Remember the base and offset information so that we can update the
1012 // instruction during code generation.
1013 InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
1014 }
1015}
1016
1017/// Create an instruction stream that represents a single iteration and stage of
1018/// each instruction. This function differs from SMSchedule::finalizeSchedule in
1019/// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this
1020/// function is an approximation of SMSchedule::finalizeSchedule with all
1021/// non-const operations removed.
1023 SMSchedule &Schedule,
1024 std::vector<MachineInstr *> &OrderedInsts,
1027
1028 // Move all instructions to the first stage from the later stages.
1029 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1030 ++Cycle) {
1031 for (int Stage = 0, LastStage = Schedule.getMaxStageCount();
1032 Stage <= LastStage; ++Stage) {
1033 for (SUnit *SU : llvm::reverse(Schedule.getInstructions(
1034 Cycle + Stage * Schedule.getInitiationInterval()))) {
1035 Instrs[Cycle].push_front(SU);
1036 }
1037 }
1038 }
1039
1040 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1041 ++Cycle) {
1042 std::deque<SUnit *> &CycleInstrs = Instrs[Cycle];
1043 CycleInstrs = Schedule.reorderInstructions(SSD, CycleInstrs);
1044 for (SUnit *SU : CycleInstrs) {
1045 MachineInstr *MI = SU->getInstr();
1046 OrderedInsts.push_back(MI);
1047 Stages[MI] = Schedule.stageScheduled(SU);
1048 }
1049 }
1050}
1051
1052namespace {
1053
1054// FuncUnitSorter - Comparison operator used to sort instructions by
1055// the number of functional unit choices.
1056struct FuncUnitSorter {
1057 const InstrItineraryData *InstrItins;
1058 const MCSubtargetInfo *STI;
1060
1061 FuncUnitSorter(const TargetSubtargetInfo &TSI)
1062 : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
1063
1064 // Compute the number of functional unit alternatives needed
1065 // at each stage, and take the minimum value. We prioritize the
1066 // instructions by the least number of choices first.
1067 unsigned minFuncUnits(const MachineInstr *Inst,
1068 InstrStage::FuncUnits &F) const {
1069 unsigned SchedClass = Inst->getDesc().getSchedClass();
1070 unsigned min = UINT_MAX;
1071 if (InstrItins && !InstrItins->isEmpty()) {
1072 for (const InstrStage &IS :
1073 make_range(InstrItins->beginStage(SchedClass),
1074 InstrItins->endStage(SchedClass))) {
1075 InstrStage::FuncUnits funcUnits = IS.getUnits();
1076 unsigned numAlternatives = llvm::popcount(funcUnits);
1077 if (numAlternatives < min) {
1078 min = numAlternatives;
1079 F = funcUnits;
1080 }
1081 }
1082 return min;
1083 }
1084 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1085 const MCSchedClassDesc *SCDesc =
1086 STI->getSchedModel().getSchedClassDesc(SchedClass);
1087 if (!SCDesc->isValid())
1088 // No valid Schedule Class Desc for schedClass, should be
1089 // Pseudo/PostRAPseudo
1090 return min;
1091
1092 for (const MCWriteProcResEntry &PRE :
1093 make_range(STI->getWriteProcResBegin(SCDesc),
1094 STI->getWriteProcResEnd(SCDesc))) {
1095 if (!PRE.ReleaseAtCycle)
1096 continue;
1097 const MCProcResourceDesc *ProcResource =
1098 STI->getSchedModel().getProcResource(PRE.ProcResourceIdx);
1099 unsigned NumUnits = ProcResource->NumUnits;
1100 if (NumUnits < min) {
1101 min = NumUnits;
1102 F = PRE.ProcResourceIdx;
1103 }
1104 }
1105 return min;
1106 }
1107 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1108 }
1109
1110 // Compute the critical resources needed by the instruction. This
1111 // function records the functional units needed by instructions that
1112 // must use only one functional unit. We use this as a tie breaker
1113 // for computing the resource MII. The instrutions that require
1114 // the same, highly used, functional unit have high priority.
1115 void calcCriticalResources(MachineInstr &MI) {
1116 unsigned SchedClass = MI.getDesc().getSchedClass();
1117 if (InstrItins && !InstrItins->isEmpty()) {
1118 for (const InstrStage &IS :
1119 make_range(InstrItins->beginStage(SchedClass),
1120 InstrItins->endStage(SchedClass))) {
1121 InstrStage::FuncUnits FuncUnits = IS.getUnits();
1122 if (llvm::popcount(FuncUnits) == 1)
1123 Resources[FuncUnits]++;
1124 }
1125 return;
1126 }
1127 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1128 const MCSchedClassDesc *SCDesc =
1129 STI->getSchedModel().getSchedClassDesc(SchedClass);
1130 if (!SCDesc->isValid())
1131 // No valid Schedule Class Desc for schedClass, should be
1132 // Pseudo/PostRAPseudo
1133 return;
1134
1135 for (const MCWriteProcResEntry &PRE :
1136 make_range(STI->getWriteProcResBegin(SCDesc),
1137 STI->getWriteProcResEnd(SCDesc))) {
1138 if (!PRE.ReleaseAtCycle)
1139 continue;
1140 Resources[PRE.ProcResourceIdx]++;
1141 }
1142 return;
1143 }
1144 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1145 }
1146
1147 /// Return true if IS1 has less priority than IS2.
1148 bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
1149 InstrStage::FuncUnits F1 = 0, F2 = 0;
1150 unsigned MFUs1 = minFuncUnits(IS1, F1);
1151 unsigned MFUs2 = minFuncUnits(IS2, F2);
1152 if (MFUs1 == MFUs2)
1153 return Resources.lookup(F1) < Resources.lookup(F2);
1154 return MFUs1 > MFUs2;
1155 }
1156};
1157
1158/// Calculate the maximum register pressure of the scheduled instructions stream
1159class HighRegisterPressureDetector {
1160 MachineBasicBlock *OrigMBB;
1161 const MachineFunction &MF;
1162 const MachineRegisterInfo &MRI;
1163 const TargetRegisterInfo *TRI;
1164
1165 const unsigned PSetNum;
1166
1167 // Indexed by PSet ID
1168 // InitSetPressure takes into account the register pressure of live-in
1169 // registers. It's not depend on how the loop is scheduled, so it's enough to
1170 // calculate them once at the beginning.
1171 std::vector<unsigned> InitSetPressure;
1172
1173 // Indexed by PSet ID
1174 // Upper limit for each register pressure set
1175 std::vector<unsigned> PressureSetLimit;
1176
1178
1180
1181public:
1182 using OrderedInstsTy = std::vector<MachineInstr *>;
1183 using Instr2StageTy = DenseMap<MachineInstr *, unsigned>;
1184
1185private:
1186 static void dumpRegisterPressures(const std::vector<unsigned> &Pressures) {
1187 if (Pressures.size() == 0) {
1188 dbgs() << "[]";
1189 } else {
1190 char Prefix = '[';
1191 for (unsigned P : Pressures) {
1192 dbgs() << Prefix << P;
1193 Prefix = ' ';
1194 }
1195 dbgs() << ']';
1196 }
1197 }
1198
1199 void dumpPSet(Register Reg) const {
1200 dbgs() << "Reg=" << printReg(Reg, TRI, 0, &MRI) << " PSet=";
1201 for (auto PSetIter = MRI.getPressureSets(Reg); PSetIter.isValid();
1202 ++PSetIter) {
1203 dbgs() << *PSetIter << ' ';
1204 }
1205 dbgs() << '\n';
1206 }
1207
1208 void increaseRegisterPressure(std::vector<unsigned> &Pressure,
1209 Register Reg) const {
1210 auto PSetIter = MRI.getPressureSets(Reg);
1211 unsigned Weight = PSetIter.getWeight();
1212 for (; PSetIter.isValid(); ++PSetIter)
1213 Pressure[*PSetIter] += Weight;
1214 }
1215
1216 void decreaseRegisterPressure(std::vector<unsigned> &Pressure,
1217 Register Reg) const {
1218 auto PSetIter = MRI.getPressureSets(Reg);
1219 unsigned Weight = PSetIter.getWeight();
1220 for (; PSetIter.isValid(); ++PSetIter) {
1221 auto &P = Pressure[*PSetIter];
1222 assert(P >= Weight &&
1223 "register pressure must be greater than or equal weight");
1224 P -= Weight;
1225 }
1226 }
1227
1228 // Return true if Reg is fixed one, for example, stack pointer
1229 bool isFixedRegister(Register Reg) const {
1230 return Reg.isPhysical() && TRI->isFixedRegister(MF, Reg.asMCReg());
1231 }
1232
1233 bool isDefinedInThisLoop(Register Reg) const {
1234 return Reg.isVirtual() && MRI.getVRegDef(Reg)->getParent() == OrigMBB;
1235 }
1236
1237 // Search for live-in variables. They are factored into the register pressure
1238 // from the begining. Live-in variables used by every iteration should be
1239 // considered as alive throughout the loop. For example, the variable `c` in
1240 // following code. \code
1241 // int c = ...;
1242 // for (int i = 0; i < n; i++)
1243 // a[i] += b[i] + c;
1244 // \endcode
1245 void computeLiveIn() {
1247 for (auto &MI : *OrigMBB) {
1248 if (MI.isDebugInstr())
1249 continue;
1250 for (auto Use : ROMap[&MI].Uses) {
1251 auto Reg = Use.RegUnit;
1252 // Ignore the variable that appears only on one side of phi instruction
1253 // because it's used only at the first iteration.
1254 if (MI.isPHI() && Reg != getLoopPhiReg(MI, OrigMBB))
1255 continue;
1256 if (isFixedRegister(Reg))
1257 continue;
1258 if (isDefinedInThisLoop(Reg))
1259 continue;
1260 Used.insert(Reg);
1261 }
1262 }
1263
1264 for (auto LiveIn : Used)
1265 increaseRegisterPressure(InitSetPressure, LiveIn);
1266 }
1267
1268 // Calculate the upper limit of each pressure set
1269 void computePressureSetLimit(const RegisterClassInfo &RCI) {
1270 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1271 PressureSetLimit[PSet] = RCI.getRegPressureSetLimit(PSet);
1272
1273 // We assume fixed registers, such as stack pointer, are already in use.
1274 // Therefore subtracting the weight of the fixed registers from the limit of
1275 // each pressure set in advance.
1277 for (const TargetRegisterClass *TRC : TRI->regclasses()) {
1278 for (const MCPhysReg Reg : *TRC)
1279 if (isFixedRegister(Reg))
1280 FixedRegs.insert(Reg);
1281 }
1282
1283 LLVM_DEBUG({
1284 for (auto Reg : FixedRegs) {
1285 dbgs() << printReg(Reg, TRI, 0, &MRI) << ": [";
1286 const int *Sets = TRI->getRegUnitPressureSets(Reg);
1287 for (; *Sets != -1; Sets++) {
1288 dbgs() << TRI->getRegPressureSetName(*Sets) << ", ";
1289 }
1290 dbgs() << "]\n";
1291 }
1292 });
1293
1294 for (auto Reg : FixedRegs) {
1295 LLVM_DEBUG(dbgs() << "fixed register: " << printReg(Reg, TRI, 0, &MRI)
1296 << "\n");
1297 auto PSetIter = MRI.getPressureSets(Reg);
1298 unsigned Weight = PSetIter.getWeight();
1299 for (; PSetIter.isValid(); ++PSetIter) {
1300 unsigned &Limit = PressureSetLimit[*PSetIter];
1301 assert(Limit >= Weight &&
1302 "register pressure limit must be greater than or equal weight");
1303 Limit -= Weight;
1304 LLVM_DEBUG(dbgs() << "PSet=" << *PSetIter << " Limit=" << Limit
1305 << " (decreased by " << Weight << ")\n");
1306 }
1307 }
1308 }
1309
1310 // There are two patterns of last-use.
1311 // - by an instruction of the current iteration
1312 // - by a phi instruction of the next iteration (loop carried value)
1313 //
1314 // Furthermore, following two groups of instructions are executed
1315 // simultaneously
1316 // - next iteration's phi instructions in i-th stage
1317 // - current iteration's instructions in i+1-th stage
1318 //
1319 // This function calculates the last-use of each register while taking into
1320 // account the above two patterns.
1321 Instr2LastUsesTy computeLastUses(const OrderedInstsTy &OrderedInsts,
1322 Instr2StageTy &Stages) const {
1323 // We treat virtual registers that are defined and used in this loop.
1324 // Following virtual register will be ignored
1325 // - live-in one
1326 // - defined but not used in the loop (potentially live-out)
1327 DenseSet<Register> TargetRegs;
1328 const auto UpdateTargetRegs = [this, &TargetRegs](Register Reg) {
1329 if (isDefinedInThisLoop(Reg))
1330 TargetRegs.insert(Reg);
1331 };
1332 for (MachineInstr *MI : OrderedInsts) {
1333 if (MI->isPHI()) {
1334 Register Reg = getLoopPhiReg(*MI, OrigMBB);
1335 UpdateTargetRegs(Reg);
1336 } else {
1337 for (auto &Use : ROMap.find(MI)->getSecond().Uses)
1338 UpdateTargetRegs(Use.RegUnit);
1339 }
1340 }
1341
1342 const auto InstrScore = [&Stages](MachineInstr *MI) {
1343 return Stages[MI] + MI->isPHI();
1344 };
1345
1347 for (MachineInstr *MI : llvm::reverse(OrderedInsts)) {
1348 for (auto Use : ROMap.find(MI)->getSecond().Uses) {
1349 auto Reg = Use.RegUnit;
1350 if (!TargetRegs.contains(Reg))
1351 continue;
1352 auto Ite = LastUseMI.find(Reg);
1353 if (Ite == LastUseMI.end()) {
1354 LastUseMI[Reg] = MI;
1355 } else {
1356 MachineInstr *Orig = Ite->second;
1357 MachineInstr *New = MI;
1358 if (InstrScore(Orig) < InstrScore(New))
1359 LastUseMI[Reg] = New;
1360 }
1361 }
1362 }
1363
1364 Instr2LastUsesTy LastUses;
1365 for (auto &Entry : LastUseMI)
1366 LastUses[Entry.second].insert(Entry.first);
1367 return LastUses;
1368 }
1369
1370 // Compute the maximum register pressure of the kernel. We'll simulate #Stage
1371 // iterations and check the register pressure at the point where all stages
1372 // overlapping.
1373 //
1374 // An example of unrolled loop where #Stage is 4..
1375 // Iter i+0 i+1 i+2 i+3
1376 // ------------------------
1377 // Stage 0
1378 // Stage 1 0
1379 // Stage 2 1 0
1380 // Stage 3 2 1 0 <- All stages overlap
1381 //
1382 std::vector<unsigned>
1383 computeMaxSetPressure(const OrderedInstsTy &OrderedInsts,
1384 Instr2StageTy &Stages,
1385 const unsigned StageCount) const {
1386 using RegSetTy = SmallDenseSet<Register, 16>;
1387
1388 // Indexed by #Iter. To treat "local" variables of each stage separately, we
1389 // manage the liveness of the registers independently by iterations.
1390 SmallVector<RegSetTy> LiveRegSets(StageCount);
1391
1392 auto CurSetPressure = InitSetPressure;
1393 auto MaxSetPressure = InitSetPressure;
1394 auto LastUses = computeLastUses(OrderedInsts, Stages);
1395
1396 LLVM_DEBUG({
1397 dbgs() << "Ordered instructions:\n";
1398 for (MachineInstr *MI : OrderedInsts) {
1399 dbgs() << "Stage " << Stages[MI] << ": ";
1400 MI->dump();
1401 }
1402 });
1403
1404 const auto InsertReg = [this, &CurSetPressure](RegSetTy &RegSet,
1405 Register Reg) {
1406 if (!Reg.isValid() || isFixedRegister(Reg))
1407 return;
1408
1409 bool Inserted = RegSet.insert(Reg).second;
1410 if (!Inserted)
1411 return;
1412
1413 LLVM_DEBUG(dbgs() << "insert " << printReg(Reg, TRI, 0, &MRI) << "\n");
1414 increaseRegisterPressure(CurSetPressure, Reg);
1415 LLVM_DEBUG(dumpPSet(Reg));
1416 };
1417
1418 const auto EraseReg = [this, &CurSetPressure](RegSetTy &RegSet,
1419 Register Reg) {
1420 if (!Reg.isValid() || isFixedRegister(Reg))
1421 return;
1422
1423 // live-in register
1424 if (!RegSet.contains(Reg))
1425 return;
1426
1427 LLVM_DEBUG(dbgs() << "erase " << printReg(Reg, TRI, 0, &MRI) << "\n");
1428 RegSet.erase(Reg);
1429 decreaseRegisterPressure(CurSetPressure, Reg);
1430 LLVM_DEBUG(dumpPSet(Reg));
1431 };
1432
1433 for (unsigned I = 0; I < StageCount; I++) {
1434 for (MachineInstr *MI : OrderedInsts) {
1435 const auto Stage = Stages[MI];
1436 if (I < Stage)
1437 continue;
1438
1439 const unsigned Iter = I - Stage;
1440
1441 for (auto &Def : ROMap.find(MI)->getSecond().Defs)
1442 InsertReg(LiveRegSets[Iter], Def.RegUnit);
1443
1444 for (auto LastUse : LastUses[MI]) {
1445 if (MI->isPHI()) {
1446 if (Iter != 0)
1447 EraseReg(LiveRegSets[Iter - 1], LastUse);
1448 } else {
1449 EraseReg(LiveRegSets[Iter], LastUse);
1450 }
1451 }
1452
1453 for (unsigned PSet = 0; PSet < PSetNum; PSet++)
1454 MaxSetPressure[PSet] =
1455 std::max(MaxSetPressure[PSet], CurSetPressure[PSet]);
1456
1457 LLVM_DEBUG({
1458 dbgs() << "CurSetPressure=";
1459 dumpRegisterPressures(CurSetPressure);
1460 dbgs() << " iter=" << Iter << " stage=" << Stage << ":";
1461 MI->dump();
1462 });
1463 }
1464 }
1465
1466 return MaxSetPressure;
1467 }
1468
1469public:
1470 HighRegisterPressureDetector(MachineBasicBlock *OrigMBB,
1471 const MachineFunction &MF)
1472 : OrigMBB(OrigMBB), MF(MF), MRI(MF.getRegInfo()),
1473 TRI(MF.getSubtarget().getRegisterInfo()),
1474 PSetNum(TRI->getNumRegPressureSets()), InitSetPressure(PSetNum, 0),
1475 PressureSetLimit(PSetNum, 0) {}
1476
1477 // Used to calculate register pressure, which is independent of loop
1478 // scheduling.
1479 void init(const RegisterClassInfo &RCI) {
1480 for (MachineInstr &MI : *OrigMBB) {
1481 if (MI.isDebugInstr())
1482 continue;
1483 ROMap[&MI].collect(MI, *TRI, MRI, false, true);
1484 }
1485
1486 computeLiveIn();
1487 computePressureSetLimit(RCI);
1488 }
1489
1490 // Calculate the maximum register pressures of the loop and check if they
1491 // exceed the limit
1492 bool detect(const SwingSchedulerDAG *SSD, SMSchedule &Schedule,
1493 const unsigned MaxStage) const {
1495 "the percentage of the margin must be between 0 to 100");
1496
1497 OrderedInstsTy OrderedInsts;
1498 Instr2StageTy Stages;
1499 computeScheduledInsts(SSD, Schedule, OrderedInsts, Stages);
1500 const auto MaxSetPressure =
1501 computeMaxSetPressure(OrderedInsts, Stages, MaxStage + 1);
1502
1503 LLVM_DEBUG({
1504 dbgs() << "Dump MaxSetPressure:\n";
1505 for (unsigned I = 0; I < MaxSetPressure.size(); I++) {
1506 dbgs() << format("MaxSetPressure[%d]=%d\n", I, MaxSetPressure[I]);
1507 }
1508 dbgs() << '\n';
1509 });
1510
1511 for (unsigned PSet = 0; PSet < PSetNum; PSet++) {
1512 unsigned Limit = PressureSetLimit[PSet];
1513 unsigned Margin = Limit * RegPressureMargin / 100;
1514 LLVM_DEBUG(dbgs() << "PSet=" << PSet << " Limit=" << Limit
1515 << " Margin=" << Margin << "\n");
1516 if (Limit < MaxSetPressure[PSet] + Margin) {
1517 LLVM_DEBUG(
1518 dbgs()
1519 << "Rejected the schedule because of too high register pressure\n");
1520 return true;
1521 }
1522 }
1523 return false;
1524 }
1525};
1526
1527} // end anonymous namespace
1528
1529/// Calculate the resource constrained minimum initiation interval for the
1530/// specified loop. We use the DFA to model the resources needed for
1531/// each instruction, and we ignore dependences. A different DFA is created
1532/// for each cycle that is required. When adding a new instruction, we attempt
1533/// to add it to each existing DFA, until a legal space is found. If the
1534/// instruction cannot be reserved in an existing DFA, we create a new one.
1535unsigned SwingSchedulerDAG::calculateResMII() {
1536 LLVM_DEBUG(dbgs() << "calculateResMII:\n");
1538 return RM.calculateResMII();
1539}
1540
1541/// Calculate the recurrence-constrainted minimum initiation interval.
1542/// Iterate over each circuit. Compute the delay(c) and distance(c)
1543/// for each circuit. The II needs to satisfy the inequality
1544/// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1545/// II that satisfies the inequality, and the RecMII is the maximum
1546/// of those values.
1547unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1548 unsigned RecMII = 0;
1549
1550 for (NodeSet &Nodes : NodeSets) {
1551 if (Nodes.empty())
1552 continue;
1553
1554 unsigned Delay = Nodes.getLatency();
1555 unsigned Distance = 1;
1556
1557 // ii = ceil(delay / distance)
1558 unsigned CurMII = (Delay + Distance - 1) / Distance;
1559 Nodes.setRecMII(CurMII);
1560 if (CurMII > RecMII)
1561 RecMII = CurMII;
1562 }
1563
1564 return RecMII;
1565}
1566
1567/// Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1568/// but we do this to find the circuits, and then change them back.
1569static void swapAntiDependences(std::vector<SUnit> &SUnits) {
1571 for (SUnit &SU : SUnits) {
1572 for (SDep &Pred : SU.Preds)
1573 if (Pred.getKind() == SDep::Anti)
1574 DepsAdded.push_back(std::make_pair(&SU, Pred));
1575 }
1576 for (std::pair<SUnit *, SDep> &P : DepsAdded) {
1577 // Remove this anti dependency and add one in the reverse direction.
1578 SUnit *SU = P.first;
1579 SDep &D = P.second;
1580 SUnit *TargetSU = D.getSUnit();
1581 unsigned Reg = D.getReg();
1582 unsigned Lat = D.getLatency();
1583 SU->removePred(D);
1584 SDep Dep(SU, SDep::Anti, Reg);
1585 Dep.setLatency(Lat);
1586 TargetSU->addPred(Dep);
1587 }
1588}
1589
1590/// Create the adjacency structure of the nodes in the graph.
1591void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1592 SwingSchedulerDAG *DAG) {
1593 BitVector Added(SUnits.size());
1594 DenseMap<int, int> OutputDeps;
1595 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1596 Added.reset();
1597 // Add any successor to the adjacency matrix and exclude duplicates.
1598 for (auto &SI : SUnits[i].Succs) {
1599 // Only create a back-edge on the first and last nodes of a dependence
1600 // chain. This records any chains and adds them later.
1601 if (SI.getKind() == SDep::Output) {
1602 int N = SI.getSUnit()->NodeNum;
1603 int BackEdge = i;
1604 auto Dep = OutputDeps.find(BackEdge);
1605 if (Dep != OutputDeps.end()) {
1606 BackEdge = Dep->second;
1607 OutputDeps.erase(Dep);
1608 }
1609 OutputDeps[N] = BackEdge;
1610 }
1611 // Do not process a boundary node, an artificial node.
1612 // A back-edge is processed only if it goes to a Phi.
1613 if (SI.getSUnit()->isBoundaryNode() || SI.isArtificial() ||
1614 (SI.getKind() == SDep::Anti && !SI.getSUnit()->getInstr()->isPHI()))
1615 continue;
1616 int N = SI.getSUnit()->NodeNum;
1617 if (!Added.test(N)) {
1618 AdjK[i].push_back(N);
1619 Added.set(N);
1620 }
1621 }
1622 // A chain edge between a store and a load is treated as a back-edge in the
1623 // adjacency matrix.
1624 for (auto &PI : SUnits[i].Preds) {
1625 if (!SUnits[i].getInstr()->mayStore() ||
1626 !DAG->isLoopCarriedDep(&SUnits[i], PI, false))
1627 continue;
1628 if (PI.getKind() == SDep::Order && PI.getSUnit()->getInstr()->mayLoad()) {
1629 int N = PI.getSUnit()->NodeNum;
1630 if (!Added.test(N)) {
1631 AdjK[i].push_back(N);
1632 Added.set(N);
1633 }
1634 }
1635 }
1636 }
1637 // Add back-edges in the adjacency matrix for the output dependences.
1638 for (auto &OD : OutputDeps)
1639 if (!Added.test(OD.second)) {
1640 AdjK[OD.first].push_back(OD.second);
1641 Added.set(OD.second);
1642 }
1643}
1644
1645/// Identify an elementary circuit in the dependence graph starting at the
1646/// specified node.
1647bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1648 bool HasBackedge) {
1649 SUnit *SV = &SUnits[V];
1650 bool F = false;
1651 Stack.insert(SV);
1652 Blocked.set(V);
1653
1654 for (auto W : AdjK[V]) {
1655 if (NumPaths > MaxPaths)
1656 break;
1657 if (W < S)
1658 continue;
1659 if (W == S) {
1660 if (!HasBackedge)
1661 NodeSets.push_back(NodeSet(Stack.begin(), Stack.end()));
1662 F = true;
1663 ++NumPaths;
1664 break;
1665 } else if (!Blocked.test(W)) {
1666 if (circuit(W, S, NodeSets,
1667 Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
1668 F = true;
1669 }
1670 }
1671
1672 if (F)
1673 unblock(V);
1674 else {
1675 for (auto W : AdjK[V]) {
1676 if (W < S)
1677 continue;
1678 B[W].insert(SV);
1679 }
1680 }
1681 Stack.pop_back();
1682 return F;
1683}
1684
1685/// Unblock a node in the circuit finding algorithm.
1686void SwingSchedulerDAG::Circuits::unblock(int U) {
1687 Blocked.reset(U);
1689 while (!BU.empty()) {
1691 assert(SI != BU.end() && "Invalid B set.");
1692 SUnit *W = *SI;
1693 BU.erase(W);
1694 if (Blocked.test(W->NodeNum))
1695 unblock(W->NodeNum);
1696 }
1697}
1698
1699/// Identify all the elementary circuits in the dependence graph using
1700/// Johnson's circuit algorithm.
1701void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
1702 // Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1703 // but we do this to find the circuits, and then change them back.
1705
1706 Circuits Cir(SUnits, Topo);
1707 // Create the adjacency structure.
1708 Cir.createAdjacencyStructure(this);
1709 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1710 Cir.reset();
1711 Cir.circuit(i, i, NodeSets);
1712 }
1713
1714 // Change the dependences back so that we've created a DAG again.
1716}
1717
1718// Create artificial dependencies between the source of COPY/REG_SEQUENCE that
1719// is loop-carried to the USE in next iteration. This will help pipeliner avoid
1720// additional copies that are needed across iterations. An artificial dependence
1721// edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
1722
1723// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
1724// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
1725// PHI-------True-Dep------> USEOfPhi
1726
1727// The mutation creates
1728// USEOfPHI -------Artificial-Dep---> SRCOfCopy
1729
1730// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
1731// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
1732// late to avoid additional copies across iterations. The possible scheduling
1733// order would be
1734// USEOfPHI --- SRCOfCopy--- COPY/REG_SEQUENCE.
1735
1736void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
1737 for (SUnit &SU : DAG->SUnits) {
1738 // Find the COPY/REG_SEQUENCE instruction.
1739 if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
1740 continue;
1741
1742 // Record the loop carried PHIs.
1744 // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
1746
1747 for (auto &Dep : SU.Preds) {
1748 SUnit *TmpSU = Dep.getSUnit();
1749 MachineInstr *TmpMI = TmpSU->getInstr();
1750 SDep::Kind DepKind = Dep.getKind();
1751 // Save the loop carried PHI.
1752 if (DepKind == SDep::Anti && TmpMI->isPHI())
1753 PHISUs.push_back(TmpSU);
1754 // Save the source of COPY/REG_SEQUENCE.
1755 // If the source has no pre-decessors, we will end up creating cycles.
1756 else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
1757 SrcSUs.push_back(TmpSU);
1758 }
1759
1760 if (PHISUs.size() == 0 || SrcSUs.size() == 0)
1761 continue;
1762
1763 // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
1764 // SUnit to the container.
1766 // Do not use iterator based loop here as we are updating the container.
1767 for (size_t Index = 0; Index < PHISUs.size(); ++Index) {
1768 for (auto &Dep : PHISUs[Index]->Succs) {
1769 if (Dep.getKind() != SDep::Data)
1770 continue;
1771
1772 SUnit *TmpSU = Dep.getSUnit();
1773 MachineInstr *TmpMI = TmpSU->getInstr();
1774 if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
1775 PHISUs.push_back(TmpSU);
1776 continue;
1777 }
1778 UseSUs.push_back(TmpSU);
1779 }
1780 }
1781
1782 if (UseSUs.size() == 0)
1783 continue;
1784
1785 SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
1786 // Add the artificial dependencies if it does not form a cycle.
1787 for (auto *I : UseSUs) {
1788 for (auto *Src : SrcSUs) {
1789 if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
1790 Src->addPred(SDep(I, SDep::Artificial));
1791 SDAG->Topo.AddPred(Src, I);
1792 }
1793 }
1794 }
1795 }
1796}
1797
1798/// Return true for DAG nodes that we ignore when computing the cost functions.
1799/// We ignore the back-edge recurrence in order to avoid unbounded recursion
1800/// in the calculation of the ASAP, ALAP, etc functions.
1801static bool ignoreDependence(const SDep &D, bool isPred) {
1802 if (D.isArtificial() || D.getSUnit()->isBoundaryNode())
1803 return true;
1804 return D.getKind() == SDep::Anti && isPred;
1805}
1806
1807/// Compute several functions need to order the nodes for scheduling.
1808/// ASAP - Earliest time to schedule a node.
1809/// ALAP - Latest time to schedule a node.
1810/// MOV - Mobility function, difference between ALAP and ASAP.
1811/// D - Depth of each node.
1812/// H - Height of each node.
1813void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
1814 ScheduleInfo.resize(SUnits.size());
1815
1816 LLVM_DEBUG({
1817 for (int I : Topo) {
1818 const SUnit &SU = SUnits[I];
1819 dumpNode(SU);
1820 }
1821 });
1822
1823 int maxASAP = 0;
1824 // Compute ASAP and ZeroLatencyDepth.
1825 for (int I : Topo) {
1826 int asap = 0;
1827 int zeroLatencyDepth = 0;
1828 SUnit *SU = &SUnits[I];
1829 for (const SDep &P : SU->Preds) {
1830 SUnit *pred = P.getSUnit();
1831 if (P.getLatency() == 0)
1832 zeroLatencyDepth =
1833 std::max(zeroLatencyDepth, getZeroLatencyDepth(pred) + 1);
1834 if (ignoreDependence(P, true))
1835 continue;
1836 asap = std::max(asap, (int)(getASAP(pred) + P.getLatency() -
1837 getDistance(pred, SU, P) * MII));
1838 }
1839 maxASAP = std::max(maxASAP, asap);
1840 ScheduleInfo[I].ASAP = asap;
1841 ScheduleInfo[I].ZeroLatencyDepth = zeroLatencyDepth;
1842 }
1843
1844 // Compute ALAP, ZeroLatencyHeight, and MOV.
1845 for (int I : llvm::reverse(Topo)) {
1846 int alap = maxASAP;
1847 int zeroLatencyHeight = 0;
1848 SUnit *SU = &SUnits[I];
1849 for (const SDep &S : SU->Succs) {
1850 SUnit *succ = S.getSUnit();
1851 if (succ->isBoundaryNode())
1852 continue;
1853 if (S.getLatency() == 0)
1854 zeroLatencyHeight =
1855 std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
1856 if (ignoreDependence(S, true))
1857 continue;
1858 alap = std::min(alap, (int)(getALAP(succ) - S.getLatency() +
1859 getDistance(SU, succ, S) * MII));
1860 }
1861
1862 ScheduleInfo[I].ALAP = alap;
1863 ScheduleInfo[I].ZeroLatencyHeight = zeroLatencyHeight;
1864 }
1865
1866 // After computing the node functions, compute the summary for each node set.
1867 for (NodeSet &I : NodeSets)
1868 I.computeNodeSetInfo(this);
1869
1870 LLVM_DEBUG({
1871 for (unsigned i = 0; i < SUnits.size(); i++) {
1872 dbgs() << "\tNode " << i << ":\n";
1873 dbgs() << "\t ASAP = " << getASAP(&SUnits[i]) << "\n";
1874 dbgs() << "\t ALAP = " << getALAP(&SUnits[i]) << "\n";
1875 dbgs() << "\t MOV = " << getMOV(&SUnits[i]) << "\n";
1876 dbgs() << "\t D = " << getDepth(&SUnits[i]) << "\n";
1877 dbgs() << "\t H = " << getHeight(&SUnits[i]) << "\n";
1878 dbgs() << "\t ZLD = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
1879 dbgs() << "\t ZLH = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
1880 }
1881 });
1882}
1883
1884/// Compute the Pred_L(O) set, as defined in the paper. The set is defined
1885/// as the predecessors of the elements of NodeOrder that are not also in
1886/// NodeOrder.
1889 const NodeSet *S = nullptr) {
1890 Preds.clear();
1891 for (const SUnit *SU : NodeOrder) {
1892 for (const SDep &Pred : SU->Preds) {
1893 if (S && S->count(Pred.getSUnit()) == 0)
1894 continue;
1895 if (ignoreDependence(Pred, true))
1896 continue;
1897 if (NodeOrder.count(Pred.getSUnit()) == 0)
1898 Preds.insert(Pred.getSUnit());
1899 }
1900 // Back-edges are predecessors with an anti-dependence.
1901 for (const SDep &Succ : SU->Succs) {
1902 if (Succ.getKind() != SDep::Anti)
1903 continue;
1904 if (S && S->count(Succ.getSUnit()) == 0)
1905 continue;
1906 if (NodeOrder.count(Succ.getSUnit()) == 0)
1907 Preds.insert(Succ.getSUnit());
1908 }
1909 }
1910 return !Preds.empty();
1911}
1912
1913/// Compute the Succ_L(O) set, as defined in the paper. The set is defined
1914/// as the successors of the elements of NodeOrder that are not also in
1915/// NodeOrder.
1918 const NodeSet *S = nullptr) {
1919 Succs.clear();
1920 for (const SUnit *SU : NodeOrder) {
1921 for (const SDep &Succ : SU->Succs) {
1922 if (S && S->count(Succ.getSUnit()) == 0)
1923 continue;
1924 if (ignoreDependence(Succ, false))
1925 continue;
1926 if (NodeOrder.count(Succ.getSUnit()) == 0)
1927 Succs.insert(Succ.getSUnit());
1928 }
1929 for (const SDep &Pred : SU->Preds) {
1930 if (Pred.getKind() != SDep::Anti)
1931 continue;
1932 if (S && S->count(Pred.getSUnit()) == 0)
1933 continue;
1934 if (NodeOrder.count(Pred.getSUnit()) == 0)
1935 Succs.insert(Pred.getSUnit());
1936 }
1937 }
1938 return !Succs.empty();
1939}
1940
1941/// Return true if there is a path from the specified node to any of the nodes
1942/// in DestNodes. Keep track and return the nodes in any path.
1943static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
1944 SetVector<SUnit *> &DestNodes,
1945 SetVector<SUnit *> &Exclude,
1946 SmallPtrSet<SUnit *, 8> &Visited) {
1947 if (Cur->isBoundaryNode())
1948 return false;
1949 if (Exclude.contains(Cur))
1950 return false;
1951 if (DestNodes.contains(Cur))
1952 return true;
1953 if (!Visited.insert(Cur).second)
1954 return Path.contains(Cur);
1955 bool FoundPath = false;
1956 for (auto &SI : Cur->Succs)
1957 if (!ignoreDependence(SI, false))
1958 FoundPath |=
1959 computePath(SI.getSUnit(), Path, DestNodes, Exclude, Visited);
1960 for (auto &PI : Cur->Preds)
1961 if (PI.getKind() == SDep::Anti)
1962 FoundPath |=
1963 computePath(PI.getSUnit(), Path, DestNodes, Exclude, Visited);
1964 if (FoundPath)
1965 Path.insert(Cur);
1966 return FoundPath;
1967}
1968
1969/// Compute the live-out registers for the instructions in a node-set.
1970/// The live-out registers are those that are defined in the node-set,
1971/// but not used. Except for use operands of Phis.
1973 NodeSet &NS) {
1978 for (SUnit *SU : NS) {
1979 const MachineInstr *MI = SU->getInstr();
1980 if (MI->isPHI())
1981 continue;
1982 for (const MachineOperand &MO : MI->all_uses()) {
1983 Register Reg = MO.getReg();
1984 if (Reg.isVirtual())
1985 Uses.insert(Reg);
1986 else if (MRI.isAllocatable(Reg))
1987 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
1988 Uses.insert(Unit);
1989 }
1990 }
1991 for (SUnit *SU : NS)
1992 for (const MachineOperand &MO : SU->getInstr()->all_defs())
1993 if (!MO.isDead()) {
1994 Register Reg = MO.getReg();
1995 if (Reg.isVirtual()) {
1996 if (!Uses.count(Reg))
1997 LiveOutRegs.push_back(RegisterMaskPair(Reg,
1999 } else if (MRI.isAllocatable(Reg)) {
2000 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg()))
2001 if (!Uses.count(Unit))
2002 LiveOutRegs.push_back(
2004 }
2005 }
2006 RPTracker.addLiveRegs(LiveOutRegs);
2007}
2008
2009/// A heuristic to filter nodes in recurrent node-sets if the register
2010/// pressure of a set is too high.
2011void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
2012 for (auto &NS : NodeSets) {
2013 // Skip small node-sets since they won't cause register pressure problems.
2014 if (NS.size() <= 2)
2015 continue;
2016 IntervalPressure RecRegPressure;
2017 RegPressureTracker RecRPTracker(RecRegPressure);
2018 RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
2019 computeLiveOuts(MF, RecRPTracker, NS);
2020 RecRPTracker.closeBottom();
2021
2022 std::vector<SUnit *> SUnits(NS.begin(), NS.end());
2023 llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {
2024 return A->NodeNum > B->NodeNum;
2025 });
2026
2027 for (auto &SU : SUnits) {
2028 // Since we're computing the register pressure for a subset of the
2029 // instructions in a block, we need to set the tracker for each
2030 // instruction in the node-set. The tracker is set to the instruction
2031 // just after the one we're interested in.
2033 RecRPTracker.setPos(std::next(CurInstI));
2034
2035 RegPressureDelta RPDelta;
2036 ArrayRef<PressureChange> CriticalPSets;
2037 RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
2038 CriticalPSets,
2039 RecRegPressure.MaxSetPressure);
2040 if (RPDelta.Excess.isValid()) {
2041 LLVM_DEBUG(
2042 dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
2044 << ":" << RPDelta.Excess.getUnitInc() << "\n");
2045 NS.setExceedPressure(SU);
2046 break;
2047 }
2048 RecRPTracker.recede();
2049 }
2050 }
2051}
2052
2053/// A heuristic to colocate node sets that have the same set of
2054/// successors.
2055void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
2056 unsigned Colocate = 0;
2057 for (int i = 0, e = NodeSets.size(); i < e; ++i) {
2058 NodeSet &N1 = NodeSets[i];
2060 if (N1.empty() || !succ_L(N1, S1))
2061 continue;
2062 for (int j = i + 1; j < e; ++j) {
2063 NodeSet &N2 = NodeSets[j];
2064 if (N1.compareRecMII(N2) != 0)
2065 continue;
2067 if (N2.empty() || !succ_L(N2, S2))
2068 continue;
2069 if (llvm::set_is_subset(S1, S2) && S1.size() == S2.size()) {
2070 N1.setColocate(++Colocate);
2071 N2.setColocate(Colocate);
2072 break;
2073 }
2074 }
2075 }
2076}
2077
2078/// Check if the existing node-sets are profitable. If not, then ignore the
2079/// recurrent node-sets, and attempt to schedule all nodes together. This is
2080/// a heuristic. If the MII is large and all the recurrent node-sets are small,
2081/// then it's best to try to schedule all instructions together instead of
2082/// starting with the recurrent node-sets.
2083void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
2084 // Look for loops with a large MII.
2085 if (MII < 17)
2086 return;
2087 // Check if the node-set contains only a simple add recurrence.
2088 for (auto &NS : NodeSets) {
2089 if (NS.getRecMII() > 2)
2090 return;
2091 if (NS.getMaxDepth() > MII)
2092 return;
2093 }
2094 NodeSets.clear();
2095 LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
2096}
2097
2098/// Add the nodes that do not belong to a recurrence set into groups
2099/// based upon connected components.
2100void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
2101 SetVector<SUnit *> NodesAdded;
2103 // Add the nodes that are on a path between the previous node sets and
2104 // the current node set.
2105 for (NodeSet &I : NodeSets) {
2107 // Add the nodes from the current node set to the previous node set.
2108 if (succ_L(I, N)) {
2110 for (SUnit *NI : N) {
2111 Visited.clear();
2112 computePath(NI, Path, NodesAdded, I, Visited);
2113 }
2114 if (!Path.empty())
2115 I.insert(Path.begin(), Path.end());
2116 }
2117 // Add the nodes from the previous node set to the current node set.
2118 N.clear();
2119 if (succ_L(NodesAdded, N)) {
2121 for (SUnit *NI : N) {
2122 Visited.clear();
2123 computePath(NI, Path, I, NodesAdded, Visited);
2124 }
2125 if (!Path.empty())
2126 I.insert(Path.begin(), Path.end());
2127 }
2128 NodesAdded.insert(I.begin(), I.end());
2129 }
2130
2131 // Create a new node set with the connected nodes of any successor of a node
2132 // in a recurrent set.
2133 NodeSet NewSet;
2135 if (succ_L(NodesAdded, N))
2136 for (SUnit *I : N)
2137 addConnectedNodes(I, NewSet, NodesAdded);
2138 if (!NewSet.empty())
2139 NodeSets.push_back(NewSet);
2140
2141 // Create a new node set with the connected nodes of any predecessor of a node
2142 // in a recurrent set.
2143 NewSet.clear();
2144 if (pred_L(NodesAdded, N))
2145 for (SUnit *I : N)
2146 addConnectedNodes(I, NewSet, NodesAdded);
2147 if (!NewSet.empty())
2148 NodeSets.push_back(NewSet);
2149
2150 // Create new nodes sets with the connected nodes any remaining node that
2151 // has no predecessor.
2152 for (SUnit &SU : SUnits) {
2153 if (NodesAdded.count(&SU) == 0) {
2154 NewSet.clear();
2155 addConnectedNodes(&SU, NewSet, NodesAdded);
2156 if (!NewSet.empty())
2157 NodeSets.push_back(NewSet);
2158 }
2159 }
2160}
2161
2162/// Add the node to the set, and add all of its connected nodes to the set.
2163void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
2164 SetVector<SUnit *> &NodesAdded) {
2165 NewSet.insert(SU);
2166 NodesAdded.insert(SU);
2167 for (auto &SI : SU->Succs) {
2168 SUnit *Successor = SI.getSUnit();
2169 if (!SI.isArtificial() && !Successor->isBoundaryNode() &&
2170 NodesAdded.count(Successor) == 0)
2171 addConnectedNodes(Successor, NewSet, NodesAdded);
2172 }
2173 for (auto &PI : SU->Preds) {
2174 SUnit *Predecessor = PI.getSUnit();
2175 if (!PI.isArtificial() && NodesAdded.count(Predecessor) == 0)
2176 addConnectedNodes(Predecessor, NewSet, NodesAdded);
2177 }
2178}
2179
2180/// Return true if Set1 contains elements in Set2. The elements in common
2181/// are returned in a different container.
2182static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
2184 Result.clear();
2185 for (SUnit *SU : Set1) {
2186 if (Set2.count(SU) != 0)
2187 Result.insert(SU);
2188 }
2189 return !Result.empty();
2190}
2191
2192/// Merge the recurrence node sets that have the same initial node.
2193void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
2194 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2195 ++I) {
2196 NodeSet &NI = *I;
2197 for (NodeSetType::iterator J = I + 1; J != E;) {
2198 NodeSet &NJ = *J;
2199 if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
2200 if (NJ.compareRecMII(NI) > 0)
2201 NI.setRecMII(NJ.getRecMII());
2202 for (SUnit *SU : *J)
2203 I->insert(SU);
2204 NodeSets.erase(J);
2205 E = NodeSets.end();
2206 } else {
2207 ++J;
2208 }
2209 }
2210 }
2211}
2212
2213/// Remove nodes that have been scheduled in previous NodeSets.
2214void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
2215 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2216 ++I)
2217 for (NodeSetType::iterator J = I + 1; J != E;) {
2218 J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
2219
2220 if (J->empty()) {
2221 NodeSets.erase(J);
2222 E = NodeSets.end();
2223 } else {
2224 ++J;
2225 }
2226 }
2227}
2228
2229/// Compute an ordered list of the dependence graph nodes, which
2230/// indicates the order that the nodes will be scheduled. This is a
2231/// two-level algorithm. First, a partial order is created, which
2232/// consists of a list of sets ordered from highest to lowest priority.
2233void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
2235 NodeOrder.clear();
2236
2237 for (auto &Nodes : NodeSets) {
2238 LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
2239 OrderKind Order;
2241 if (pred_L(NodeOrder, N) && llvm::set_is_subset(N, Nodes)) {
2242 R.insert(N.begin(), N.end());
2243 Order = BottomUp;
2244 LLVM_DEBUG(dbgs() << " Bottom up (preds) ");
2245 } else if (succ_L(NodeOrder, N) && llvm::set_is_subset(N, Nodes)) {
2246 R.insert(N.begin(), N.end());
2247 Order = TopDown;
2248 LLVM_DEBUG(dbgs() << " Top down (succs) ");
2249 } else if (isIntersect(N, Nodes, R)) {
2250 // If some of the successors are in the existing node-set, then use the
2251 // top-down ordering.
2252 Order = TopDown;
2253 LLVM_DEBUG(dbgs() << " Top down (intersect) ");
2254 } else if (NodeSets.size() == 1) {
2255 for (const auto &N : Nodes)
2256 if (N->Succs.size() == 0)
2257 R.insert(N);
2258 Order = BottomUp;
2259 LLVM_DEBUG(dbgs() << " Bottom up (all) ");
2260 } else {
2261 // Find the node with the highest ASAP.
2262 SUnit *maxASAP = nullptr;
2263 for (SUnit *SU : Nodes) {
2264 if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
2265 (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
2266 maxASAP = SU;
2267 }
2268 R.insert(maxASAP);
2269 Order = BottomUp;
2270 LLVM_DEBUG(dbgs() << " Bottom up (default) ");
2271 }
2272
2273 while (!R.empty()) {
2274 if (Order == TopDown) {
2275 // Choose the node with the maximum height. If more than one, choose
2276 // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
2277 // choose the node with the lowest MOV.
2278 while (!R.empty()) {
2279 SUnit *maxHeight = nullptr;
2280 for (SUnit *I : R) {
2281 if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
2282 maxHeight = I;
2283 else if (getHeight(I) == getHeight(maxHeight) &&
2285 maxHeight = I;
2286 else if (getHeight(I) == getHeight(maxHeight) &&
2288 getZeroLatencyHeight(maxHeight) &&
2289 getMOV(I) < getMOV(maxHeight))
2290 maxHeight = I;
2291 }
2292 NodeOrder.insert(maxHeight);
2293 LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
2294 R.remove(maxHeight);
2295 for (const auto &I : maxHeight->Succs) {
2296 if (Nodes.count(I.getSUnit()) == 0)
2297 continue;
2298 if (NodeOrder.contains(I.getSUnit()))
2299 continue;
2300 if (ignoreDependence(I, false))
2301 continue;
2302 R.insert(I.getSUnit());
2303 }
2304 // Back-edges are predecessors with an anti-dependence.
2305 for (const auto &I : maxHeight->Preds) {
2306 if (I.getKind() != SDep::Anti)
2307 continue;
2308 if (Nodes.count(I.getSUnit()) == 0)
2309 continue;
2310 if (NodeOrder.contains(I.getSUnit()))
2311 continue;
2312 R.insert(I.getSUnit());
2313 }
2314 }
2315 Order = BottomUp;
2316 LLVM_DEBUG(dbgs() << "\n Switching order to bottom up ");
2318 if (pred_L(NodeOrder, N, &Nodes))
2319 R.insert(N.begin(), N.end());
2320 } else {
2321 // Choose the node with the maximum depth. If more than one, choose
2322 // the node with the maximum ZeroLatencyDepth. If still more than one,
2323 // choose the node with the lowest MOV.
2324 while (!R.empty()) {
2325 SUnit *maxDepth = nullptr;
2326 for (SUnit *I : R) {
2327 if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
2328 maxDepth = I;
2329 else if (getDepth(I) == getDepth(maxDepth) &&
2331 maxDepth = I;
2332 else if (getDepth(I) == getDepth(maxDepth) &&
2334 getMOV(I) < getMOV(maxDepth))
2335 maxDepth = I;
2336 }
2337 NodeOrder.insert(maxDepth);
2338 LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
2339 R.remove(maxDepth);
2340 if (Nodes.isExceedSU(maxDepth)) {
2341 Order = TopDown;
2342 R.clear();
2343 R.insert(Nodes.getNode(0));
2344 break;
2345 }
2346 for (const auto &I : maxDepth->Preds) {
2347 if (Nodes.count(I.getSUnit()) == 0)
2348 continue;
2349 if (NodeOrder.contains(I.getSUnit()))
2350 continue;
2351 R.insert(I.getSUnit());
2352 }
2353 // Back-edges are predecessors with an anti-dependence.
2354 for (const auto &I : maxDepth->Succs) {
2355 if (I.getKind() != SDep::Anti)
2356 continue;
2357 if (Nodes.count(I.getSUnit()) == 0)
2358 continue;
2359 if (NodeOrder.contains(I.getSUnit()))
2360 continue;
2361 R.insert(I.getSUnit());
2362 }
2363 }
2364 Order = TopDown;
2365 LLVM_DEBUG(dbgs() << "\n Switching order to top down ");
2367 if (succ_L(NodeOrder, N, &Nodes))
2368 R.insert(N.begin(), N.end());
2369 }
2370 }
2371 LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
2372 }
2373
2374 LLVM_DEBUG({
2375 dbgs() << "Node order: ";
2376 for (SUnit *I : NodeOrder)
2377 dbgs() << " " << I->NodeNum << " ";
2378 dbgs() << "\n";
2379 });
2380}
2381
2382/// Process the nodes in the computed order and create the pipelined schedule
2383/// of the instructions, if possible. Return true if a schedule is found.
2384bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
2385
2386 if (NodeOrder.empty()){
2387 LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );
2388 return false;
2389 }
2390
2391 bool scheduleFound = false;
2392 std::unique_ptr<HighRegisterPressureDetector> HRPDetector;
2393 if (LimitRegPressure) {
2394 HRPDetector =
2395 std::make_unique<HighRegisterPressureDetector>(Loop.getHeader(), MF);
2396 HRPDetector->init(RegClassInfo);
2397 }
2398 // Keep increasing II until a valid schedule is found.
2399 for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) {
2400 Schedule.reset();
2401 Schedule.setInitiationInterval(II);
2402 LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
2403
2404 SetVector<SUnit *>::iterator NI = NodeOrder.begin();
2405 SetVector<SUnit *>::iterator NE = NodeOrder.end();
2406 do {
2407 SUnit *SU = *NI;
2408
2409 // Compute the schedule time for the instruction, which is based
2410 // upon the scheduled time for any predecessors/successors.
2411 int EarlyStart = INT_MIN;
2412 int LateStart = INT_MAX;
2413 // These values are set when the size of the schedule window is limited
2414 // due to chain dependences.
2415 int SchedEnd = INT_MAX;
2416 int SchedStart = INT_MIN;
2417 Schedule.computeStart(SU, &EarlyStart, &LateStart, &SchedEnd, &SchedStart,
2418 II, this);
2419 LLVM_DEBUG({
2420 dbgs() << "\n";
2421 dbgs() << "Inst (" << SU->NodeNum << ") ";
2422 SU->getInstr()->dump();
2423 dbgs() << "\n";
2424 });
2425 LLVM_DEBUG({
2426 dbgs() << format("\tes: %8x ls: %8x me: %8x ms: %8x\n", EarlyStart,
2427 LateStart, SchedEnd, SchedStart);
2428 });
2429
2430 if (EarlyStart > LateStart || SchedEnd < EarlyStart ||
2431 SchedStart > LateStart)
2432 scheduleFound = false;
2433 else if (EarlyStart != INT_MIN && LateStart == INT_MAX) {
2434 SchedEnd = std::min(SchedEnd, EarlyStart + (int)II - 1);
2435 scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2436 } else if (EarlyStart == INT_MIN && LateStart != INT_MAX) {
2437 SchedStart = std::max(SchedStart, LateStart - (int)II + 1);
2438 scheduleFound = Schedule.insert(SU, LateStart, SchedStart, II);
2439 } else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2440 SchedEnd =
2441 std::min(SchedEnd, std::min(LateStart, EarlyStart + (int)II - 1));
2442 // When scheduling a Phi it is better to start at the late cycle and go
2443 // backwards. The default order may insert the Phi too far away from
2444 // its first dependence.
2445 if (SU->getInstr()->isPHI())
2446 scheduleFound = Schedule.insert(SU, SchedEnd, EarlyStart, II);
2447 else
2448 scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2449 } else {
2450 int FirstCycle = Schedule.getFirstCycle();
2451 scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
2452 FirstCycle + getASAP(SU) + II - 1, II);
2453 }
2454 // Even if we find a schedule, make sure the schedule doesn't exceed the
2455 // allowable number of stages. We keep trying if this happens.
2456 if (scheduleFound)
2457 if (SwpMaxStages > -1 &&
2458 Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2459 scheduleFound = false;
2460
2461 LLVM_DEBUG({
2462 if (!scheduleFound)
2463 dbgs() << "\tCan't schedule\n";
2464 });
2465 } while (++NI != NE && scheduleFound);
2466
2467 // If a schedule is found, ensure non-pipelined instructions are in stage 0
2468 if (scheduleFound)
2469 scheduleFound =
2470 Schedule.normalizeNonPipelinedInstructions(this, LoopPipelinerInfo);
2471
2472 // If a schedule is found, check if it is a valid schedule too.
2473 if (scheduleFound)
2474 scheduleFound = Schedule.isValidSchedule(this);
2475
2476 // If a schedule was found and the option is enabled, check if the schedule
2477 // might generate additional register spills/fills.
2478 if (scheduleFound && LimitRegPressure)
2479 scheduleFound =
2480 !HRPDetector->detect(this, Schedule, Schedule.getMaxStageCount());
2481 }
2482
2483 LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
2484 << " (II=" << Schedule.getInitiationInterval()
2485 << ")\n");
2486
2487 if (scheduleFound) {
2488 scheduleFound = LoopPipelinerInfo->shouldUseSchedule(*this, Schedule);
2489 if (!scheduleFound)
2490 LLVM_DEBUG(dbgs() << "Target rejected schedule\n");
2491 }
2492
2493 if (scheduleFound) {
2494 Schedule.finalizeSchedule(this);
2495 Pass.ORE->emit([&]() {
2497 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
2498 << "Schedule found with Initiation Interval: "
2499 << ore::NV("II", Schedule.getInitiationInterval())
2500 << ", MaxStageCount: "
2501 << ore::NV("MaxStageCount", Schedule.getMaxStageCount());
2502 });
2503 } else
2504 Schedule.reset();
2505
2506 return scheduleFound && Schedule.getMaxStageCount() > 0;
2507}
2508
2509/// Return true if we can compute the amount the instruction changes
2510/// during each iteration. Set Delta to the amount of the change.
2511bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
2513 const MachineOperand *BaseOp;
2514 int64_t Offset;
2515 bool OffsetIsScalable;
2516 if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
2517 return false;
2518
2519 // FIXME: This algorithm assumes instructions have fixed-size offsets.
2520 if (OffsetIsScalable)
2521 return false;
2522
2523 if (!BaseOp->isReg())
2524 return false;
2525
2526 Register BaseReg = BaseOp->getReg();
2527
2529 // Check if there is a Phi. If so, get the definition in the loop.
2530 MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
2531 if (BaseDef && BaseDef->isPHI()) {
2532 BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
2533 BaseDef = MRI.getVRegDef(BaseReg);
2534 }
2535 if (!BaseDef)
2536 return false;
2537
2538 int D = 0;
2539 if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
2540 return false;
2541
2542 Delta = D;
2543 return true;
2544}
2545
2546/// Check if we can change the instruction to use an offset value from the
2547/// previous iteration. If so, return true and set the base and offset values
2548/// so that we can rewrite the load, if necessary.
2549/// v1 = Phi(v0, v3)
2550/// v2 = load v1, 0
2551/// v3 = post_store v1, 4, x
2552/// This function enables the load to be rewritten as v2 = load v3, 4.
2553bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
2554 unsigned &BasePos,
2555 unsigned &OffsetPos,
2556 unsigned &NewBase,
2557 int64_t &Offset) {
2558 // Get the load instruction.
2559 if (TII->isPostIncrement(*MI))
2560 return false;
2561 unsigned BasePosLd, OffsetPosLd;
2562 if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
2563 return false;
2564 Register BaseReg = MI->getOperand(BasePosLd).getReg();
2565
2566 // Look for the Phi instruction.
2567 MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
2568 MachineInstr *Phi = MRI.getVRegDef(BaseReg);
2569 if (!Phi || !Phi->isPHI())
2570 return false;
2571 // Get the register defined in the loop block.
2572 unsigned PrevReg = getLoopPhiReg(*Phi, MI->getParent());
2573 if (!PrevReg)
2574 return false;
2575
2576 // Check for the post-increment load/store instruction.
2577 MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
2578 if (!PrevDef || PrevDef == MI)
2579 return false;
2580
2581 if (!TII->isPostIncrement(*PrevDef))
2582 return false;
2583
2584 unsigned BasePos1 = 0, OffsetPos1 = 0;
2585 if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
2586 return false;
2587
2588 // Make sure that the instructions do not access the same memory location in
2589 // the next iteration.
2590 int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
2591 int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
2593 NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
2594 bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
2595 MF.deleteMachineInstr(NewMI);
2596 if (!Disjoint)
2597 return false;
2598
2599 // Set the return value once we determine that we return true.
2600 BasePos = BasePosLd;
2601 OffsetPos = OffsetPosLd;
2602 NewBase = PrevReg;
2603 Offset = StoreOffset;
2604 return true;
2605}
2606
2607/// Apply changes to the instruction if needed. The changes are need
2608/// to improve the scheduling and depend up on the final schedule.
2610 SMSchedule &Schedule) {
2611 SUnit *SU = getSUnit(MI);
2613 InstrChanges.find(SU);
2614 if (It != InstrChanges.end()) {
2615 std::pair<unsigned, int64_t> RegAndOffset = It->second;
2616 unsigned BasePos, OffsetPos;
2617 if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2618 return;
2619 Register BaseReg = MI->getOperand(BasePos).getReg();
2620 MachineInstr *LoopDef = findDefInLoop(BaseReg);
2621 int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
2622 int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
2623 int BaseStageNum = Schedule.stageScheduled(SU);
2624 int BaseCycleNum = Schedule.cycleScheduled(SU);
2625 if (BaseStageNum < DefStageNum) {
2627 int OffsetDiff = DefStageNum - BaseStageNum;
2628 if (DefCycleNum < BaseCycleNum) {
2629 NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
2630 if (OffsetDiff > 0)
2631 --OffsetDiff;
2632 }
2633 int64_t NewOffset =
2634 MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
2635 NewMI->getOperand(OffsetPos).setImm(NewOffset);
2636 SU->setInstr(NewMI);
2637 MISUnitMap[NewMI] = SU;
2638 NewMIs[MI] = NewMI;
2639 }
2640 }
2641}
2642
2643/// Return the instruction in the loop that defines the register.
2644/// If the definition is a Phi, then follow the Phi operand to
2645/// the instruction in the loop.
2646MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
2648 MachineInstr *Def = MRI.getVRegDef(Reg);
2649 while (Def->isPHI()) {
2650 if (!Visited.insert(Def).second)
2651 break;
2652 for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
2653 if (Def->getOperand(i + 1).getMBB() == BB) {
2654 Def = MRI.getVRegDef(Def->getOperand(i).getReg());
2655 break;
2656 }
2657 }
2658 return Def;
2659}
2660
2661/// Return true for an order or output dependence that is loop carried
2662/// potentially. A dependence is loop carried if the destination defines a value
2663/// that may be used or defined by the source in a subsequent iteration.
2665 bool isSucc) {
2666 if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
2667 Dep.isArtificial() || Dep.getSUnit()->isBoundaryNode())
2668 return false;
2669
2671 return true;
2672
2673 if (Dep.getKind() == SDep::Output)
2674 return true;
2675
2676 MachineInstr *SI = Source->getInstr();
2677 MachineInstr *DI = Dep.getSUnit()->getInstr();
2678 if (!isSucc)
2679 std::swap(SI, DI);
2680 assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
2681
2682 // Assume ordered loads and stores may have a loop carried dependence.
2683 if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
2684 SI->mayRaiseFPException() || DI->mayRaiseFPException() ||
2685 SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
2686 return true;
2687
2688 if (!DI->mayLoadOrStore() || !SI->mayLoadOrStore())
2689 return false;
2690
2691 // The conservative assumption is that a dependence between memory operations
2692 // may be loop carried. The following code checks when it can be proved that
2693 // there is no loop carried dependence.
2694 unsigned DeltaS, DeltaD;
2695 if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD))
2696 return true;
2697
2698 const MachineOperand *BaseOpS, *BaseOpD;
2699 int64_t OffsetS, OffsetD;
2700 bool OffsetSIsScalable, OffsetDIsScalable;
2702 if (!TII->getMemOperandWithOffset(*SI, BaseOpS, OffsetS, OffsetSIsScalable,
2703 TRI) ||
2704 !TII->getMemOperandWithOffset(*DI, BaseOpD, OffsetD, OffsetDIsScalable,
2705 TRI))
2706 return true;
2707
2708 assert(!OffsetSIsScalable && !OffsetDIsScalable &&
2709 "Expected offsets to be byte offsets");
2710
2711 MachineInstr *DefS = MRI.getVRegDef(BaseOpS->getReg());
2712 MachineInstr *DefD = MRI.getVRegDef(BaseOpD->getReg());
2713 if (!DefS || !DefD || !DefS->isPHI() || !DefD->isPHI())
2714 return true;
2715
2716 unsigned InitValS = 0;
2717 unsigned LoopValS = 0;
2718 unsigned InitValD = 0;
2719 unsigned LoopValD = 0;
2720 getPhiRegs(*DefS, BB, InitValS, LoopValS);
2721 getPhiRegs(*DefD, BB, InitValD, LoopValD);
2722 MachineInstr *InitDefS = MRI.getVRegDef(InitValS);
2723 MachineInstr *InitDefD = MRI.getVRegDef(InitValD);
2724
2725 if (!InitDefS->isIdenticalTo(*InitDefD))
2726 return true;
2727
2728 // Check that the base register is incremented by a constant value for each
2729 // iteration.
2730 MachineInstr *LoopDefS = MRI.getVRegDef(LoopValS);
2731 int D = 0;
2732 if (!LoopDefS || !TII->getIncrementValue(*LoopDefS, D))
2733 return true;
2734
2735 LocationSize AccessSizeS = (*SI->memoperands_begin())->getSize();
2736 LocationSize AccessSizeD = (*DI->memoperands_begin())->getSize();
2737
2738 // This is the main test, which checks the offset values and the loop
2739 // increment value to determine if the accesses may be loop carried.
2740 if (!AccessSizeS.hasValue() || !AccessSizeD.hasValue())
2741 return true;
2742
2743 if (DeltaS != DeltaD || DeltaS < AccessSizeS.getValue() ||
2744 DeltaD < AccessSizeD.getValue())
2745 return true;
2746
2747 return (OffsetS + (int64_t)AccessSizeS.getValue() <
2748 OffsetD + (int64_t)AccessSizeD.getValue());
2749}
2750
2751void SwingSchedulerDAG::postProcessDAG() {
2752 for (auto &M : Mutations)
2753 M->apply(this);
2754}
2755
2756/// Try to schedule the node at the specified StartCycle and continue
2757/// until the node is schedule or the EndCycle is reached. This function
2758/// returns true if the node is scheduled. This routine may search either
2759/// forward or backward for a place to insert the instruction based upon
2760/// the relative values of StartCycle and EndCycle.
2761bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
2762 bool forward = true;
2763 LLVM_DEBUG({
2764 dbgs() << "Trying to insert node between " << StartCycle << " and "
2765 << EndCycle << " II: " << II << "\n";
2766 });
2767 if (StartCycle > EndCycle)
2768 forward = false;
2769
2770 // The terminating condition depends on the direction.
2771 int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
2772 for (int curCycle = StartCycle; curCycle != termCycle;
2773 forward ? ++curCycle : --curCycle) {
2774
2775 if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
2776 ProcItinResources.canReserveResources(*SU, curCycle)) {
2777 LLVM_DEBUG({
2778 dbgs() << "\tinsert at cycle " << curCycle << " ";
2779 SU->getInstr()->dump();
2780 });
2781
2782 if (!ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
2783 ProcItinResources.reserveResources(*SU, curCycle);
2784 ScheduledInstrs[curCycle].push_back(SU);
2785 InstrToCycle.insert(std::make_pair(SU, curCycle));
2786 if (curCycle > LastCycle)
2787 LastCycle = curCycle;
2788 if (curCycle < FirstCycle)
2789 FirstCycle = curCycle;
2790 return true;
2791 }
2792 LLVM_DEBUG({
2793 dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
2794 SU->getInstr()->dump();
2795 });
2796 }
2797 return false;
2798}
2799
2800// Return the cycle of the earliest scheduled instruction in the chain.
2803 SmallVector<SDep, 8> Worklist;
2804 Worklist.push_back(Dep);
2805 int EarlyCycle = INT_MAX;
2806 while (!Worklist.empty()) {
2807 const SDep &Cur = Worklist.pop_back_val();
2808 SUnit *PrevSU = Cur.getSUnit();
2809 if (Visited.count(PrevSU))
2810 continue;
2811 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
2812 if (it == InstrToCycle.end())
2813 continue;
2814 EarlyCycle = std::min(EarlyCycle, it->second);
2815 for (const auto &PI : PrevSU->Preds)
2816 if (PI.getKind() == SDep::Order || PI.getKind() == SDep::Output)
2817 Worklist.push_back(PI);
2818 Visited.insert(PrevSU);
2819 }
2820 return EarlyCycle;
2821}
2822
2823// Return the cycle of the latest scheduled instruction in the chain.
2826 SmallVector<SDep, 8> Worklist;
2827 Worklist.push_back(Dep);
2828 int LateCycle = INT_MIN;
2829 while (!Worklist.empty()) {
2830 const SDep &Cur = Worklist.pop_back_val();
2831 SUnit *SuccSU = Cur.getSUnit();
2832 if (Visited.count(SuccSU) || SuccSU->isBoundaryNode())
2833 continue;
2834 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
2835 if (it == InstrToCycle.end())
2836 continue;
2837 LateCycle = std::max(LateCycle, it->second);
2838 for (const auto &SI : SuccSU->Succs)
2839 if (SI.getKind() == SDep::Order || SI.getKind() == SDep::Output)
2840 Worklist.push_back(SI);
2841 Visited.insert(SuccSU);
2842 }
2843 return LateCycle;
2844}
2845
2846/// If an instruction has a use that spans multiple iterations, then
2847/// return true. These instructions are characterized by having a back-ege
2848/// to a Phi, which contains a reference to another Phi.
2850 for (auto &P : SU->Preds)
2851 if (DAG->isBackedge(SU, P) && P.getSUnit()->getInstr()->isPHI())
2852 for (auto &S : P.getSUnit()->Succs)
2853 if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
2854 return P.getSUnit();
2855 return nullptr;
2856}
2857
2858/// Compute the scheduling start slot for the instruction. The start slot
2859/// depends on any predecessor or successor nodes scheduled already.
2860void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
2861 int *MinEnd, int *MaxStart, int II,
2862 SwingSchedulerDAG *DAG) {
2863 // Iterate over each instruction that has been scheduled already. The start
2864 // slot computation depends on whether the previously scheduled instruction
2865 // is a predecessor or successor of the specified instruction.
2866 for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
2867
2868 // Iterate over each instruction in the current cycle.
2869 for (SUnit *I : getInstructions(cycle)) {
2870 // Because we're processing a DAG for the dependences, we recognize
2871 // the back-edge in recurrences by anti dependences.
2872 for (unsigned i = 0, e = (unsigned)SU->Preds.size(); i != e; ++i) {
2873 const SDep &Dep = SU->Preds[i];
2874 if (Dep.getSUnit() == I) {
2875 if (!DAG->isBackedge(SU, Dep)) {
2876 int EarlyStart = cycle + Dep.getLatency() -
2877 DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
2878 *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
2879 if (DAG->isLoopCarriedDep(SU, Dep, false)) {
2880 int End = earliestCycleInChain(Dep) + (II - 1);
2881 *MinEnd = std::min(*MinEnd, End);
2882 }
2883 } else {
2884 int LateStart = cycle - Dep.getLatency() +
2885 DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
2886 *MinLateStart = std::min(*MinLateStart, LateStart);
2887 }
2888 }
2889 // For instruction that requires multiple iterations, make sure that
2890 // the dependent instruction is not scheduled past the definition.
2891 SUnit *BE = multipleIterations(I, DAG);
2892 if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
2893 !SU->isPred(I))
2894 *MinLateStart = std::min(*MinLateStart, cycle);
2895 }
2896 for (unsigned i = 0, e = (unsigned)SU->Succs.size(); i != e; ++i) {
2897 if (SU->Succs[i].getSUnit() == I) {
2898 const SDep &Dep = SU->Succs[i];
2899 if (!DAG->isBackedge(SU, Dep)) {
2900 int LateStart = cycle - Dep.getLatency() +
2901 DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
2902 *MinLateStart = std::min(*MinLateStart, LateStart);
2903 if (DAG->isLoopCarriedDep(SU, Dep)) {
2904 int Start = latestCycleInChain(Dep) + 1 - II;
2905 *MaxStart = std::max(*MaxStart, Start);
2906 }
2907 } else {
2908 int EarlyStart = cycle + Dep.getLatency() -
2909 DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
2910 *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
2911 }
2912 }
2913 }
2914 }
2915 }
2916}
2917
2918/// Order the instructions within a cycle so that the definitions occur
2919/// before the uses. Returns true if the instruction is added to the start
2920/// of the list, or false if added to the end.
2922 std::deque<SUnit *> &Insts) const {
2923 MachineInstr *MI = SU->getInstr();
2924 bool OrderBeforeUse = false;
2925 bool OrderAfterDef = false;
2926 bool OrderBeforeDef = false;
2927 unsigned MoveDef = 0;
2928 unsigned MoveUse = 0;
2929 int StageInst1 = stageScheduled(SU);
2930
2931 unsigned Pos = 0;
2932 for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
2933 ++I, ++Pos) {
2934 for (MachineOperand &MO : MI->operands()) {
2935 if (!MO.isReg() || !MO.getReg().isVirtual())
2936 continue;
2937
2938 Register Reg = MO.getReg();
2939 unsigned BasePos, OffsetPos;
2940 if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2941 if (MI->getOperand(BasePos).getReg() == Reg)
2942 if (unsigned NewReg = SSD->getInstrBaseReg(SU))
2943 Reg = NewReg;
2944 bool Reads, Writes;
2945 std::tie(Reads, Writes) =
2946 (*I)->getInstr()->readsWritesVirtualRegister(Reg);
2947 if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
2948 OrderBeforeUse = true;
2949 if (MoveUse == 0)
2950 MoveUse = Pos;
2951 } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
2952 // Add the instruction after the scheduled instruction.
2953 OrderAfterDef = true;
2954 MoveDef = Pos;
2955 } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
2956 if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
2957 OrderBeforeUse = true;
2958 if (MoveUse == 0)
2959 MoveUse = Pos;
2960 } else {
2961 OrderAfterDef = true;
2962 MoveDef = Pos;
2963 }
2964 } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
2965 OrderBeforeUse = true;
2966 if (MoveUse == 0)
2967 MoveUse = Pos;
2968 if (MoveUse != 0) {
2969 OrderAfterDef = true;
2970 MoveDef = Pos - 1;
2971 }
2972 } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
2973 // Add the instruction before the scheduled instruction.
2974 OrderBeforeUse = true;
2975 if (MoveUse == 0)
2976 MoveUse = Pos;
2977 } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
2978 isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
2979 if (MoveUse == 0) {
2980 OrderBeforeDef = true;
2981 MoveUse = Pos;
2982 }
2983 }
2984 }
2985 // Check for order dependences between instructions. Make sure the source
2986 // is ordered before the destination.
2987 for (auto &S : SU->Succs) {
2988 if (S.getSUnit() != *I)
2989 continue;
2990 if (S.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
2991 OrderBeforeUse = true;
2992 if (Pos < MoveUse)
2993 MoveUse = Pos;
2994 }
2995 // We did not handle HW dependences in previous for loop,
2996 // and we normally set Latency = 0 for Anti deps,
2997 // so may have nodes in same cycle with Anti denpendent on HW regs.
2998 else if (S.getKind() == SDep::Anti && stageScheduled(*I) == StageInst1) {
2999 OrderBeforeUse = true;
3000 if ((MoveUse == 0) || (Pos < MoveUse))
3001 MoveUse = Pos;
3002 }
3003 }
3004 for (auto &P : SU->Preds) {
3005 if (P.getSUnit() != *I)
3006 continue;
3007 if (P.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
3008 OrderAfterDef = true;
3009 MoveDef = Pos;
3010 }
3011 }
3012 }
3013
3014 // A circular dependence.
3015 if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
3016 OrderBeforeUse = false;
3017
3018 // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
3019 // to a loop-carried dependence.
3020 if (OrderBeforeDef)
3021 OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
3022
3023 // The uncommon case when the instruction order needs to be updated because
3024 // there is both a use and def.
3025 if (OrderBeforeUse && OrderAfterDef) {
3026 SUnit *UseSU = Insts.at(MoveUse);
3027 SUnit *DefSU = Insts.at(MoveDef);
3028 if (MoveUse > MoveDef) {
3029 Insts.erase(Insts.begin() + MoveUse);
3030 Insts.erase(Insts.begin() + MoveDef);
3031 } else {
3032 Insts.erase(Insts.begin() + MoveDef);
3033 Insts.erase(Insts.begin() + MoveUse);
3034 }
3035 orderDependence(SSD, UseSU, Insts);
3036 orderDependence(SSD, SU, Insts);
3037 orderDependence(SSD, DefSU, Insts);
3038 return;
3039 }
3040 // Put the new instruction first if there is a use in the list. Otherwise,
3041 // put it at the end of the list.
3042 if (OrderBeforeUse)
3043 Insts.push_front(SU);
3044 else
3045 Insts.push_back(SU);
3046}
3047
3048/// Return true if the scheduled Phi has a loop carried operand.
3050 MachineInstr &Phi) const {
3051 if (!Phi.isPHI())
3052 return false;
3053 assert(Phi.isPHI() && "Expecting a Phi.");
3054 SUnit *DefSU = SSD->getSUnit(&Phi);
3055 unsigned DefCycle = cycleScheduled(DefSU);
3056 int DefStage = stageScheduled(DefSU);
3057
3058 unsigned InitVal = 0;
3059 unsigned LoopVal = 0;
3060 getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
3061 SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
3062 if (!UseSU)
3063 return true;
3064 if (UseSU->getInstr()->isPHI())
3065 return true;
3066 unsigned LoopCycle = cycleScheduled(UseSU);
3067 int LoopStage = stageScheduled(UseSU);
3068 return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
3069}
3070
3071/// Return true if the instruction is a definition that is loop carried
3072/// and defines the use on the next iteration.
3073/// v1 = phi(v2, v3)
3074/// (Def) v3 = op v1
3075/// (MO) = v1
3076/// If MO appears before Def, then v1 and v3 may get assigned to the same
3077/// register.
3079 MachineInstr *Def,
3080 MachineOperand &MO) const {
3081 if (!MO.isReg())
3082 return false;
3083 if (Def->isPHI())
3084 return false;
3085 MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
3086 if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
3087 return false;
3088 if (!isLoopCarried(SSD, *Phi))
3089 return false;
3090 unsigned LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
3091 for (MachineOperand &DMO : Def->all_defs()) {
3092 if (DMO.getReg() == LoopReg)
3093 return true;
3094 }
3095 return false;
3096}
3097
3098/// Determine transitive dependences of unpipelineable instructions
3101 SmallSet<SUnit *, 8> DoNotPipeline;
3102 SmallVector<SUnit *, 8> Worklist;
3103
3104 for (auto &SU : SSD->SUnits)
3105 if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr()))
3106 Worklist.push_back(&SU);
3107
3108 while (!Worklist.empty()) {
3109 auto SU = Worklist.pop_back_val();
3110 if (DoNotPipeline.count(SU))
3111 continue;
3112 LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n");
3113 DoNotPipeline.insert(SU);
3114 for (auto &Dep : SU->Preds)
3115 Worklist.push_back(Dep.getSUnit());
3116 if (SU->getInstr()->isPHI())
3117 for (auto &Dep : SU->Succs)
3118 if (Dep.getKind() == SDep::Anti)
3119 Worklist.push_back(Dep.getSUnit());
3120 }
3121 return DoNotPipeline;
3122}
3123
3124// Determine all instructions upon which any unpipelineable instruction depends
3125// and ensure that they are in stage 0. If unable to do so, return false.
3129
3130 int NewLastCycle = INT_MIN;
3131 for (SUnit &SU : SSD->SUnits) {
3132 if (!SU.isInstr())
3133 continue;
3134 if (!DNP.contains(&SU) || stageScheduled(&SU) == 0) {
3135 NewLastCycle = std::max(NewLastCycle, InstrToCycle[&SU]);
3136 continue;
3137 }
3138
3139 // Put the non-pipelined instruction as early as possible in the schedule
3140 int NewCycle = getFirstCycle();
3141 for (auto &Dep : SU.Preds)
3142 NewCycle = std::max(InstrToCycle[Dep.getSUnit()], NewCycle);
3143
3144 int OldCycle = InstrToCycle[&SU];
3145 if (OldCycle != NewCycle) {
3146 InstrToCycle[&SU] = NewCycle;
3147 auto &OldS = getInstructions(OldCycle);
3148 llvm::erase(OldS, &SU);
3149 getInstructions(NewCycle).emplace_back(&SU);
3150 LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum
3151 << ") is not pipelined; moving from cycle " << OldCycle
3152 << " to " << NewCycle << " Instr:" << *SU.getInstr());
3153 }
3154 NewLastCycle = std::max(NewLastCycle, NewCycle);
3155 }
3156 LastCycle = NewLastCycle;
3157 return true;
3158}
3159
3160// Check if the generated schedule is valid. This function checks if
3161// an instruction that uses a physical register is scheduled in a
3162// different stage than the definition. The pipeliner does not handle
3163// physical register values that may cross a basic block boundary.
3164// Furthermore, if a physical def/use pair is assigned to the same
3165// cycle, orderDependence does not guarantee def/use ordering, so that
3166// case should be considered invalid. (The test checks for both
3167// earlier and same-cycle use to be more robust.)
3169 for (SUnit &SU : SSD->SUnits) {
3170 if (!SU.hasPhysRegDefs)
3171 continue;
3172 int StageDef = stageScheduled(&SU);
3173 int CycleDef = InstrToCycle[&SU];
3174 assert(StageDef != -1 && "Instruction should have been scheduled.");
3175 for (auto &SI : SU.Succs)
3176 if (SI.isAssignedRegDep() && !SI.getSUnit()->isBoundaryNode())
3177 if (Register::isPhysicalRegister(SI.getReg())) {
3178 if (stageScheduled(SI.getSUnit()) != StageDef)
3179 return false;
3180 if (InstrToCycle[SI.getSUnit()] <= CycleDef)
3181 return false;
3182 }
3183 }
3184 return true;
3185}
3186
3187/// A property of the node order in swing-modulo-scheduling is
3188/// that for nodes outside circuits the following holds:
3189/// none of them is scheduled after both a successor and a
3190/// predecessor.
3191/// The method below checks whether the property is met.
3192/// If not, debug information is printed and statistics information updated.
3193/// Note that we do not use an assert statement.
3194/// The reason is that although an invalid node oder may prevent
3195/// the pipeliner from finding a pipelined schedule for arbitrary II,
3196/// it does not lead to the generation of incorrect code.
3197void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
3198
3199 // a sorted vector that maps each SUnit to its index in the NodeOrder
3200 typedef std::pair<SUnit *, unsigned> UnitIndex;
3201 std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
3202
3203 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
3204 Indices.push_back(std::make_pair(NodeOrder[i], i));
3205
3206 auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
3207 return std::get<0>(i1) < std::get<0>(i2);
3208 };
3209
3210 // sort, so that we can perform a binary search
3211 llvm::sort(Indices, CompareKey);
3212
3213 bool Valid = true;
3214 (void)Valid;
3215 // for each SUnit in the NodeOrder, check whether
3216 // it appears after both a successor and a predecessor
3217 // of the SUnit. If this is the case, and the SUnit
3218 // is not part of circuit, then the NodeOrder is not
3219 // valid.
3220 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
3221 SUnit *SU = NodeOrder[i];
3222 unsigned Index = i;
3223
3224 bool PredBefore = false;
3225 bool SuccBefore = false;
3226
3227 SUnit *Succ;
3228 SUnit *Pred;
3229 (void)Succ;
3230 (void)Pred;
3231
3232 for (SDep &PredEdge : SU->Preds) {
3233 SUnit *PredSU = PredEdge.getSUnit();
3234 unsigned PredIndex = std::get<1>(
3235 *llvm::lower_bound(Indices, std::make_pair(PredSU, 0), CompareKey));
3236 if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
3237 PredBefore = true;
3238 Pred = PredSU;
3239 break;
3240 }
3241 }
3242
3243 for (SDep &SuccEdge : SU->Succs) {
3244 SUnit *SuccSU = SuccEdge.getSUnit();
3245 // Do not process a boundary node, it was not included in NodeOrder,
3246 // hence not in Indices either, call to std::lower_bound() below will
3247 // return Indices.end().
3248 if (SuccSU->isBoundaryNode())
3249 continue;
3250 unsigned SuccIndex = std::get<1>(
3251 *llvm::lower_bound(Indices, std::make_pair(SuccSU, 0), CompareKey));
3252 if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
3253 SuccBefore = true;
3254 Succ = SuccSU;
3255 break;
3256 }
3257 }
3258
3259 if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
3260 // instructions in circuits are allowed to be scheduled
3261 // after both a successor and predecessor.
3262 bool InCircuit = llvm::any_of(
3263 Circuits, [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
3264 if (InCircuit)
3265 LLVM_DEBUG(dbgs() << "In a circuit, predecessor ";);
3266 else {
3267 Valid = false;
3268 NumNodeOrderIssues++;
3269 LLVM_DEBUG(dbgs() << "Predecessor ";);
3270 }
3271 LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
3272 << " are scheduled before node " << SU->NodeNum
3273 << "\n";);
3274 }
3275 }
3276
3277 LLVM_DEBUG({
3278 if (!Valid)
3279 dbgs() << "Invalid node order found!\n";
3280 });
3281}
3282
3283/// Attempt to fix the degenerate cases when the instruction serialization
3284/// causes the register lifetimes to overlap. For example,
3285/// p' = store_pi(p, b)
3286/// = load p, offset
3287/// In this case p and p' overlap, which means that two registers are needed.
3288/// Instead, this function changes the load to use p' and updates the offset.
3289void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
3290 unsigned OverlapReg = 0;
3291 unsigned NewBaseReg = 0;
3292 for (SUnit *SU : Instrs) {
3293 MachineInstr *MI = SU->getInstr();
3294 for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
3295 const MachineOperand &MO = MI->getOperand(i);
3296 // Look for an instruction that uses p. The instruction occurs in the
3297 // same cycle but occurs later in the serialized order.
3298 if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
3299 // Check that the instruction appears in the InstrChanges structure,
3300 // which contains instructions that can have the offset updated.
3302 InstrChanges.find(SU);
3303 if (It != InstrChanges.end()) {
3304 unsigned BasePos, OffsetPos;
3305 // Update the base register and adjust the offset.
3306 if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
3308 NewMI->getOperand(BasePos).setReg(NewBaseReg);
3309 int64_t NewOffset =
3310 MI->getOperand(OffsetPos).getImm() - It->second.second;
3311 NewMI->getOperand(OffsetPos).setImm(NewOffset);
3312 SU->setInstr(NewMI);
3313 MISUnitMap[NewMI] = SU;
3314 NewMIs[MI] = NewMI;
3315 }
3316 }
3317 OverlapReg = 0;
3318 NewBaseReg = 0;
3319 break;
3320 }
3321 // Look for an instruction of the form p' = op(p), which uses and defines
3322 // two virtual registers that get allocated to the same physical register.
3323 unsigned TiedUseIdx = 0;
3324 if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
3325 // OverlapReg is p in the example above.
3326 OverlapReg = MI->getOperand(TiedUseIdx).getReg();
3327 // NewBaseReg is p' in the example above.
3328 NewBaseReg = MI->getOperand(i).getReg();
3329 break;
3330 }
3331 }
3332 }
3333}
3334
3335std::deque<SUnit *>
3337 const std::deque<SUnit *> &Instrs) const {
3338 std::deque<SUnit *> NewOrderPhi;
3339 for (SUnit *SU : Instrs) {
3340 if (SU->getInstr()->isPHI())
3341 NewOrderPhi.push_back(SU);
3342 }
3343 std::deque<SUnit *> NewOrderI;
3344 for (SUnit *SU : Instrs) {
3345 if (!SU->getInstr()->isPHI())
3346 orderDependence(SSD, SU, NewOrderI);
3347 }
3348 llvm::append_range(NewOrderPhi, NewOrderI);
3349 return NewOrderPhi;
3350}
3351
3352/// After the schedule has been formed, call this function to combine
3353/// the instructions from the different stages/cycles. That is, this
3354/// function creates a schedule that represents a single iteration.
3356 // Move all instructions to the first stage from later stages.
3357 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3358 for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
3359 ++stage) {
3360 std::deque<SUnit *> &cycleInstrs =
3361 ScheduledInstrs[cycle + (stage * InitiationInterval)];
3362 for (SUnit *SU : llvm::reverse(cycleInstrs))
3363 ScheduledInstrs[cycle].push_front(SU);
3364 }
3365 }
3366
3367 // Erase all the elements in the later stages. Only one iteration should
3368 // remain in the scheduled list, and it contains all the instructions.
3369 for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
3370 ScheduledInstrs.erase(cycle);
3371
3372 // Change the registers in instruction as specified in the InstrChanges
3373 // map. We need to use the new registers to create the correct order.
3374 for (const SUnit &SU : SSD->SUnits)
3375 SSD->applyInstrChange(SU.getInstr(), *this);
3376
3377 // Reorder the instructions in each cycle to fix and improve the
3378 // generated code.
3379 for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
3380 std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
3381 cycleInstrs = reorderInstructions(SSD, cycleInstrs);
3382 SSD->fixupRegisterOverlaps(cycleInstrs);
3383 }
3384
3385 LLVM_DEBUG(dump(););
3386}
3387
3389 os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
3390 << " depth " << MaxDepth << " col " << Colocate << "\n";
3391 for (const auto &I : Nodes)
3392 os << " SU(" << I->NodeNum << ") " << *(I->getInstr());
3393 os << "\n";
3394}
3395
3396#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3397/// Print the schedule information to the given output.
3399 // Iterate over each cycle.
3400 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3401 // Iterate over each instruction in the cycle.
3402 const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
3403 for (SUnit *CI : cycleInstrs->second) {
3404 os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
3405 os << "(" << CI->NodeNum << ") ";
3406 CI->getInstr()->print(os);
3407 os << "\n";
3408 }
3409 }
3410}
3411
3412/// Utility function used for debugging to print the schedule.
3415
3416void ResourceManager::dumpMRT() const {
3417 LLVM_DEBUG({
3418 if (UseDFA)
3419 return;
3420 std::stringstream SS;
3421 SS << "MRT:\n";
3422 SS << std::setw(4) << "Slot";
3423 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3424 SS << std::setw(3) << I;
3425 SS << std::setw(7) << "#Mops"
3426 << "\n";
3427 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3428 SS << std::setw(4) << Slot;
3429 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
3430 SS << std::setw(3) << MRT[Slot][I];
3431 SS << std::setw(7) << NumScheduledMops[Slot] << "\n";
3432 }
3433 dbgs() << SS.str();
3434 });
3435}
3436#endif
3437
3439 const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {
3440 unsigned ProcResourceID = 0;
3441
3442 // We currently limit the resource kinds to 64 and below so that we can use
3443 // uint64_t for Masks
3444 assert(SM.getNumProcResourceKinds() < 64 &&
3445 "Too many kinds of resources, unsupported");
3446 // Create a unique bitmask for every processor resource unit.
3447 // Skip resource at index 0, since it always references 'InvalidUnit'.
3448 Masks.resize(SM.getNumProcResourceKinds());
3449 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3451 if (Desc.SubUnitsIdxBegin)
3452 continue;
3453 Masks[I] = 1ULL << ProcResourceID;
3454 ProcResourceID++;
3455 }
3456 // Create a unique bitmask for every processor resource group.
3457 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3459 if (!Desc.SubUnitsIdxBegin)
3460 continue;
3461 Masks[I] = 1ULL << ProcResourceID;
3462 for (unsigned U = 0; U < Desc.NumUnits; ++U)
3463 Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];
3464 ProcResourceID++;
3465 }
3466 LLVM_DEBUG({
3467 if (SwpShowResMask) {
3468 dbgs() << "ProcResourceDesc:\n";
3469 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3470 const MCProcResourceDesc *ProcResource = SM.getProcResource(I);
3471 dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
3472 ProcResource->Name, I, Masks[I],
3473 ProcResource->NumUnits);
3474 }
3475 dbgs() << " -----------------\n";
3476 }
3477 });
3478}
3479
3481 LLVM_DEBUG({
3482 if (SwpDebugResource)
3483 dbgs() << "canReserveResources:\n";
3484 });
3485 if (UseDFA)
3486 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3487 ->canReserveResources(&SU.getInstr()->getDesc());
3488
3489 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3490 if (!SCDesc->isValid()) {
3491 LLVM_DEBUG({
3492 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3493 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3494 });
3495 return true;
3496 }
3497
3498 reserveResources(SCDesc, Cycle);
3499 bool Result = !isOverbooked();
3500 unreserveResources(SCDesc, Cycle);
3501
3502 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return " << Result << "\n\n";);
3503 return Result;
3504}
3505
3506void ResourceManager::reserveResources(SUnit &SU, int Cycle) {
3507 LLVM_DEBUG({
3508 if (SwpDebugResource)
3509 dbgs() << "reserveResources:\n";
3510 });
3511 if (UseDFA)
3512 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3513 ->reserveResources(&SU.getInstr()->getDesc());
3514
3515 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3516 if (!SCDesc->isValid()) {
3517 LLVM_DEBUG({
3518 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3519 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3520 });
3521 return;
3522 }
3523
3524 reserveResources(SCDesc, Cycle);
3525
3526 LLVM_DEBUG({
3527 if (SwpDebugResource) {
3528 dumpMRT();
3529 dbgs() << "reserveResources: done!\n\n";
3530 }
3531 });
3532}
3533
3534void ResourceManager::reserveResources(const MCSchedClassDesc *SCDesc,
3535 int Cycle) {
3536 assert(!UseDFA);
3537 for (const MCWriteProcResEntry &PRE : make_range(
3538 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3539 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3540 ++MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3541
3542 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3543 ++NumScheduledMops[positiveModulo(C, InitiationInterval)];
3544}
3545
3546void ResourceManager::unreserveResources(const MCSchedClassDesc *SCDesc,
3547 int Cycle) {
3548 assert(!UseDFA);
3549 for (const MCWriteProcResEntry &PRE : make_range(
3550 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3551 for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3552 --MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3553
3554 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3555 --NumScheduledMops[positiveModulo(C, InitiationInterval)];
3556}
3557
3558bool ResourceManager::isOverbooked() const {
3559 assert(!UseDFA);
3560 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3561 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3563 if (MRT[Slot][I] > Desc->NumUnits)
3564 return true;
3565 }
3566 if (NumScheduledMops[Slot] > IssueWidth)
3567 return true;
3568 }
3569 return false;
3570}
3571
3572int ResourceManager::calculateResMIIDFA() const {
3573 assert(UseDFA);
3574
3575 // Sort the instructions by the number of available choices for scheduling,
3576 // least to most. Use the number of critical resources as the tie breaker.
3577 FuncUnitSorter FUS = FuncUnitSorter(*ST);
3578 for (SUnit &SU : DAG->SUnits)
3579 FUS.calcCriticalResources(*SU.getInstr());
3581 FuncUnitOrder(FUS);
3582
3583 for (SUnit &SU : DAG->SUnits)
3584 FuncUnitOrder.push(SU.getInstr());
3585
3587 Resources.push_back(
3588 std::unique_ptr<DFAPacketizer>(TII->CreateTargetScheduleState(*ST)));
3589
3590 while (!FuncUnitOrder.empty()) {
3591 MachineInstr *MI = FuncUnitOrder.top();
3592 FuncUnitOrder.pop();
3593 if (TII->isZeroCost(MI->getOpcode()))
3594 continue;
3595
3596 // Attempt to reserve the instruction in an existing DFA. At least one
3597 // DFA is needed for each cycle.
3598 unsigned NumCycles = DAG->getSUnit(MI)->Latency;
3599 unsigned ReservedCycles = 0;
3600 auto *RI = Resources.begin();
3601 auto *RE = Resources.end();
3602 LLVM_DEBUG({
3603 dbgs() << "Trying to reserve resource for " << NumCycles
3604 << " cycles for \n";
3605 MI->dump();
3606 });
3607 for (unsigned C = 0; C < NumCycles; ++C)
3608 while (RI != RE) {
3609 if ((*RI)->canReserveResources(*MI)) {
3610 (*RI)->reserveResources(*MI);
3611 ++ReservedCycles;
3612 break;
3613 }
3614 RI++;
3615 }
3616 LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles
3617 << ", NumCycles:" << NumCycles << "\n");
3618 // Add new DFAs, if needed, to reserve resources.
3619 for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
3621 << "NewResource created to reserve resources"
3622 << "\n");
3623 auto *NewResource = TII->CreateTargetScheduleState(*ST);
3624 assert(NewResource->canReserveResources(*MI) && "Reserve error.");
3625 NewResource->reserveResources(*MI);
3626 Resources.push_back(std::unique_ptr<DFAPacketizer>(NewResource));
3627 }
3628 }
3629
3630 int Resmii = Resources.size();
3631 LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
3632 return Resmii;
3633}
3634
3636 if (UseDFA)
3637 return calculateResMIIDFA();
3638
3639 // Count each resource consumption and divide it by the number of units.
3640 // ResMII is the max value among them.
3641
3642 int NumMops = 0;
3644 for (SUnit &SU : DAG->SUnits) {
3645 if (TII->isZeroCost(SU.getInstr()->getOpcode()))
3646 continue;
3647
3648 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3649 if (!SCDesc->isValid())
3650 continue;
3651
3652 LLVM_DEBUG({
3653 if (SwpDebugResource) {
3654 DAG->dumpNode(SU);
3655 dbgs() << " #Mops: " << SCDesc->NumMicroOps << "\n"
3656 << " WriteProcRes: ";
3657 }
3658 });
3659 NumMops += SCDesc->NumMicroOps;
3660 for (const MCWriteProcResEntry &PRE :
3661 make_range(STI->getWriteProcResBegin(SCDesc),
3662 STI->getWriteProcResEnd(SCDesc))) {
3663 LLVM_DEBUG({
3664 if (SwpDebugResource) {
3665 const MCProcResourceDesc *Desc =
3666 SM.getProcResource(PRE.ProcResourceIdx);
3667 dbgs() << Desc->Name << ": " << PRE.ReleaseAtCycle << ", ";
3668 }
3669 });
3670 ResourceCount[PRE.ProcResourceIdx] += PRE.ReleaseAtCycle;
3671 }
3672 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "\n");
3673 }
3674
3675 int Result = (NumMops + IssueWidth - 1) / IssueWidth;
3676 LLVM_DEBUG({
3677 if (SwpDebugResource)
3678 dbgs() << "#Mops: " << NumMops << ", "
3679 << "IssueWidth: " << IssueWidth << ", "
3680 << "Cycles: " << Result << "\n";
3681 });
3682
3683 LLVM_DEBUG({
3684 if (SwpDebugResource) {
3685 std::stringstream SS;
3686 SS << std::setw(2) << "ID" << std::setw(16) << "Name" << std::setw(10)
3687 << "Units" << std::setw(10) << "Consumed" << std::setw(10) << "Cycles"
3688 << "\n";
3689 dbgs() << SS.str();
3690 }
3691 });
3692 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3694 int Cycles = (ResourceCount[I] + Desc->NumUnits - 1) / Desc->NumUnits;
3695 LLVM_DEBUG({
3696 if (SwpDebugResource) {
3697 std::stringstream SS;
3698 SS << std::setw(2) << I << std::setw(16) << Desc->Name << std::setw(10)
3699 << Desc->NumUnits << std::setw(10) << ResourceCount[I]
3700 << std::setw(10) << Cycles << "\n";
3701 dbgs() << SS.str();
3702 }
3703 });
3704 if (Cycles > Result)
3705 Result = Cycles;
3706 }
3707 return Result;
3708}
3709
3711 InitiationInterval = II;
3712 DFAResources.clear();
3713 DFAResources.resize(II);
3714 for (auto &I : DFAResources)
3715 I.reset(ST->getInstrInfo()->CreateTargetScheduleState(*ST));
3716 MRT.clear();
3718 NumScheduledMops.clear();
3719 NumScheduledMops.resize(II);
3720}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static const LLT S1
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:529
This file declares an analysis pass that computes CycleInfo for LLVM IR, specialized from GenericCycl...
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
bool End
Definition: ELF_riscv.cpp:480
SmallVector< uint32_t, 0 > Writes
Definition: ELF_riscv.cpp:497
Rewrite Partial Register Uses
#define DEBUG_TYPE
const HexagonInstrInfo * TII
hexagon gen pred
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
static cl::opt< int > SwpForceII("pipeliner-force-ii", cl::desc("Force pipeliner to use specified II."), cl::Hidden, cl::init(-1))
A command line argument to force pipeliner to use specified initial interval.
static cl::opt< bool > ExperimentalCodeGen("pipeliner-experimental-cg", cl::Hidden, cl::init(false), cl::desc("Use the experimental peeling code generator for software pipelining"))
static bool pred_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Preds, const NodeSet *S=nullptr)
Compute the Pred_L(O) set, as defined in the paper.
static cl::opt< int > RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden, cl::init(5), cl::desc("Margin representing the unused percentage of " "the register pressure limit"))
static cl::opt< bool > SwpDebugResource("pipeliner-dbg-res", cl::Hidden, cl::init(false))
static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker, NodeSet &NS)
Compute the live-out registers for the instructions in a node-set.
static void computeScheduledInsts(const SwingSchedulerDAG *SSD, SMSchedule &Schedule, std::vector< MachineInstr * > &OrderedInsts, DenseMap< MachineInstr *, unsigned > &Stages)
Create an instruction stream that represents a single iteration and stage of each instruction.
static cl::opt< bool > EmitTestAnnotations("pipeliner-annotate-for-testing", cl::Hidden, cl::init(false), cl::desc("Instead of emitting the pipelined code, annotate instructions " "with the generated schedule for feeding into the " "-modulo-schedule-test pass"))
static bool isIntersect(SmallSetVector< SUnit *, 8 > &Set1, const NodeSet &Set2, SmallSetVector< SUnit *, 8 > &Result)
Return true if Set1 contains elements in Set2.
static cl::opt< bool > SwpIgnoreRecMII("pipeliner-ignore-recmii", cl::ReallyHidden, cl::desc("Ignore RecMII"))
static cl::opt< int > SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1))
static bool succ_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Succs, const NodeSet *S=nullptr)
Compute the Succ_L(O) set, as defined in the paper.
Modulo Software Pipelining
static cl::opt< bool > SwpPruneLoopCarried("pipeliner-prune-loop-carried", cl::desc("Prune loop carried order dependences."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of loop carried order dependences.
static bool isDependenceBarrier(MachineInstr &MI)
Return true if the instruction causes a chain between memory references before and after it.
static cl::opt< int > SwpMaxMii("pipeliner-max-mii", cl::desc("Size limit for the MII."), cl::Hidden, cl::init(27))
A command line argument to limit minimum initial interval for pipelining.
static unsigned getLoopPhiReg(const MachineInstr &Phi, const MachineBasicBlock *LoopBB)
Return the Phi register value that comes the loop block.
static void swapAntiDependences(std::vector< SUnit > &SUnits)
Swap all the anti dependences in the DAG.
static bool isSuccOrder(SUnit *SUa, SUnit *SUb)
Return true if SUb can be reached from SUa following the chain edges.
static cl::opt< int > SwpMaxStages("pipeliner-max-stages", cl::desc("Maximum stages allowed in the generated scheduled."), cl::Hidden, cl::init(3))
A command line argument to limit the number of stages in the pipeline.
static cl::opt< bool > EnableSWPOptSize("enable-pipeliner-opt-size", cl::desc("Enable SWP at Os."), cl::Hidden, cl::init(false))
A command line option to enable SWP at -Os.
static cl::opt< bool > SwpShowResMask("pipeliner-show-mask", cl::Hidden, cl::init(false))
static cl::opt< int > SwpIISearchRange("pipeliner-ii-search-range", cl::desc("Range to search for II"), cl::Hidden, cl::init(10))
static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop, unsigned &InitVal, unsigned &LoopVal)
Return the register values for the operands of a Phi instruction.
static cl::opt< bool > LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false), cl::desc("Limit register pressure of scheduled loop"))
#define DEBUG_TYPE
static cl::opt< bool > EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true), cl::desc("Enable Software Pipelining"))
A command line option to turn software pipelining on or off.
static bool ignoreDependence(const SDep &D, bool isPred)
Return true for DAG nodes that we ignore when computing the cost functions.
static cl::opt< bool > SwpPruneDeps("pipeliner-prune-deps", cl::desc("Prune dependences between unrelated Phi nodes."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of chain dependences due to an unrelated Phi.
static SUnit * multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG)
If an instruction has a use that spans multiple iterations, then return true.
static bool computePath(SUnit *Cur, SetVector< SUnit * > &Path, SetVector< SUnit * > &DestNodes, SetVector< SUnit * > &Exclude, SmallPtrSet< SUnit *, 8 > &Visited)
Return true if there is a path from the specified node to any of the nodes in DestNodes.
unsigned const TargetRegisterInfo * TRI
unsigned Reg
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file defines the PriorityQueue class.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static unsigned getSize(unsigned Kind)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
bool isNoAlias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A trivial helper function to check to see if the specified pointers are no-alias.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:220
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
bool erase(const KeyT &Val)
Definition: DenseMap.h:329
bool empty() const
Definition: DenseMap.h:98
iterator end()
Definition: DenseMap.h:84
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:350
A possibly irreducible generalization of a Loop.
Itinerary data supplied by a subtarget to be used by a target.
const InstrStage * beginStage(unsigned ItinClassIndx) const
Return the first stage of the itinerary.
const InstrStage * endStage(unsigned ItinClassIndx) const
Return the last+1 stage of the itinerary.
bool isEmpty() const
Returns true if there are no itineraries.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:359
bool hasValue() const
TypeSize getValue() const
BlockT * getHeader() const
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:631
unsigned getSchedClass() const
Return the scheduling class for this instruction.
Definition: MCInstrDesc.h:600
const MCInstrDesc & get(unsigned Opcode) const
Return the machine instruction descriptor that corresponds to the specified instruction opcode.
Definition: MCInstrInfo.h:63
Generic base class for all target subtargets.
const MCWriteProcResEntry * getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCWriteProcResEntry * getWriteProcResBegin(const MCSchedClassDesc *SC) const
Return an iterator at the first process resource consumed by the given scheduling class.
const MCSchedModel & getSchedModel() const
Get the machine model for this subtarget's CPU.
Metadata node.
Definition: Metadata.h:1067
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1428
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1426
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1434
Tracking metadata reference owned by Metadata.
Definition: Metadata.h:889
A single uniqued string.
Definition: Metadata.h:720
StringRef getString() const
Definition: Metadata.cpp:610
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
instr_iterator instr_end()
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void deleteMachineInstr(MachineInstr *MI)
DeleteMachineInstr - Delete the given MachineInstr.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
bool mayRaiseFPException() const
Return true if this instruction could possibly raise a floating-point exception.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:546
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:329
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition: MachineInstr.h:543
bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
bool isRegSequence() const
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:777
bool isIdenticalTo(const MachineInstr &Other, MICheckType Check=CheckDefs) const
Return true if this instruction is identical to Other.
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool isPseudo(QueryType Type=IgnoreBundle) const
Return true if this is a pseudo instruction that doesn't correspond to a real machine instruction.
Definition: MachineInstr.h:898
bool isPHI() const
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:556
iterator_range< filtered_mop_iterator > all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
Definition: MachineInstr.h:733
A description of a memory reference used in the backend.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
const Value * getValue() const
Return the base address of the memory access.
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
Register getReg() const
getReg - Returns the register number.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
Diagnostic information for optimization analysis remarks.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
The main class in the implementation of the target independent software pipeliner pass.
const TargetInstrInfo * TII
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
MachineFunction * MF
MachineOptimizationRemarkEmitter * ORE
RegisterClassInfo RegClassInfo
defusechain_iterator - This class provides iterator support for machine operands in the function that...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
use_instr_iterator use_instr_begin(Register RegNo) const
static use_instr_iterator use_instr_end()
MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator end()
Definition: MapVector.h:71
iterator find(const KeyT &Key)
Definition: MapVector.h:167
void clear()
Definition: MapVector.h:88
static MemoryLocation getAfter(const Value *Ptr, const AAMDNodes &AATags=AAMDNodes())
Return a location that may access any location after Ptr, while remaining within the underlying objec...
The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place, rewriting the old loop and...
void cleanup()
Performs final cleanup after expansion.
void expand()
Performs the actual expansion.
Expander that simply annotates each scheduled instruction with a post-instr symbol that can be consum...
void annotate()
Performs the annotation.
Represents a schedule for a single-block loop.
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
SUnit * getNode(unsigned i) const
void print(raw_ostream &os) const
void setRecMII(unsigned mii)
unsigned count(SUnit *SU) const
void setColocate(unsigned c)
int compareRecMII(NodeSet &RHS)
unsigned size() const
bool insert(SUnit *SU)
LLVM_DUMP_METHOD void dump() const
bool empty() const
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
void dump() const
Definition: Pass.cpp:136
A reimplementation of ModuloScheduleExpander.
unsigned getPSet() const
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
Track the current register pressure at some position in the instruction stream, and remember the high...
void addLiveRegs(ArrayRef< RegisterMaskPair > Regs)
Force liveness of virtual registers or physical register units.
unsigned getRegPressureSetLimit(unsigned Idx) const
Get the register unit limit for the given pressure set index.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static constexpr bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:65
void initProcResourceVectors(const MCSchedModel &SM, SmallVectorImpl< uint64_t > &Masks)
void init(int II)
Initialize resources with the initiation interval II.
bool canReserveResources(SUnit &SU, int Cycle)
Check if the resources occupied by a machine instruction are available in the current state.
Scheduling dependency.
Definition: ScheduleDAG.h:49
SUnit * getSUnit() const
Definition: ScheduleDAG.h:480
Kind getKind() const
Returns an enum value representing the kind of the dependence.
Definition: ScheduleDAG.h:486
Kind
These are the different kinds of scheduling dependencies.
Definition: ScheduleDAG.h:52
@ Output
A register output-dependence (aka WAW).
Definition: ScheduleDAG.h:55
@ Order
Any other ordering dependency.
Definition: ScheduleDAG.h:56
@ Anti
A register anti-dependence (aka WAR).
Definition: ScheduleDAG.h:54
@ Data
Regular data dependence (aka true-dependence).
Definition: ScheduleDAG.h:53
void setLatency(unsigned Lat)
Sets the latency for this edge.
Definition: ScheduleDAG.h:147
@ Barrier
An unknown scheduling barrier.
Definition: ScheduleDAG.h:69
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
Definition: ScheduleDAG.h:142
bool isArtificial() const
Tests if this is an Order dependence that is marked as "artificial", meaning it isn't necessary for c...
Definition: ScheduleDAG.h:200
This class represents the scheduled code.
std::deque< SUnit * > reorderInstructions(const SwingSchedulerDAG *SSD, const std::deque< SUnit * > &Instrs) const
int earliestCycleInChain(const SDep &Dep)
Return the cycle of the earliest scheduled instruction in the dependence chain.
void setInitiationInterval(int ii)
Set the initiation interval for this schedule.
SmallSet< SUnit *, 8 > computeUnpipelineableNodes(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
Determine transitive dependences of unpipelineable instructions.
void dump() const
Utility function used for debugging to print the schedule.
bool insert(SUnit *SU, int StartCycle, int EndCycle, int II)
Try to schedule the node at the specified StartCycle and continue until the node is schedule or the E...
unsigned getMaxStageCount()
Return the maximum stage count needed for this schedule.
void print(raw_ostream &os) const
Print the schedule information to the given output.
int latestCycleInChain(const SDep &Dep)
Return the cycle of the latest scheduled instruction in the dependence chain.
int stageScheduled(SUnit *SU) const
Return the stage for a scheduled instruction.
void orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU, std::deque< SUnit * > &Insts) const
Order the instructions within a cycle so that the definitions occur before the uses.
void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart, int *MinEnd, int *MaxStart, int II, SwingSchedulerDAG *DAG)
Compute the scheduling start slot for the instruction.
bool isValidSchedule(SwingSchedulerDAG *SSD)
int getInitiationInterval() const
Return the initiation interval for this schedule.
std::deque< SUnit * > & getInstructions(int cycle)
Return the instructions that are scheduled at the specified cycle.
int getFirstCycle() const
Return the first cycle in the completed schedule.
bool isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD, MachineInstr *Def, MachineOperand &MO) const
Return true if the instruction is a definition that is loop carried and defines the use on the next i...
unsigned cycleScheduled(SUnit *SU) const
Return the cycle for a scheduled instruction.
bool normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
bool isLoopCarried(const SwingSchedulerDAG *SSD, MachineInstr &Phi) const
Return true if the scheduled Phi has a loop carried operand.
int getFinalCycle() const
Return the last cycle in the finalized schedule.
void finalizeSchedule(SwingSchedulerDAG *SSD)
After the schedule has been formed, call this function to combine the instructions from the different...
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
unsigned NumPreds
Definition: ScheduleDAG.h:266
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
Definition: ScheduleDAG.h:362
unsigned NodeNum
Entry # of node in the node vector.
Definition: ScheduleDAG.h:264
void setInstr(MachineInstr *MI)
Assigns the instruction for the SUnit.
Definition: ScheduleDAG.h:366
void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
bool isPred(const SUnit *N) const
Tests if node N is a predecessor of this node.
Definition: ScheduleDAG.h:431
unsigned short Latency
Node latency.
Definition: ScheduleDAG.h:273
bool isBoundaryNode() const
Boundary nodes are placeholders for the boundary of the scheduling region.
Definition: ScheduleDAG.h:344
bool hasPhysRegDefs
Has physreg defs that are being used.
Definition: ScheduleDAG.h:280
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:257
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:256
bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:373
A ScheduleDAG for scheduling lists of MachineInstr.
DenseMap< MachineInstr *, SUnit * > MISUnitMap
After calling BuildSchedGraph, each machine instruction in the current scheduling region is mapped to...
virtual void finishBlock()
Cleans up after scheduling in the given block.
MachineBasicBlock * BB
The block in which to insert instructions.
const MCSchedClassDesc * getSchedClass(SUnit *SU) const
Resolves and cache a resolved scheduling class for an SUnit.
void dumpNode(const SUnit &SU) const override
UndefValue * UnknownValue
For an unanalyzable memory access, this Value is used in maps.
void buildSchedGraph(AAResults *AA, RegPressureTracker *RPTracker=nullptr, PressureDiffs *PDiffs=nullptr, LiveIntervals *LIS=nullptr, bool TrackLaneMasks=false)
Builds SUnits for the current region.
SUnit * getSUnit(MachineInstr *MI) const
Returns an existing SUnit for this MI, or nullptr.
void dump() const override
void RemovePred(SUnit *M, SUnit *N)
Updates the topological ordering to accommodate an edge to be removed from the specified node N from ...
void InitDAGTopologicalSorting()
Creates the initial topological ordering from the DAG to be scheduled.
void AddPred(SUnit *Y, SUnit *X)
Updates the topological ordering to accommodate an edge to be added from SUnit X to SUnit Y.
bool IsReachable(const SUnit *SU, const SUnit *TargetSU)
Checks if SU is reachable from TargetSU.
MachineRegisterInfo & MRI
Virtual/real register map.
Definition: ScheduleDAG.h:560
const TargetInstrInfo * TII
Target instruction information.
Definition: ScheduleDAG.h:557
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:561
const TargetRegisterInfo * TRI
Target processor register info.
Definition: ScheduleDAG.h:558
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:559
A vector that has set insertion semantics.
Definition: SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
typename vector_type::const_iterator iterator
Definition: SetVector.h:69
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:254
SlotIndexes pass.
Definition: SlotIndexes.h:300
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:523
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:290
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:356
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
iterator end() const
Definition: SmallPtrSet.h:385
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
iterator begin() const
Definition: SmallPtrSet.h:380
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:236
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
This class builds the dependence graph for the instructions in a loop, and attempts to schedule the i...
unsigned getInstrBaseReg(SUnit *SU) const
Return the new base register that was stored away for the changed instruction.
unsigned getDepth(SUnit *Node)
The depth, in the dependence graph, for a node.
int getASAP(SUnit *Node)
Return the earliest time an instruction may be scheduled.
void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule)
Apply changes to the instruction if needed.
void finishBlock() override
Clean up after the software pipeliner runs.
void fixupRegisterOverlaps(std::deque< SUnit * > &Instrs)
Attempt to fix the degenerate cases when the instruction serialization causes the register lifetimes ...
int getZeroLatencyDepth(SUnit *Node)
The maximum unweighted length of a path from an arbitrary node to the given node in which each edge h...
bool isLoopCarriedDep(SUnit *Source, const SDep &Dep, bool isSucc=true)
Return true for an order or output dependence that is loop carried potentially.
unsigned getDistance(SUnit *U, SUnit *V, const SDep &Dep)
The distance function, which indicates that operation V of iteration I depends on operations U of ite...
void schedule() override
We override the schedule function in ScheduleDAGInstrs to implement the scheduling part of the Swing ...
int getMOV(SUnit *Node)
The mobility function, which the number of slots in which an instruction may be scheduled.
int getZeroLatencyHeight(SUnit *Node)
The maximum unweighted length of a path from the given node to an arbitrary node in which each edge h...
bool isBackedge(SUnit *Source, const SDep &Dep)
Return true if the dependence is a back-edge in the data dependence graph.
unsigned getHeight(SUnit *Node)
The height, in the dependence graph, for a node.
int getALAP(SUnit *Node)
Return the latest time an instruction my be scheduled.
Object returned by analyzeLoopForPipelining.
virtual bool shouldIgnoreForPipelining(const MachineInstr *MI) const =0
Return true if the given instruction should not be pipelined and should be ignored.
virtual bool shouldUseSchedule(SwingSchedulerDAG &SSD, SMSchedule &SMS)
Return true if the proposed schedule should used.
virtual std::unique_ptr< PipelinerLoopInfo > analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const
Analyze loop L, which must be a single-basic-block loop, and if the conditions can be understood enou...
bool isZeroCost(unsigned Opcode) const
Return true for pseudo instructions that don't consume any machine resources in their current form.
virtual bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
virtual DFAPacketizer * CreateTargetScheduleState(const TargetSubtargetInfo &) const
Create machine specific model for scheduling.
virtual bool isPostIncrement(const MachineInstr &MI) const
Return true for post-incremented instructions.
virtual bool getBaseAndOffsetPosition(const MachineInstr &MI, unsigned &BasePos, unsigned &OffsetPos) const
Return true if the instruction contains a base register and offset.
virtual bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const
Sometimes, it is possible for the target to tell, even without aliasing information,...
virtual bool getIncrementValue(const MachineInstr &MI, int &Value) const
If the instruction is an increment of a constant value, return the amount.
bool getMemOperandWithOffset(const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const
Get the base operand and byte offset of an instruction that reads/writes memory.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const char * getRegPressureSetName(unsigned Idx) const =0
Get the name of this register unit pressure set.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
static Type * getVoidTy(LLVMContext &C)
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1808
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:139
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
constexpr double e
Definition: MathExtras.h:31
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Offset
Definition: DWP.cpp:456
void stable_sort(R &&Range)
Definition: STLExtras.h:2004
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1689
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2082
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2068
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1738
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:428
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1656
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
unsigned getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1963
char & MachinePipelinerID
This pass performs software pipelining on machine instructions.
CycleInfo::CycleT Cycle
Definition: CycleInfo.h:24
cl::opt< bool > SwpEnableCopyToPhi
bool isIdentifiedObject(const Value *V)
Return true if this pointer refers to a distinct and identifiable object.
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
cl::opt< int > SwpForceIssueWidth
A command line argument to force pipeliner to use specified issue width.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Description of the encoding of one expression Op.
These values represent a non-pipelined step in the execution of an instruction.
RegisterPressure computed within a region of instructions delimited by TopIdx and BottomIdx.
static constexpr LaneBitmask getNone()
Definition: LaneBitmask.h:81
Define a kind of processor resource that will be modeled by the scheduler.
Definition: MCSchedule.h:31
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition: MCSchedule.h:118
bool isValid() const
Definition: MCSchedule.h:136
Machine model for scheduling, bundling, and heuristics.
Definition: MCSchedule.h:253
const MCSchedClassDesc * getSchedClassDesc(unsigned SchedClassIdx) const
Definition: MCSchedule.h:360
unsigned getNumProcResourceKinds() const
Definition: MCSchedule.h:349
bool hasInstrSchedModel() const
Does this machine model include instruction-level scheduling.
Definition: MCSchedule.h:334
const MCProcResourceDesc * getProcResource(unsigned ProcResourceIdx) const
Definition: MCSchedule.h:353
Identify one of the processor resource kinds consumed by a particular scheduling class for the specif...
Definition: MCSchedule.h:63
SmallVector< MachineOperand, 4 > BrCond
std::unique_ptr< TargetInstrInfo::PipelinerLoopInfo > LoopPipelinerInfo
Store the effects of a change in pressure on things that MI scheduler cares about.
std::vector< unsigned > MaxSetPressure
Map of max reg pressure indexed by pressure set ID, not class ID.