LLVM 17.0.0git
MachinePipeliner.cpp
Go to the documentation of this file.
1//===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
10//
11// This SMS implementation is a target-independent back-end pass. When enabled,
12// the pass runs just prior to the register allocation pass, while the machine
13// IR is in SSA form. If software pipelining is successful, then the original
14// loop is replaced by the optimized loop. The optimized loop contains one or
15// more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
16// the instructions cannot be scheduled in a given MII, we increase the MII by
17// one and try again.
18//
19// The SMS implementation is an extension of the ScheduleDAGInstrs class. We
20// represent loop carried dependences in the DAG as order edges to the Phi
21// nodes. We also perform several passes over the DAG to eliminate unnecessary
22// edges that inhibit the ability to pipeline. The implementation uses the
23// DFAPacketizer class to compute the minimum initiation interval and the check
24// where an instruction may be inserted in the pipelined schedule.
25//
26// In order for the SMS pass to work, several target specific hooks need to be
27// implemented to get information about the loop structure and to rewrite
28// instructions.
29//
30//===----------------------------------------------------------------------===//
31
33#include "llvm/ADT/ArrayRef.h"
34#include "llvm/ADT/BitVector.h"
35#include "llvm/ADT/DenseMap.h"
36#include "llvm/ADT/MapVector.h"
39#include "llvm/ADT/SetVector.h"
41#include "llvm/ADT/SmallSet.h"
43#include "llvm/ADT/Statistic.h"
69#include "llvm/Config/llvm-config.h"
70#include "llvm/IR/Attributes.h"
71#include "llvm/IR/Function.h"
72#include "llvm/MC/LaneBitmask.h"
73#include "llvm/MC/MCInstrDesc.h"
76#include "llvm/Pass.h"
79#include "llvm/Support/Debug.h"
82#include <algorithm>
83#include <cassert>
84#include <climits>
85#include <cstdint>
86#include <deque>
87#include <functional>
88#include <iomanip>
89#include <iterator>
90#include <map>
91#include <memory>
92#include <sstream>
93#include <tuple>
94#include <utility>
95#include <vector>
96
97using namespace llvm;
98
99#define DEBUG_TYPE "pipeliner"
100
101STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
102STATISTIC(NumPipelined, "Number of loops software pipelined");
103STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
104STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
105STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
106STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
107STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
108STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
109STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
110STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
111STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
112
113/// A command line option to turn software pipelining on or off.
114static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
115 cl::desc("Enable Software Pipelining"));
116
117/// A command line option to enable SWP at -Os.
118static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
119 cl::desc("Enable SWP at Os."), cl::Hidden,
120 cl::init(false));
121
122/// A command line argument to limit minimum initial interval for pipelining.
123static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
124 cl::desc("Size limit for the MII."),
125 cl::Hidden, cl::init(27));
126
127/// A command line argument to force pipeliner to use specified initial
128/// interval.
129static cl::opt<int> SwpForceII("pipeliner-force-ii",
130 cl::desc("Force pipeliner to use specified II."),
131 cl::Hidden, cl::init(-1));
132
133/// A command line argument to limit the number of stages in the pipeline.
134static cl::opt<int>
135 SwpMaxStages("pipeliner-max-stages",
136 cl::desc("Maximum stages allowed in the generated scheduled."),
137 cl::Hidden, cl::init(3));
138
139/// A command line option to disable the pruning of chain dependences due to
140/// an unrelated Phi.
141static cl::opt<bool>
142 SwpPruneDeps("pipeliner-prune-deps",
143 cl::desc("Prune dependences between unrelated Phi nodes."),
144 cl::Hidden, cl::init(true));
145
146/// A command line option to disable the pruning of loop carried order
147/// dependences.
148static cl::opt<bool>
149 SwpPruneLoopCarried("pipeliner-prune-loop-carried",
150 cl::desc("Prune loop carried order dependences."),
151 cl::Hidden, cl::init(true));
152
153#ifndef NDEBUG
154static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
155#endif
156
157static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
159 cl::desc("Ignore RecMII"));
160
161static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
162 cl::init(false));
163static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,
164 cl::init(false));
165
167 "pipeliner-annotate-for-testing", cl::Hidden, cl::init(false),
168 cl::desc("Instead of emitting the pipelined code, annotate instructions "
169 "with the generated schedule for feeding into the "
170 "-modulo-schedule-test pass"));
171
173 "pipeliner-experimental-cg", cl::Hidden, cl::init(false),
174 cl::desc(
175 "Use the experimental peeling code generator for software pipelining"));
176
177namespace llvm {
178
179// A command line option to enable the CopyToPhi DAG mutation.
181 cl::init(true),
182 cl::desc("Enable CopyToPhi DAG Mutation"));
183
184/// A command line argument to force pipeliner to use specified issue
185/// width.
187 "pipeliner-force-issue-width",
188 cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
189 cl::init(-1));
190
191} // end namespace llvm
192
193unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
194char MachinePipeliner::ID = 0;
195#ifndef NDEBUG
197#endif
199
201 "Modulo Software Pipelining", false, false)
207 "Modulo Software Pipelining", false, false)
208
209/// The "main" function for implementing Swing Modulo Scheduling.
210bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
211 if (skipFunction(mf.getFunction()))
212 return false;
213
214 if (!EnableSWP)
215 return false;
216
217 if (mf.getFunction().getAttributes().hasFnAttr(Attribute::OptimizeForSize) &&
219 return false;
220
221 if (!mf.getSubtarget().enableMachinePipeliner())
222 return false;
223
224 // Cannot pipeline loops without instruction itineraries if we are using
225 // DFA for the pipeliner.
226 if (mf.getSubtarget().useDFAforSMS() &&
227 (!mf.getSubtarget().getInstrItineraryData() ||
228 mf.getSubtarget().getInstrItineraryData()->isEmpty()))
229 return false;
230
231 MF = &mf;
232 MLI = &getAnalysis<MachineLoopInfo>();
233 MDT = &getAnalysis<MachineDominatorTree>();
234 ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
235 TII = MF->getSubtarget().getInstrInfo();
236 RegClassInfo.runOnMachineFunction(*MF);
237
238 for (const auto &L : *MLI)
239 scheduleLoop(*L);
240
241 return false;
242}
243
244/// Attempt to perform the SMS algorithm on the specified loop. This function is
245/// the main entry point for the algorithm. The function identifies candidate
246/// loops, calculates the minimum initiation interval, and attempts to schedule
247/// the loop.
248bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
249 bool Changed = false;
250 for (const auto &InnerLoop : L)
251 Changed |= scheduleLoop(*InnerLoop);
252
253#ifndef NDEBUG
254 // Stop trying after reaching the limit (if any).
255 int Limit = SwpLoopLimit;
256 if (Limit >= 0) {
257 if (NumTries >= SwpLoopLimit)
258 return Changed;
259 NumTries++;
260 }
261#endif
262
263 setPragmaPipelineOptions(L);
264 if (!canPipelineLoop(L)) {
265 LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
266 ORE->emit([&]() {
267 return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",
268 L.getStartLoc(), L.getHeader())
269 << "Failed to pipeline loop";
270 });
271
272 LI.LoopPipelinerInfo.reset();
273 return Changed;
274 }
275
276 ++NumTrytoPipeline;
277
278 Changed = swingModuloScheduler(L);
279
280 LI.LoopPipelinerInfo.reset();
281 return Changed;
282}
283
284void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
285 // Reset the pragma for the next loop in iteration.
286 disabledByPragma = false;
287 II_setByPragma = 0;
288
289 MachineBasicBlock *LBLK = L.getTopBlock();
290
291 if (LBLK == nullptr)
292 return;
293
294 const BasicBlock *BBLK = LBLK->getBasicBlock();
295 if (BBLK == nullptr)
296 return;
297
298 const Instruction *TI = BBLK->getTerminator();
299 if (TI == nullptr)
300 return;
301
302 MDNode *LoopID = TI->getMetadata(LLVMContext::MD_loop);
303 if (LoopID == nullptr)
304 return;
305
306 assert(LoopID->getNumOperands() > 0 && "requires atleast one operand");
307 assert(LoopID->getOperand(0) == LoopID && "invalid loop");
308
309 for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
310 MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
311
312 if (MD == nullptr)
313 continue;
314
315 MDString *S = dyn_cast<MDString>(MD->getOperand(0));
316
317 if (S == nullptr)
318 continue;
319
320 if (S->getString() == "llvm.loop.pipeline.initiationinterval") {
321 assert(MD->getNumOperands() == 2 &&
322 "Pipeline initiation interval hint metadata should have two operands.");
324 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
325 assert(II_setByPragma >= 1 && "Pipeline initiation interval must be positive.");
326 } else if (S->getString() == "llvm.loop.pipeline.disable") {
327 disabledByPragma = true;
328 }
329 }
330}
331
332/// Return true if the loop can be software pipelined. The algorithm is
333/// restricted to loops with a single basic block. Make sure that the
334/// branch in the loop can be analyzed.
335bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
336 if (L.getNumBlocks() != 1) {
337 ORE->emit([&]() {
338 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
339 L.getStartLoc(), L.getHeader())
340 << "Not a single basic block: "
341 << ore::NV("NumBlocks", L.getNumBlocks());
342 });
343 return false;
344 }
345
346 if (disabledByPragma) {
347 ORE->emit([&]() {
348 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
349 L.getStartLoc(), L.getHeader())
350 << "Disabled by Pragma.";
351 });
352 return false;
353 }
354
355 // Check if the branch can't be understood because we can't do pipelining
356 // if that's the case.
357 LI.TBB = nullptr;
358 LI.FBB = nullptr;
359 LI.BrCond.clear();
360 if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
361 LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
362 NumFailBranch++;
363 ORE->emit([&]() {
364 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
365 L.getStartLoc(), L.getHeader())
366 << "The branch can't be understood";
367 });
368 return false;
369 }
370
371 LI.LoopInductionVar = nullptr;
372 LI.LoopCompare = nullptr;
374 if (!LI.LoopPipelinerInfo) {
375 LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
376 NumFailLoop++;
377 ORE->emit([&]() {
378 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
379 L.getStartLoc(), L.getHeader())
380 << "The loop structure is not supported";
381 });
382 return false;
383 }
384
385 if (!L.getLoopPreheader()) {
386 LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
387 NumFailPreheader++;
388 ORE->emit([&]() {
389 return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
390 L.getStartLoc(), L.getHeader())
391 << "No loop preheader found";
392 });
393 return false;
394 }
395
396 // Remove any subregisters from inputs to phi nodes.
397 preprocessPhiNodes(*L.getHeader());
398 return true;
399}
400
401void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
403 SlotIndexes &Slots = *getAnalysis<LiveIntervals>().getSlotIndexes();
404
405 for (MachineInstr &PI : B.phis()) {
406 MachineOperand &DefOp = PI.getOperand(0);
407 assert(DefOp.getSubReg() == 0);
408 auto *RC = MRI.getRegClass(DefOp.getReg());
409
410 for (unsigned i = 1, n = PI.getNumOperands(); i != n; i += 2) {
411 MachineOperand &RegOp = PI.getOperand(i);
412 if (RegOp.getSubReg() == 0)
413 continue;
414
415 // If the operand uses a subregister, replace it with a new register
416 // without subregisters, and generate a copy to the new register.
417 Register NewReg = MRI.createVirtualRegister(RC);
418 MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
420 const DebugLoc &DL = PredB.findDebugLoc(At);
421 auto Copy = BuildMI(PredB, At, DL, TII->get(TargetOpcode::COPY), NewReg)
422 .addReg(RegOp.getReg(), getRegState(RegOp),
423 RegOp.getSubReg());
424 Slots.insertMachineInstrInMaps(*Copy);
425 RegOp.setReg(NewReg);
426 RegOp.setSubReg(0);
427 }
428 }
429}
430
431/// The SMS algorithm consists of the following main steps:
432/// 1. Computation and analysis of the dependence graph.
433/// 2. Ordering of the nodes (instructions).
434/// 3. Attempt to Schedule the loop.
435bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
436 assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
437
438 SwingSchedulerDAG SMS(*this, L, getAnalysis<LiveIntervals>(), RegClassInfo,
440
442 // The kernel should not include any terminator instructions. These
443 // will be added back later.
444 SMS.startBlock(MBB);
445
446 // Compute the number of 'real' instructions in the basic block by
447 // ignoring terminators.
448 unsigned size = MBB->size();
450 E = MBB->instr_end();
451 I != E; ++I, --size)
452 ;
453
454 SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
455 SMS.schedule();
456 SMS.exitRegion();
457
458 SMS.finishBlock();
459 return SMS.hasNewSchedule();
460}
461
470}
471
472void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
473 if (SwpForceII > 0)
474 MII = SwpForceII;
475 else if (II_setByPragma > 0)
476 MII = II_setByPragma;
477 else
478 MII = std::max(ResMII, RecMII);
479}
480
481void SwingSchedulerDAG::setMAX_II() {
482 if (SwpForceII > 0)
483 MAX_II = SwpForceII;
484 else if (II_setByPragma > 0)
485 MAX_II = II_setByPragma;
486 else
487 MAX_II = MII + 10;
488}
489
490/// We override the schedule function in ScheduleDAGInstrs to implement the
491/// scheduling part of the Swing Modulo Scheduling algorithm.
493 AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
494 buildSchedGraph(AA);
495 addLoopCarriedDependences(AA);
496 updatePhiDependences();
498 changeDependences();
499 postprocessDAG();
500 LLVM_DEBUG(dump());
501
502 NodeSetType NodeSets;
503 findCircuits(NodeSets);
504 NodeSetType Circuits = NodeSets;
505
506 // Calculate the MII.
507 unsigned ResMII = calculateResMII();
508 unsigned RecMII = calculateRecMII(NodeSets);
509
510 fuseRecs(NodeSets);
511
512 // This flag is used for testing and can cause correctness problems.
513 if (SwpIgnoreRecMII)
514 RecMII = 0;
515
516 setMII(ResMII, RecMII);
517 setMAX_II();
518
519 LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II
520 << " (rec=" << RecMII << ", res=" << ResMII << ")\n");
521
522 // Can't schedule a loop without a valid MII.
523 if (MII == 0) {
524 LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
525 NumFailZeroMII++;
526 Pass.ORE->emit([&]() {
528 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
529 << "Invalid Minimal Initiation Interval: 0";
530 });
531 return;
532 }
533
534 // Don't pipeline large loops.
535 if (SwpMaxMii != -1 && (int)MII > SwpMaxMii) {
536 LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
537 << ", we don't pipeline large loops\n");
538 NumFailLargeMaxMII++;
539 Pass.ORE->emit([&]() {
541 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
542 << "Minimal Initiation Interval too large: "
543 << ore::NV("MII", (int)MII) << " > "
544 << ore::NV("SwpMaxMii", SwpMaxMii) << "."
545 << "Refer to -pipeliner-max-mii.";
546 });
547 return;
548 }
549
550 computeNodeFunctions(NodeSets);
551
552 registerPressureFilter(NodeSets);
553
554 colocateNodeSets(NodeSets);
555
556 checkNodeSets(NodeSets);
557
558 LLVM_DEBUG({
559 for (auto &I : NodeSets) {
560 dbgs() << " Rec NodeSet ";
561 I.dump();
562 }
563 });
564
565 llvm::stable_sort(NodeSets, std::greater<NodeSet>());
566
567 groupRemainingNodes(NodeSets);
568
569 removeDuplicateNodes(NodeSets);
570
571 LLVM_DEBUG({
572 for (auto &I : NodeSets) {
573 dbgs() << " NodeSet ";
574 I.dump();
575 }
576 });
577
578 computeNodeOrder(NodeSets);
579
580 // check for node order issues
581 checkValidNodeOrder(Circuits);
582
583 SMSchedule Schedule(Pass.MF, this);
584 Scheduled = schedulePipeline(Schedule);
585
586 if (!Scheduled){
587 LLVM_DEBUG(dbgs() << "No schedule found, return\n");
588 NumFailNoSchedule++;
589 Pass.ORE->emit([&]() {
591 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
592 << "Unable to find schedule";
593 });
594 return;
595 }
596
597 unsigned numStages = Schedule.getMaxStageCount();
598 // No need to generate pipeline if there are no overlapped iterations.
599 if (numStages == 0) {
600 LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
601 NumFailZeroStage++;
602 Pass.ORE->emit([&]() {
604 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
605 << "No need to pipeline - no overlapped iterations in schedule.";
606 });
607 return;
608 }
609 // Check that the maximum stage count is less than user-defined limit.
610 if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages) {
611 LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
612 << " : too many stages, abort\n");
613 NumFailLargeMaxStage++;
614 Pass.ORE->emit([&]() {
616 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
617 << "Too many stages in schedule: "
618 << ore::NV("numStages", (int)numStages) << " > "
619 << ore::NV("SwpMaxStages", SwpMaxStages)
620 << ". Refer to -pipeliner-max-stages.";
621 });
622 return;
623 }
624
625 Pass.ORE->emit([&]() {
627 Loop.getHeader())
628 << "Pipelined succesfully!";
629 });
630
631 // Generate the schedule as a ModuloSchedule.
632 DenseMap<MachineInstr *, int> Cycles, Stages;
633 std::vector<MachineInstr *> OrderedInsts;
634 for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
635 ++Cycle) {
636 for (SUnit *SU : Schedule.getInstructions(Cycle)) {
637 OrderedInsts.push_back(SU->getInstr());
638 Cycles[SU->getInstr()] = Cycle;
639 Stages[SU->getInstr()] = Schedule.stageScheduled(SU);
640 }
641 }
643 for (auto &KV : NewMIs) {
644 Cycles[KV.first] = Cycles[KV.second];
645 Stages[KV.first] = Stages[KV.second];
646 NewInstrChanges[KV.first] = InstrChanges[getSUnit(KV.first)];
647 }
648
649 ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
650 std::move(Stages));
652 assert(NewInstrChanges.empty() &&
653 "Cannot serialize a schedule with InstrChanges!");
655 MSTI.annotate();
656 return;
657 }
658 // The experimental code generator can't work if there are InstChanges.
659 if (ExperimentalCodeGen && NewInstrChanges.empty()) {
660 PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
661 MSE.expand();
662 } else {
663 ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
664 MSE.expand();
665 MSE.cleanup();
666 }
667 ++NumPipelined;
668}
669
670/// Clean up after the software pipeliner runs.
672 for (auto &KV : NewMIs)
673 MF.deleteMachineInstr(KV.second);
674 NewMIs.clear();
675
676 // Call the superclass.
678}
679
680/// Return the register values for the operands of a Phi instruction.
681/// This function assume the instruction is a Phi.
683 unsigned &InitVal, unsigned &LoopVal) {
684 assert(Phi.isPHI() && "Expecting a Phi.");
685
686 InitVal = 0;
687 LoopVal = 0;
688 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
689 if (Phi.getOperand(i + 1).getMBB() != Loop)
690 InitVal = Phi.getOperand(i).getReg();
691 else
692 LoopVal = Phi.getOperand(i).getReg();
693
694 assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
695}
696
697/// Return the Phi register value that comes the loop block.
698static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
699 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
700 if (Phi.getOperand(i + 1).getMBB() == LoopBB)
701 return Phi.getOperand(i).getReg();
702 return 0;
703}
704
705/// Return true if SUb can be reached from SUa following the chain edges.
706static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
709 Worklist.push_back(SUa);
710 while (!Worklist.empty()) {
711 const SUnit *SU = Worklist.pop_back_val();
712 for (const auto &SI : SU->Succs) {
713 SUnit *SuccSU = SI.getSUnit();
714 if (SI.getKind() == SDep::Order) {
715 if (Visited.count(SuccSU))
716 continue;
717 if (SuccSU == SUb)
718 return true;
719 Worklist.push_back(SuccSU);
720 Visited.insert(SuccSU);
721 }
722 }
723 }
724 return false;
725}
726
727/// Return true if the instruction causes a chain between memory
728/// references before and after it.
730 return MI.isCall() || MI.mayRaiseFPException() ||
731 MI.hasUnmodeledSideEffects() ||
732 (MI.hasOrderedMemoryRef() &&
733 (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad()));
734}
735
736/// Return the underlying objects for the memory references of an instruction.
737/// This function calls the code in ValueTracking, but first checks that the
738/// instruction has a memory operand.
741 if (!MI->hasOneMemOperand())
742 return;
743 MachineMemOperand *MM = *MI->memoperands_begin();
744 if (!MM->getValue())
745 return;
746 getUnderlyingObjects(MM->getValue(), Objs);
747 for (const Value *V : Objs) {
748 if (!isIdentifiedObject(V)) {
749 Objs.clear();
750 return;
751 }
752 Objs.push_back(V);
753 }
754}
755
756/// Add a chain edge between a load and store if the store can be an
757/// alias of the load on a subsequent iteration, i.e., a loop carried
758/// dependence. This code is very similar to the code in ScheduleDAGInstrs
759/// but that code doesn't create loop carried dependences.
760void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
764 for (auto &SU : SUnits) {
765 MachineInstr &MI = *SU.getInstr();
767 PendingLoads.clear();
768 else if (MI.mayLoad()) {
771 if (Objs.empty())
773 for (const auto *V : Objs) {
774 SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
775 SUs.push_back(&SU);
776 }
777 } else if (MI.mayStore()) {
780 if (Objs.empty())
782 for (const auto *V : Objs) {
784 PendingLoads.find(V);
785 if (I == PendingLoads.end())
786 continue;
787 for (auto *Load : I->second) {
788 if (isSuccOrder(Load, &SU))
789 continue;
790 MachineInstr &LdMI = *Load->getInstr();
791 // First, perform the cheaper check that compares the base register.
792 // If they are the same and the load offset is less than the store
793 // offset, then mark the dependence as loop carried potentially.
794 const MachineOperand *BaseOp1, *BaseOp2;
795 int64_t Offset1, Offset2;
796 bool Offset1IsScalable, Offset2IsScalable;
797 if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1,
798 Offset1IsScalable, TRI) &&
799 TII->getMemOperandWithOffset(MI, BaseOp2, Offset2,
800 Offset2IsScalable, TRI)) {
801 if (BaseOp1->isIdenticalTo(*BaseOp2) &&
802 Offset1IsScalable == Offset2IsScalable &&
803 (int)Offset1 < (int)Offset2) {
805 "What happened to the chain edge?");
806 SDep Dep(Load, SDep::Barrier);
807 Dep.setLatency(1);
808 SU.addPred(Dep);
809 continue;
810 }
811 }
812 // Second, the more expensive check that uses alias analysis on the
813 // base registers. If they alias, and the load offset is less than
814 // the store offset, the mark the dependence as loop carried.
815 if (!AA) {
816 SDep Dep(Load, SDep::Barrier);
817 Dep.setLatency(1);
818 SU.addPred(Dep);
819 continue;
820 }
821 MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
822 MachineMemOperand *MMO2 = *MI.memoperands_begin();
823 if (!MMO1->getValue() || !MMO2->getValue()) {
824 SDep Dep(Load, SDep::Barrier);
825 Dep.setLatency(1);
826 SU.addPred(Dep);
827 continue;
828 }
829 if (MMO1->getValue() == MMO2->getValue() &&
830 MMO1->getOffset() <= MMO2->getOffset()) {
831 SDep Dep(Load, SDep::Barrier);
832 Dep.setLatency(1);
833 SU.addPred(Dep);
834 continue;
835 }
836 if (!AA->isNoAlias(
839 MMO2->getAAInfo()))) {
840 SDep Dep(Load, SDep::Barrier);
841 Dep.setLatency(1);
842 SU.addPred(Dep);
843 }
844 }
845 }
846 }
847 }
848}
849
850/// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
851/// processes dependences for PHIs. This function adds true dependences
852/// from a PHI to a use, and a loop carried dependence from the use to the
853/// PHI. The loop carried dependence is represented as an anti dependence
854/// edge. This function also removes chain dependences between unrelated
855/// PHIs.
856void SwingSchedulerDAG::updatePhiDependences() {
857 SmallVector<SDep, 4> RemoveDeps;
859
860 // Iterate over each DAG node.
861 for (SUnit &I : SUnits) {
862 RemoveDeps.clear();
863 // Set to true if the instruction has an operand defined by a Phi.
864 unsigned HasPhiUse = 0;
865 unsigned HasPhiDef = 0;
866 MachineInstr *MI = I.getInstr();
867 // Iterate over each operand, and we process the definitions.
868 for (const MachineOperand &MO : MI->operands()) {
869 if (!MO.isReg())
870 continue;
871 Register Reg = MO.getReg();
872 if (MO.isDef()) {
873 // If the register is used by a Phi, then create an anti dependence.
875 UI = MRI.use_instr_begin(Reg),
876 UE = MRI.use_instr_end();
877 UI != UE; ++UI) {
878 MachineInstr *UseMI = &*UI;
879 SUnit *SU = getSUnit(UseMI);
880 if (SU != nullptr && UseMI->isPHI()) {
881 if (!MI->isPHI()) {
882 SDep Dep(SU, SDep::Anti, Reg);
883 Dep.setLatency(1);
884 I.addPred(Dep);
885 } else {
886 HasPhiDef = Reg;
887 // Add a chain edge to a dependent Phi that isn't an existing
888 // predecessor.
889 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
890 I.addPred(SDep(SU, SDep::Barrier));
891 }
892 }
893 }
894 } else if (MO.isUse()) {
895 // If the register is defined by a Phi, then create a true dependence.
897 if (DefMI == nullptr)
898 continue;
899 SUnit *SU = getSUnit(DefMI);
900 if (SU != nullptr && DefMI->isPHI()) {
901 if (!MI->isPHI()) {
902 SDep Dep(SU, SDep::Data, Reg);
903 Dep.setLatency(0);
904 ST.adjustSchedDependency(SU, 0, &I, MO.getOperandNo(), Dep);
905 I.addPred(Dep);
906 } else {
907 HasPhiUse = Reg;
908 // Add a chain edge to a dependent Phi that isn't an existing
909 // predecessor.
910 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
911 I.addPred(SDep(SU, SDep::Barrier));
912 }
913 }
914 }
915 }
916 // Remove order dependences from an unrelated Phi.
917 if (!SwpPruneDeps)
918 continue;
919 for (auto &PI : I.Preds) {
920 MachineInstr *PMI = PI.getSUnit()->getInstr();
921 if (PMI->isPHI() && PI.getKind() == SDep::Order) {
922 if (I.getInstr()->isPHI()) {
923 if (PMI->getOperand(0).getReg() == HasPhiUse)
924 continue;
925 if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
926 continue;
927 }
928 RemoveDeps.push_back(PI);
929 }
930 }
931 for (int i = 0, e = RemoveDeps.size(); i != e; ++i)
932 I.removePred(RemoveDeps[i]);
933 }
934}
935
936/// Iterate over each DAG node and see if we can change any dependences
937/// in order to reduce the recurrence MII.
938void SwingSchedulerDAG::changeDependences() {
939 // See if an instruction can use a value from the previous iteration.
940 // If so, we update the base and offset of the instruction and change
941 // the dependences.
942 for (SUnit &I : SUnits) {
943 unsigned BasePos = 0, OffsetPos = 0, NewBase = 0;
944 int64_t NewOffset = 0;
945 if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
946 NewOffset))
947 continue;
948
949 // Get the MI and SUnit for the instruction that defines the original base.
950 Register OrigBase = I.getInstr()->getOperand(BasePos).getReg();
952 if (!DefMI)
953 continue;
954 SUnit *DefSU = getSUnit(DefMI);
955 if (!DefSU)
956 continue;
957 // Get the MI and SUnit for the instruction that defins the new base.
958 MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
959 if (!LastMI)
960 continue;
961 SUnit *LastSU = getSUnit(LastMI);
962 if (!LastSU)
963 continue;
964
965 if (Topo.IsReachable(&I, LastSU))
966 continue;
967
968 // Remove the dependence. The value now depends on a prior iteration.
970 for (const SDep &P : I.Preds)
971 if (P.getSUnit() == DefSU)
972 Deps.push_back(P);
973 for (int i = 0, e = Deps.size(); i != e; i++) {
974 Topo.RemovePred(&I, Deps[i].getSUnit());
975 I.removePred(Deps[i]);
976 }
977 // Remove the chain dependence between the instructions.
978 Deps.clear();
979 for (auto &P : LastSU->Preds)
980 if (P.getSUnit() == &I && P.getKind() == SDep::Order)
981 Deps.push_back(P);
982 for (int i = 0, e = Deps.size(); i != e; i++) {
983 Topo.RemovePred(LastSU, Deps[i].getSUnit());
984 LastSU->removePred(Deps[i]);
985 }
986
987 // Add a dependence between the new instruction and the instruction
988 // that defines the new base.
989 SDep Dep(&I, SDep::Anti, NewBase);
990 Topo.AddPred(LastSU, &I);
991 LastSU->addPred(Dep);
992
993 // Remember the base and offset information so that we can update the
994 // instruction during code generation.
995 InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
996 }
997}
998
999namespace {
1000
1001// FuncUnitSorter - Comparison operator used to sort instructions by
1002// the number of functional unit choices.
1003struct FuncUnitSorter {
1004 const InstrItineraryData *InstrItins;
1005 const MCSubtargetInfo *STI;
1007
1008 FuncUnitSorter(const TargetSubtargetInfo &TSI)
1009 : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
1010
1011 // Compute the number of functional unit alternatives needed
1012 // at each stage, and take the minimum value. We prioritize the
1013 // instructions by the least number of choices first.
1014 unsigned minFuncUnits(const MachineInstr *Inst,
1015 InstrStage::FuncUnits &F) const {
1016 unsigned SchedClass = Inst->getDesc().getSchedClass();
1017 unsigned min = UINT_MAX;
1018 if (InstrItins && !InstrItins->isEmpty()) {
1019 for (const InstrStage &IS :
1020 make_range(InstrItins->beginStage(SchedClass),
1021 InstrItins->endStage(SchedClass))) {
1022 InstrStage::FuncUnits funcUnits = IS.getUnits();
1023 unsigned numAlternatives = llvm::popcount(funcUnits);
1024 if (numAlternatives < min) {
1025 min = numAlternatives;
1026 F = funcUnits;
1027 }
1028 }
1029 return min;
1030 }
1031 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1032 const MCSchedClassDesc *SCDesc =
1033 STI->getSchedModel().getSchedClassDesc(SchedClass);
1034 if (!SCDesc->isValid())
1035 // No valid Schedule Class Desc for schedClass, should be
1036 // Pseudo/PostRAPseudo
1037 return min;
1038
1039 for (const MCWriteProcResEntry &PRE :
1040 make_range(STI->getWriteProcResBegin(SCDesc),
1041 STI->getWriteProcResEnd(SCDesc))) {
1042 if (!PRE.Cycles)
1043 continue;
1044 const MCProcResourceDesc *ProcResource =
1045 STI->getSchedModel().getProcResource(PRE.ProcResourceIdx);
1046 unsigned NumUnits = ProcResource->NumUnits;
1047 if (NumUnits < min) {
1048 min = NumUnits;
1049 F = PRE.ProcResourceIdx;
1050 }
1051 }
1052 return min;
1053 }
1054 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1055 }
1056
1057 // Compute the critical resources needed by the instruction. This
1058 // function records the functional units needed by instructions that
1059 // must use only one functional unit. We use this as a tie breaker
1060 // for computing the resource MII. The instrutions that require
1061 // the same, highly used, functional unit have high priority.
1062 void calcCriticalResources(MachineInstr &MI) {
1063 unsigned SchedClass = MI.getDesc().getSchedClass();
1064 if (InstrItins && !InstrItins->isEmpty()) {
1065 for (const InstrStage &IS :
1066 make_range(InstrItins->beginStage(SchedClass),
1067 InstrItins->endStage(SchedClass))) {
1068 InstrStage::FuncUnits FuncUnits = IS.getUnits();
1069 if (llvm::popcount(FuncUnits) == 1)
1070 Resources[FuncUnits]++;
1071 }
1072 return;
1073 }
1074 if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1075 const MCSchedClassDesc *SCDesc =
1076 STI->getSchedModel().getSchedClassDesc(SchedClass);
1077 if (!SCDesc->isValid())
1078 // No valid Schedule Class Desc for schedClass, should be
1079 // Pseudo/PostRAPseudo
1080 return;
1081
1082 for (const MCWriteProcResEntry &PRE :
1083 make_range(STI->getWriteProcResBegin(SCDesc),
1084 STI->getWriteProcResEnd(SCDesc))) {
1085 if (!PRE.Cycles)
1086 continue;
1087 Resources[PRE.ProcResourceIdx]++;
1088 }
1089 return;
1090 }
1091 llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1092 }
1093
1094 /// Return true if IS1 has less priority than IS2.
1095 bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
1096 InstrStage::FuncUnits F1 = 0, F2 = 0;
1097 unsigned MFUs1 = minFuncUnits(IS1, F1);
1098 unsigned MFUs2 = minFuncUnits(IS2, F2);
1099 if (MFUs1 == MFUs2)
1100 return Resources.lookup(F1) < Resources.lookup(F2);
1101 return MFUs1 > MFUs2;
1102 }
1103};
1104
1105} // end anonymous namespace
1106
1107/// Calculate the resource constrained minimum initiation interval for the
1108/// specified loop. We use the DFA to model the resources needed for
1109/// each instruction, and we ignore dependences. A different DFA is created
1110/// for each cycle that is required. When adding a new instruction, we attempt
1111/// to add it to each existing DFA, until a legal space is found. If the
1112/// instruction cannot be reserved in an existing DFA, we create a new one.
1113unsigned SwingSchedulerDAG::calculateResMII() {
1114 LLVM_DEBUG(dbgs() << "calculateResMII:\n");
1116 return RM.calculateResMII();
1117}
1118
1119/// Calculate the recurrence-constrainted minimum initiation interval.
1120/// Iterate over each circuit. Compute the delay(c) and distance(c)
1121/// for each circuit. The II needs to satisfy the inequality
1122/// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1123/// II that satisfies the inequality, and the RecMII is the maximum
1124/// of those values.
1125unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1126 unsigned RecMII = 0;
1127
1128 for (NodeSet &Nodes : NodeSets) {
1129 if (Nodes.empty())
1130 continue;
1131
1132 unsigned Delay = Nodes.getLatency();
1133 unsigned Distance = 1;
1134
1135 // ii = ceil(delay / distance)
1136 unsigned CurMII = (Delay + Distance - 1) / Distance;
1137 Nodes.setRecMII(CurMII);
1138 if (CurMII > RecMII)
1139 RecMII = CurMII;
1140 }
1141
1142 return RecMII;
1143}
1144
1145/// Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1146/// but we do this to find the circuits, and then change them back.
1147static void swapAntiDependences(std::vector<SUnit> &SUnits) {
1149 for (SUnit &SU : SUnits) {
1150 for (SDep &Pred : SU.Preds)
1151 if (Pred.getKind() == SDep::Anti)
1152 DepsAdded.push_back(std::make_pair(&SU, Pred));
1153 }
1154 for (std::pair<SUnit *, SDep> &P : DepsAdded) {
1155 // Remove this anti dependency and add one in the reverse direction.
1156 SUnit *SU = P.first;
1157 SDep &D = P.second;
1158 SUnit *TargetSU = D.getSUnit();
1159 unsigned Reg = D.getReg();
1160 unsigned Lat = D.getLatency();
1161 SU->removePred(D);
1162 SDep Dep(SU, SDep::Anti, Reg);
1163 Dep.setLatency(Lat);
1164 TargetSU->addPred(Dep);
1165 }
1166}
1167
1168/// Create the adjacency structure of the nodes in the graph.
1169void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1170 SwingSchedulerDAG *DAG) {
1171 BitVector Added(SUnits.size());
1172 DenseMap<int, int> OutputDeps;
1173 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1174 Added.reset();
1175 // Add any successor to the adjacency matrix and exclude duplicates.
1176 for (auto &SI : SUnits[i].Succs) {
1177 // Only create a back-edge on the first and last nodes of a dependence
1178 // chain. This records any chains and adds them later.
1179 if (SI.getKind() == SDep::Output) {
1180 int N = SI.getSUnit()->NodeNum;
1181 int BackEdge = i;
1182 auto Dep = OutputDeps.find(BackEdge);
1183 if (Dep != OutputDeps.end()) {
1184 BackEdge = Dep->second;
1185 OutputDeps.erase(Dep);
1186 }
1187 OutputDeps[N] = BackEdge;
1188 }
1189 // Do not process a boundary node, an artificial node.
1190 // A back-edge is processed only if it goes to a Phi.
1191 if (SI.getSUnit()->isBoundaryNode() || SI.isArtificial() ||
1192 (SI.getKind() == SDep::Anti && !SI.getSUnit()->getInstr()->isPHI()))
1193 continue;
1194 int N = SI.getSUnit()->NodeNum;
1195 if (!Added.test(N)) {
1196 AdjK[i].push_back(N);
1197 Added.set(N);
1198 }
1199 }
1200 // A chain edge between a store and a load is treated as a back-edge in the
1201 // adjacency matrix.
1202 for (auto &PI : SUnits[i].Preds) {
1203 if (!SUnits[i].getInstr()->mayStore() ||
1204 !DAG->isLoopCarriedDep(&SUnits[i], PI, false))
1205 continue;
1206 if (PI.getKind() == SDep::Order && PI.getSUnit()->getInstr()->mayLoad()) {
1207 int N = PI.getSUnit()->NodeNum;
1208 if (!Added.test(N)) {
1209 AdjK[i].push_back(N);
1210 Added.set(N);
1211 }
1212 }
1213 }
1214 }
1215 // Add back-edges in the adjacency matrix for the output dependences.
1216 for (auto &OD : OutputDeps)
1217 if (!Added.test(OD.second)) {
1218 AdjK[OD.first].push_back(OD.second);
1219 Added.set(OD.second);
1220 }
1221}
1222
1223/// Identify an elementary circuit in the dependence graph starting at the
1224/// specified node.
1225bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1226 bool HasBackedge) {
1227 SUnit *SV = &SUnits[V];
1228 bool F = false;
1229 Stack.insert(SV);
1230 Blocked.set(V);
1231
1232 for (auto W : AdjK[V]) {
1233 if (NumPaths > MaxPaths)
1234 break;
1235 if (W < S)
1236 continue;
1237 if (W == S) {
1238 if (!HasBackedge)
1239 NodeSets.push_back(NodeSet(Stack.begin(), Stack.end()));
1240 F = true;
1241 ++NumPaths;
1242 break;
1243 } else if (!Blocked.test(W)) {
1244 if (circuit(W, S, NodeSets,
1245 Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
1246 F = true;
1247 }
1248 }
1249
1250 if (F)
1251 unblock(V);
1252 else {
1253 for (auto W : AdjK[V]) {
1254 if (W < S)
1255 continue;
1256 B[W].insert(SV);
1257 }
1258 }
1259 Stack.pop_back();
1260 return F;
1261}
1262
1263/// Unblock a node in the circuit finding algorithm.
1264void SwingSchedulerDAG::Circuits::unblock(int U) {
1265 Blocked.reset(U);
1266 SmallPtrSet<SUnit *, 4> &BU = B[U];
1267 while (!BU.empty()) {
1269 assert(SI != BU.end() && "Invalid B set.");
1270 SUnit *W = *SI;
1271 BU.erase(W);
1272 if (Blocked.test(W->NodeNum))
1273 unblock(W->NodeNum);
1274 }
1275}
1276
1277/// Identify all the elementary circuits in the dependence graph using
1278/// Johnson's circuit algorithm.
1279void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
1280 // Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1281 // but we do this to find the circuits, and then change them back.
1283
1284 Circuits Cir(SUnits, Topo);
1285 // Create the adjacency structure.
1286 Cir.createAdjacencyStructure(this);
1287 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1288 Cir.reset();
1289 Cir.circuit(i, i, NodeSets);
1290 }
1291
1292 // Change the dependences back so that we've created a DAG again.
1294}
1295
1296// Create artificial dependencies between the source of COPY/REG_SEQUENCE that
1297// is loop-carried to the USE in next iteration. This will help pipeliner avoid
1298// additional copies that are needed across iterations. An artificial dependence
1299// edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
1300
1301// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
1302// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
1303// PHI-------True-Dep------> USEOfPhi
1304
1305// The mutation creates
1306// USEOfPHI -------Artificial-Dep---> SRCOfCopy
1307
1308// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
1309// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
1310// late to avoid additional copies across iterations. The possible scheduling
1311// order would be
1312// USEOfPHI --- SRCOfCopy--- COPY/REG_SEQUENCE.
1313
1314void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
1315 for (SUnit &SU : DAG->SUnits) {
1316 // Find the COPY/REG_SEQUENCE instruction.
1317 if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
1318 continue;
1319
1320 // Record the loop carried PHIs.
1322 // Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
1324
1325 for (auto &Dep : SU.Preds) {
1326 SUnit *TmpSU = Dep.getSUnit();
1327 MachineInstr *TmpMI = TmpSU->getInstr();
1328 SDep::Kind DepKind = Dep.getKind();
1329 // Save the loop carried PHI.
1330 if (DepKind == SDep::Anti && TmpMI->isPHI())
1331 PHISUs.push_back(TmpSU);
1332 // Save the source of COPY/REG_SEQUENCE.
1333 // If the source has no pre-decessors, we will end up creating cycles.
1334 else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > 0)
1335 SrcSUs.push_back(TmpSU);
1336 }
1337
1338 if (PHISUs.size() == 0 || SrcSUs.size() == 0)
1339 continue;
1340
1341 // Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
1342 // SUnit to the container.
1344 // Do not use iterator based loop here as we are updating the container.
1345 for (size_t Index = 0; Index < PHISUs.size(); ++Index) {
1346 for (auto &Dep : PHISUs[Index]->Succs) {
1347 if (Dep.getKind() != SDep::Data)
1348 continue;
1349
1350 SUnit *TmpSU = Dep.getSUnit();
1351 MachineInstr *TmpMI = TmpSU->getInstr();
1352 if (TmpMI->isPHI() || TmpMI->isRegSequence()) {
1353 PHISUs.push_back(TmpSU);
1354 continue;
1355 }
1356 UseSUs.push_back(TmpSU);
1357 }
1358 }
1359
1360 if (UseSUs.size() == 0)
1361 continue;
1362
1363 SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
1364 // Add the artificial dependencies if it does not form a cycle.
1365 for (auto *I : UseSUs) {
1366 for (auto *Src : SrcSUs) {
1367 if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
1368 Src->addPred(SDep(I, SDep::Artificial));
1369 SDAG->Topo.AddPred(Src, I);
1370 }
1371 }
1372 }
1373 }
1374}
1375
1376/// Return true for DAG nodes that we ignore when computing the cost functions.
1377/// We ignore the back-edge recurrence in order to avoid unbounded recursion
1378/// in the calculation of the ASAP, ALAP, etc functions.
1379static bool ignoreDependence(const SDep &D, bool isPred) {
1380 if (D.isArtificial() || D.getSUnit()->isBoundaryNode())
1381 return true;
1382 return D.getKind() == SDep::Anti && isPred;
1383}
1384
1385/// Compute several functions need to order the nodes for scheduling.
1386/// ASAP - Earliest time to schedule a node.
1387/// ALAP - Latest time to schedule a node.
1388/// MOV - Mobility function, difference between ALAP and ASAP.
1389/// D - Depth of each node.
1390/// H - Height of each node.
1391void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
1392 ScheduleInfo.resize(SUnits.size());
1393
1394 LLVM_DEBUG({
1395 for (int I : Topo) {
1396 const SUnit &SU = SUnits[I];
1397 dumpNode(SU);
1398 }
1399 });
1400
1401 int maxASAP = 0;
1402 // Compute ASAP and ZeroLatencyDepth.
1403 for (int I : Topo) {
1404 int asap = 0;
1405 int zeroLatencyDepth = 0;
1406 SUnit *SU = &SUnits[I];
1407 for (const SDep &P : SU->Preds) {
1408 SUnit *pred = P.getSUnit();
1409 if (P.getLatency() == 0)
1410 zeroLatencyDepth =
1411 std::max(zeroLatencyDepth, getZeroLatencyDepth(pred) + 1);
1412 if (ignoreDependence(P, true))
1413 continue;
1414 asap = std::max(asap, (int)(getASAP(pred) + P.getLatency() -
1415 getDistance(pred, SU, P) * MII));
1416 }
1417 maxASAP = std::max(maxASAP, asap);
1418 ScheduleInfo[I].ASAP = asap;
1419 ScheduleInfo[I].ZeroLatencyDepth = zeroLatencyDepth;
1420 }
1421
1422 // Compute ALAP, ZeroLatencyHeight, and MOV.
1423 for (int I : llvm::reverse(Topo)) {
1424 int alap = maxASAP;
1425 int zeroLatencyHeight = 0;
1426 SUnit *SU = &SUnits[I];
1427 for (const SDep &S : SU->Succs) {
1428 SUnit *succ = S.getSUnit();
1429 if (succ->isBoundaryNode())
1430 continue;
1431 if (S.getLatency() == 0)
1432 zeroLatencyHeight =
1433 std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1);
1434 if (ignoreDependence(S, true))
1435 continue;
1436 alap = std::min(alap, (int)(getALAP(succ) - S.getLatency() +
1437 getDistance(SU, succ, S) * MII));
1438 }
1439
1440 ScheduleInfo[I].ALAP = alap;
1441 ScheduleInfo[I].ZeroLatencyHeight = zeroLatencyHeight;
1442 }
1443
1444 // After computing the node functions, compute the summary for each node set.
1445 for (NodeSet &I : NodeSets)
1446 I.computeNodeSetInfo(this);
1447
1448 LLVM_DEBUG({
1449 for (unsigned i = 0; i < SUnits.size(); i++) {
1450 dbgs() << "\tNode " << i << ":\n";
1451 dbgs() << "\t ASAP = " << getASAP(&SUnits[i]) << "\n";
1452 dbgs() << "\t ALAP = " << getALAP(&SUnits[i]) << "\n";
1453 dbgs() << "\t MOV = " << getMOV(&SUnits[i]) << "\n";
1454 dbgs() << "\t D = " << getDepth(&SUnits[i]) << "\n";
1455 dbgs() << "\t H = " << getHeight(&SUnits[i]) << "\n";
1456 dbgs() << "\t ZLD = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
1457 dbgs() << "\t ZLH = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
1458 }
1459 });
1460}
1461
1462/// Compute the Pred_L(O) set, as defined in the paper. The set is defined
1463/// as the predecessors of the elements of NodeOrder that are not also in
1464/// NodeOrder.
1467 const NodeSet *S = nullptr) {
1468 Preds.clear();
1469 for (const SUnit *SU : NodeOrder) {
1470 for (const SDep &Pred : SU->Preds) {
1471 if (S && S->count(Pred.getSUnit()) == 0)
1472 continue;
1473 if (ignoreDependence(Pred, true))
1474 continue;
1475 if (NodeOrder.count(Pred.getSUnit()) == 0)
1476 Preds.insert(Pred.getSUnit());
1477 }
1478 // Back-edges are predecessors with an anti-dependence.
1479 for (const SDep &Succ : SU->Succs) {
1480 if (Succ.getKind() != SDep::Anti)
1481 continue;
1482 if (S && S->count(Succ.getSUnit()) == 0)
1483 continue;
1484 if (NodeOrder.count(Succ.getSUnit()) == 0)
1485 Preds.insert(Succ.getSUnit());
1486 }
1487 }
1488 return !Preds.empty();
1489}
1490
1491/// Compute the Succ_L(O) set, as defined in the paper. The set is defined
1492/// as the successors of the elements of NodeOrder that are not also in
1493/// NodeOrder.
1496 const NodeSet *S = nullptr) {
1497 Succs.clear();
1498 for (const SUnit *SU : NodeOrder) {
1499 for (const SDep &Succ : SU->Succs) {
1500 if (S && S->count(Succ.getSUnit()) == 0)
1501 continue;
1502 if (ignoreDependence(Succ, false))
1503 continue;
1504 if (NodeOrder.count(Succ.getSUnit()) == 0)
1505 Succs.insert(Succ.getSUnit());
1506 }
1507 for (const SDep &Pred : SU->Preds) {
1508 if (Pred.getKind() != SDep::Anti)
1509 continue;
1510 if (S && S->count(Pred.getSUnit()) == 0)
1511 continue;
1512 if (NodeOrder.count(Pred.getSUnit()) == 0)
1513 Succs.insert(Pred.getSUnit());
1514 }
1515 }
1516 return !Succs.empty();
1517}
1518
1519/// Return true if there is a path from the specified node to any of the nodes
1520/// in DestNodes. Keep track and return the nodes in any path.
1521static bool computePath(SUnit *Cur, SetVector<SUnit *> &Path,
1522 SetVector<SUnit *> &DestNodes,
1523 SetVector<SUnit *> &Exclude,
1524 SmallPtrSet<SUnit *, 8> &Visited) {
1525 if (Cur->isBoundaryNode())
1526 return false;
1527 if (Exclude.contains(Cur))
1528 return false;
1529 if (DestNodes.contains(Cur))
1530 return true;
1531 if (!Visited.insert(Cur).second)
1532 return Path.contains(Cur);
1533 bool FoundPath = false;
1534 for (auto &SI : Cur->Succs)
1535 if (!ignoreDependence(SI, false))
1536 FoundPath |=
1537 computePath(SI.getSUnit(), Path, DestNodes, Exclude, Visited);
1538 for (auto &PI : Cur->Preds)
1539 if (PI.getKind() == SDep::Anti)
1540 FoundPath |=
1541 computePath(PI.getSUnit(), Path, DestNodes, Exclude, Visited);
1542 if (FoundPath)
1543 Path.insert(Cur);
1544 return FoundPath;
1545}
1546
1547/// Compute the live-out registers for the instructions in a node-set.
1548/// The live-out registers are those that are defined in the node-set,
1549/// but not used. Except for use operands of Phis.
1551 NodeSet &NS) {
1556 for (SUnit *SU : NS) {
1557 const MachineInstr *MI = SU->getInstr();
1558 if (MI->isPHI())
1559 continue;
1560 for (const MachineOperand &MO : MI->operands())
1561 if (MO.isReg() && MO.isUse()) {
1562 Register Reg = MO.getReg();
1563 if (Reg.isVirtual())
1564 Uses.insert(Reg);
1565 else if (MRI.isAllocatable(Reg))
1566 for (MCRegUnitIterator Units(Reg.asMCReg(), TRI); Units.isValid();
1567 ++Units)
1568 Uses.insert(*Units);
1569 }
1570 }
1571 for (SUnit *SU : NS)
1572 for (const MachineOperand &MO : SU->getInstr()->operands())
1573 if (MO.isReg() && MO.isDef() && !MO.isDead()) {
1574 Register Reg = MO.getReg();
1575 if (Reg.isVirtual()) {
1576 if (!Uses.count(Reg))
1577 LiveOutRegs.push_back(RegisterMaskPair(Reg,
1579 } else if (MRI.isAllocatable(Reg)) {
1580 for (MCRegUnitIterator Units(Reg.asMCReg(), TRI); Units.isValid();
1581 ++Units)
1582 if (!Uses.count(*Units))
1583 LiveOutRegs.push_back(RegisterMaskPair(*Units,
1585 }
1586 }
1587 RPTracker.addLiveRegs(LiveOutRegs);
1588}
1589
1590/// A heuristic to filter nodes in recurrent node-sets if the register
1591/// pressure of a set is too high.
1592void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
1593 for (auto &NS : NodeSets) {
1594 // Skip small node-sets since they won't cause register pressure problems.
1595 if (NS.size() <= 2)
1596 continue;
1597 IntervalPressure RecRegPressure;
1598 RegPressureTracker RecRPTracker(RecRegPressure);
1599 RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
1600 computeLiveOuts(MF, RecRPTracker, NS);
1601 RecRPTracker.closeBottom();
1602
1603 std::vector<SUnit *> SUnits(NS.begin(), NS.end());
1604 llvm::sort(SUnits, [](const SUnit *A, const SUnit *B) {
1605 return A->NodeNum > B->NodeNum;
1606 });
1607
1608 for (auto &SU : SUnits) {
1609 // Since we're computing the register pressure for a subset of the
1610 // instructions in a block, we need to set the tracker for each
1611 // instruction in the node-set. The tracker is set to the instruction
1612 // just after the one we're interested in.
1614 RecRPTracker.setPos(std::next(CurInstI));
1615
1616 RegPressureDelta RPDelta;
1617 ArrayRef<PressureChange> CriticalPSets;
1618 RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
1619 CriticalPSets,
1620 RecRegPressure.MaxSetPressure);
1621 if (RPDelta.Excess.isValid()) {
1622 LLVM_DEBUG(
1623 dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
1625 << ":" << RPDelta.Excess.getUnitInc() << "\n");
1626 NS.setExceedPressure(SU);
1627 break;
1628 }
1629 RecRPTracker.recede();
1630 }
1631 }
1632}
1633
1634/// A heuristic to colocate node sets that have the same set of
1635/// successors.
1636void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
1637 unsigned Colocate = 0;
1638 for (int i = 0, e = NodeSets.size(); i < e; ++i) {
1639 NodeSet &N1 = NodeSets[i];
1641 if (N1.empty() || !succ_L(N1, S1))
1642 continue;
1643 for (int j = i + 1; j < e; ++j) {
1644 NodeSet &N2 = NodeSets[j];
1645 if (N1.compareRecMII(N2) != 0)
1646 continue;
1648 if (N2.empty() || !succ_L(N2, S2))
1649 continue;
1650 if (llvm::set_is_subset(S1, S2) && S1.size() == S2.size()) {
1651 N1.setColocate(++Colocate);
1652 N2.setColocate(Colocate);
1653 break;
1654 }
1655 }
1656 }
1657}
1658
1659/// Check if the existing node-sets are profitable. If not, then ignore the
1660/// recurrent node-sets, and attempt to schedule all nodes together. This is
1661/// a heuristic. If the MII is large and all the recurrent node-sets are small,
1662/// then it's best to try to schedule all instructions together instead of
1663/// starting with the recurrent node-sets.
1664void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
1665 // Look for loops with a large MII.
1666 if (MII < 17)
1667 return;
1668 // Check if the node-set contains only a simple add recurrence.
1669 for (auto &NS : NodeSets) {
1670 if (NS.getRecMII() > 2)
1671 return;
1672 if (NS.getMaxDepth() > MII)
1673 return;
1674 }
1675 NodeSets.clear();
1676 LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
1677}
1678
1679/// Add the nodes that do not belong to a recurrence set into groups
1680/// based upon connected components.
1681void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
1682 SetVector<SUnit *> NodesAdded;
1684 // Add the nodes that are on a path between the previous node sets and
1685 // the current node set.
1686 for (NodeSet &I : NodeSets) {
1688 // Add the nodes from the current node set to the previous node set.
1689 if (succ_L(I, N)) {
1691 for (SUnit *NI : N) {
1692 Visited.clear();
1693 computePath(NI, Path, NodesAdded, I, Visited);
1694 }
1695 if (!Path.empty())
1696 I.insert(Path.begin(), Path.end());
1697 }
1698 // Add the nodes from the previous node set to the current node set.
1699 N.clear();
1700 if (succ_L(NodesAdded, N)) {
1702 for (SUnit *NI : N) {
1703 Visited.clear();
1704 computePath(NI, Path, I, NodesAdded, Visited);
1705 }
1706 if (!Path.empty())
1707 I.insert(Path.begin(), Path.end());
1708 }
1709 NodesAdded.insert(I.begin(), I.end());
1710 }
1711
1712 // Create a new node set with the connected nodes of any successor of a node
1713 // in a recurrent set.
1714 NodeSet NewSet;
1716 if (succ_L(NodesAdded, N))
1717 for (SUnit *I : N)
1718 addConnectedNodes(I, NewSet, NodesAdded);
1719 if (!NewSet.empty())
1720 NodeSets.push_back(NewSet);
1721
1722 // Create a new node set with the connected nodes of any predecessor of a node
1723 // in a recurrent set.
1724 NewSet.clear();
1725 if (pred_L(NodesAdded, N))
1726 for (SUnit *I : N)
1727 addConnectedNodes(I, NewSet, NodesAdded);
1728 if (!NewSet.empty())
1729 NodeSets.push_back(NewSet);
1730
1731 // Create new nodes sets with the connected nodes any remaining node that
1732 // has no predecessor.
1733 for (SUnit &SU : SUnits) {
1734 if (NodesAdded.count(&SU) == 0) {
1735 NewSet.clear();
1736 addConnectedNodes(&SU, NewSet, NodesAdded);
1737 if (!NewSet.empty())
1738 NodeSets.push_back(NewSet);
1739 }
1740 }
1741}
1742
1743/// Add the node to the set, and add all of its connected nodes to the set.
1744void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
1745 SetVector<SUnit *> &NodesAdded) {
1746 NewSet.insert(SU);
1747 NodesAdded.insert(SU);
1748 for (auto &SI : SU->Succs) {
1749 SUnit *Successor = SI.getSUnit();
1750 if (!SI.isArtificial() && !Successor->isBoundaryNode() &&
1751 NodesAdded.count(Successor) == 0)
1752 addConnectedNodes(Successor, NewSet, NodesAdded);
1753 }
1754 for (auto &PI : SU->Preds) {
1755 SUnit *Predecessor = PI.getSUnit();
1756 if (!PI.isArtificial() && NodesAdded.count(Predecessor) == 0)
1757 addConnectedNodes(Predecessor, NewSet, NodesAdded);
1758 }
1759}
1760
1761/// Return true if Set1 contains elements in Set2. The elements in common
1762/// are returned in a different container.
1763static bool isIntersect(SmallSetVector<SUnit *, 8> &Set1, const NodeSet &Set2,
1765 Result.clear();
1766 for (SUnit *SU : Set1) {
1767 if (Set2.count(SU) != 0)
1768 Result.insert(SU);
1769 }
1770 return !Result.empty();
1771}
1772
1773/// Merge the recurrence node sets that have the same initial node.
1774void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
1775 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
1776 ++I) {
1777 NodeSet &NI = *I;
1778 for (NodeSetType::iterator J = I + 1; J != E;) {
1779 NodeSet &NJ = *J;
1780 if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
1781 if (NJ.compareRecMII(NI) > 0)
1782 NI.setRecMII(NJ.getRecMII());
1783 for (SUnit *SU : *J)
1784 I->insert(SU);
1785 NodeSets.erase(J);
1786 E = NodeSets.end();
1787 } else {
1788 ++J;
1789 }
1790 }
1791 }
1792}
1793
1794/// Remove nodes that have been scheduled in previous NodeSets.
1795void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
1796 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
1797 ++I)
1798 for (NodeSetType::iterator J = I + 1; J != E;) {
1799 J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
1800
1801 if (J->empty()) {
1802 NodeSets.erase(J);
1803 E = NodeSets.end();
1804 } else {
1805 ++J;
1806 }
1807 }
1808}
1809
1810/// Compute an ordered list of the dependence graph nodes, which
1811/// indicates the order that the nodes will be scheduled. This is a
1812/// two-level algorithm. First, a partial order is created, which
1813/// consists of a list of sets ordered from highest to lowest priority.
1814void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
1816 NodeOrder.clear();
1817
1818 for (auto &Nodes : NodeSets) {
1819 LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
1820 OrderKind Order;
1822 if (pred_L(NodeOrder, N) && llvm::set_is_subset(N, Nodes)) {
1823 R.insert(N.begin(), N.end());
1824 Order = BottomUp;
1825 LLVM_DEBUG(dbgs() << " Bottom up (preds) ");
1826 } else if (succ_L(NodeOrder, N) && llvm::set_is_subset(N, Nodes)) {
1827 R.insert(N.begin(), N.end());
1828 Order = TopDown;
1829 LLVM_DEBUG(dbgs() << " Top down (succs) ");
1830 } else if (isIntersect(N, Nodes, R)) {
1831 // If some of the successors are in the existing node-set, then use the
1832 // top-down ordering.
1833 Order = TopDown;
1834 LLVM_DEBUG(dbgs() << " Top down (intersect) ");
1835 } else if (NodeSets.size() == 1) {
1836 for (const auto &N : Nodes)
1837 if (N->Succs.size() == 0)
1838 R.insert(N);
1839 Order = BottomUp;
1840 LLVM_DEBUG(dbgs() << " Bottom up (all) ");
1841 } else {
1842 // Find the node with the highest ASAP.
1843 SUnit *maxASAP = nullptr;
1844 for (SUnit *SU : Nodes) {
1845 if (maxASAP == nullptr || getASAP(SU) > getASAP(maxASAP) ||
1846 (getASAP(SU) == getASAP(maxASAP) && SU->NodeNum > maxASAP->NodeNum))
1847 maxASAP = SU;
1848 }
1849 R.insert(maxASAP);
1850 Order = BottomUp;
1851 LLVM_DEBUG(dbgs() << " Bottom up (default) ");
1852 }
1853
1854 while (!R.empty()) {
1855 if (Order == TopDown) {
1856 // Choose the node with the maximum height. If more than one, choose
1857 // the node wiTH the maximum ZeroLatencyHeight. If still more than one,
1858 // choose the node with the lowest MOV.
1859 while (!R.empty()) {
1860 SUnit *maxHeight = nullptr;
1861 for (SUnit *I : R) {
1862 if (maxHeight == nullptr || getHeight(I) > getHeight(maxHeight))
1863 maxHeight = I;
1864 else if (getHeight(I) == getHeight(maxHeight) &&
1866 maxHeight = I;
1867 else if (getHeight(I) == getHeight(maxHeight) &&
1869 getZeroLatencyHeight(maxHeight) &&
1870 getMOV(I) < getMOV(maxHeight))
1871 maxHeight = I;
1872 }
1873 NodeOrder.insert(maxHeight);
1874 LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
1875 R.remove(maxHeight);
1876 for (const auto &I : maxHeight->Succs) {
1877 if (Nodes.count(I.getSUnit()) == 0)
1878 continue;
1879 if (NodeOrder.contains(I.getSUnit()))
1880 continue;
1881 if (ignoreDependence(I, false))
1882 continue;
1883 R.insert(I.getSUnit());
1884 }
1885 // Back-edges are predecessors with an anti-dependence.
1886 for (const auto &I : maxHeight->Preds) {
1887 if (I.getKind() != SDep::Anti)
1888 continue;
1889 if (Nodes.count(I.getSUnit()) == 0)
1890 continue;
1891 if (NodeOrder.contains(I.getSUnit()))
1892 continue;
1893 R.insert(I.getSUnit());
1894 }
1895 }
1896 Order = BottomUp;
1897 LLVM_DEBUG(dbgs() << "\n Switching order to bottom up ");
1899 if (pred_L(NodeOrder, N, &Nodes))
1900 R.insert(N.begin(), N.end());
1901 } else {
1902 // Choose the node with the maximum depth. If more than one, choose
1903 // the node with the maximum ZeroLatencyDepth. If still more than one,
1904 // choose the node with the lowest MOV.
1905 while (!R.empty()) {
1906 SUnit *maxDepth = nullptr;
1907 for (SUnit *I : R) {
1908 if (maxDepth == nullptr || getDepth(I) > getDepth(maxDepth))
1909 maxDepth = I;
1910 else if (getDepth(I) == getDepth(maxDepth) &&
1912 maxDepth = I;
1913 else if (getDepth(I) == getDepth(maxDepth) &&
1915 getMOV(I) < getMOV(maxDepth))
1916 maxDepth = I;
1917 }
1918 NodeOrder.insert(maxDepth);
1919 LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
1920 R.remove(maxDepth);
1921 if (Nodes.isExceedSU(maxDepth)) {
1922 Order = TopDown;
1923 R.clear();
1924 R.insert(Nodes.getNode(0));
1925 break;
1926 }
1927 for (const auto &I : maxDepth->Preds) {
1928 if (Nodes.count(I.getSUnit()) == 0)
1929 continue;
1930 if (NodeOrder.contains(I.getSUnit()))
1931 continue;
1932 R.insert(I.getSUnit());
1933 }
1934 // Back-edges are predecessors with an anti-dependence.
1935 for (const auto &I : maxDepth->Succs) {
1936 if (I.getKind() != SDep::Anti)
1937 continue;
1938 if (Nodes.count(I.getSUnit()) == 0)
1939 continue;
1940 if (NodeOrder.contains(I.getSUnit()))
1941 continue;
1942 R.insert(I.getSUnit());
1943 }
1944 }
1945 Order = TopDown;
1946 LLVM_DEBUG(dbgs() << "\n Switching order to top down ");
1948 if (succ_L(NodeOrder, N, &Nodes))
1949 R.insert(N.begin(), N.end());
1950 }
1951 }
1952 LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
1953 }
1954
1955 LLVM_DEBUG({
1956 dbgs() << "Node order: ";
1957 for (SUnit *I : NodeOrder)
1958 dbgs() << " " << I->NodeNum << " ";
1959 dbgs() << "\n";
1960 });
1961}
1962
1963/// Process the nodes in the computed order and create the pipelined schedule
1964/// of the instructions, if possible. Return true if a schedule is found.
1965bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
1966
1967 if (NodeOrder.empty()){
1968 LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );
1969 return false;
1970 }
1971
1972 bool scheduleFound = false;
1973 // Keep increasing II until a valid schedule is found.
1974 for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) {
1975 Schedule.reset();
1976 Schedule.setInitiationInterval(II);
1977 LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
1978
1979 SetVector<SUnit *>::iterator NI = NodeOrder.begin();
1980 SetVector<SUnit *>::iterator NE = NodeOrder.end();
1981 do {
1982 SUnit *SU = *NI;
1983
1984 // Compute the schedule time for the instruction, which is based
1985 // upon the scheduled time for any predecessors/successors.
1986 int EarlyStart = INT_MIN;
1987 int LateStart = INT_MAX;
1988 // These values are set when the size of the schedule window is limited
1989 // due to chain dependences.
1990 int SchedEnd = INT_MAX;
1991 int SchedStart = INT_MIN;
1992 Schedule.computeStart(SU, &EarlyStart, &LateStart, &SchedEnd, &SchedStart,
1993 II, this);
1994 LLVM_DEBUG({
1995 dbgs() << "\n";
1996 dbgs() << "Inst (" << SU->NodeNum << ") ";
1997 SU->getInstr()->dump();
1998 dbgs() << "\n";
1999 });
2000 LLVM_DEBUG({
2001 dbgs() << format("\tes: %8x ls: %8x me: %8x ms: %8x\n", EarlyStart,
2002 LateStart, SchedEnd, SchedStart);
2003 });
2004
2005 if (EarlyStart > LateStart || SchedEnd < EarlyStart ||
2006 SchedStart > LateStart)
2007 scheduleFound = false;
2008 else if (EarlyStart != INT_MIN && LateStart == INT_MAX) {
2009 SchedEnd = std::min(SchedEnd, EarlyStart + (int)II - 1);
2010 scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2011 } else if (EarlyStart == INT_MIN && LateStart != INT_MAX) {
2012 SchedStart = std::max(SchedStart, LateStart - (int)II + 1);
2013 scheduleFound = Schedule.insert(SU, LateStart, SchedStart, II);
2014 } else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2015 SchedEnd =
2016 std::min(SchedEnd, std::min(LateStart, EarlyStart + (int)II - 1));
2017 // When scheduling a Phi it is better to start at the late cycle and go
2018 // backwards. The default order may insert the Phi too far away from
2019 // its first dependence.
2020 if (SU->getInstr()->isPHI())
2021 scheduleFound = Schedule.insert(SU, SchedEnd, EarlyStart, II);
2022 else
2023 scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2024 } else {
2025 int FirstCycle = Schedule.getFirstCycle();
2026 scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
2027 FirstCycle + getASAP(SU) + II - 1, II);
2028 }
2029 // Even if we find a schedule, make sure the schedule doesn't exceed the
2030 // allowable number of stages. We keep trying if this happens.
2031 if (scheduleFound)
2032 if (SwpMaxStages > -1 &&
2033 Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2034 scheduleFound = false;
2035
2036 LLVM_DEBUG({
2037 if (!scheduleFound)
2038 dbgs() << "\tCan't schedule\n";
2039 });
2040 } while (++NI != NE && scheduleFound);
2041
2042 // If a schedule is found, ensure non-pipelined instructions are in stage 0
2043 if (scheduleFound)
2044 scheduleFound =
2045 Schedule.normalizeNonPipelinedInstructions(this, LoopPipelinerInfo);
2046
2047 // If a schedule is found, check if it is a valid schedule too.
2048 if (scheduleFound)
2049 scheduleFound = Schedule.isValidSchedule(this);
2050 }
2051
2052 LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
2053 << " (II=" << Schedule.getInitiationInterval()
2054 << ")\n");
2055
2056 if (scheduleFound) {
2057 scheduleFound = LoopPipelinerInfo->shouldUseSchedule(*this, Schedule);
2058 if (!scheduleFound)
2059 LLVM_DEBUG(dbgs() << "Target rejected schedule\n");
2060 }
2061
2062 if (scheduleFound) {
2063 Schedule.finalizeSchedule(this);
2064 Pass.ORE->emit([&]() {
2066 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
2067 << "Schedule found with Initiation Interval: "
2068 << ore::NV("II", Schedule.getInitiationInterval())
2069 << ", MaxStageCount: "
2070 << ore::NV("MaxStageCount", Schedule.getMaxStageCount());
2071 });
2072 } else
2073 Schedule.reset();
2074
2075 return scheduleFound && Schedule.getMaxStageCount() > 0;
2076}
2077
2078/// Return true if we can compute the amount the instruction changes
2079/// during each iteration. Set Delta to the amount of the change.
2080bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
2082 const MachineOperand *BaseOp;
2083 int64_t Offset;
2084 bool OffsetIsScalable;
2085 if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
2086 return false;
2087
2088 // FIXME: This algorithm assumes instructions have fixed-size offsets.
2089 if (OffsetIsScalable)
2090 return false;
2091
2092 if (!BaseOp->isReg())
2093 return false;
2094
2095 Register BaseReg = BaseOp->getReg();
2096
2098 // Check if there is a Phi. If so, get the definition in the loop.
2099 MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
2100 if (BaseDef && BaseDef->isPHI()) {
2101 BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
2102 BaseDef = MRI.getVRegDef(BaseReg);
2103 }
2104 if (!BaseDef)
2105 return false;
2106
2107 int D = 0;
2108 if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
2109 return false;
2110
2111 Delta = D;
2112 return true;
2113}
2114
2115/// Check if we can change the instruction to use an offset value from the
2116/// previous iteration. If so, return true and set the base and offset values
2117/// so that we can rewrite the load, if necessary.
2118/// v1 = Phi(v0, v3)
2119/// v2 = load v1, 0
2120/// v3 = post_store v1, 4, x
2121/// This function enables the load to be rewritten as v2 = load v3, 4.
2122bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
2123 unsigned &BasePos,
2124 unsigned &OffsetPos,
2125 unsigned &NewBase,
2126 int64_t &Offset) {
2127 // Get the load instruction.
2128 if (TII->isPostIncrement(*MI))
2129 return false;
2130 unsigned BasePosLd, OffsetPosLd;
2131 if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
2132 return false;
2133 Register BaseReg = MI->getOperand(BasePosLd).getReg();
2134
2135 // Look for the Phi instruction.
2136 MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
2137 MachineInstr *Phi = MRI.getVRegDef(BaseReg);
2138 if (!Phi || !Phi->isPHI())
2139 return false;
2140 // Get the register defined in the loop block.
2141 unsigned PrevReg = getLoopPhiReg(*Phi, MI->getParent());
2142 if (!PrevReg)
2143 return false;
2144
2145 // Check for the post-increment load/store instruction.
2146 MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
2147 if (!PrevDef || PrevDef == MI)
2148 return false;
2149
2150 if (!TII->isPostIncrement(*PrevDef))
2151 return false;
2152
2153 unsigned BasePos1 = 0, OffsetPos1 = 0;
2154 if (!TII->getBaseAndOffsetPosition(*PrevDef, BasePos1, OffsetPos1))
2155 return false;
2156
2157 // Make sure that the instructions do not access the same memory location in
2158 // the next iteration.
2159 int64_t LoadOffset = MI->getOperand(OffsetPosLd).getImm();
2160 int64_t StoreOffset = PrevDef->getOperand(OffsetPos1).getImm();
2162 NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset);
2163 bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef);
2164 MF.deleteMachineInstr(NewMI);
2165 if (!Disjoint)
2166 return false;
2167
2168 // Set the return value once we determine that we return true.
2169 BasePos = BasePosLd;
2170 OffsetPos = OffsetPosLd;
2171 NewBase = PrevReg;
2172 Offset = StoreOffset;
2173 return true;
2174}
2175
2176/// Apply changes to the instruction if needed. The changes are need
2177/// to improve the scheduling and depend up on the final schedule.
2179 SMSchedule &Schedule) {
2180 SUnit *SU = getSUnit(MI);
2182 InstrChanges.find(SU);
2183 if (It != InstrChanges.end()) {
2184 std::pair<unsigned, int64_t> RegAndOffset = It->second;
2185 unsigned BasePos, OffsetPos;
2186 if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2187 return;
2188 Register BaseReg = MI->getOperand(BasePos).getReg();
2189 MachineInstr *LoopDef = findDefInLoop(BaseReg);
2190 int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
2191 int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
2192 int BaseStageNum = Schedule.stageScheduled(SU);
2193 int BaseCycleNum = Schedule.cycleScheduled(SU);
2194 if (BaseStageNum < DefStageNum) {
2196 int OffsetDiff = DefStageNum - BaseStageNum;
2197 if (DefCycleNum < BaseCycleNum) {
2198 NewMI->getOperand(BasePos).setReg(RegAndOffset.first);
2199 if (OffsetDiff > 0)
2200 --OffsetDiff;
2201 }
2202 int64_t NewOffset =
2203 MI->getOperand(OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
2204 NewMI->getOperand(OffsetPos).setImm(NewOffset);
2205 SU->setInstr(NewMI);
2206 MISUnitMap[NewMI] = SU;
2207 NewMIs[MI] = NewMI;
2208 }
2209 }
2210}
2211
2212/// Return the instruction in the loop that defines the register.
2213/// If the definition is a Phi, then follow the Phi operand to
2214/// the instruction in the loop.
2215MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
2217 MachineInstr *Def = MRI.getVRegDef(Reg);
2218 while (Def->isPHI()) {
2219 if (!Visited.insert(Def).second)
2220 break;
2221 for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
2222 if (Def->getOperand(i + 1).getMBB() == BB) {
2223 Def = MRI.getVRegDef(Def->getOperand(i).getReg());
2224 break;
2225 }
2226 }
2227 return Def;
2228}
2229
2230/// Return true for an order or output dependence that is loop carried
2231/// potentially. A dependence is loop carried if the destination defines a valu
2232/// that may be used or defined by the source in a subsequent iteration.
2234 bool isSucc) {
2235 if ((Dep.getKind() != SDep::Order && Dep.getKind() != SDep::Output) ||
2236 Dep.isArtificial() || Dep.getSUnit()->isBoundaryNode())
2237 return false;
2238
2240 return true;
2241
2242 if (Dep.getKind() == SDep::Output)
2243 return true;
2244
2245 MachineInstr *SI = Source->getInstr();
2246 MachineInstr *DI = Dep.getSUnit()->getInstr();
2247 if (!isSucc)
2248 std::swap(SI, DI);
2249 assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
2250
2251 // Assume ordered loads and stores may have a loop carried dependence.
2252 if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
2253 SI->mayRaiseFPException() || DI->mayRaiseFPException() ||
2254 SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
2255 return true;
2256
2257 // Only chain dependences between a load and store can be loop carried.
2258 if (!DI->mayStore() || !SI->mayLoad())
2259 return false;
2260
2261 unsigned DeltaS, DeltaD;
2262 if (!computeDelta(*SI, DeltaS) || !computeDelta(*DI, DeltaD))
2263 return true;
2264
2265 const MachineOperand *BaseOpS, *BaseOpD;
2266 int64_t OffsetS, OffsetD;
2267 bool OffsetSIsScalable, OffsetDIsScalable;
2269 if (!TII->getMemOperandWithOffset(*SI, BaseOpS, OffsetS, OffsetSIsScalable,
2270 TRI) ||
2271 !TII->getMemOperandWithOffset(*DI, BaseOpD, OffsetD, OffsetDIsScalable,
2272 TRI))
2273 return true;
2274
2275 assert(!OffsetSIsScalable && !OffsetDIsScalable &&
2276 "Expected offsets to be byte offsets");
2277
2278 MachineInstr *DefS = MRI.getVRegDef(BaseOpS->getReg());
2279 MachineInstr *DefD = MRI.getVRegDef(BaseOpD->getReg());
2280 if (!DefS || !DefD || !DefS->isPHI() || !DefD->isPHI())
2281 return true;
2282
2283 unsigned InitValS = 0;
2284 unsigned LoopValS = 0;
2285 unsigned InitValD = 0;
2286 unsigned LoopValD = 0;
2287 getPhiRegs(*DefS, BB, InitValS, LoopValS);
2288 getPhiRegs(*DefD, BB, InitValD, LoopValD);
2289 MachineInstr *InitDefS = MRI.getVRegDef(InitValS);
2290 MachineInstr *InitDefD = MRI.getVRegDef(InitValD);
2291
2292 if (!InitDefS->isIdenticalTo(*InitDefD))
2293 return true;
2294
2295 // Check that the base register is incremented by a constant value for each
2296 // iteration.
2297 MachineInstr *LoopDefS = MRI.getVRegDef(LoopValS);
2298 int D = 0;
2299 if (!LoopDefS || !TII->getIncrementValue(*LoopDefS, D))
2300 return true;
2301
2302 uint64_t AccessSizeS = (*SI->memoperands_begin())->getSize();
2303 uint64_t AccessSizeD = (*DI->memoperands_begin())->getSize();
2304
2305 // This is the main test, which checks the offset values and the loop
2306 // increment value to determine if the accesses may be loop carried.
2307 if (AccessSizeS == MemoryLocation::UnknownSize ||
2308 AccessSizeD == MemoryLocation::UnknownSize)
2309 return true;
2310
2311 if (DeltaS != DeltaD || DeltaS < AccessSizeS || DeltaD < AccessSizeD)
2312 return true;
2313
2314 return (OffsetS + (int64_t)AccessSizeS < OffsetD + (int64_t)AccessSizeD);
2315}
2316
2317void SwingSchedulerDAG::postprocessDAG() {
2318 for (auto &M : Mutations)
2319 M->apply(this);
2320}
2321
2322/// Try to schedule the node at the specified StartCycle and continue
2323/// until the node is schedule or the EndCycle is reached. This function
2324/// returns true if the node is scheduled. This routine may search either
2325/// forward or backward for a place to insert the instruction based upon
2326/// the relative values of StartCycle and EndCycle.
2327bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) {
2328 bool forward = true;
2329 LLVM_DEBUG({
2330 dbgs() << "Trying to insert node between " << StartCycle << " and "
2331 << EndCycle << " II: " << II << "\n";
2332 });
2333 if (StartCycle > EndCycle)
2334 forward = false;
2335
2336 // The terminating condition depends on the direction.
2337 int termCycle = forward ? EndCycle + 1 : EndCycle - 1;
2338 for (int curCycle = StartCycle; curCycle != termCycle;
2339 forward ? ++curCycle : --curCycle) {
2340
2341 if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) ||
2342 ProcItinResources.canReserveResources(*SU, curCycle)) {
2343 LLVM_DEBUG({
2344 dbgs() << "\tinsert at cycle " << curCycle << " ";
2345 SU->getInstr()->dump();
2346 });
2347
2348 if (!ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()))
2349 ProcItinResources.reserveResources(*SU, curCycle);
2350 ScheduledInstrs[curCycle].push_back(SU);
2351 InstrToCycle.insert(std::make_pair(SU, curCycle));
2352 if (curCycle > LastCycle)
2353 LastCycle = curCycle;
2354 if (curCycle < FirstCycle)
2355 FirstCycle = curCycle;
2356 return true;
2357 }
2358 LLVM_DEBUG({
2359 dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
2360 SU->getInstr()->dump();
2361 });
2362 }
2363 return false;
2364}
2365
2366// Return the cycle of the earliest scheduled instruction in the chain.
2369 SmallVector<SDep, 8> Worklist;
2370 Worklist.push_back(Dep);
2371 int EarlyCycle = INT_MAX;
2372 while (!Worklist.empty()) {
2373 const SDep &Cur = Worklist.pop_back_val();
2374 SUnit *PrevSU = Cur.getSUnit();
2375 if (Visited.count(PrevSU))
2376 continue;
2377 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(PrevSU);
2378 if (it == InstrToCycle.end())
2379 continue;
2380 EarlyCycle = std::min(EarlyCycle, it->second);
2381 for (const auto &PI : PrevSU->Preds)
2382 if (PI.getKind() == SDep::Order || PI.getKind() == SDep::Output)
2383 Worklist.push_back(PI);
2384 Visited.insert(PrevSU);
2385 }
2386 return EarlyCycle;
2387}
2388
2389// Return the cycle of the latest scheduled instruction in the chain.
2392 SmallVector<SDep, 8> Worklist;
2393 Worklist.push_back(Dep);
2394 int LateCycle = INT_MIN;
2395 while (!Worklist.empty()) {
2396 const SDep &Cur = Worklist.pop_back_val();
2397 SUnit *SuccSU = Cur.getSUnit();
2398 if (Visited.count(SuccSU) || SuccSU->isBoundaryNode())
2399 continue;
2400 std::map<SUnit *, int>::const_iterator it = InstrToCycle.find(SuccSU);
2401 if (it == InstrToCycle.end())
2402 continue;
2403 LateCycle = std::max(LateCycle, it->second);
2404 for (const auto &SI : SuccSU->Succs)
2405 if (SI.getKind() == SDep::Order || SI.getKind() == SDep::Output)
2406 Worklist.push_back(SI);
2407 Visited.insert(SuccSU);
2408 }
2409 return LateCycle;
2410}
2411
2412/// If an instruction has a use that spans multiple iterations, then
2413/// return true. These instructions are characterized by having a back-ege
2414/// to a Phi, which contains a reference to another Phi.
2416 for (auto &P : SU->Preds)
2417 if (DAG->isBackedge(SU, P) && P.getSUnit()->getInstr()->isPHI())
2418 for (auto &S : P.getSUnit()->Succs)
2419 if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
2420 return P.getSUnit();
2421 return nullptr;
2422}
2423
2424/// Compute the scheduling start slot for the instruction. The start slot
2425/// depends on any predecessor or successor nodes scheduled already.
2426void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
2427 int *MinEnd, int *MaxStart, int II,
2428 SwingSchedulerDAG *DAG) {
2429 // Iterate over each instruction that has been scheduled already. The start
2430 // slot computation depends on whether the previously scheduled instruction
2431 // is a predecessor or successor of the specified instruction.
2432 for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
2433
2434 // Iterate over each instruction in the current cycle.
2435 for (SUnit *I : getInstructions(cycle)) {
2436 // Because we're processing a DAG for the dependences, we recognize
2437 // the back-edge in recurrences by anti dependences.
2438 for (unsigned i = 0, e = (unsigned)SU->Preds.size(); i != e; ++i) {
2439 const SDep &Dep = SU->Preds[i];
2440 if (Dep.getSUnit() == I) {
2441 if (!DAG->isBackedge(SU, Dep)) {
2442 int EarlyStart = cycle + Dep.getLatency() -
2443 DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
2444 *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
2445 if (DAG->isLoopCarriedDep(SU, Dep, false)) {
2446 int End = earliestCycleInChain(Dep) + (II - 1);
2447 *MinEnd = std::min(*MinEnd, End);
2448 }
2449 } else {
2450 int LateStart = cycle - Dep.getLatency() +
2451 DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
2452 *MinLateStart = std::min(*MinLateStart, LateStart);
2453 }
2454 }
2455 // For instruction that requires multiple iterations, make sure that
2456 // the dependent instruction is not scheduled past the definition.
2457 SUnit *BE = multipleIterations(I, DAG);
2458 if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
2459 !SU->isPred(I))
2460 *MinLateStart = std::min(*MinLateStart, cycle);
2461 }
2462 for (unsigned i = 0, e = (unsigned)SU->Succs.size(); i != e; ++i) {
2463 if (SU->Succs[i].getSUnit() == I) {
2464 const SDep &Dep = SU->Succs[i];
2465 if (!DAG->isBackedge(SU, Dep)) {
2466 int LateStart = cycle - Dep.getLatency() +
2467 DAG->getDistance(SU, Dep.getSUnit(), Dep) * II;
2468 *MinLateStart = std::min(*MinLateStart, LateStart);
2469 if (DAG->isLoopCarriedDep(SU, Dep)) {
2470 int Start = latestCycleInChain(Dep) + 1 - II;
2471 *MaxStart = std::max(*MaxStart, Start);
2472 }
2473 } else {
2474 int EarlyStart = cycle + Dep.getLatency() -
2475 DAG->getDistance(Dep.getSUnit(), SU, Dep) * II;
2476 *MaxEarlyStart = std::max(*MaxEarlyStart, EarlyStart);
2477 }
2478 }
2479 }
2480 }
2481 }
2482}
2483
2484/// Order the instructions within a cycle so that the definitions occur
2485/// before the uses. Returns true if the instruction is added to the start
2486/// of the list, or false if added to the end.
2488 std::deque<SUnit *> &Insts) {
2489 MachineInstr *MI = SU->getInstr();
2490 bool OrderBeforeUse = false;
2491 bool OrderAfterDef = false;
2492 bool OrderBeforeDef = false;
2493 unsigned MoveDef = 0;
2494 unsigned MoveUse = 0;
2495 int StageInst1 = stageScheduled(SU);
2496
2497 unsigned Pos = 0;
2498 for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
2499 ++I, ++Pos) {
2500 for (MachineOperand &MO : MI->operands()) {
2501 if (!MO.isReg() || !MO.getReg().isVirtual())
2502 continue;
2503
2504 Register Reg = MO.getReg();
2505 unsigned BasePos, OffsetPos;
2506 if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
2507 if (MI->getOperand(BasePos).getReg() == Reg)
2508 if (unsigned NewReg = SSD->getInstrBaseReg(SU))
2509 Reg = NewReg;
2510 bool Reads, Writes;
2511 std::tie(Reads, Writes) =
2512 (*I)->getInstr()->readsWritesVirtualRegister(Reg);
2513 if (MO.isDef() && Reads && stageScheduled(*I) <= StageInst1) {
2514 OrderBeforeUse = true;
2515 if (MoveUse == 0)
2516 MoveUse = Pos;
2517 } else if (MO.isDef() && Reads && stageScheduled(*I) > StageInst1) {
2518 // Add the instruction after the scheduled instruction.
2519 OrderAfterDef = true;
2520 MoveDef = Pos;
2521 } else if (MO.isUse() && Writes && stageScheduled(*I) == StageInst1) {
2522 if (cycleScheduled(*I) == cycleScheduled(SU) && !(*I)->isSucc(SU)) {
2523 OrderBeforeUse = true;
2524 if (MoveUse == 0)
2525 MoveUse = Pos;
2526 } else {
2527 OrderAfterDef = true;
2528 MoveDef = Pos;
2529 }
2530 } else if (MO.isUse() && Writes && stageScheduled(*I) > StageInst1) {
2531 OrderBeforeUse = true;
2532 if (MoveUse == 0)
2533 MoveUse = Pos;
2534 if (MoveUse != 0) {
2535 OrderAfterDef = true;
2536 MoveDef = Pos - 1;
2537 }
2538 } else if (MO.isUse() && Writes && stageScheduled(*I) < StageInst1) {
2539 // Add the instruction before the scheduled instruction.
2540 OrderBeforeUse = true;
2541 if (MoveUse == 0)
2542 MoveUse = Pos;
2543 } else if (MO.isUse() && stageScheduled(*I) == StageInst1 &&
2544 isLoopCarriedDefOfUse(SSD, (*I)->getInstr(), MO)) {
2545 if (MoveUse == 0) {
2546 OrderBeforeDef = true;
2547 MoveUse = Pos;
2548 }
2549 }
2550 }
2551 // Check for order dependences between instructions. Make sure the source
2552 // is ordered before the destination.
2553 for (auto &S : SU->Succs) {
2554 if (S.getSUnit() != *I)
2555 continue;
2556 if (S.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
2557 OrderBeforeUse = true;
2558 if (Pos < MoveUse)
2559 MoveUse = Pos;
2560 }
2561 // We did not handle HW dependences in previous for loop,
2562 // and we normally set Latency = 0 for Anti deps,
2563 // so may have nodes in same cycle with Anti denpendent on HW regs.
2564 else if (S.getKind() == SDep::Anti && stageScheduled(*I) == StageInst1) {
2565 OrderBeforeUse = true;
2566 if ((MoveUse == 0) || (Pos < MoveUse))
2567 MoveUse = Pos;
2568 }
2569 }
2570 for (auto &P : SU->Preds) {
2571 if (P.getSUnit() != *I)
2572 continue;
2573 if (P.getKind() == SDep::Order && stageScheduled(*I) == StageInst1) {
2574 OrderAfterDef = true;
2575 MoveDef = Pos;
2576 }
2577 }
2578 }
2579
2580 // A circular dependence.
2581 if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
2582 OrderBeforeUse = false;
2583
2584 // OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
2585 // to a loop-carried dependence.
2586 if (OrderBeforeDef)
2587 OrderBeforeUse = !OrderAfterDef || (MoveUse > MoveDef);
2588
2589 // The uncommon case when the instruction order needs to be updated because
2590 // there is both a use and def.
2591 if (OrderBeforeUse && OrderAfterDef) {
2592 SUnit *UseSU = Insts.at(MoveUse);
2593 SUnit *DefSU = Insts.at(MoveDef);
2594 if (MoveUse > MoveDef) {
2595 Insts.erase(Insts.begin() + MoveUse);
2596 Insts.erase(Insts.begin() + MoveDef);
2597 } else {
2598 Insts.erase(Insts.begin() + MoveDef);
2599 Insts.erase(Insts.begin() + MoveUse);
2600 }
2601 orderDependence(SSD, UseSU, Insts);
2602 orderDependence(SSD, SU, Insts);
2603 orderDependence(SSD, DefSU, Insts);
2604 return;
2605 }
2606 // Put the new instruction first if there is a use in the list. Otherwise,
2607 // put it at the end of the list.
2608 if (OrderBeforeUse)
2609 Insts.push_front(SU);
2610 else
2611 Insts.push_back(SU);
2612}
2613
2614/// Return true if the scheduled Phi has a loop carried operand.
2616 if (!Phi.isPHI())
2617 return false;
2618 assert(Phi.isPHI() && "Expecting a Phi.");
2619 SUnit *DefSU = SSD->getSUnit(&Phi);
2620 unsigned DefCycle = cycleScheduled(DefSU);
2621 int DefStage = stageScheduled(DefSU);
2622
2623 unsigned InitVal = 0;
2624 unsigned LoopVal = 0;
2625 getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
2626 SUnit *UseSU = SSD->getSUnit(MRI.getVRegDef(LoopVal));
2627 if (!UseSU)
2628 return true;
2629 if (UseSU->getInstr()->isPHI())
2630 return true;
2631 unsigned LoopCycle = cycleScheduled(UseSU);
2632 int LoopStage = stageScheduled(UseSU);
2633 return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
2634}
2635
2636/// Return true if the instruction is a definition that is loop carried
2637/// and defines the use on the next iteration.
2638/// v1 = phi(v2, v3)
2639/// (Def) v3 = op v1
2640/// (MO) = v1
2641/// If MO appears before Def, then then v1 and v3 may get assigned to the same
2642/// register.
2644 MachineInstr *Def, MachineOperand &MO) {
2645 if (!MO.isReg())
2646 return false;
2647 if (Def->isPHI())
2648 return false;
2649 MachineInstr *Phi = MRI.getVRegDef(MO.getReg());
2650 if (!Phi || !Phi->isPHI() || Phi->getParent() != Def->getParent())
2651 return false;
2652 if (!isLoopCarried(SSD, *Phi))
2653 return false;
2654 unsigned LoopReg = getLoopPhiReg(*Phi, Phi->getParent());
2655 for (unsigned i = 0, e = Def->getNumOperands(); i != e; ++i) {
2656 MachineOperand &DMO = Def->getOperand(i);
2657 if (!DMO.isReg() || !DMO.isDef())
2658 continue;
2659 if (DMO.getReg() == LoopReg)
2660 return true;
2661 }
2662 return false;
2663}
2664
2665/// Determine transitive dependences of unpipelineable instructions
2668 SmallSet<SUnit *, 8> DoNotPipeline;
2669 SmallVector<SUnit *, 8> Worklist;
2670
2671 for (auto &SU : SSD->SUnits)
2672 if (SU.isInstr() && PLI->shouldIgnoreForPipelining(SU.getInstr()))
2673 Worklist.push_back(&SU);
2674
2675 while (!Worklist.empty()) {
2676 auto SU = Worklist.pop_back_val();
2677 if (DoNotPipeline.count(SU))
2678 continue;
2679 LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n");
2680 DoNotPipeline.insert(SU);
2681 for (auto &Dep : SU->Preds)
2682 Worklist.push_back(Dep.getSUnit());
2683 if (SU->getInstr()->isPHI())
2684 for (auto &Dep : SU->Succs)
2685 if (Dep.getKind() == SDep::Anti)
2686 Worklist.push_back(Dep.getSUnit());
2687 }
2688 return DoNotPipeline;
2689}
2690
2691// Determine all instructions upon which any unpipelineable instruction depends
2692// and ensure that they are in stage 0. If unable to do so, return false.
2696
2697 int NewLastCycle = INT_MIN;
2698 for (SUnit &SU : SSD->SUnits) {
2699 if (!SU.isInstr())
2700 continue;
2701 if (!DNP.contains(&SU) || stageScheduled(&SU) == 0) {
2702 NewLastCycle = std::max(NewLastCycle, InstrToCycle[&SU]);
2703 continue;
2704 }
2705
2706 // Put the non-pipelined instruction as early as possible in the schedule
2707 int NewCycle = getFirstCycle();
2708 for (auto &Dep : SU.Preds)
2709 NewCycle = std::max(InstrToCycle[Dep.getSUnit()], NewCycle);
2710
2711 int OldCycle = InstrToCycle[&SU];
2712 if (OldCycle != NewCycle) {
2713 InstrToCycle[&SU] = NewCycle;
2714 auto &OldS = getInstructions(OldCycle);
2715 llvm::erase_value(OldS, &SU);
2716 getInstructions(NewCycle).emplace_back(&SU);
2717 LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum
2718 << ") is not pipelined; moving from cycle " << OldCycle
2719 << " to " << NewCycle << " Instr:" << *SU.getInstr());
2720 }
2721 NewLastCycle = std::max(NewLastCycle, NewCycle);
2722 }
2723 LastCycle = NewLastCycle;
2724 return true;
2725}
2726
2727// Check if the generated schedule is valid. This function checks if
2728// an instruction that uses a physical register is scheduled in a
2729// different stage than the definition. The pipeliner does not handle
2730// physical register values that may cross a basic block boundary.
2731// Furthermore, if a physical def/use pair is assigned to the same
2732// cycle, orderDependence does not guarantee def/use ordering, so that
2733// case should be considered invalid. (The test checks for both
2734// earlier and same-cycle use to be more robust.)
2736 for (SUnit &SU : SSD->SUnits) {
2737 if (!SU.hasPhysRegDefs)
2738 continue;
2739 int StageDef = stageScheduled(&SU);
2740 int CycleDef = InstrToCycle[&SU];
2741 assert(StageDef != -1 && "Instruction should have been scheduled.");
2742 for (auto &SI : SU.Succs)
2743 if (SI.isAssignedRegDep() && !SI.getSUnit()->isBoundaryNode())
2744 if (Register::isPhysicalRegister(SI.getReg())) {
2745 if (stageScheduled(SI.getSUnit()) != StageDef)
2746 return false;
2747 if (InstrToCycle[SI.getSUnit()] <= CycleDef)
2748 return false;
2749 }
2750 }
2751 return true;
2752}
2753
2754/// A property of the node order in swing-modulo-scheduling is
2755/// that for nodes outside circuits the following holds:
2756/// none of them is scheduled after both a successor and a
2757/// predecessor.
2758/// The method below checks whether the property is met.
2759/// If not, debug information is printed and statistics information updated.
2760/// Note that we do not use an assert statement.
2761/// The reason is that although an invalid node oder may prevent
2762/// the pipeliner from finding a pipelined schedule for arbitrary II,
2763/// it does not lead to the generation of incorrect code.
2764void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
2765
2766 // a sorted vector that maps each SUnit to its index in the NodeOrder
2767 typedef std::pair<SUnit *, unsigned> UnitIndex;
2768 std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(nullptr, 0));
2769
2770 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i)
2771 Indices.push_back(std::make_pair(NodeOrder[i], i));
2772
2773 auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
2774 return std::get<0>(i1) < std::get<0>(i2);
2775 };
2776
2777 // sort, so that we can perform a binary search
2778 llvm::sort(Indices, CompareKey);
2779
2780 bool Valid = true;
2781 (void)Valid;
2782 // for each SUnit in the NodeOrder, check whether
2783 // it appears after both a successor and a predecessor
2784 // of the SUnit. If this is the case, and the SUnit
2785 // is not part of circuit, then the NodeOrder is not
2786 // valid.
2787 for (unsigned i = 0, s = NodeOrder.size(); i < s; ++i) {
2788 SUnit *SU = NodeOrder[i];
2789 unsigned Index = i;
2790
2791 bool PredBefore = false;
2792 bool SuccBefore = false;
2793
2794 SUnit *Succ;
2795 SUnit *Pred;
2796 (void)Succ;
2797 (void)Pred;
2798
2799 for (SDep &PredEdge : SU->Preds) {
2800 SUnit *PredSU = PredEdge.getSUnit();
2801 unsigned PredIndex = std::get<1>(
2802 *llvm::lower_bound(Indices, std::make_pair(PredSU, 0), CompareKey));
2803 if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
2804 PredBefore = true;
2805 Pred = PredSU;
2806 break;
2807 }
2808 }
2809
2810 for (SDep &SuccEdge : SU->Succs) {
2811 SUnit *SuccSU = SuccEdge.getSUnit();
2812 // Do not process a boundary node, it was not included in NodeOrder,
2813 // hence not in Indices either, call to std::lower_bound() below will
2814 // return Indices.end().
2815 if (SuccSU->isBoundaryNode())
2816 continue;
2817 unsigned SuccIndex = std::get<1>(
2818 *llvm::lower_bound(Indices, std::make_pair(SuccSU, 0), CompareKey));
2819 if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
2820 SuccBefore = true;
2821 Succ = SuccSU;
2822 break;
2823 }
2824 }
2825
2826 if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
2827 // instructions in circuits are allowed to be scheduled
2828 // after both a successor and predecessor.
2829 bool InCircuit = llvm::any_of(
2830 Circuits, [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
2831 if (InCircuit)
2832 LLVM_DEBUG(dbgs() << "In a circuit, predecessor ";);
2833 else {
2834 Valid = false;
2835 NumNodeOrderIssues++;
2836 LLVM_DEBUG(dbgs() << "Predecessor ";);
2837 }
2838 LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
2839 << " are scheduled before node " << SU->NodeNum
2840 << "\n";);
2841 }
2842 }
2843
2844 LLVM_DEBUG({
2845 if (!Valid)
2846 dbgs() << "Invalid node order found!\n";
2847 });
2848}
2849
2850/// Attempt to fix the degenerate cases when the instruction serialization
2851/// causes the register lifetimes to overlap. For example,
2852/// p' = store_pi(p, b)
2853/// = load p, offset
2854/// In this case p and p' overlap, which means that two registers are needed.
2855/// Instead, this function changes the load to use p' and updates the offset.
2856void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
2857 unsigned OverlapReg = 0;
2858 unsigned NewBaseReg = 0;
2859 for (SUnit *SU : Instrs) {
2860 MachineInstr *MI = SU->getInstr();
2861 for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
2862 const MachineOperand &MO = MI->getOperand(i);
2863 // Look for an instruction that uses p. The instruction occurs in the
2864 // same cycle but occurs later in the serialized order.
2865 if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
2866 // Check that the instruction appears in the InstrChanges structure,
2867 // which contains instructions that can have the offset updated.
2869 InstrChanges.find(SU);
2870 if (It != InstrChanges.end()) {
2871 unsigned BasePos, OffsetPos;
2872 // Update the base register and adjust the offset.
2873 if (TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos)) {
2875 NewMI->getOperand(BasePos).setReg(NewBaseReg);
2876 int64_t NewOffset =
2877 MI->getOperand(OffsetPos).getImm() - It->second.second;
2878 NewMI->getOperand(OffsetPos).setImm(NewOffset);
2879 SU->setInstr(NewMI);
2880 MISUnitMap[NewMI] = SU;
2881 NewMIs[MI] = NewMI;
2882 }
2883 }
2884 OverlapReg = 0;
2885 NewBaseReg = 0;
2886 break;
2887 }
2888 // Look for an instruction of the form p' = op(p), which uses and defines
2889 // two virtual registers that get allocated to the same physical register.
2890 unsigned TiedUseIdx = 0;
2891 if (MI->isRegTiedToUseOperand(i, &TiedUseIdx)) {
2892 // OverlapReg is p in the example above.
2893 OverlapReg = MI->getOperand(TiedUseIdx).getReg();
2894 // NewBaseReg is p' in the example above.
2895 NewBaseReg = MI->getOperand(i).getReg();
2896 break;
2897 }
2898 }
2899 }
2900}
2901
2902/// After the schedule has been formed, call this function to combine
2903/// the instructions from the different stages/cycles. That is, this
2904/// function creates a schedule that represents a single iteration.
2906 // Move all instructions to the first stage from later stages.
2907 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
2908 for (int stage = 1, lastStage = getMaxStageCount(); stage <= lastStage;
2909 ++stage) {
2910 std::deque<SUnit *> &cycleInstrs =
2911 ScheduledInstrs[cycle + (stage * InitiationInterval)];
2912 for (SUnit *SU : llvm::reverse(cycleInstrs))
2913 ScheduledInstrs[cycle].push_front(SU);
2914 }
2915 }
2916
2917 // Erase all the elements in the later stages. Only one iteration should
2918 // remain in the scheduled list, and it contains all the instructions.
2919 for (int cycle = getFinalCycle() + 1; cycle <= LastCycle; ++cycle)
2920 ScheduledInstrs.erase(cycle);
2921
2922 // Change the registers in instruction as specified in the InstrChanges
2923 // map. We need to use the new registers to create the correct order.
2924 for (const SUnit &SU : SSD->SUnits)
2925 SSD->applyInstrChange(SU.getInstr(), *this);
2926
2927 // Reorder the instructions in each cycle to fix and improve the
2928 // generated code.
2929 for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
2930 std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
2931 std::deque<SUnit *> newOrderPhi;
2932 for (SUnit *SU : cycleInstrs) {
2933 if (SU->getInstr()->isPHI())
2934 newOrderPhi.push_back(SU);
2935 }
2936 std::deque<SUnit *> newOrderI;
2937 for (SUnit *SU : cycleInstrs) {
2938 if (!SU->getInstr()->isPHI())
2939 orderDependence(SSD, SU, newOrderI);
2940 }
2941 // Replace the old order with the new order.
2942 cycleInstrs.swap(newOrderPhi);
2943 llvm::append_range(cycleInstrs, newOrderI);
2944 SSD->fixupRegisterOverlaps(cycleInstrs);
2945 }
2946
2947 LLVM_DEBUG(dump(););
2948}
2949
2951 os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
2952 << " depth " << MaxDepth << " col " << Colocate << "\n";
2953 for (const auto &I : Nodes)
2954 os << " SU(" << I->NodeNum << ") " << *(I->getInstr());
2955 os << "\n";
2956}
2957
2958#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2959/// Print the schedule information to the given output.
2961 // Iterate over each cycle.
2962 for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
2963 // Iterate over each instruction in the cycle.
2964 const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
2965 for (SUnit *CI : cycleInstrs->second) {
2966 os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
2967 os << "(" << CI->NodeNum << ") ";
2968 CI->getInstr()->print(os);
2969 os << "\n";
2970 }
2971 }
2972}
2973
2974/// Utility function used for debugging to print the schedule.
2977
2978void ResourceManager::dumpMRT() const {
2979 LLVM_DEBUG({
2980 if (UseDFA)
2981 return;
2982 std::stringstream SS;
2983 SS << "MRT:\n";
2984 SS << std::setw(4) << "Slot";
2985 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
2986 SS << std::setw(3) << I;
2987 SS << std::setw(7) << "#Mops"
2988 << "\n";
2989 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
2990 SS << std::setw(4) << Slot;
2991 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I)
2992 SS << std::setw(3) << MRT[Slot][I];
2993 SS << std::setw(7) << NumScheduledMops[Slot] << "\n";
2994 }
2995 dbgs() << SS.str();
2996 });
2997}
2998#endif
2999
3001 const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {
3002 unsigned ProcResourceID = 0;
3003
3004 // We currently limit the resource kinds to 64 and below so that we can use
3005 // uint64_t for Masks
3006 assert(SM.getNumProcResourceKinds() < 64 &&
3007 "Too many kinds of resources, unsupported");
3008 // Create a unique bitmask for every processor resource unit.
3009 // Skip resource at index 0, since it always references 'InvalidUnit'.
3010 Masks.resize(SM.getNumProcResourceKinds());
3011 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3012 const MCProcResourceDesc &Desc = *SM.getProcResource(I);
3013 if (Desc.SubUnitsIdxBegin)
3014 continue;
3015 Masks[I] = 1ULL << ProcResourceID;
3016 ProcResourceID++;
3017 }
3018 // Create a unique bitmask for every processor resource group.
3019 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3020 const MCProcResourceDesc &Desc = *SM.getProcResource(I);
3021 if (!Desc.SubUnitsIdxBegin)
3022 continue;
3023 Masks[I] = 1ULL << ProcResourceID;
3024 for (unsigned U = 0; U < Desc.NumUnits; ++U)
3025 Masks[I] |= Masks[Desc.SubUnitsIdxBegin[U]];
3026 ProcResourceID++;
3027 }
3028 LLVM_DEBUG({
3029 if (SwpShowResMask) {
3030 dbgs() << "ProcResourceDesc:\n";
3031 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3032 const MCProcResourceDesc *ProcResource = SM.getProcResource(I);
3033 dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
3034 ProcResource->Name, I, Masks[I],
3035 ProcResource->NumUnits);
3036 }
3037 dbgs() << " -----------------\n";
3038 }
3039 });
3040}
3041
3043 LLVM_DEBUG({
3044 if (SwpDebugResource)
3045 dbgs() << "canReserveResources:\n";
3046 });
3047 if (UseDFA)
3048 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3049 ->canReserveResources(&SU.getInstr()->getDesc());
3050
3051 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3052 if (!SCDesc->isValid()) {
3053 LLVM_DEBUG({
3054 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3055 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3056 });
3057 return true;
3058 }
3059
3060 reserveResources(SCDesc, Cycle);
3061 bool Result = !isOverbooked();
3062 unreserveResources(SCDesc, Cycle);
3063
3064 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return " << Result << "\n\n";);
3065 return Result;
3066}
3067
3068void ResourceManager::reserveResources(SUnit &SU, int Cycle) {
3069 LLVM_DEBUG({
3070 if (SwpDebugResource)
3071 dbgs() << "reserveResources:\n";
3072 });
3073 if (UseDFA)
3074 return DFAResources[positiveModulo(Cycle, InitiationInterval)]
3075 ->reserveResources(&SU.getInstr()->getDesc());
3076
3077 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3078 if (!SCDesc->isValid()) {
3079 LLVM_DEBUG({
3080 dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3081 dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3082 });
3083 return;
3084 }
3085
3086 reserveResources(SCDesc, Cycle);
3087
3088 LLVM_DEBUG({
3089 if (SwpDebugResource) {
3090 dumpMRT();
3091 dbgs() << "reserveResources: done!\n\n";
3092 }
3093 });
3094}
3095
3096void ResourceManager::reserveResources(const MCSchedClassDesc *SCDesc,
3097 int Cycle) {
3098 assert(!UseDFA);
3099 for (const MCWriteProcResEntry &PRE : make_range(
3100 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3101 for (int C = Cycle; C < Cycle + PRE.Cycles; ++C)
3102 ++MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3103
3104 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3105 ++NumScheduledMops[positiveModulo(C, InitiationInterval)];
3106}
3107
3108void ResourceManager::unreserveResources(const MCSchedClassDesc *SCDesc,
3109 int Cycle) {
3110 assert(!UseDFA);
3111 for (const MCWriteProcResEntry &PRE : make_range(
3112 STI->getWriteProcResBegin(SCDesc), STI->getWriteProcResEnd(SCDesc)))
3113 for (int C = Cycle; C < Cycle + PRE.Cycles; ++C)
3114 --MRT[positiveModulo(C, InitiationInterval)][PRE.ProcResourceIdx];
3115
3116 for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3117 --NumScheduledMops[positiveModulo(C, InitiationInterval)];
3118}
3119
3120bool ResourceManager::isOverbooked() const {
3121 assert(!UseDFA);
3122 for (int Slot = 0; Slot < InitiationInterval; ++Slot) {
3123 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3124 const MCProcResourceDesc *Desc = SM.getProcResource(I);
3125 if (MRT[Slot][I] > Desc->NumUnits)
3126 return true;
3127 }
3128 if (NumScheduledMops[Slot] > IssueWidth)
3129 return true;
3130 }
3131 return false;
3132}
3133
3134int ResourceManager::calculateResMIIDFA() const {
3135 assert(UseDFA);
3136
3137 // Sort the instructions by the number of available choices for scheduling,
3138 // least to most. Use the number of critical resources as the tie breaker.
3139 FuncUnitSorter FUS = FuncUnitSorter(*ST);
3140 for (SUnit &SU : DAG->SUnits)
3141 FUS.calcCriticalResources(*SU.getInstr());
3143 FuncUnitOrder(FUS);
3144
3145 for (SUnit &SU : DAG->SUnits)
3146 FuncUnitOrder.push(SU.getInstr());
3147
3149 Resources.push_back(
3150 std::unique_ptr<DFAPacketizer>(TII->CreateTargetScheduleState(*ST)));
3151
3152 while (!FuncUnitOrder.empty()) {
3153 MachineInstr *MI = FuncUnitOrder.top();
3154 FuncUnitOrder.pop();
3155 if (TII->isZeroCost(MI->getOpcode()))
3156 continue;
3157
3158 // Attempt to reserve the instruction in an existing DFA. At least one
3159 // DFA is needed for each cycle.
3160 unsigned NumCycles = DAG->getSUnit(MI)->Latency;
3161 unsigned ReservedCycles = 0;
3162 auto *RI = Resources.begin();
3163 auto *RE = Resources.end();
3164 LLVM_DEBUG({
3165 dbgs() << "Trying to reserve resource for " << NumCycles
3166 << " cycles for \n";
3167 MI->dump();
3168 });
3169 for (unsigned C = 0; C < NumCycles; ++C)
3170 while (RI != RE) {
3171 if ((*RI)->canReserveResources(*MI)) {
3172 (*RI)->reserveResources(*MI);
3173 ++ReservedCycles;
3174 break;
3175 }
3176 RI++;
3177 }
3178 LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles
3179 << ", NumCycles:" << NumCycles << "\n");
3180 // Add new DFAs, if needed, to reserve resources.
3181 for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
3183 << "NewResource created to reserve resources"
3184 << "\n");
3185 auto *NewResource = TII->CreateTargetScheduleState(*ST);
3186 assert(NewResource->canReserveResources(*MI) && "Reserve error.");
3187 NewResource->reserveResources(*MI);
3188 Resources.push_back(std::unique_ptr<DFAPacketizer>(NewResource));
3189 }
3190 }
3191
3192 int Resmii = Resources.size();
3193 LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
3194 return Resmii;
3195}
3196
3198 if (UseDFA)
3199 return calculateResMIIDFA();
3200
3201 // Count each resource consumption and divide it by the number of units.
3202 // ResMII is the max value among them.
3203
3204 int NumMops = 0;
3206 for (SUnit &SU : DAG->SUnits) {
3207 if (TII->isZeroCost(SU.getInstr()->getOpcode()))
3208 continue;
3209
3210 const MCSchedClassDesc *SCDesc = DAG->getSchedClass(&SU);
3211 if (!SCDesc->isValid())
3212 continue;
3213
3214 LLVM_DEBUG({
3215 if (SwpDebugResource) {
3216 DAG->dumpNode(SU);
3217 dbgs() << " #Mops: " << SCDesc->NumMicroOps << "\n"
3218 << " WriteProcRes: ";
3219 }
3220 });
3221 NumMops += SCDesc->NumMicroOps;
3222 for (const MCWriteProcResEntry &PRE :
3223 make_range(STI->getWriteProcResBegin(SCDesc),
3224 STI->getWriteProcResEnd(SCDesc))) {
3225 LLVM_DEBUG({
3226 if (SwpDebugResource) {
3227 const MCProcResourceDesc *Desc =
3228 SM.getProcResource(PRE.ProcResourceIdx);
3229 dbgs() << Desc->Name << ": " << PRE.Cycles << ", ";
3230 }
3231 });
3232 ResourceCount[PRE.ProcResourceIdx] += PRE.Cycles;
3233 }
3234 LLVM_DEBUG(if (SwpDebugResource) dbgs() << "\n");
3235 }
3236
3237 int Result = (NumMops + IssueWidth - 1) / IssueWidth;
3238 LLVM_DEBUG({
3239 if (SwpDebugResource)
3240 dbgs() << "#Mops: " << NumMops << ", "
3241 << "IssueWidth: " << IssueWidth << ", "
3242 << "Cycles: " << Result << "\n";
3243 });
3244
3245 LLVM_DEBUG({
3246 if (SwpDebugResource) {
3247 std::stringstream SS;
3248 SS << std::setw(2) << "ID" << std::setw(16) << "Name" << std::setw(10)
3249 << "Units" << std::setw(10) << "Consumed" << std::setw(10) << "Cycles"
3250 << "\n";
3251 dbgs() << SS.str();
3252 }
3253 });
3254 for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3255 const MCProcResourceDesc *Desc = SM.getProcResource(I);
3256 int Cycles = (ResourceCount[I] + Desc->NumUnits - 1) / Desc->NumUnits;
3257 LLVM_DEBUG({
3258 if (SwpDebugResource) {
3259 std::stringstream SS;
3260 SS << std::setw(2) << I << std::setw(16) << Desc->Name << std::setw(10)
3261 << Desc->NumUnits << std::setw(10) << ResourceCount[I]
3262 << std::setw(10) << Cycles << "\n";
3263 dbgs() << SS.str();
3264 }
3265 });
3266 if (Cycles > Result)
3267 Result = Cycles;
3268 }
3269 return Result;
3270}
3271
3273 InitiationInterval = II;
3274 DFAResources.clear();
3275 DFAResources.resize(II);
3276 for (auto &I : DFAResources)
3277 I.reset(ST->getInstrInfo()->CreateTargetScheduleState(*ST));
3278 MRT.clear();
3280 NumScheduledMops.clear();
3281 NumScheduledMops.resize(II);
3282}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
SmallPtrSet< MachineInstr *, 2 > Uses
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:492
This file declares an analysis pass that computes CycleInfo for LLVM IR, specialized from GenericCycl...
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
hexagon gen pred
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
static cl::opt< int > SwpForceII("pipeliner-force-ii", cl::desc("Force pipeliner to use specified II."), cl::Hidden, cl::init(-1))
A command line argument to force pipeliner to use specified initial interval.
static cl::opt< bool > ExperimentalCodeGen("pipeliner-experimental-cg", cl::Hidden, cl::init(false), cl::desc("Use the experimental peeling code generator for software pipelining"))
static bool pred_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Preds, const NodeSet *S=nullptr)
Compute the Pred_L(O) set, as defined in the paper.
static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB)
Return the Phi register value that comes the loop block.
static cl::opt< bool > SwpDebugResource("pipeliner-dbg-res", cl::Hidden, cl::init(false))
static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker, NodeSet &NS)
Compute the live-out registers for the instructions in a node-set.
static cl::opt< bool > EmitTestAnnotations("pipeliner-annotate-for-testing", cl::Hidden, cl::init(false), cl::desc("Instead of emitting the pipelined code, annotate instructions " "with the generated schedule for feeding into the " "-modulo-schedule-test pass"))
static bool isIntersect(SmallSetVector< SUnit *, 8 > &Set1, const NodeSet &Set2, SmallSetVector< SUnit *, 8 > &Result)
Return true if Set1 contains elements in Set2.
static cl::opt< bool > SwpIgnoreRecMII("pipeliner-ignore-recmii", cl::ReallyHidden, cl::desc("Ignore RecMII"))
static cl::opt< int > SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1))
static bool succ_L(SetVector< SUnit * > &NodeOrder, SmallSetVector< SUnit *, 8 > &Succs, const NodeSet *S=nullptr)
Compute the Succ_L(O) set, as defined in the paper.
Modulo Software Pipelining
static cl::opt< bool > SwpPruneLoopCarried("pipeliner-prune-loop-carried", cl::desc("Prune loop carried order dependences."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of loop carried order dependences.
static bool isDependenceBarrier(MachineInstr &MI)
Return true if the instruction causes a chain between memory references before and after it.
static cl::opt< int > SwpMaxMii("pipeliner-max-mii", cl::desc("Size limit for the MII."), cl::Hidden, cl::init(27))
A command line argument to limit minimum initial interval for pipelining.
static void swapAntiDependences(std::vector< SUnit > &SUnits)
Swap all the anti dependences in the DAG.
static bool isSuccOrder(SUnit *SUa, SUnit *SUb)
Return true if SUb can be reached from SUa following the chain edges.
static cl::opt< int > SwpMaxStages("pipeliner-max-stages", cl::desc("Maximum stages allowed in the generated scheduled."), cl::Hidden, cl::init(3))
A command line argument to limit the number of stages in the pipeline.
static cl::opt< bool > EnableSWPOptSize("enable-pipeliner-opt-size", cl::desc("Enable SWP at Os."), cl::Hidden, cl::init(false))
A command line option to enable SWP at -Os.
static cl::opt< bool > SwpShowResMask("pipeliner-show-mask", cl::Hidden, cl::init(false))
static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop, unsigned &InitVal, unsigned &LoopVal)
Return the register values for the operands of a Phi instruction.
#define DEBUG_TYPE
static cl::opt< bool > EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true), cl::desc("Enable Software Pipelining"))
A command line option to turn software pipelining on or off.
static bool ignoreDependence(const SDep &D, bool isPred)
Return true for DAG nodes that we ignore when computing the cost functions.
static cl::opt< bool > SwpPruneDeps("pipeliner-prune-deps", cl::desc("Prune dependences between unrelated Phi nodes."), cl::Hidden, cl::init(true))
A command line option to disable the pruning of chain dependences due to an unrelated Phi.
static SUnit * multipleIterations(SUnit *SU, SwingSchedulerDAG *DAG)
If an instruction has a use that spans multiple iterations, then return true.
static bool computePath(SUnit *Cur, SetVector< SUnit * > &Path, SetVector< SUnit * > &DestNodes, SetVector< SUnit * > &Exclude, SmallPtrSet< SUnit *, 8 > &Visited)
Return true if there is a path from the specified node to any of the nodes in DestNodes.
unsigned const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
return ToRemove size() > 0
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file defines the PriorityQueue class.
@ SI
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static unsigned getSize(unsigned Kind)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
bool isNoAlias(const MemoryLocation &LocA, const MemoryLocation &LocB)
A trivial helper function to check to see if the specified pointers are no-alias.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:127
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:197
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:150
bool erase(const KeyT &Val)
Definition: DenseMap.h:302
bool empty() const
Definition: DenseMap.h:98
iterator end()
Definition: DenseMap.h:84
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:315
A possibly irreducible generalization of a Loop.
Itinerary data supplied by a subtarget to be used by a target.
const InstrStage * beginStage(unsigned ItinClassIndx) const
Return the first stage of the itinerary.
const InstrStage * endStage(unsigned ItinClassIndx) const
Return the last+1 stage of the itinerary.
bool isEmpty() const
Returns true if there are no itineraries.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:275
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
Definition: LoopInfo.h:202
BlockT * getHeader() const
Definition: LoopInfo.h:105
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:183
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:188
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:547
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:631
unsigned getSchedClass() const
Return the scheduling class for this instruction.
Definition: MCInstrDesc.h:596
const MCInstrDesc & get(unsigned Opcode) const
Return the machine instruction descriptor that corresponds to the specified instruction opcode.
Definition: MCInstrInfo.h:63
bool isValid() const
isValid - returns true if this iterator is not yet at the end.
Generic base class for all target subtargets.
const MCWriteProcResEntry * getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCWriteProcResEntry * getWriteProcResBegin(const MCSchedClassDesc *SC) const
Return an iterator at the first process resource consumed by the given scheduling class.
const MCSchedModel & getSchedModel() const
Get the machine model for this subtarget's CPU.
Metadata node.
Definition: Metadata.h:943
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1291
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1297
A single uniqued string.
Definition: Metadata.h:611
StringRef getString() const
Definition: Metadata.cpp:507
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any DBG_VALUE and DBG_LABEL instructions.
instr_iterator instr_end()
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void deleteMachineInstr(MachineInstr *MI)
DeleteMachineInstr - Delete the given MachineInstr.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:68
bool mayRaiseFPException() const
Return true if this instruction could possibly raise a floating-point exception.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:516
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:313
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:519
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition: MachineInstr.h:513
bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
bool isRegSequence() const
iterator_range< mop_iterator > operands()
Definition: MachineInstr.h:641
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:731
bool isIdenticalTo(const MachineInstr &Other, MICheckType Check=CheckDefs) const
Return true if this instruction is identical to Other.
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
bool isPseudo(QueryType Type=IgnoreBundle) const
Return true if this is a pseudo instruction that doesn't correspond to a real machine instruction.
Definition: MachineInstr.h:852
bool isPHI() const
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:526
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
MachineBasicBlock * getTopBlock()
Return the "top" block in the loop, which is the first block in the linear layout,...
A description of a memory reference used in the backend.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
const Value * getValue() const
Return the base address of the memory access.
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineBasicBlock * getMBB() const
void setReg(Register Reg)
Change the register this operand corresponds to.
Register getReg() const
getReg - Returns the register number.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
Diagnostic information for optimization analysis remarks.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
The main class in the implementation of the target independent software pipeliner pass.
const TargetInstrInfo * TII
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
MachineFunction * MF
MachineOptimizationRemarkEmitter * ORE
RegisterClassInfo RegClassInfo
defusechain_iterator - This class provides iterator support for machine operands in the function that...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
use_instr_iterator use_instr_begin(Register RegNo) const
static use_instr_iterator use_instr_end()
MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:37
iterator end()
Definition: MapVector.h:72
iterator find(const KeyT &Key)
Definition: MapVector.h:148
void clear()
Definition: MapVector.h:89
static MemoryLocation getAfter(const Value *Ptr, const AAMDNodes &AATags=AAMDNodes())
Return a location that may access any location after Ptr, while remaining within the underlying objec...
The ModuloScheduleExpander takes a ModuloSchedule and expands it in-place, rewriting the old loop and...
void cleanup()
Performs final cleanup after expansion.
void expand()
Performs the actual expansion.
Expander that simply annotates each scheduled instruction with a post-instr symbol that can be consum...
void annotate()
Performs the annotation.
Represents a schedule for a single-block loop.
A NodeSet contains a set of SUnit DAG nodes with additional information that assigns a priority to th...
SUnit * getNode(unsigned i) const
void print(raw_ostream &os) const
void setRecMII(unsigned mii)
unsigned count(SUnit *SU) const
void setColocate(unsigned c)
int compareRecMII(NodeSet &RHS)
unsigned size() const
bool insert(SUnit *SU)
LLVM_DUMP_METHOD void dump() const
bool empty() const
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:91
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
void dump() const
Definition: Pass.cpp:136
A reimplementation of ModuloScheduleExpander.
unsigned getPSet() const
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
Track the current register pressure at some position in the instruction stream, and remember the high...
void addLiveRegs(ArrayRef< RegisterMaskPair > Regs)
Force liveness of virtual registers or physical register units.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:65
void initProcResourceVectors(const MCSchedModel &SM, SmallVectorImpl< uint64_t > &Masks)
void init(int II)
Initialize resources with the initiation interval II.
bool canReserveResources(SUnit &SU, int Cycle)
Check if the resources occupied by a machine instruction are available in the current state.
Scheduling dependency.
Definition: ScheduleDAG.h:49
SUnit * getSUnit() const
Definition: ScheduleDAG.h:480
Kind getKind() const
Returns an enum value representing the kind of the dependence.
Definition: ScheduleDAG.h:486
Kind
These are the different kinds of scheduling dependencies.
Definition: ScheduleDAG.h:52
@ Output
A register output-dependence (aka WAW).
Definition: ScheduleDAG.h:55
@ Order
Any other ordering dependency.
Definition: ScheduleDAG.h:56
@ Anti
A register anti-dependence (aka WAR).
Definition: ScheduleDAG.h:54
@ Data
Regular data dependence (aka true-dependence).
Definition: ScheduleDAG.h:53
void setLatency(unsigned Lat)
Sets the latency for this edge.
Definition: ScheduleDAG.h:147
@ Barrier
An unknown scheduling barrier.
Definition: ScheduleDAG.h:69
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
Definition: ScheduleDAG.h:142
bool isArtificial() const
Tests if this is an Order dependence that is marked as "artificial", meaning it isn't necessary for c...
Definition: ScheduleDAG.h:200
This class represents the scheduled code.
int earliestCycleInChain(const SDep &Dep)
Return the cycle of the earliest scheduled instruction in the dependence chain.
void setInitiationInterval(int ii)
Set the initiation interval for this schedule.
SmallSet< SUnit *, 8 > computeUnpipelineableNodes(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
Determine transitive dependences of unpipelineable instructions.
bool isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD, MachineInstr *Def, MachineOperand &MO)
Return true if the instruction is a definition that is loop carried and defines the use on the next i...
void dump() const
Utility function used for debugging to print the schedule.
bool insert(SUnit *SU, int StartCycle, int EndCycle, int II)
Try to schedule the node at the specified StartCycle and continue until the node is schedule or the E...
unsigned getMaxStageCount()
Return the maximum stage count needed for this schedule.
void print(raw_ostream &os) const
Print the schedule information to the given output.
int latestCycleInChain(const SDep &Dep)
Return the cycle of the latest scheduled instruction in the dependence chain.
int stageScheduled(SUnit *SU) const
Return the stage for a scheduled instruction.
void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart, int *MinEnd, int *MaxStart, int II, SwingSchedulerDAG *DAG)
Compute the scheduling start slot for the instruction.
bool isValidSchedule(SwingSchedulerDAG *SSD)
int getInitiationInterval() const
Return the initiation interval for this schedule.
std::deque< SUnit * > & getInstructions(int cycle)
Return the instructions that are scheduled at the specified cycle.
int getFirstCycle() const
Return the first cycle in the completed schedule.
unsigned cycleScheduled(SUnit *SU) const
Return the cycle for a scheduled instruction.
bool isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi)
Return true if the scheduled Phi has a loop carried operand.
bool normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI)
int getFinalCycle() const
Return the last cycle in the finalized schedule.
void orderDependence(SwingSchedulerDAG *SSD, SUnit *SU, std::deque< SUnit * > &Insts)
Order the instructions within a cycle so that the definitions occur before the uses.
void finalizeSchedule(SwingSchedulerDAG *SSD)
After the schedule has been formed, call this function to combine the instructions from the different...
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
unsigned NumPreds
Definition: ScheduleDAG.h:266
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
Definition: ScheduleDAG.h:362
unsigned NodeNum
Entry # of node in the node vector.
Definition: ScheduleDAG.h:264
void setInstr(MachineInstr *MI)
Assigns the instruction for the SUnit.
Definition: ScheduleDAG.h:366
void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
bool isPred(const SUnit *N) const
Tests if node N is a predecessor of this node.
Definition: ScheduleDAG.h:431
unsigned short Latency
Node latency.
Definition: ScheduleDAG.h:273
bool isBoundaryNode() const
Boundary nodes are placeholders for the boundary of the scheduling region.
Definition: ScheduleDAG.h:344
bool hasPhysRegDefs
Has physreg defs that are being used.
Definition: ScheduleDAG.h:280
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:257
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:256
bool addPred(const SDep &D, bool Required=true)
Adds the specified edge as a pred of the current node if not already.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:373
A ScheduleDAG for scheduling lists of MachineInstr.
DenseMap< MachineInstr *, SUnit * > MISUnitMap
After calling BuildSchedGraph, each machine instruction in the current scheduling region is mapped to...
virtual void finishBlock()
Cleans up after scheduling in the given block.
MachineBasicBlock * BB
The block in which to insert instructions.
const MCSchedClassDesc * getSchedClass(SUnit *SU) const
Resolves and cache a resolved scheduling class for an SUnit.
void dumpNode(const SUnit &SU) const override
UndefValue * UnknownValue
For an unanalyzable memory access, this Value is used in maps.
void buildSchedGraph(AAResults *AA, RegPressureTracker *RPTracker=nullptr, PressureDiffs *PDiffs=nullptr, LiveIntervals *LIS=nullptr, bool TrackLaneMasks=false)
Builds SUnits for the current region.
SUnit * getSUnit(MachineInstr *MI) const
Returns an existing SUnit for this MI, or nullptr.
void dump() const override
void RemovePred(SUnit *M, SUnit *N)
Updates the topological ordering to accommodate an an edge to be removed from the specified node N fr...
void InitDAGTopologicalSorting()
Creates the initial topological ordering from the DAG to be scheduled.
void AddPred(SUnit *Y, SUnit *X)
Updates the topological ordering to accommodate an edge to be added from SUnit X to SUnit Y.
bool IsReachable(const SUnit *SU, const SUnit *TargetSU)
Checks if SU is reachable from TargetSU.
MachineRegisterInfo & MRI
Virtual/real register map.
Definition: ScheduleDAG.h:560
const TargetInstrInfo * TII
Target instruction information.
Definition: ScheduleDAG.h:557
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:561
const TargetRegisterInfo * TRI
Target processor register info.
Definition: ScheduleDAG.h:558
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:559
A vector that has set insertion semantics.
Definition: SetVector.h:40
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:77
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:208
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:202
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:141
void clear()
Completely clear the SetVector.
Definition: SetVector.h:213
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:72
typename vector_type::const_iterator iterator
Definition: SetVector.h:48
SlotIndexes pass.
Definition: SlotIndexes.h:319
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:540
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:379
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:383
iterator end() const
Definition: SmallPtrSet.h:408
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365
iterator begin() const
Definition: SmallPtrSet.h:403
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:301
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:164
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:234
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:177
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:577
void resize(size_type N)
Definition: SmallVector.h:642
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
This class builds the dependence graph for the instructions in a loop, and attempts to schedule the i...
unsigned getDepth(SUnit *Node)
The depth, in the dependence graph, for a node.
unsigned getInstrBaseReg(SUnit *SU)
Return the new base register that was stored away for the changed instruction.
int getASAP(SUnit *Node)
Return the earliest time an instruction may be scheduled.
void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule)
Apply changes to the instruction if needed.
void finishBlock() override
Clean up after the software pipeliner runs.
void fixupRegisterOverlaps(std::deque< SUnit * > &Instrs)
Attempt to fix the degenerate cases when the instruction serialization causes the register lifetimes ...
int getZeroLatencyDepth(SUnit *Node)
The maximum unweighted length of a path from an arbitrary node to the given node in which each edge h...
bool isLoopCarriedDep(SUnit *Source, const SDep &Dep, bool isSucc=true)
Return true for an order or output dependence that is loop carried potentially.
unsigned getDistance(SUnit *U, SUnit *V, const SDep &Dep)
The distance function, which indicates that operation V of iteration I depends on operations U of ite...
void schedule() override
We override the schedule function in ScheduleDAGInstrs to implement the scheduling part of the Swing ...
int getMOV(SUnit *Node)
The mobility function, which the number of slots in which an instruction may be scheduled.
int getZeroLatencyHeight(SUnit *Node)
The maximum unweighted length of a path from the given node to an arbitrary node in which each edge h...
bool isBackedge(SUnit *Source, const SDep &Dep)
Return true if the dependence is a back-edge in the data dependence graph.
unsigned getHeight(SUnit *Node)
The height, in the dependence graph, for a node.
int getALAP(SUnit *Node)
Return the latest time an instruction my be scheduled.
Object returned by analyzeLoopForPipelining.
virtual bool shouldIgnoreForPipelining(const MachineInstr *MI) const =0
Return true if the given instruction should not be pipelined and should be ignored.
virtual bool shouldUseSchedule(SwingSchedulerDAG &SSD, SMSchedule &SMS)
Return true if the proposed schedule should used.
virtual std::unique_ptr< PipelinerLoopInfo > analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const
Analyze loop L, which must be a single-basic-block loop, and if the conditions can be understood enou...
bool isZeroCost(unsigned Opcode) const
Return true for pseudo instructions that don't consume any machine resources in their current form.
virtual bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
virtual DFAPacketizer * CreateTargetScheduleState(const TargetSubtargetInfo &) const
Create machine specific model for scheduling.
virtual bool isPostIncrement(const MachineInstr &MI) const
Return true for post-incremented instructions.
virtual bool getBaseAndOffsetPosition(const MachineInstr &MI, unsigned &BasePos, unsigned &OffsetPos) const
Return true if the instruction contains a base register and offset.
virtual bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const
Sometimes, it is possible for the target to tell, even without aliasing information,...
virtual bool getIncrementValue(const MachineInstr &MI, int &Value) const
If the instruction is an increment of a constant value, return the amount.
bool getMemOperandWithOffset(const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset, bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const
Get the base operand and byte offset of an instruction that reads/writes memory.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const char * getRegPressureSetName(unsigned Idx) const =0
Get the name of this register unit pressure set.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
static Type * getVoidTy(LLVMContext &C)
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1740
LLVM Value Representation.
Definition: Value.h:74
unsigned getPosition() const
Definition: CommandLine.h:304
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:139
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
constexpr double e
Definition: MathExtras.h:31
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:406
void stable_sort(R &&Range)
Definition: STLExtras.h:1948
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:349
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
Definition: SetOperations.h:72
void append_range(Container &C, Range &&R)
Wrapper function to append a range to a container.
Definition: STLExtras.h:2014
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:484
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1683
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:124
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
unsigned getRegState(const MachineOperand &RegOp)
Get all register state flags from machine operand RegOp.
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1923
void erase_value(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2006
char & MachinePipelinerID
This pass performs software pipelining on machine instructions.
CycleInfo::CycleT Cycle
Definition: CycleAnalysis.h:28
cl::opt< bool > SwpEnableCopyToPhi
bool isIdentifiedObject(const Value *V)
Return true if this pointer refers to a distinct and identifiable object.
cl::opt< int > SwpForceIssueWidth
A command line argument to force pipeliner to use specified issue width.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:853
#define N
These values represent a non-pipelined step in the execution of an instruction.
RegisterPressure computed within a region of instructions delimited by TopIdx and BottomIdx.
static constexpr LaneBitmask getNone()
Definition: LaneBitmask.h:81
Define a kind of processor resource that will be modeled by the scheduler.
Definition: MCSchedule.h:31
const unsigned * SubUnitsIdxBegin
Definition: MCSchedule.h:53
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition: MCSchedule.h:109
bool isValid() const
Definition: MCSchedule.h:127
Machine model for scheduling, bundling, and heuristics.
Definition: MCSchedule.h:244
const MCSchedClassDesc * getSchedClassDesc(unsigned SchedClassIdx) const
Definition: MCSchedule.h:346
unsigned getNumProcResourceKinds() const
Definition: MCSchedule.h:335
bool hasInstrSchedModel() const
Does this machine model include instruction-level scheduling.
Definition: MCSchedule.h:320
const MCProcResourceDesc * getProcResource(unsigned ProcResourceIdx) const
Definition: MCSchedule.h:339
Identify one of the processor resource kinds consumed by a particular scheduling class for the specif...
Definition: MCSchedule.h:63
SmallVector< MachineOperand, 4 > BrCond
std::unique_ptr< TargetInstrInfo::PipelinerLoopInfo > LoopPipelinerInfo
Store the effects of a change in pressure on things that MI scheduler cares about.
std::vector< unsigned > MaxSetPressure
Map of max reg pressure indexed by pressure set ID, not class ID.