LLVM 19.0.0git
MachineCombiner.cpp
Go to the documentation of this file.
1//===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// The machine combiner pass uses machine trace metrics to ensure the combined
10// instructions do not lengthen the critical path or the resource depth.
11//===----------------------------------------------------------------------===//
12
13#include "llvm/ADT/DenseMap.h"
14#include "llvm/ADT/Statistic.h"
32#include "llvm/Support/Debug.h"
34
35using namespace llvm;
36
37#define DEBUG_TYPE "machine-combiner"
38
39STATISTIC(NumInstCombined, "Number of machineinst combined");
40
42inc_threshold("machine-combiner-inc-threshold", cl::Hidden,
43 cl::desc("Incremental depth computation will be used for basic "
44 "blocks with more instructions."), cl::init(500));
45
46static cl::opt<bool> dump_intrs("machine-combiner-dump-subst-intrs", cl::Hidden,
47 cl::desc("Dump all substituted intrs"),
48 cl::init(false));
49
50#ifdef EXPENSIVE_CHECKS
52 "machine-combiner-verify-pattern-order", cl::Hidden,
54 "Verify that the generated patterns are ordered by increasing latency"),
55 cl::init(true));
56#else
58 "machine-combiner-verify-pattern-order", cl::Hidden,
60 "Verify that the generated patterns are ordered by increasing latency"),
61 cl::init(false));
62#endif
63
64namespace {
65class MachineCombiner : public MachineFunctionPass {
66 const TargetSubtargetInfo *STI = nullptr;
67 const TargetInstrInfo *TII = nullptr;
68 const TargetRegisterInfo *TRI = nullptr;
69 MCSchedModel SchedModel;
70 MachineRegisterInfo *MRI = nullptr;
71 MachineLoopInfo *MLI = nullptr; // Current MachineLoopInfo
72 MachineTraceMetrics *Traces = nullptr;
73 MachineTraceMetrics::Ensemble *TraceEnsemble = nullptr;
74 MachineBlockFrequencyInfo *MBFI = nullptr;
75 ProfileSummaryInfo *PSI = nullptr;
76 RegisterClassInfo RegClassInfo;
77
78 TargetSchedModel TSchedModel;
79
80 /// True if optimizing for code size.
81 bool OptSize = false;
82
83public:
84 static char ID;
85 MachineCombiner() : MachineFunctionPass(ID) {
87 }
88 void getAnalysisUsage(AnalysisUsage &AU) const override;
89 bool runOnMachineFunction(MachineFunction &MF) override;
90 StringRef getPassName() const override { return "Machine InstCombiner"; }
91
92private:
93 bool combineInstructions(MachineBasicBlock *);
94 MachineInstr *getOperandDef(const MachineOperand &MO);
95 bool isTransientMI(const MachineInstr *MI);
96 unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
97 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
99 const MachineBasicBlock &MBB);
100 unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot,
101 MachineTraceMetrics::Trace BlockTrace);
102 bool
103 improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
107 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
108 MachineCombinerPattern Pattern, bool SlackIsAccurate);
109 bool reduceRegisterPressure(MachineInstr &Root, MachineBasicBlock *MBB,
113 bool preservesResourceLen(MachineBasicBlock *MBB,
117 void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs,
119 std::pair<unsigned, unsigned>
120 getLatenciesForInstrSequences(MachineInstr &MI,
123 MachineTraceMetrics::Trace BlockTrace);
124
125 void verifyPatternOrder(MachineBasicBlock *MBB, MachineInstr &Root,
127};
128}
129
130char MachineCombiner::ID = 0;
131char &llvm::MachineCombinerID = MachineCombiner::ID;
132
134 "Machine InstCombiner", false, false)
139
140void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
141 AU.setPreservesCFG();
142 AU.addPreserved<MachineDominatorTree>();
143 AU.addRequired<MachineLoopInfo>();
144 AU.addPreserved<MachineLoopInfo>();
145 AU.addRequired<MachineTraceMetrics>();
146 AU.addPreserved<MachineTraceMetrics>();
147 AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
148 AU.addRequired<ProfileSummaryInfoWrapperPass>();
150}
151
153MachineCombiner::getOperandDef(const MachineOperand &MO) {
154 MachineInstr *DefInstr = nullptr;
155 // We need a virtual register definition.
156 if (MO.isReg() && MO.getReg().isVirtual())
157 DefInstr = MRI->getUniqueVRegDef(MO.getReg());
158 return DefInstr;
159}
160
161/// Return true if MI is unlikely to generate an actual target instruction.
162bool MachineCombiner::isTransientMI(const MachineInstr *MI) {
163 if (!MI->isCopy())
164 return MI->isTransient();
165
166 // If MI is a COPY, check if its src and dst registers can be coalesced.
167 Register Dst = MI->getOperand(0).getReg();
168 Register Src = MI->getOperand(1).getReg();
169
170 if (!MI->isFullCopy()) {
171 // If src RC contains super registers of dst RC, it can also be coalesced.
172 if (MI->getOperand(0).getSubReg() || Src.isPhysical() || Dst.isPhysical())
173 return false;
174
175 auto SrcSub = MI->getOperand(1).getSubReg();
176 auto SrcRC = MRI->getRegClass(Src);
177 auto DstRC = MRI->getRegClass(Dst);
178 return TRI->getMatchingSuperRegClass(SrcRC, DstRC, SrcSub) != nullptr;
179 }
180
181 if (Src.isPhysical() && Dst.isPhysical())
182 return Src == Dst;
183
184 if (Src.isVirtual() && Dst.isVirtual()) {
185 auto SrcRC = MRI->getRegClass(Src);
186 auto DstRC = MRI->getRegClass(Dst);
187 return SrcRC->hasSuperClassEq(DstRC) || SrcRC->hasSubClassEq(DstRC);
188 }
189
190 if (Src.isVirtual())
191 std::swap(Src, Dst);
192
193 // Now Src is physical register, Dst is virtual register.
194 auto DstRC = MRI->getRegClass(Dst);
195 return DstRC->contains(Src);
196}
197
198/// Computes depth of instructions in vector \InsInstr.
199///
200/// \param InsInstrs is a vector of machine instructions
201/// \param InstrIdxForVirtReg is a dense map of virtual register to index
202/// of defining machine instruction in \p InsInstrs
203/// \param BlockTrace is a trace of machine instructions
204///
205/// \returns Depth of last instruction in \InsInstrs ("NewRoot")
206unsigned
207MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
208 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
210 const MachineBasicBlock &MBB) {
211 SmallVector<unsigned, 16> InstrDepth;
212 // For each instruction in the new sequence compute the depth based on the
213 // operands. Use the trace information when possible. For new operands which
214 // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
215 for (auto *InstrPtr : InsInstrs) { // for each Use
216 unsigned IDepth = 0;
217 for (const MachineOperand &MO : InstrPtr->all_uses()) {
218 // Check for virtual register operand.
219 if (!MO.getReg().isVirtual())
220 continue;
221 unsigned DepthOp = 0;
222 unsigned LatencyOp = 0;
224 InstrIdxForVirtReg.find(MO.getReg());
225 if (II != InstrIdxForVirtReg.end()) {
226 // Operand is new virtual register not in trace
227 assert(II->second < InstrDepth.size() && "Bad Index");
228 MachineInstr *DefInstr = InsInstrs[II->second];
229 assert(DefInstr &&
230 "There must be a definition for a new virtual register");
231 DepthOp = InstrDepth[II->second];
232 int DefIdx = DefInstr->findRegisterDefOperandIdx(MO.getReg());
233 int UseIdx = InstrPtr->findRegisterUseOperandIdx(MO.getReg());
234 LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx,
235 InstrPtr, UseIdx);
236 } else {
237 MachineInstr *DefInstr = getOperandDef(MO);
238 if (DefInstr && (TII->getMachineCombinerTraceStrategy() !=
239 MachineTraceStrategy::TS_Local ||
240 DefInstr->getParent() == &MBB)) {
241 DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth;
242 if (!isTransientMI(DefInstr))
243 LatencyOp = TSchedModel.computeOperandLatency(
244 DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
245 InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
246 }
247 }
248 IDepth = std::max(IDepth, DepthOp + LatencyOp);
249 }
250 InstrDepth.push_back(IDepth);
251 }
252 unsigned NewRootIdx = InsInstrs.size() - 1;
253 return InstrDepth[NewRootIdx];
254}
255
256/// Computes instruction latency as max of latency of defined operands.
257///
258/// \param Root is a machine instruction that could be replaced by NewRoot.
259/// It is used to compute a more accurate latency information for NewRoot in
260/// case there is a dependent instruction in the same trace (\p BlockTrace)
261/// \param NewRoot is the instruction for which the latency is computed
262/// \param BlockTrace is a trace of machine instructions
263///
264/// \returns Latency of \p NewRoot
265unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
266 MachineTraceMetrics::Trace BlockTrace) {
267 // Check each definition in NewRoot and compute the latency
268 unsigned NewRootLatency = 0;
269
270 for (const MachineOperand &MO : NewRoot->all_defs()) {
271 // Check for virtual register operand.
272 if (!MO.getReg().isVirtual())
273 continue;
274 // Get the first instruction that uses MO
275 MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg());
276 RI++;
277 if (RI == MRI->reg_end())
278 continue;
279 MachineInstr *UseMO = RI->getParent();
280 unsigned LatencyOp = 0;
281 if (UseMO && BlockTrace.isDepInTrace(*Root, *UseMO)) {
282 LatencyOp = TSchedModel.computeOperandLatency(
283 NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO,
284 UseMO->findRegisterUseOperandIdx(MO.getReg()));
285 } else {
286 LatencyOp = TSchedModel.computeInstrLatency(NewRoot);
287 }
288 NewRootLatency = std::max(NewRootLatency, LatencyOp);
289 }
290 return NewRootLatency;
291}
292
293/// The combiner's goal may differ based on which pattern it is attempting
294/// to optimize.
296 MustReduceDepth, // The data dependency chain must be improved.
297 MustReduceRegisterPressure, // The register pressure must be reduced.
298 Default // The critical path must not be lengthened.
299};
300
302 // TODO: If C++ ever gets a real enum class, make this part of the
303 // MachineCombinerPattern class.
304 switch (P) {
305 case MachineCombinerPattern::REASSOC_AX_BY:
306 case MachineCombinerPattern::REASSOC_AX_YB:
307 case MachineCombinerPattern::REASSOC_XA_BY:
308 case MachineCombinerPattern::REASSOC_XA_YB:
309 case MachineCombinerPattern::REASSOC_XY_AMM_BMM:
310 case MachineCombinerPattern::REASSOC_XMM_AMM_BMM:
311 case MachineCombinerPattern::SUBADD_OP1:
312 case MachineCombinerPattern::SUBADD_OP2:
313 case MachineCombinerPattern::FMADD_AX:
314 case MachineCombinerPattern::FMADD_XA:
315 case MachineCombinerPattern::FMSUB:
316 case MachineCombinerPattern::FNMSUB:
318 case MachineCombinerPattern::REASSOC_XY_BCA:
319 case MachineCombinerPattern::REASSOC_XY_BAC:
321 default:
323 }
324}
325
326/// Estimate the latency of the new and original instruction sequence by summing
327/// up the latencies of the inserted and deleted instructions. This assumes
328/// that the inserted and deleted instructions are dependent instruction chains,
329/// which might not hold in all cases.
330std::pair<unsigned, unsigned> MachineCombiner::getLatenciesForInstrSequences(
333 MachineTraceMetrics::Trace BlockTrace) {
334 assert(!InsInstrs.empty() && "Only support sequences that insert instrs.");
335 unsigned NewRootLatency = 0;
336 // NewRoot is the last instruction in the \p InsInstrs vector.
337 MachineInstr *NewRoot = InsInstrs.back();
338 for (unsigned i = 0; i < InsInstrs.size() - 1; i++)
339 NewRootLatency += TSchedModel.computeInstrLatency(InsInstrs[i]);
340 NewRootLatency += getLatency(&MI, NewRoot, BlockTrace);
341
342 unsigned RootLatency = 0;
343 for (auto *I : DelInstrs)
344 RootLatency += TSchedModel.computeInstrLatency(I);
345
346 return {NewRootLatency, RootLatency};
347}
348
349bool MachineCombiner::reduceRegisterPressure(
354 // FIXME: for now, we don't do any check for the register pressure patterns.
355 // We treat them as always profitable. But we can do better if we make
356 // RegPressureTracker class be aware of TIE attribute. Then we can get an
357 // accurate compare of register pressure with DelInstrs or InsInstrs.
358 return true;
359}
360
361/// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
362/// The new code sequence ends in MI NewRoot. A necessary condition for the new
363/// sequence to replace the old sequence is that it cannot lengthen the critical
364/// path. The definition of "improve" may be restricted by specifying that the
365/// new path improves the data dependency chain (MustReduceDepth).
366bool MachineCombiner::improvesCriticalPathLen(
371 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
373 bool SlackIsAccurate) {
374 // Get depth and latency of NewRoot and Root.
375 unsigned NewRootDepth =
376 getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace, *MBB);
377 unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth;
378
379 LLVM_DEBUG(dbgs() << " Dependence data for " << *Root << "\tNewRootDepth: "
380 << NewRootDepth << "\tRootDepth: " << RootDepth);
381
382 // For a transform such as reassociation, the cost equation is
383 // conservatively calculated so that we must improve the depth (data
384 // dependency cycles) in the critical path to proceed with the transform.
385 // Being conservative also protects against inaccuracies in the underlying
386 // machine trace metrics and CPU models.
388 LLVM_DEBUG(dbgs() << "\tIt MustReduceDepth ");
389 LLVM_DEBUG(NewRootDepth < RootDepth
390 ? dbgs() << "\t and it does it\n"
391 : dbgs() << "\t but it does NOT do it\n");
392 return NewRootDepth < RootDepth;
393 }
394
395 // A more flexible cost calculation for the critical path includes the slack
396 // of the original code sequence. This may allow the transform to proceed
397 // even if the instruction depths (data dependency cycles) become worse.
398
399 // Account for the latency of the inserted and deleted instructions by
400 unsigned NewRootLatency, RootLatency;
401 if (TII->accumulateInstrSeqToRootLatency(*Root)) {
402 std::tie(NewRootLatency, RootLatency) =
403 getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace);
404 } else {
405 NewRootLatency = TSchedModel.computeInstrLatency(InsInstrs.back());
406 RootLatency = TSchedModel.computeInstrLatency(Root);
407 }
408
409 unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
410 unsigned NewCycleCount = NewRootDepth + NewRootLatency;
411 unsigned OldCycleCount =
412 RootDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0);
413 LLVM_DEBUG(dbgs() << "\n\tNewRootLatency: " << NewRootLatency
414 << "\tRootLatency: " << RootLatency << "\n\tRootSlack: "
415 << RootSlack << " SlackIsAccurate=" << SlackIsAccurate
416 << "\n\tNewRootDepth + NewRootLatency = " << NewCycleCount
417 << "\n\tRootDepth + RootLatency + RootSlack = "
418 << OldCycleCount;);
419 LLVM_DEBUG(NewCycleCount <= OldCycleCount
420 ? dbgs() << "\n\t It IMPROVES PathLen because"
421 : dbgs() << "\n\t It DOES NOT improve PathLen because");
422 LLVM_DEBUG(dbgs() << "\n\t\tNewCycleCount = " << NewCycleCount
423 << ", OldCycleCount = " << OldCycleCount << "\n");
424
425 return NewCycleCount <= OldCycleCount;
426}
427
428/// helper routine to convert instructions into SC
429void MachineCombiner::instr2instrSC(
432 for (auto *InstrPtr : Instrs) {
433 unsigned Opc = InstrPtr->getOpcode();
434 unsigned Idx = TII->get(Opc).getSchedClass();
435 const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx);
436 InstrsSC.push_back(SC);
437 }
438}
439
440/// True when the new instructions do not increase resource length
441bool MachineCombiner::preservesResourceLen(
445 if (!TSchedModel.hasInstrSchedModel())
446 return true;
447
448 // Compute current resource length
449
450 //ArrayRef<const MachineBasicBlock *> MBBarr(MBB);
451 SmallVector <const MachineBasicBlock *, 1> MBBarr;
452 MBBarr.push_back(MBB);
453 unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr);
454
455 // Deal with SC rather than Instructions.
458
459 instr2instrSC(InsInstrs, InsInstrsSC);
460 instr2instrSC(DelInstrs, DelInstrsSC);
461
462 ArrayRef<const MCSchedClassDesc *> MSCInsArr{InsInstrsSC};
463 ArrayRef<const MCSchedClassDesc *> MSCDelArr{DelInstrsSC};
464
465 // Compute new resource length.
466 unsigned ResLenAfterCombine =
467 BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr);
468
469 LLVM_DEBUG(dbgs() << "\t\tResource length before replacement: "
470 << ResLenBeforeCombine
471 << " and after: " << ResLenAfterCombine << "\n";);
473 ResLenAfterCombine <=
474 ResLenBeforeCombine + TII->getExtendResourceLenLimit()
475 ? dbgs() << "\t\t As result it IMPROVES/PRESERVES Resource Length\n"
476 : dbgs() << "\t\t As result it DOES NOT improve/preserve Resource "
477 "Length\n");
478
479 return ResLenAfterCombine <=
480 ResLenBeforeCombine + TII->getExtendResourceLenLimit();
481}
482
483/// Inserts InsInstrs and deletes DelInstrs. Incrementally updates instruction
484/// depths if requested.
485///
486/// \param MBB basic block to insert instructions in
487/// \param MI current machine instruction
488/// \param InsInstrs new instructions to insert in \p MBB
489/// \param DelInstrs instruction to delete from \p MBB
490/// \param TraceEnsemble is a pointer to the machine trace information
491/// \param RegUnits set of live registers, needed to compute instruction depths
492/// \param TII is target instruction info, used to call target hook
493/// \param Pattern is used to call target hook finalizeInsInstrs
494/// \param IncrementalUpdate if true, compute instruction depths incrementally,
495/// otherwise invalidate the trace
500 MachineTraceMetrics::Ensemble *TraceEnsemble,
502 MachineCombinerPattern Pattern, bool IncrementalUpdate) {
503 // If we want to fix up some placeholder for some target, do it now.
504 // We need this because in genAlternativeCodeSequence, we have not decided the
505 // better pattern InsInstrs or DelInstrs, so we don't want generate some
506 // sideeffect to the function. For example we need to delay the constant pool
507 // entry creation here after InsInstrs is selected as better pattern.
508 // Otherwise the constant pool entry created for InsInstrs will not be deleted
509 // even if InsInstrs is not the better pattern.
510 TII->finalizeInsInstrs(MI, Pattern, InsInstrs);
511
512 for (auto *InstrPtr : InsInstrs)
514
515 for (auto *InstrPtr : DelInstrs) {
516 InstrPtr->eraseFromParent();
517 // Erase all LiveRegs defined by the removed instruction
518 for (auto *I = RegUnits.begin(); I != RegUnits.end();) {
519 if (I->MI == InstrPtr)
520 I = RegUnits.erase(I);
521 else
522 I++;
523 }
524 }
525
526 if (IncrementalUpdate)
527 for (auto *InstrPtr : InsInstrs)
528 TraceEnsemble->updateDepth(MBB, *InstrPtr, RegUnits);
529 else
530 TraceEnsemble->invalidate(MBB);
531
532 NumInstCombined++;
533}
534
535// Check that the difference between original and new latency is decreasing for
536// later patterns. This helps to discover sub-optimal pattern orderings.
537void MachineCombiner::verifyPatternOrder(
540 long PrevLatencyDiff = std::numeric_limits<long>::max();
541 (void)PrevLatencyDiff; // Variable is used in assert only.
542 for (auto P : Patterns) {
545 DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
546 TII->genAlternativeCodeSequence(Root, P, InsInstrs, DelInstrs,
547 InstrIdxForVirtReg);
548 // Found pattern, but did not generate alternative sequence.
549 // This can happen e.g. when an immediate could not be materialized
550 // in a single instruction.
551 if (InsInstrs.empty() || !TSchedModel.hasInstrSchedModelOrItineraries())
552 continue;
553
554 unsigned NewRootLatency, RootLatency;
555 std::tie(NewRootLatency, RootLatency) = getLatenciesForInstrSequences(
556 Root, InsInstrs, DelInstrs, TraceEnsemble->getTrace(MBB));
557 long CurrentLatencyDiff = ((long)RootLatency) - ((long)NewRootLatency);
558 assert(CurrentLatencyDiff <= PrevLatencyDiff &&
559 "Current pattern is better than previous pattern.");
560 PrevLatencyDiff = CurrentLatencyDiff;
561 }
562}
563
564/// Substitute a slow code sequence with a faster one by
565/// evaluating instruction combining pattern.
566/// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
567/// combining based on machine trace metrics. Only combine a sequence of
568/// instructions when this neither lengthens the critical path nor increases
569/// resource pressure. When optimizing for codesize always combine when the new
570/// sequence is shorter.
571bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
572 bool Changed = false;
573 LLVM_DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
574
575 bool IncrementalUpdate = false;
576 auto BlockIter = MBB->begin();
577 decltype(BlockIter) LastUpdate;
578 // Check if the block is in a loop.
579 const MachineLoop *ML = MLI->getLoopFor(MBB);
580 if (!TraceEnsemble)
581 TraceEnsemble = Traces->getEnsemble(TII->getMachineCombinerTraceStrategy());
582
583 SparseSet<LiveRegUnit> RegUnits;
584 RegUnits.setUniverse(TRI->getNumRegUnits());
585
586 bool OptForSize = OptSize || llvm::shouldOptimizeForSize(MBB, PSI, MBFI);
587
588 bool DoRegPressureReduce =
589 TII->shouldReduceRegisterPressure(MBB, &RegClassInfo);
590
591 while (BlockIter != MBB->end()) {
592 auto &MI = *BlockIter++;
594 // The motivating example is:
595 //
596 // MUL Other MUL_op1 MUL_op2 Other
597 // \ / \ | /
598 // ADD/SUB => MADD/MSUB
599 // (=Root) (=NewRoot)
600
601 // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is
602 // usually beneficial for code size it unfortunately can hurt performance
603 // when the ADD is on the critical path, but the MUL is not. With the
604 // substitution the MUL becomes part of the critical path (in form of the
605 // MADD) and can lengthen it on architectures where the MADD latency is
606 // longer than the ADD latency.
607 //
608 // For each instruction we check if it can be the root of a combiner
609 // pattern. Then for each pattern the new code sequence in form of MI is
610 // generated and evaluated. When the efficiency criteria (don't lengthen
611 // critical path, don't use more resources) is met the new sequence gets
612 // hooked up into the basic block before the old sequence is removed.
613 //
614 // The algorithm does not try to evaluate all patterns and pick the best.
615 // This is only an artificial restriction though. In practice there is
616 // mostly one pattern, and getMachineCombinerPatterns() can order patterns
617 // based on an internal cost heuristic. If
618 // machine-combiner-verify-pattern-order is enabled, all patterns are
619 // checked to ensure later patterns do not provide better latency savings.
620
621 if (!TII->getMachineCombinerPatterns(MI, Patterns, DoRegPressureReduce))
622 continue;
623
625 verifyPatternOrder(MBB, MI, Patterns);
626
627 for (const auto P : Patterns) {
630 DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
631 TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
632 InstrIdxForVirtReg);
633 // Found pattern, but did not generate alternative sequence.
634 // This can happen e.g. when an immediate could not be materialized
635 // in a single instruction.
636 if (InsInstrs.empty())
637 continue;
638
640 dbgs() << "\tFor the Pattern (" << (int)P
641 << ") these instructions could be removed\n";
642 for (auto const *InstrPtr : DelInstrs)
643 InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
644 /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
645 dbgs() << "\tThese instructions could replace the removed ones\n";
646 for (auto const *InstrPtr : InsInstrs)
647 InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
648 /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
649 });
650
651 if (IncrementalUpdate && LastUpdate != BlockIter) {
652 // Update depths since the last incremental update.
653 TraceEnsemble->updateDepths(LastUpdate, BlockIter, RegUnits);
654 LastUpdate = BlockIter;
655 }
656
657 if (DoRegPressureReduce &&
660 if (MBB->size() > inc_threshold) {
661 // Use incremental depth updates for basic blocks above threshold
662 IncrementalUpdate = true;
663 LastUpdate = BlockIter;
664 }
665 if (reduceRegisterPressure(MI, MBB, InsInstrs, DelInstrs, P)) {
666 // Replace DelInstrs with InsInstrs.
667 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
668 RegUnits, TII, P, IncrementalUpdate);
669 Changed |= true;
670
671 // Go back to previous instruction as it may have ILP reassociation
672 // opportunity.
673 BlockIter--;
674 break;
675 }
676 }
677
678 if (ML && TII->isThroughputPattern(P)) {
679 LLVM_DEBUG(dbgs() << "\t Replacing due to throughput pattern in loop\n");
680 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
681 RegUnits, TII, P, IncrementalUpdate);
682 // Eagerly stop after the first pattern fires.
683 Changed = true;
684 break;
685 } else if (OptForSize && InsInstrs.size() < DelInstrs.size()) {
686 LLVM_DEBUG(dbgs() << "\t Replacing due to OptForSize ("
687 << InsInstrs.size() << " < "
688 << DelInstrs.size() << ")\n");
689 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
690 RegUnits, TII, P, IncrementalUpdate);
691 // Eagerly stop after the first pattern fires.
692 Changed = true;
693 break;
694 } else {
695 // For big basic blocks, we only compute the full trace the first time
696 // we hit this. We do not invalidate the trace, but instead update the
697 // instruction depths incrementally.
698 // NOTE: Only the instruction depths up to MI are accurate. All other
699 // trace information is not updated.
700 MachineTraceMetrics::Trace BlockTrace = TraceEnsemble->getTrace(MBB);
701 Traces->verifyAnalysis();
702 if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs,
703 InstrIdxForVirtReg, P,
704 !IncrementalUpdate) &&
705 preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) {
706 if (MBB->size() > inc_threshold) {
707 // Use incremental depth updates for basic blocks above treshold
708 IncrementalUpdate = true;
709 LastUpdate = BlockIter;
710 }
711
712 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
713 RegUnits, TII, P, IncrementalUpdate);
714
715 // Eagerly stop after the first pattern fires.
716 Changed = true;
717 break;
718 }
719 // Cleanup instructions of the alternative code sequence. There is no
720 // use for them.
722 for (auto *InstrPtr : InsInstrs)
723 MF->deleteMachineInstr(InstrPtr);
724 }
725 InstrIdxForVirtReg.clear();
726 }
727 }
728
729 if (Changed && IncrementalUpdate)
730 Traces->invalidate(MBB);
731 return Changed;
732}
733
734bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
735 STI = &MF.getSubtarget();
736 TII = STI->getInstrInfo();
737 TRI = STI->getRegisterInfo();
738 SchedModel = STI->getSchedModel();
739 TSchedModel.init(STI);
740 MRI = &MF.getRegInfo();
741 MLI = &getAnalysis<MachineLoopInfo>();
742 Traces = &getAnalysis<MachineTraceMetrics>();
743 PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
744 MBFI = (PSI && PSI->hasProfileSummary()) ?
745 &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
746 nullptr;
747 TraceEnsemble = nullptr;
748 OptSize = MF.getFunction().hasOptSize();
749 RegClassInfo.runOnMachineFunction(MF);
750
751 LLVM_DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n');
752 if (!TII->useMachineCombiner()) {
754 dbgs()
755 << " Skipping pass: Target does not support machine combiner\n");
756 return false;
757 }
758
759 bool Changed = false;
760
761 // Try to combine instructions.
762 for (auto &MBB : MF)
763 Changed |= combineInstructions(&MBB);
764
765 return Changed;
766}
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock & MBB
COFF::MachineTypes Machine
Definition: COFFYAML.cpp:371
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
static int getLatency(LLVMDisasmContext *DC, const MCInst &Inst)
Gets latency information for Inst, based on DC information.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
===- LazyMachineBlockFrequencyInfo.h - Lazy Block Frequency -*- C++ -*–===//
#define I(x, y, z)
Definition: MD5.cpp:58
static CombinerObjective getCombinerObjective(MachineCombinerPattern P)
static cl::opt< bool > VerifyPatternOrder("machine-combiner-verify-pattern-order", cl::Hidden, cl::desc("Verify that the generated patterns are ordered by increasing latency"), cl::init(false))
static cl::opt< unsigned > inc_threshold("machine-combiner-inc-threshold", cl::Hidden, cl::desc("Incremental depth computation will be used for basic " "blocks with more instructions."), cl::init(500))
static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, MachineTraceMetrics::Ensemble *TraceEnsemble, SparseSet< LiveRegUnit > &RegUnits, const TargetInstrInfo *TII, MachineCombinerPattern Pattern, bool IncrementalUpdate)
Inserts InsInstrs and deletes DelInstrs.
static cl::opt< bool > dump_intrs("machine-combiner-dump-subst-intrs", cl::Hidden, cl::desc("Dump all substituted intrs"), cl::init(false))
CombinerObjective
The combiner's goal may differ based on which pattern it is attempting to optimize.
#define DEBUG_TYPE
unsigned const TargetRegisterInfo * TRI
#define P(N)
if(VerifyEach)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
Represent the analysis usage information of a pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
iterator end()
Definition: DenseMap.h:84
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:680
The core instruction combiner logic.
Definition: InstCombiner.h:47
This is an alternative analysis pass to MachineBlockFrequencyInfo.
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate machine basic b...
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
void deleteMachineInstr(MachineInstr *MI)
DeleteMachineInstr - Delete the given MachineInstr.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Representation of each machine instruction.
Definition: MachineInstr.h:69
int findRegisterUseOperandIdx(Register Reg, bool isKill=false, const TargetRegisterInfo *TRI=nullptr) const
Returns the operand index that is a use of the specific register or -1 if it is not found.
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:329
int findRegisterDefOperandIdx(Register Reg, bool isDead=false, bool Overlap=false, const TargetRegisterInfo *TRI=nullptr) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
iterator_range< filtered_mop_iterator > all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
Definition: MachineInstr.h:733
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
Register getReg() const
getReg - Returns the register number.
reg_begin/reg_end - Provide iteration support to walk over all definitions and uses of a register wit...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
A trace ensemble is a collection of traces selected using the same strategy, for example 'minimum res...
void invalidate(const MachineBasicBlock *MBB)
Invalidate traces through BadMBB.
void updateDepth(TraceBlockInfo &TBI, const MachineInstr &, SparseSet< LiveRegUnit > &RegUnits)
Updates the depth of an machine instruction, given RegUnits.
A trace represents a plausible sequence of executed basic blocks that passes through the current basi...
InstrCycles getInstrCycles(const MachineInstr &MI) const
Return the depth and height of MI.
unsigned getInstrSlack(const MachineInstr &MI) const
Return the slack of MI.
bool isDepInTrace(const MachineInstr &DefMI, const MachineInstr &UseMI) const
A dependence is useful if the basic block of the defining instruction is part of the trace of the use...
unsigned getResourceLength(ArrayRef< const MachineBasicBlock * > Extrablocks=std::nullopt, ArrayRef< const MCSchedClassDesc * > ExtraInstrs=std::nullopt, ArrayRef< const MCSchedClassDesc * > RemoveInstrs=std::nullopt) const
Return the resource length of the trace.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
Analysis providing profile information.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
SparseSet - Fast set implementation for objects that can be identified by small unsigned keys.
Definition: SparseSet.h:124
iterator erase(iterator I)
erase - Erases an existing element identified by a valid iterator.
Definition: SparseSet.h:289
const_iterator begin() const
Definition: SparseSet.h:174
const_iterator end() const
Definition: SparseSet.h:175
void setUniverse(unsigned U)
setUniverse - Set the universe size which determines the largest key the set can hold.
Definition: SparseSet.h:155
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
TargetInstrInfo - Interface to description of machine instruction set.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Provide an instruction scheduling machine model to CodeGen passes.
TargetSubtargetInfo - Generic base class for all target subtargets.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ SC
CHAIN = SC CHAIN, Imm128 - System call.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
char & MachineCombinerID
This pass performs instruction combining using trace metrics to estimate critical-path and resource d...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
MachineCombinerPattern
These are instruction patterns matched by the machine combiner pass.
void initializeMachineCombinerPass(PassRegistry &)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition: MCSchedule.h:118
Machine model for scheduling, bundling, and heuristics.
Definition: MCSchedule.h:253
unsigned Depth
Earliest issue cycle as determined by data dependencies and instruction latencies from the beginning ...