LLVM 17.0.0git
MachineCombiner.cpp
Go to the documentation of this file.
1//===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// The machine combiner pass uses machine trace metrics to ensure the combined
10// instructions do not lengthen the critical path or the resource depth.
11//===----------------------------------------------------------------------===//
12
13#include "llvm/ADT/DenseMap.h"
14#include "llvm/ADT/Statistic.h"
32#include "llvm/Support/Debug.h"
34
35using namespace llvm;
36
37#define DEBUG_TYPE "machine-combiner"
38
39STATISTIC(NumInstCombined, "Number of machineinst combined");
40
42inc_threshold("machine-combiner-inc-threshold", cl::Hidden,
43 cl::desc("Incremental depth computation will be used for basic "
44 "blocks with more instructions."), cl::init(500));
45
46static cl::opt<bool> dump_intrs("machine-combiner-dump-subst-intrs", cl::Hidden,
47 cl::desc("Dump all substituted intrs"),
48 cl::init(false));
49
50#ifdef EXPENSIVE_CHECKS
52 "machine-combiner-verify-pattern-order", cl::Hidden,
54 "Verify that the generated patterns are ordered by increasing latency"),
55 cl::init(true));
56#else
58 "machine-combiner-verify-pattern-order", cl::Hidden,
60 "Verify that the generated patterns are ordered by increasing latency"),
61 cl::init(false));
62#endif
63
64namespace {
65class MachineCombiner : public MachineFunctionPass {
66 const TargetSubtargetInfo *STI;
67 const TargetInstrInfo *TII;
69 MCSchedModel SchedModel;
71 MachineLoopInfo *MLI; // Current MachineLoopInfo
72 MachineTraceMetrics *Traces;
73 MachineTraceMetrics::Ensemble *TraceEnsemble;
76 RegisterClassInfo RegClassInfo;
77
78 TargetSchedModel TSchedModel;
79
80 /// True if optimizing for code size.
81 bool OptSize;
82
83public:
84 static char ID;
85 MachineCombiner() : MachineFunctionPass(ID) {
87 }
88 void getAnalysisUsage(AnalysisUsage &AU) const override;
89 bool runOnMachineFunction(MachineFunction &MF) override;
90 StringRef getPassName() const override { return "Machine InstCombiner"; }
91
92private:
93 bool combineInstructions(MachineBasicBlock *);
94 MachineInstr *getOperandDef(const MachineOperand &MO);
95 bool isTransientMI(const MachineInstr *MI);
96 unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
97 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
99 const MachineBasicBlock &MBB);
100 unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot,
101 MachineTraceMetrics::Trace BlockTrace);
102 bool
103 improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
107 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
108 MachineCombinerPattern Pattern, bool SlackIsAccurate);
109 bool reduceRegisterPressure(MachineInstr &Root, MachineBasicBlock *MBB,
113 bool preservesResourceLen(MachineBasicBlock *MBB,
117 void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs,
119 std::pair<unsigned, unsigned>
120 getLatenciesForInstrSequences(MachineInstr &MI,
123 MachineTraceMetrics::Trace BlockTrace);
124
125 void verifyPatternOrder(MachineBasicBlock *MBB, MachineInstr &Root,
127};
128}
129
130char MachineCombiner::ID = 0;
131char &llvm::MachineCombinerID = MachineCombiner::ID;
132
134 "Machine InstCombiner", false, false)
137INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner",
139
140void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
141 AU.setPreservesCFG();
142 AU.addPreserved<MachineDominatorTree>();
143 AU.addRequired<MachineLoopInfo>();
144 AU.addPreserved<MachineLoopInfo>();
145 AU.addRequired<MachineTraceMetrics>();
146 AU.addPreserved<MachineTraceMetrics>();
147 AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
148 AU.addRequired<ProfileSummaryInfoWrapperPass>();
150}
151
152MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) {
153 MachineInstr *DefInstr = nullptr;
154 // We need a virtual register definition.
155 if (MO.isReg() && MO.getReg().isVirtual())
156 DefInstr = MRI->getUniqueVRegDef(MO.getReg());
157 // PHI's have no depth etc.
158 if (DefInstr && DefInstr->isPHI())
159 DefInstr = nullptr;
160 return DefInstr;
161}
162
163/// Return true if MI is unlikely to generate an actual target instruction.
164bool MachineCombiner::isTransientMI(const MachineInstr *MI) {
165 if (!MI->isCopy())
166 return MI->isTransient();
167
168 // If MI is a COPY, check if its src and dst registers can be coalesced.
169 Register Dst = MI->getOperand(0).getReg();
170 Register Src = MI->getOperand(1).getReg();
171
172 if (!MI->isFullCopy()) {
173 // If src RC contains super registers of dst RC, it can also be coalesced.
174 if (MI->getOperand(0).getSubReg() || Src.isPhysical() || Dst.isPhysical())
175 return false;
176
177 auto SrcSub = MI->getOperand(1).getSubReg();
178 auto SrcRC = MRI->getRegClass(Src);
179 auto DstRC = MRI->getRegClass(Dst);
180 return TRI->getMatchingSuperRegClass(SrcRC, DstRC, SrcSub) != nullptr;
181 }
182
183 if (Src.isPhysical() && Dst.isPhysical())
184 return Src == Dst;
185
186 if (Src.isVirtual() && Dst.isVirtual()) {
187 auto SrcRC = MRI->getRegClass(Src);
188 auto DstRC = MRI->getRegClass(Dst);
189 return SrcRC->hasSuperClassEq(DstRC) || SrcRC->hasSubClassEq(DstRC);
190 }
191
192 if (Src.isVirtual())
193 std::swap(Src, Dst);
194
195 // Now Src is physical register, Dst is virtual register.
196 auto DstRC = MRI->getRegClass(Dst);
197 return DstRC->contains(Src);
198}
199
200/// Computes depth of instructions in vector \InsInstr.
201///
202/// \param InsInstrs is a vector of machine instructions
203/// \param InstrIdxForVirtReg is a dense map of virtual register to index
204/// of defining machine instruction in \p InsInstrs
205/// \param BlockTrace is a trace of machine instructions
206///
207/// \returns Depth of last instruction in \InsInstrs ("NewRoot")
208unsigned
209MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
210 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
212 const MachineBasicBlock &MBB) {
213 SmallVector<unsigned, 16> InstrDepth;
214 // For each instruction in the new sequence compute the depth based on the
215 // operands. Use the trace information when possible. For new operands which
216 // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
217 for (auto *InstrPtr : InsInstrs) { // for each Use
218 unsigned IDepth = 0;
219 for (const MachineOperand &MO : InstrPtr->operands()) {
220 // Check for virtual register operand.
221 if (!(MO.isReg() && MO.getReg().isVirtual()))
222 continue;
223 if (!MO.isUse())
224 continue;
225 unsigned DepthOp = 0;
226 unsigned LatencyOp = 0;
228 InstrIdxForVirtReg.find(MO.getReg());
229 if (II != InstrIdxForVirtReg.end()) {
230 // Operand is new virtual register not in trace
231 assert(II->second < InstrDepth.size() && "Bad Index");
232 MachineInstr *DefInstr = InsInstrs[II->second];
233 assert(DefInstr &&
234 "There must be a definition for a new virtual register");
235 DepthOp = InstrDepth[II->second];
236 int DefIdx = DefInstr->findRegisterDefOperandIdx(MO.getReg());
237 int UseIdx = InstrPtr->findRegisterUseOperandIdx(MO.getReg());
238 LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx,
239 InstrPtr, UseIdx);
240 } else {
241 MachineInstr *DefInstr = getOperandDef(MO);
242 if (DefInstr && (TII->getMachineCombinerTraceStrategy() !=
243 MachineTraceStrategy::TS_Local ||
244 DefInstr->getParent() == &MBB)) {
245 DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth;
246 if (!isTransientMI(DefInstr))
247 LatencyOp = TSchedModel.computeOperandLatency(
248 DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
249 InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
250 }
251 }
252 IDepth = std::max(IDepth, DepthOp + LatencyOp);
253 }
254 InstrDepth.push_back(IDepth);
255 }
256 unsigned NewRootIdx = InsInstrs.size() - 1;
257 return InstrDepth[NewRootIdx];
258}
259
260/// Computes instruction latency as max of latency of defined operands.
261///
262/// \param Root is a machine instruction that could be replaced by NewRoot.
263/// It is used to compute a more accurate latency information for NewRoot in
264/// case there is a dependent instruction in the same trace (\p BlockTrace)
265/// \param NewRoot is the instruction for which the latency is computed
266/// \param BlockTrace is a trace of machine instructions
267///
268/// \returns Latency of \p NewRoot
269unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
270 MachineTraceMetrics::Trace BlockTrace) {
271 // Check each definition in NewRoot and compute the latency
272 unsigned NewRootLatency = 0;
273
274 for (const MachineOperand &MO : NewRoot->operands()) {
275 // Check for virtual register operand.
276 if (!(MO.isReg() && MO.getReg().isVirtual()))
277 continue;
278 if (!MO.isDef())
279 continue;
280 // Get the first instruction that uses MO
281 MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg());
282 RI++;
283 if (RI == MRI->reg_end())
284 continue;
285 MachineInstr *UseMO = RI->getParent();
286 unsigned LatencyOp = 0;
287 if (UseMO && BlockTrace.isDepInTrace(*Root, *UseMO)) {
288 LatencyOp = TSchedModel.computeOperandLatency(
289 NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO,
290 UseMO->findRegisterUseOperandIdx(MO.getReg()));
291 } else {
292 LatencyOp = TSchedModel.computeInstrLatency(NewRoot);
293 }
294 NewRootLatency = std::max(NewRootLatency, LatencyOp);
295 }
296 return NewRootLatency;
297}
298
299/// The combiner's goal may differ based on which pattern it is attempting
300/// to optimize.
302 MustReduceDepth, // The data dependency chain must be improved.
303 MustReduceRegisterPressure, // The register pressure must be reduced.
304 Default // The critical path must not be lengthened.
305};
306
308 // TODO: If C++ ever gets a real enum class, make this part of the
309 // MachineCombinerPattern class.
310 switch (P) {
311 case MachineCombinerPattern::REASSOC_AX_BY:
312 case MachineCombinerPattern::REASSOC_AX_YB:
313 case MachineCombinerPattern::REASSOC_XA_BY:
314 case MachineCombinerPattern::REASSOC_XA_YB:
315 case MachineCombinerPattern::REASSOC_XY_AMM_BMM:
316 case MachineCombinerPattern::REASSOC_XMM_AMM_BMM:
317 case MachineCombinerPattern::SUBADD_OP1:
318 case MachineCombinerPattern::SUBADD_OP2:
319 case MachineCombinerPattern::FMADD_AX:
320 case MachineCombinerPattern::FMADD_XA:
321 case MachineCombinerPattern::FMSUB:
322 case MachineCombinerPattern::FNMSUB:
324 case MachineCombinerPattern::REASSOC_XY_BCA:
325 case MachineCombinerPattern::REASSOC_XY_BAC:
327 default:
329 }
330}
331
332/// Estimate the latency of the new and original instruction sequence by summing
333/// up the latencies of the inserted and deleted instructions. This assumes
334/// that the inserted and deleted instructions are dependent instruction chains,
335/// which might not hold in all cases.
336std::pair<unsigned, unsigned> MachineCombiner::getLatenciesForInstrSequences(
339 MachineTraceMetrics::Trace BlockTrace) {
340 assert(!InsInstrs.empty() && "Only support sequences that insert instrs.");
341 unsigned NewRootLatency = 0;
342 // NewRoot is the last instruction in the \p InsInstrs vector.
343 MachineInstr *NewRoot = InsInstrs.back();
344 for (unsigned i = 0; i < InsInstrs.size() - 1; i++)
345 NewRootLatency += TSchedModel.computeInstrLatency(InsInstrs[i]);
346 NewRootLatency += getLatency(&MI, NewRoot, BlockTrace);
347
348 unsigned RootLatency = 0;
349 for (auto *I : DelInstrs)
350 RootLatency += TSchedModel.computeInstrLatency(I);
351
352 return {NewRootLatency, RootLatency};
353}
354
355bool MachineCombiner::reduceRegisterPressure(
360 // FIXME: for now, we don't do any check for the register pressure patterns.
361 // We treat them as always profitable. But we can do better if we make
362 // RegPressureTracker class be aware of TIE attribute. Then we can get an
363 // accurate compare of register pressure with DelInstrs or InsInstrs.
364 return true;
365}
366
367/// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
368/// The new code sequence ends in MI NewRoot. A necessary condition for the new
369/// sequence to replace the old sequence is that it cannot lengthen the critical
370/// path. The definition of "improve" may be restricted by specifying that the
371/// new path improves the data dependency chain (MustReduceDepth).
372bool MachineCombiner::improvesCriticalPathLen(
377 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
379 bool SlackIsAccurate) {
380 // Get depth and latency of NewRoot and Root.
381 unsigned NewRootDepth =
382 getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace, *MBB);
383 unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth;
384
385 LLVM_DEBUG(dbgs() << " Dependence data for " << *Root << "\tNewRootDepth: "
386 << NewRootDepth << "\tRootDepth: " << RootDepth);
387
388 // For a transform such as reassociation, the cost equation is
389 // conservatively calculated so that we must improve the depth (data
390 // dependency cycles) in the critical path to proceed with the transform.
391 // Being conservative also protects against inaccuracies in the underlying
392 // machine trace metrics and CPU models.
394 LLVM_DEBUG(dbgs() << "\tIt MustReduceDepth ");
395 LLVM_DEBUG(NewRootDepth < RootDepth
396 ? dbgs() << "\t and it does it\n"
397 : dbgs() << "\t but it does NOT do it\n");
398 return NewRootDepth < RootDepth;
399 }
400
401 // A more flexible cost calculation for the critical path includes the slack
402 // of the original code sequence. This may allow the transform to proceed
403 // even if the instruction depths (data dependency cycles) become worse.
404
405 // Account for the latency of the inserted and deleted instructions by
406 unsigned NewRootLatency, RootLatency;
407 std::tie(NewRootLatency, RootLatency) =
408 getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace);
409
410 unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
411 unsigned NewCycleCount = NewRootDepth + NewRootLatency;
412 unsigned OldCycleCount =
413 RootDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0);
414 LLVM_DEBUG(dbgs() << "\n\tNewRootLatency: " << NewRootLatency
415 << "\tRootLatency: " << RootLatency << "\n\tRootSlack: "
416 << RootSlack << " SlackIsAccurate=" << SlackIsAccurate
417 << "\n\tNewRootDepth + NewRootLatency = " << NewCycleCount
418 << "\n\tRootDepth + RootLatency + RootSlack = "
419 << OldCycleCount;);
420 LLVM_DEBUG(NewCycleCount <= OldCycleCount
421 ? dbgs() << "\n\t It IMPROVES PathLen because"
422 : dbgs() << "\n\t It DOES NOT improve PathLen because");
423 LLVM_DEBUG(dbgs() << "\n\t\tNewCycleCount = " << NewCycleCount
424 << ", OldCycleCount = " << OldCycleCount << "\n");
425
426 return NewCycleCount <= OldCycleCount;
427}
428
429/// helper routine to convert instructions into SC
430void MachineCombiner::instr2instrSC(
433 for (auto *InstrPtr : Instrs) {
434 unsigned Opc = InstrPtr->getOpcode();
435 unsigned Idx = TII->get(Opc).getSchedClass();
436 const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx);
437 InstrsSC.push_back(SC);
438 }
439}
440
441/// True when the new instructions do not increase resource length
442bool MachineCombiner::preservesResourceLen(
446 if (!TSchedModel.hasInstrSchedModel())
447 return true;
448
449 // Compute current resource length
450
451 //ArrayRef<const MachineBasicBlock *> MBBarr(MBB);
452 SmallVector <const MachineBasicBlock *, 1> MBBarr;
453 MBBarr.push_back(MBB);
454 unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr);
455
456 // Deal with SC rather than Instructions.
459
460 instr2instrSC(InsInstrs, InsInstrsSC);
461 instr2instrSC(DelInstrs, DelInstrsSC);
462
463 ArrayRef<const MCSchedClassDesc *> MSCInsArr{InsInstrsSC};
464 ArrayRef<const MCSchedClassDesc *> MSCDelArr{DelInstrsSC};
465
466 // Compute new resource length.
467 unsigned ResLenAfterCombine =
468 BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr);
469
470 LLVM_DEBUG(dbgs() << "\t\tResource length before replacement: "
471 << ResLenBeforeCombine
472 << " and after: " << ResLenAfterCombine << "\n";);
474 ResLenAfterCombine <=
475 ResLenBeforeCombine + TII->getExtendResourceLenLimit()
476 ? dbgs() << "\t\t As result it IMPROVES/PRESERVES Resource Length\n"
477 : dbgs() << "\t\t As result it DOES NOT improve/preserve Resource "
478 "Length\n");
479
480 return ResLenAfterCombine <=
481 ResLenBeforeCombine + TII->getExtendResourceLenLimit();
482}
483
484/// Inserts InsInstrs and deletes DelInstrs. Incrementally updates instruction
485/// depths if requested.
486///
487/// \param MBB basic block to insert instructions in
488/// \param MI current machine instruction
489/// \param InsInstrs new instructions to insert in \p MBB
490/// \param DelInstrs instruction to delete from \p MBB
491/// \param TraceEnsemble is a pointer to the machine trace information
492/// \param RegUnits set of live registers, needed to compute instruction depths
493/// \param TII is target instruction info, used to call target hook
494/// \param Pattern is used to call target hook finalizeInsInstrs
495/// \param IncrementalUpdate if true, compute instruction depths incrementally,
496/// otherwise invalidate the trace
501 MachineTraceMetrics::Ensemble *TraceEnsemble,
503 MachineCombinerPattern Pattern, bool IncrementalUpdate) {
504 // If we want to fix up some placeholder for some target, do it now.
505 // We need this because in genAlternativeCodeSequence, we have not decided the
506 // better pattern InsInstrs or DelInstrs, so we don't want generate some
507 // sideeffect to the function. For example we need to delay the constant pool
508 // entry creation here after InsInstrs is selected as better pattern.
509 // Otherwise the constant pool entry created for InsInstrs will not be deleted
510 // even if InsInstrs is not the better pattern.
511 TII->finalizeInsInstrs(MI, Pattern, InsInstrs);
512
513 for (auto *InstrPtr : InsInstrs)
515
516 for (auto *InstrPtr : DelInstrs) {
517 InstrPtr->eraseFromParent();
518 // Erase all LiveRegs defined by the removed instruction
519 for (auto *I = RegUnits.begin(); I != RegUnits.end();) {
520 if (I->MI == InstrPtr)
521 I = RegUnits.erase(I);
522 else
523 I++;
524 }
525 }
526
527 if (IncrementalUpdate)
528 for (auto *InstrPtr : InsInstrs)
529 TraceEnsemble->updateDepth(MBB, *InstrPtr, RegUnits);
530 else
531 TraceEnsemble->invalidate(MBB);
532
533 NumInstCombined++;
534}
535
536// Check that the difference between original and new latency is decreasing for
537// later patterns. This helps to discover sub-optimal pattern orderings.
538void MachineCombiner::verifyPatternOrder(
541 long PrevLatencyDiff = std::numeric_limits<long>::max();
542 (void)PrevLatencyDiff; // Variable is used in assert only.
543 for (auto P : Patterns) {
546 DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
547 TII->genAlternativeCodeSequence(Root, P, InsInstrs, DelInstrs,
548 InstrIdxForVirtReg);
549 // Found pattern, but did not generate alternative sequence.
550 // This can happen e.g. when an immediate could not be materialized
551 // in a single instruction.
552 if (InsInstrs.empty() || !TSchedModel.hasInstrSchedModelOrItineraries())
553 continue;
554
555 unsigned NewRootLatency, RootLatency;
556 std::tie(NewRootLatency, RootLatency) = getLatenciesForInstrSequences(
557 Root, InsInstrs, DelInstrs, TraceEnsemble->getTrace(MBB));
558 long CurrentLatencyDiff = ((long)RootLatency) - ((long)NewRootLatency);
559 assert(CurrentLatencyDiff <= PrevLatencyDiff &&
560 "Current pattern is better than previous pattern.");
561 PrevLatencyDiff = CurrentLatencyDiff;
562 }
563}
564
565/// Substitute a slow code sequence with a faster one by
566/// evaluating instruction combining pattern.
567/// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
568/// combining based on machine trace metrics. Only combine a sequence of
569/// instructions when this neither lengthens the critical path nor increases
570/// resource pressure. When optimizing for codesize always combine when the new
571/// sequence is shorter.
572bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
573 bool Changed = false;
574 LLVM_DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
575
576 bool IncrementalUpdate = false;
577 auto BlockIter = MBB->begin();
578 decltype(BlockIter) LastUpdate;
579 // Check if the block is in a loop.
580 const MachineLoop *ML = MLI->getLoopFor(MBB);
581 if (!TraceEnsemble)
582 TraceEnsemble = Traces->getEnsemble(TII->getMachineCombinerTraceStrategy());
583
584 SparseSet<LiveRegUnit> RegUnits;
585 RegUnits.setUniverse(TRI->getNumRegUnits());
586
587 bool OptForSize = OptSize || llvm::shouldOptimizeForSize(MBB, PSI, MBFI);
588
589 bool DoRegPressureReduce =
590 TII->shouldReduceRegisterPressure(MBB, &RegClassInfo);
591
592 while (BlockIter != MBB->end()) {
593 auto &MI = *BlockIter++;
595 // The motivating example is:
596 //
597 // MUL Other MUL_op1 MUL_op2 Other
598 // \ / \ | /
599 // ADD/SUB => MADD/MSUB
600 // (=Root) (=NewRoot)
601
602 // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is
603 // usually beneficial for code size it unfortunately can hurt performance
604 // when the ADD is on the critical path, but the MUL is not. With the
605 // substitution the MUL becomes part of the critical path (in form of the
606 // MADD) and can lengthen it on architectures where the MADD latency is
607 // longer than the ADD latency.
608 //
609 // For each instruction we check if it can be the root of a combiner
610 // pattern. Then for each pattern the new code sequence in form of MI is
611 // generated and evaluated. When the efficiency criteria (don't lengthen
612 // critical path, don't use more resources) is met the new sequence gets
613 // hooked up into the basic block before the old sequence is removed.
614 //
615 // The algorithm does not try to evaluate all patterns and pick the best.
616 // This is only an artificial restriction though. In practice there is
617 // mostly one pattern, and getMachineCombinerPatterns() can order patterns
618 // based on an internal cost heuristic. If
619 // machine-combiner-verify-pattern-order is enabled, all patterns are
620 // checked to ensure later patterns do not provide better latency savings.
621
622 if (!TII->getMachineCombinerPatterns(MI, Patterns, DoRegPressureReduce))
623 continue;
624
626 verifyPatternOrder(MBB, MI, Patterns);
627
628 for (const auto P : Patterns) {
631 DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
632 TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
633 InstrIdxForVirtReg);
634 // Found pattern, but did not generate alternative sequence.
635 // This can happen e.g. when an immediate could not be materialized
636 // in a single instruction.
637 if (InsInstrs.empty())
638 continue;
639
641 dbgs() << "\tFor the Pattern (" << (int)P
642 << ") these instructions could be removed\n";
643 for (auto const *InstrPtr : DelInstrs)
644 InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
645 /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
646 dbgs() << "\tThese instructions could replace the removed ones\n";
647 for (auto const *InstrPtr : InsInstrs)
648 InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
649 /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
650 });
651
652 if (IncrementalUpdate && LastUpdate != BlockIter) {
653 // Update depths since the last incremental update.
654 TraceEnsemble->updateDepths(LastUpdate, BlockIter, RegUnits);
655 LastUpdate = BlockIter;
656 }
657
658 if (DoRegPressureReduce &&
661 if (MBB->size() > inc_threshold) {
662 // Use incremental depth updates for basic blocks above threshold
663 IncrementalUpdate = true;
664 LastUpdate = BlockIter;
665 }
666 if (reduceRegisterPressure(MI, MBB, InsInstrs, DelInstrs, P)) {
667 // Replace DelInstrs with InsInstrs.
668 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
669 RegUnits, TII, P, IncrementalUpdate);
670 Changed |= true;
671
672 // Go back to previous instruction as it may have ILP reassociation
673 // opportunity.
674 BlockIter--;
675 break;
676 }
677 }
678
679 if (ML && TII->isThroughputPattern(P)) {
680 LLVM_DEBUG(dbgs() << "\t Replacing due to throughput pattern in loop\n");
681 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
682 RegUnits, TII, P, IncrementalUpdate);
683 // Eagerly stop after the first pattern fires.
684 Changed = true;
685 break;
686 } else if (OptForSize && InsInstrs.size() < DelInstrs.size()) {
687 LLVM_DEBUG(dbgs() << "\t Replacing due to OptForSize ("
688 << InsInstrs.size() << " < "
689 << DelInstrs.size() << ")\n");
690 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
691 RegUnits, TII, P, IncrementalUpdate);
692 // Eagerly stop after the first pattern fires.
693 Changed = true;
694 break;
695 } else {
696 // For big basic blocks, we only compute the full trace the first time
697 // we hit this. We do not invalidate the trace, but instead update the
698 // instruction depths incrementally.
699 // NOTE: Only the instruction depths up to MI are accurate. All other
700 // trace information is not updated.
701 MachineTraceMetrics::Trace BlockTrace = TraceEnsemble->getTrace(MBB);
702 Traces->verifyAnalysis();
703 if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs,
704 InstrIdxForVirtReg, P,
705 !IncrementalUpdate) &&
706 preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) {
707 if (MBB->size() > inc_threshold) {
708 // Use incremental depth updates for basic blocks above treshold
709 IncrementalUpdate = true;
710 LastUpdate = BlockIter;
711 }
712
713 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
714 RegUnits, TII, P, IncrementalUpdate);
715
716 // Eagerly stop after the first pattern fires.
717 Changed = true;
718 break;
719 }
720 // Cleanup instructions of the alternative code sequence. There is no
721 // use for them.
723 for (auto *InstrPtr : InsInstrs)
724 MF->deleteMachineInstr(InstrPtr);
725 }
726 InstrIdxForVirtReg.clear();
727 }
728 }
729
730 if (Changed && IncrementalUpdate)
731 Traces->invalidate(MBB);
732 return Changed;
733}
734
735bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
736 STI = &MF.getSubtarget();
737 TII = STI->getInstrInfo();
738 TRI = STI->getRegisterInfo();
739 SchedModel = STI->getSchedModel();
740 TSchedModel.init(STI);
741 MRI = &MF.getRegInfo();
742 MLI = &getAnalysis<MachineLoopInfo>();
743 Traces = &getAnalysis<MachineTraceMetrics>();
744 PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
745 MBFI = (PSI && PSI->hasProfileSummary()) ?
746 &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
747 nullptr;
748 TraceEnsemble = nullptr;
749 OptSize = MF.getFunction().hasOptSize();
750 RegClassInfo.runOnMachineFunction(MF);
751
752 LLVM_DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n');
753 if (!TII->useMachineCombiner()) {
755 dbgs()
756 << " Skipping pass: Target does not support machine combiner\n");
757 return false;
758 }
759
760 bool Changed = false;
761
762 // Try to combine instructions.
763 for (auto &MBB : MF)
764 Changed |= combineInstructions(&MBB);
765
766 return Changed;
767}
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock & MBB
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
static int getLatency(LLVMDisasmContext *DC, const MCInst &Inst)
Gets latency information for Inst, based on DC information.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
===- LazyMachineBlockFrequencyInfo.h - Lazy Block Frequency -*- C++ -*–===//
#define I(x, y, z)
Definition: MD5.cpp:58
static CombinerObjective getCombinerObjective(MachineCombinerPattern P)
static cl::opt< bool > VerifyPatternOrder("machine-combiner-verify-pattern-order", cl::Hidden, cl::desc("Verify that the generated patterns are ordered by increasing latency"), cl::init(false))
static cl::opt< unsigned > inc_threshold("machine-combiner-inc-threshold", cl::Hidden, cl::desc("Incremental depth computation will be used for basic " "blocks with more instructions."), cl::init(500))
static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI, SmallVector< MachineInstr *, 16 > InsInstrs, SmallVector< MachineInstr *, 16 > DelInstrs, MachineTraceMetrics::Ensemble *TraceEnsemble, SparseSet< LiveRegUnit > &RegUnits, const TargetInstrInfo *TII, MachineCombinerPattern Pattern, bool IncrementalUpdate)
Inserts InsInstrs and deletes DelInstrs.
static cl::opt< bool > dump_intrs("machine-combiner-dump-subst-intrs", cl::Hidden, cl::desc("Dump all substituted intrs"), cl::init(false))
CombinerObjective
The combiner's goal may differ based on which pattern it is attempting to optimize.
#define DEBUG_TYPE
unsigned const TargetRegisterInfo * TRI
#define P(N)
if(VerifyEach)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
Represent the analysis usage information of a pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
iterator end()
Definition: DenseMap.h:84
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:649
The core instruction combiner logic.
Definition: InstCombiner.h:45
This is an alternative analysis pass to MachineBlockFrequencyInfo.
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate machine basic b...
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
void deleteMachineInstr(MachineInstr *MI)
DeleteMachineInstr - Delete the given MachineInstr.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Representation of each machine instruction.
Definition: MachineInstr.h:68
int findRegisterUseOperandIdx(Register Reg, bool isKill=false, const TargetRegisterInfo *TRI=nullptr) const
Returns the operand index that is a use of the specific register or -1 if it is not found.
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:313
int findRegisterDefOperandIdx(Register Reg, bool isDead=false, bool Overlap=false, const TargetRegisterInfo *TRI=nullptr) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
iterator_range< mop_iterator > operands()
Definition: MachineInstr.h:641
bool isPHI() const
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
Register getReg() const
getReg - Returns the register number.
reg_begin/reg_end - Provide iteration support to walk over all definitions and uses of a register wit...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
A trace ensemble is a collection of traces selected using the same strategy, for example 'minimum res...
void invalidate(const MachineBasicBlock *MBB)
Invalidate traces through BadMBB.
void updateDepth(TraceBlockInfo &TBI, const MachineInstr &, SparseSet< LiveRegUnit > &RegUnits)
Updates the depth of an machine instruction, given RegUnits.
A trace represents a plausible sequence of executed basic blocks that passes through the current basi...
InstrCycles getInstrCycles(const MachineInstr &MI) const
Return the depth and height of MI.
unsigned getInstrSlack(const MachineInstr &MI) const
Return the slack of MI.
bool isDepInTrace(const MachineInstr &DefMI, const MachineInstr &UseMI) const
A dependence is useful if the basic block of the defining instruction is part of the trace of the use...
unsigned getResourceLength(ArrayRef< const MachineBasicBlock * > Extrablocks=std::nullopt, ArrayRef< const MCSchedClassDesc * > ExtraInstrs=std::nullopt, ArrayRef< const MCSchedClassDesc * > RemoveInstrs=std::nullopt) const
Return the resource length of the trace.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
Analysis providing profile information.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:577
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
SparseSet - Fast set implementation for objects that can be identified by small unsigned keys.
Definition: SparseSet.h:124
iterator erase(iterator I)
erase - Erases an existing element identified by a valid iterator.
Definition: SparseSet.h:288
const_iterator begin() const
Definition: SparseSet.h:174
const_iterator end() const
Definition: SparseSet.h:175
void setUniverse(unsigned U)
setUniverse - Set the universe size which determines the largest key the set can hold.
Definition: SparseSet.h:155
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
TargetInstrInfo - Interface to description of machine instruction set.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Provide an instruction scheduling machine model to CodeGen passes.
TargetSubtargetInfo - Generic base class for all target subtargets.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ SC
CHAIN = SC CHAIN, Imm128 - System call.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
char & MachineCombinerID
This pass performs instruction combining using trace metrics to estimate critical-path and resource d...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
MachineCombinerPattern
These are instruction patterns matched by the machine combiner pass.
void initializeMachineCombinerPass(PassRegistry &)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition: MCSchedule.h:109
Machine model for scheduling, bundling, and heuristics.
Definition: MCSchedule.h:244
unsigned Depth
Earliest issue cycle as determined by data dependencies and instruction latencies from the beginning ...