LLVM 20.0.0git
MachineCombiner.cpp
Go to the documentation of this file.
1//===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// The machine combiner pass uses machine trace metrics to ensure the combined
10// instructions do not lengthen the critical path or the resource depth.
11//===----------------------------------------------------------------------===//
12
13#include "llvm/ADT/DenseMap.h"
14#include "llvm/ADT/Statistic.h"
32#include "llvm/Support/Debug.h"
34
35using namespace llvm;
36
37#define DEBUG_TYPE "machine-combiner"
38
39STATISTIC(NumInstCombined, "Number of machineinst combined");
40
42inc_threshold("machine-combiner-inc-threshold", cl::Hidden,
43 cl::desc("Incremental depth computation will be used for basic "
44 "blocks with more instructions."), cl::init(500));
45
46static cl::opt<bool> dump_intrs("machine-combiner-dump-subst-intrs", cl::Hidden,
47 cl::desc("Dump all substituted intrs"),
48 cl::init(false));
49
50#ifdef EXPENSIVE_CHECKS
52 "machine-combiner-verify-pattern-order", cl::Hidden,
54 "Verify that the generated patterns are ordered by increasing latency"),
55 cl::init(true));
56#else
58 "machine-combiner-verify-pattern-order", cl::Hidden,
60 "Verify that the generated patterns are ordered by increasing latency"),
61 cl::init(false));
62#endif
63
64namespace {
65class MachineCombiner : public MachineFunctionPass {
66 const TargetSubtargetInfo *STI = nullptr;
67 const TargetInstrInfo *TII = nullptr;
68 const TargetRegisterInfo *TRI = nullptr;
69 MCSchedModel SchedModel;
70 MachineRegisterInfo *MRI = nullptr;
71 MachineLoopInfo *MLI = nullptr; // Current MachineLoopInfo
72 MachineTraceMetrics *Traces = nullptr;
73 MachineTraceMetrics::Ensemble *TraceEnsemble = nullptr;
74 MachineBlockFrequencyInfo *MBFI = nullptr;
75 ProfileSummaryInfo *PSI = nullptr;
76 RegisterClassInfo RegClassInfo;
77
78 TargetSchedModel TSchedModel;
79
80public:
81 static char ID;
82 MachineCombiner() : MachineFunctionPass(ID) {
84 }
85 void getAnalysisUsage(AnalysisUsage &AU) const override;
86 bool runOnMachineFunction(MachineFunction &MF) override;
87 StringRef getPassName() const override { return "Machine InstCombiner"; }
88
89private:
90 bool combineInstructions(MachineBasicBlock *);
91 MachineInstr *getOperandDef(const MachineOperand &MO);
92 bool isTransientMI(const MachineInstr *MI);
93 unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
94 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
96 const MachineBasicBlock &MBB);
97 unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot,
99 bool improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
103 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
104 unsigned Pattern, bool SlackIsAccurate);
105 bool reduceRegisterPressure(MachineInstr &Root, MachineBasicBlock *MBB,
108 unsigned Pattern);
109 bool preservesResourceLen(MachineBasicBlock *MBB,
113 void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs,
115 std::pair<unsigned, unsigned>
116 getLatenciesForInstrSequences(MachineInstr &MI,
119 MachineTraceMetrics::Trace BlockTrace);
120
121 void verifyPatternOrder(MachineBasicBlock *MBB, MachineInstr &Root,
122 SmallVector<unsigned, 16> &Patterns);
123 CombinerObjective getCombinerObjective(unsigned Pattern);
124};
125}
126
127char MachineCombiner::ID = 0;
128char &llvm::MachineCombinerID = MachineCombiner::ID;
129
131 "Machine InstCombiner", false, false)
136
137void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
138 AU.setPreservesCFG();
139 AU.addPreserved<MachineDominatorTreeWrapperPass>();
140 AU.addRequired<MachineLoopInfoWrapperPass>();
141 AU.addPreserved<MachineLoopInfoWrapperPass>();
142 AU.addRequired<MachineTraceMetricsWrapperPass>();
143 AU.addPreserved<MachineTraceMetricsWrapperPass>();
144 AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
145 AU.addRequired<ProfileSummaryInfoWrapperPass>();
147}
148
150MachineCombiner::getOperandDef(const MachineOperand &MO) {
151 MachineInstr *DefInstr = nullptr;
152 // We need a virtual register definition.
153 if (MO.isReg() && MO.getReg().isVirtual())
154 DefInstr = MRI->getUniqueVRegDef(MO.getReg());
155 return DefInstr;
156}
157
158/// Return true if MI is unlikely to generate an actual target instruction.
159bool MachineCombiner::isTransientMI(const MachineInstr *MI) {
160 if (!MI->isCopy())
161 return MI->isTransient();
162
163 // If MI is a COPY, check if its src and dst registers can be coalesced.
164 Register Dst = MI->getOperand(0).getReg();
165 Register Src = MI->getOperand(1).getReg();
166
167 if (!MI->isFullCopy()) {
168 // If src RC contains super registers of dst RC, it can also be coalesced.
169 if (MI->getOperand(0).getSubReg() || Src.isPhysical() || Dst.isPhysical())
170 return false;
171
172 auto SrcSub = MI->getOperand(1).getSubReg();
173 auto SrcRC = MRI->getRegClass(Src);
174 auto DstRC = MRI->getRegClass(Dst);
175 return TRI->getMatchingSuperRegClass(SrcRC, DstRC, SrcSub) != nullptr;
176 }
177
178 if (Src.isPhysical() && Dst.isPhysical())
179 return Src == Dst;
180
181 if (Src.isVirtual() && Dst.isVirtual()) {
182 auto SrcRC = MRI->getRegClass(Src);
183 auto DstRC = MRI->getRegClass(Dst);
184 return SrcRC->hasSuperClassEq(DstRC) || SrcRC->hasSubClassEq(DstRC);
185 }
186
187 if (Src.isVirtual())
188 std::swap(Src, Dst);
189
190 // Now Src is physical register, Dst is virtual register.
191 auto DstRC = MRI->getRegClass(Dst);
192 return DstRC->contains(Src);
193}
194
195/// Computes depth of instructions in vector \InsInstr.
196///
197/// \param InsInstrs is a vector of machine instructions
198/// \param InstrIdxForVirtReg is a dense map of virtual register to index
199/// of defining machine instruction in \p InsInstrs
200/// \param BlockTrace is a trace of machine instructions
201///
202/// \returns Depth of last instruction in \InsInstrs ("NewRoot")
203unsigned
204MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
205 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
207 const MachineBasicBlock &MBB) {
208 SmallVector<unsigned, 16> InstrDepth;
209 // For each instruction in the new sequence compute the depth based on the
210 // operands. Use the trace information when possible. For new operands which
211 // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
212 for (auto *InstrPtr : InsInstrs) { // for each Use
213 unsigned IDepth = 0;
214 for (const MachineOperand &MO : InstrPtr->all_uses()) {
215 // Check for virtual register operand.
216 if (!MO.getReg().isVirtual())
217 continue;
218 unsigned DepthOp = 0;
219 unsigned LatencyOp = 0;
221 InstrIdxForVirtReg.find(MO.getReg());
222 if (II != InstrIdxForVirtReg.end()) {
223 // Operand is new virtual register not in trace
224 assert(II->second < InstrDepth.size() && "Bad Index");
225 MachineInstr *DefInstr = InsInstrs[II->second];
226 assert(DefInstr &&
227 "There must be a definition for a new virtual register");
228 DepthOp = InstrDepth[II->second];
229 int DefIdx =
230 DefInstr->findRegisterDefOperandIdx(MO.getReg(), /*TRI=*/nullptr);
231 int UseIdx =
232 InstrPtr->findRegisterUseOperandIdx(MO.getReg(), /*TRI=*/nullptr);
233 LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx,
234 InstrPtr, UseIdx);
235 } else {
236 MachineInstr *DefInstr = getOperandDef(MO);
237 if (DefInstr && (TII->getMachineCombinerTraceStrategy() !=
238 MachineTraceStrategy::TS_Local ||
239 DefInstr->getParent() == &MBB)) {
240 DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth;
241 if (!isTransientMI(DefInstr))
242 LatencyOp = TSchedModel.computeOperandLatency(
243 DefInstr,
244 DefInstr->findRegisterDefOperandIdx(MO.getReg(),
245 /*TRI=*/nullptr),
246 InstrPtr,
247 InstrPtr->findRegisterUseOperandIdx(MO.getReg(),
248 /*TRI=*/nullptr));
249 }
250 }
251 IDepth = std::max(IDepth, DepthOp + LatencyOp);
252 }
253 InstrDepth.push_back(IDepth);
254 }
255 unsigned NewRootIdx = InsInstrs.size() - 1;
256 return InstrDepth[NewRootIdx];
257}
258
259/// Computes instruction latency as max of latency of defined operands.
260///
261/// \param Root is a machine instruction that could be replaced by NewRoot.
262/// It is used to compute a more accurate latency information for NewRoot in
263/// case there is a dependent instruction in the same trace (\p BlockTrace)
264/// \param NewRoot is the instruction for which the latency is computed
265/// \param BlockTrace is a trace of machine instructions
266///
267/// \returns Latency of \p NewRoot
268unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
269 MachineTraceMetrics::Trace BlockTrace) {
270 // Check each definition in NewRoot and compute the latency
271 unsigned NewRootLatency = 0;
272
273 for (const MachineOperand &MO : NewRoot->all_defs()) {
274 // Check for virtual register operand.
275 if (!MO.getReg().isVirtual())
276 continue;
277 // Get the first instruction that uses MO
278 MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg());
279 RI++;
280 if (RI == MRI->reg_end())
281 continue;
282 MachineInstr *UseMO = RI->getParent();
283 unsigned LatencyOp = 0;
284 if (UseMO && BlockTrace.isDepInTrace(*Root, *UseMO)) {
285 LatencyOp = TSchedModel.computeOperandLatency(
286 NewRoot,
287 NewRoot->findRegisterDefOperandIdx(MO.getReg(), /*TRI=*/nullptr),
288 UseMO,
289 UseMO->findRegisterUseOperandIdx(MO.getReg(), /*TRI=*/nullptr));
290 } else {
291 LatencyOp = TSchedModel.computeInstrLatency(NewRoot);
292 }
293 NewRootLatency = std::max(NewRootLatency, LatencyOp);
294 }
295 return NewRootLatency;
296}
297
298CombinerObjective MachineCombiner::getCombinerObjective(unsigned Pattern) {
299 // TODO: If C++ ever gets a real enum class, make this part of the
300 // MachineCombinerPattern class.
301 switch (Pattern) {
302 case MachineCombinerPattern::REASSOC_AX_BY:
303 case MachineCombinerPattern::REASSOC_AX_YB:
304 case MachineCombinerPattern::REASSOC_XA_BY:
305 case MachineCombinerPattern::REASSOC_XA_YB:
306 return CombinerObjective::MustReduceDepth;
307 default:
308 return TII->getCombinerObjective(Pattern);
309 }
310}
311
312/// Estimate the latency of the new and original instruction sequence by summing
313/// up the latencies of the inserted and deleted instructions. This assumes
314/// that the inserted and deleted instructions are dependent instruction chains,
315/// which might not hold in all cases.
316std::pair<unsigned, unsigned> MachineCombiner::getLatenciesForInstrSequences(
319 MachineTraceMetrics::Trace BlockTrace) {
320 assert(!InsInstrs.empty() && "Only support sequences that insert instrs.");
321 unsigned NewRootLatency = 0;
322 // NewRoot is the last instruction in the \p InsInstrs vector.
323 MachineInstr *NewRoot = InsInstrs.back();
324 for (unsigned i = 0; i < InsInstrs.size() - 1; i++)
325 NewRootLatency += TSchedModel.computeInstrLatency(InsInstrs[i]);
326 NewRootLatency += getLatency(&MI, NewRoot, BlockTrace);
327
328 unsigned RootLatency = 0;
329 for (auto *I : DelInstrs)
330 RootLatency += TSchedModel.computeInstrLatency(I);
331
332 return {NewRootLatency, RootLatency};
333}
334
335bool MachineCombiner::reduceRegisterPressure(
338 SmallVectorImpl<MachineInstr *> &DelInstrs, unsigned Pattern) {
339 // FIXME: for now, we don't do any check for the register pressure patterns.
340 // We treat them as always profitable. But we can do better if we make
341 // RegPressureTracker class be aware of TIE attribute. Then we can get an
342 // accurate compare of register pressure with DelInstrs or InsInstrs.
343 return true;
344}
345
346/// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
347/// The new code sequence ends in MI NewRoot. A necessary condition for the new
348/// sequence to replace the old sequence is that it cannot lengthen the critical
349/// path. The definition of "improve" may be restricted by specifying that the
350/// new path improves the data dependency chain (MustReduceDepth).
351bool MachineCombiner::improvesCriticalPathLen(
356 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned Pattern,
357 bool SlackIsAccurate) {
358 // Get depth and latency of NewRoot and Root.
359 unsigned NewRootDepth =
360 getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace, *MBB);
361 unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth;
362
363 LLVM_DEBUG(dbgs() << " Dependence data for " << *Root << "\tNewRootDepth: "
364 << NewRootDepth << "\tRootDepth: " << RootDepth);
365
366 // For a transform such as reassociation, the cost equation is
367 // conservatively calculated so that we must improve the depth (data
368 // dependency cycles) in the critical path to proceed with the transform.
369 // Being conservative also protects against inaccuracies in the underlying
370 // machine trace metrics and CPU models.
371 if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) {
372 LLVM_DEBUG(dbgs() << "\tIt MustReduceDepth ");
373 LLVM_DEBUG(NewRootDepth < RootDepth
374 ? dbgs() << "\t and it does it\n"
375 : dbgs() << "\t but it does NOT do it\n");
376 return NewRootDepth < RootDepth;
377 }
378
379 // A more flexible cost calculation for the critical path includes the slack
380 // of the original code sequence. This may allow the transform to proceed
381 // even if the instruction depths (data dependency cycles) become worse.
382
383 // Account for the latency of the inserted and deleted instructions by
384 unsigned NewRootLatency, RootLatency;
385 if (TII->accumulateInstrSeqToRootLatency(*Root)) {
386 std::tie(NewRootLatency, RootLatency) =
387 getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace);
388 } else {
389 NewRootLatency = TSchedModel.computeInstrLatency(InsInstrs.back());
390 RootLatency = TSchedModel.computeInstrLatency(Root);
391 }
392
393 unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
394 unsigned NewCycleCount = NewRootDepth + NewRootLatency;
395 unsigned OldCycleCount =
396 RootDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0);
397 LLVM_DEBUG(dbgs() << "\n\tNewRootLatency: " << NewRootLatency
398 << "\tRootLatency: " << RootLatency << "\n\tRootSlack: "
399 << RootSlack << " SlackIsAccurate=" << SlackIsAccurate
400 << "\n\tNewRootDepth + NewRootLatency = " << NewCycleCount
401 << "\n\tRootDepth + RootLatency + RootSlack = "
402 << OldCycleCount);
403 LLVM_DEBUG(NewCycleCount <= OldCycleCount
404 ? dbgs() << "\n\t It IMPROVES PathLen because"
405 : dbgs() << "\n\t It DOES NOT improve PathLen because");
406 LLVM_DEBUG(dbgs() << "\n\t\tNewCycleCount = " << NewCycleCount
407 << ", OldCycleCount = " << OldCycleCount << "\n");
408
409 return NewCycleCount <= OldCycleCount;
410}
411
412/// helper routine to convert instructions into SC
413void MachineCombiner::instr2instrSC(
416 for (auto *InstrPtr : Instrs) {
417 unsigned Opc = InstrPtr->getOpcode();
418 unsigned Idx = TII->get(Opc).getSchedClass();
419 const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx);
420 InstrsSC.push_back(SC);
421 }
422}
423
424/// True when the new instructions do not increase resource length
425bool MachineCombiner::preservesResourceLen(
429 if (!TSchedModel.hasInstrSchedModel())
430 return true;
431
432 // Compute current resource length
433
434 //ArrayRef<const MachineBasicBlock *> MBBarr(MBB);
435 SmallVector <const MachineBasicBlock *, 1> MBBarr;
436 MBBarr.push_back(MBB);
437 unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr);
438
439 // Deal with SC rather than Instructions.
442
443 instr2instrSC(InsInstrs, InsInstrsSC);
444 instr2instrSC(DelInstrs, DelInstrsSC);
445
446 ArrayRef<const MCSchedClassDesc *> MSCInsArr{InsInstrsSC};
447 ArrayRef<const MCSchedClassDesc *> MSCDelArr{DelInstrsSC};
448
449 // Compute new resource length.
450 unsigned ResLenAfterCombine =
451 BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr);
452
453 LLVM_DEBUG(dbgs() << "\t\tResource length before replacement: "
454 << ResLenBeforeCombine
455 << " and after: " << ResLenAfterCombine << "\n");
457 ResLenAfterCombine <=
458 ResLenBeforeCombine + TII->getExtendResourceLenLimit()
459 ? dbgs() << "\t\t As result it IMPROVES/PRESERVES Resource Length\n"
460 : dbgs() << "\t\t As result it DOES NOT improve/preserve Resource "
461 "Length\n");
462
463 return ResLenAfterCombine <=
464 ResLenBeforeCombine + TII->getExtendResourceLenLimit();
465}
466
467/// Inserts InsInstrs and deletes DelInstrs. Incrementally updates instruction
468/// depths if requested.
469///
470/// \param MBB basic block to insert instructions in
471/// \param MI current machine instruction
472/// \param InsInstrs new instructions to insert in \p MBB
473/// \param DelInstrs instruction to delete from \p MBB
474/// \param TraceEnsemble is a pointer to the machine trace information
475/// \param RegUnits set of live registers, needed to compute instruction depths
476/// \param TII is target instruction info, used to call target hook
477/// \param Pattern is used to call target hook finalizeInsInstrs
478/// \param IncrementalUpdate if true, compute instruction depths incrementally,
479/// otherwise invalidate the trace
480static void
484 MachineTraceMetrics::Ensemble *TraceEnsemble,
485 SparseSet<LiveRegUnit> &RegUnits,
486 const TargetInstrInfo *TII, unsigned Pattern,
487 bool IncrementalUpdate) {
488 // If we want to fix up some placeholder for some target, do it now.
489 // We need this because in genAlternativeCodeSequence, we have not decided the
490 // better pattern InsInstrs or DelInstrs, so we don't want generate some
491 // sideeffect to the function. For example we need to delay the constant pool
492 // entry creation here after InsInstrs is selected as better pattern.
493 // Otherwise the constant pool entry created for InsInstrs will not be deleted
494 // even if InsInstrs is not the better pattern.
495 TII->finalizeInsInstrs(MI, Pattern, InsInstrs);
496
497 for (auto *InstrPtr : InsInstrs)
499
500 for (auto *InstrPtr : DelInstrs) {
501 InstrPtr->eraseFromParent();
502 // Erase all LiveRegs defined by the removed instruction
503 for (auto *I = RegUnits.begin(); I != RegUnits.end();) {
504 if (I->MI == InstrPtr)
505 I = RegUnits.erase(I);
506 else
507 I++;
508 }
509 }
510
511 if (IncrementalUpdate)
512 for (auto *InstrPtr : InsInstrs)
513 TraceEnsemble->updateDepth(MBB, *InstrPtr, RegUnits);
514 else
515 TraceEnsemble->invalidate(MBB);
516
517 NumInstCombined++;
518}
519
520// Check that the difference between original and new latency is decreasing for
521// later patterns. This helps to discover sub-optimal pattern orderings.
522void MachineCombiner::verifyPatternOrder(MachineBasicBlock *MBB,
523 MachineInstr &Root,
524 SmallVector<unsigned, 16> &Patterns) {
525 long PrevLatencyDiff = std::numeric_limits<long>::max();
526 (void)PrevLatencyDiff; // Variable is used in assert only.
527 for (auto P : Patterns) {
530 DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
531 TII->genAlternativeCodeSequence(Root, P, InsInstrs, DelInstrs,
532 InstrIdxForVirtReg);
533 // Found pattern, but did not generate alternative sequence.
534 // This can happen e.g. when an immediate could not be materialized
535 // in a single instruction.
536 if (InsInstrs.empty() || !TSchedModel.hasInstrSchedModelOrItineraries())
537 continue;
538
539 unsigned NewRootLatency, RootLatency;
540 std::tie(NewRootLatency, RootLatency) = getLatenciesForInstrSequences(
541 Root, InsInstrs, DelInstrs, TraceEnsemble->getTrace(MBB));
542 long CurrentLatencyDiff = ((long)RootLatency) - ((long)NewRootLatency);
543 assert(CurrentLatencyDiff <= PrevLatencyDiff &&
544 "Current pattern is better than previous pattern.");
545 PrevLatencyDiff = CurrentLatencyDiff;
546 }
547}
548
549/// Substitute a slow code sequence with a faster one by
550/// evaluating instruction combining pattern.
551/// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
552/// combining based on machine trace metrics. Only combine a sequence of
553/// instructions when this neither lengthens the critical path nor increases
554/// resource pressure. When optimizing for codesize always combine when the new
555/// sequence is shorter.
556bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
557 bool Changed = false;
558 LLVM_DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
559
560 bool IncrementalUpdate = false;
561 auto BlockIter = MBB->begin();
562 decltype(BlockIter) LastUpdate;
563 // Check if the block is in a loop.
564 const MachineLoop *ML = MLI->getLoopFor(MBB);
565 if (!TraceEnsemble)
566 TraceEnsemble = Traces->getEnsemble(TII->getMachineCombinerTraceStrategy());
567
568 SparseSet<LiveRegUnit> RegUnits;
569 RegUnits.setUniverse(TRI->getNumRegUnits());
570
571 bool OptForSize = llvm::shouldOptimizeForSize(MBB, PSI, MBFI);
572
573 bool DoRegPressureReduce =
574 TII->shouldReduceRegisterPressure(MBB, &RegClassInfo);
575
576 while (BlockIter != MBB->end()) {
577 auto &MI = *BlockIter++;
579 // The motivating example is:
580 //
581 // MUL Other MUL_op1 MUL_op2 Other
582 // \ / \ | /
583 // ADD/SUB => MADD/MSUB
584 // (=Root) (=NewRoot)
585
586 // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is
587 // usually beneficial for code size it unfortunately can hurt performance
588 // when the ADD is on the critical path, but the MUL is not. With the
589 // substitution the MUL becomes part of the critical path (in form of the
590 // MADD) and can lengthen it on architectures where the MADD latency is
591 // longer than the ADD latency.
592 //
593 // For each instruction we check if it can be the root of a combiner
594 // pattern. Then for each pattern the new code sequence in form of MI is
595 // generated and evaluated. When the efficiency criteria (don't lengthen
596 // critical path, don't use more resources) is met the new sequence gets
597 // hooked up into the basic block before the old sequence is removed.
598 //
599 // The algorithm does not try to evaluate all patterns and pick the best.
600 // This is only an artificial restriction though. In practice there is
601 // mostly one pattern, and getMachineCombinerPatterns() can order patterns
602 // based on an internal cost heuristic. If
603 // machine-combiner-verify-pattern-order is enabled, all patterns are
604 // checked to ensure later patterns do not provide better latency savings.
605
606 if (!TII->getMachineCombinerPatterns(MI, Patterns, DoRegPressureReduce))
607 continue;
608
610 verifyPatternOrder(MBB, MI, Patterns);
611
612 for (const auto P : Patterns) {
615 DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
616 TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
617 InstrIdxForVirtReg);
618 // Found pattern, but did not generate alternative sequence.
619 // This can happen e.g. when an immediate could not be materialized
620 // in a single instruction.
621 if (InsInstrs.empty())
622 continue;
623
625 dbgs() << "\tFor the Pattern (" << (int)P
626 << ") these instructions could be removed\n";
627 for (auto const *InstrPtr : DelInstrs)
628 InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
629 /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
630 dbgs() << "\tThese instructions could replace the removed ones\n";
631 for (auto const *InstrPtr : InsInstrs)
632 InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
633 /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
634 });
635
636 if (IncrementalUpdate && LastUpdate != BlockIter) {
637 // Update depths since the last incremental update.
638 TraceEnsemble->updateDepths(LastUpdate, BlockIter, RegUnits);
639 LastUpdate = BlockIter;
640 }
641
642 if (DoRegPressureReduce &&
643 getCombinerObjective(P) ==
644 CombinerObjective::MustReduceRegisterPressure) {
645 if (MBB->size() > inc_threshold) {
646 // Use incremental depth updates for basic blocks above threshold
647 IncrementalUpdate = true;
648 LastUpdate = BlockIter;
649 }
650 if (reduceRegisterPressure(MI, MBB, InsInstrs, DelInstrs, P)) {
651 // Replace DelInstrs with InsInstrs.
652 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
653 RegUnits, TII, P, IncrementalUpdate);
654 Changed |= true;
655
656 // Go back to previous instruction as it may have ILP reassociation
657 // opportunity.
658 BlockIter--;
659 break;
660 }
661 }
662
663 if (ML && TII->isThroughputPattern(P)) {
664 LLVM_DEBUG(dbgs() << "\t Replacing due to throughput pattern in loop\n");
665 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
666 RegUnits, TII, P, IncrementalUpdate);
667 // Eagerly stop after the first pattern fires.
668 Changed = true;
669 break;
670 } else if (OptForSize && InsInstrs.size() < DelInstrs.size()) {
671 LLVM_DEBUG(dbgs() << "\t Replacing due to OptForSize ("
672 << InsInstrs.size() << " < "
673 << DelInstrs.size() << ")\n");
674 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
675 RegUnits, TII, P, IncrementalUpdate);
676 // Eagerly stop after the first pattern fires.
677 Changed = true;
678 break;
679 } else {
680 // For big basic blocks, we only compute the full trace the first time
681 // we hit this. We do not invalidate the trace, but instead update the
682 // instruction depths incrementally.
683 // NOTE: Only the instruction depths up to MI are accurate. All other
684 // trace information is not updated.
685 MachineTraceMetrics::Trace BlockTrace = TraceEnsemble->getTrace(MBB);
686 Traces->verifyAnalysis();
687 if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs,
688 InstrIdxForVirtReg, P,
689 !IncrementalUpdate) &&
690 preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) {
691 if (MBB->size() > inc_threshold) {
692 // Use incremental depth updates for basic blocks above treshold
693 IncrementalUpdate = true;
694 LastUpdate = BlockIter;
695 }
696
697 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
698 RegUnits, TII, P, IncrementalUpdate);
699
700 // Eagerly stop after the first pattern fires.
701 Changed = true;
702 break;
703 }
704 // Cleanup instructions of the alternative code sequence. There is no
705 // use for them.
707 for (auto *InstrPtr : InsInstrs)
708 MF->deleteMachineInstr(InstrPtr);
709 }
710 InstrIdxForVirtReg.clear();
711 }
712 }
713
714 if (Changed && IncrementalUpdate)
715 Traces->invalidate(MBB);
716 return Changed;
717}
718
719bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
720 STI = &MF.getSubtarget();
721 TII = STI->getInstrInfo();
722 TRI = STI->getRegisterInfo();
723 SchedModel = STI->getSchedModel();
724 TSchedModel.init(STI);
725 MRI = &MF.getRegInfo();
726 MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
727 Traces = &getAnalysis<MachineTraceMetricsWrapperPass>().getMTM();
728 PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
729 MBFI = (PSI && PSI->hasProfileSummary()) ?
730 &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
731 nullptr;
732 TraceEnsemble = nullptr;
733 RegClassInfo.runOnMachineFunction(MF);
734
735 LLVM_DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n');
736 if (!TII->useMachineCombiner()) {
738 dbgs()
739 << " Skipping pass: Target does not support machine combiner\n");
740 return false;
741 }
742
743 bool Changed = false;
744
745 // Try to combine instructions.
746 for (auto &MBB : MF)
747 Changed |= combineInstructions(&MBB);
748
749 return Changed;
750}
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock & MBB
COFF::MachineTypes Machine
Definition: COFFYAML.cpp:390
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
===- LazyMachineBlockFrequencyInfo.h - Lazy Block Frequency -*- C++ -*–===//
#define I(x, y, z)
Definition: MD5.cpp:58
static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, MachineTraceMetrics::Ensemble *TraceEnsemble, SparseSet< LiveRegUnit > &RegUnits, const TargetInstrInfo *TII, unsigned Pattern, bool IncrementalUpdate)
Inserts InsInstrs and deletes DelInstrs.
static cl::opt< bool > VerifyPatternOrder("machine-combiner-verify-pattern-order", cl::Hidden, cl::desc("Verify that the generated patterns are ordered by increasing latency"), cl::init(false))
static cl::opt< unsigned > inc_threshold("machine-combiner-inc-threshold", cl::Hidden, cl::desc("Incremental depth computation will be used for basic " "blocks with more instructions."), cl::init(500))
static cl::opt< bool > dump_intrs("machine-combiner-dump-subst-intrs", cl::Hidden, cl::desc("Dump all substituted intrs"), cl::init(false))
#define DEBUG_TYPE
unsigned const TargetRegisterInfo * TRI
uint64_t IntrinsicInst * II
#define P(N)
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
Represent the analysis usage information of a pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
iterator end()
Definition: DenseMap.h:84
The core instruction combiner logic.
Definition: InstCombiner.h:48
This is an alternative analysis pass to MachineBlockFrequencyInfo.
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate machine basic b...
Analysis pass which computes a MachineDominatorTree.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
void deleteMachineInstr(MachineInstr *MI)
DeleteMachineInstr - Delete the given MachineInstr.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Representation of each machine instruction.
Definition: MachineInstr.h:69
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:347
int findRegisterUseOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isKill=false) const
Returns the operand index that is a use of the specific register or -1 if it is not found.
iterator_range< filtered_mop_iterator > all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
Definition: MachineInstr.h:762
int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
Register getReg() const
getReg - Returns the register number.
reg_begin/reg_end - Provide iteration support to walk over all definitions and uses of a register wit...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
A trace ensemble is a collection of traces selected using the same strategy, for example 'minimum res...
void invalidate(const MachineBasicBlock *MBB)
Invalidate traces through BadMBB.
void updateDepth(TraceBlockInfo &TBI, const MachineInstr &, SparseSet< LiveRegUnit > &RegUnits)
Updates the depth of an machine instruction, given RegUnits.
A trace represents a plausible sequence of executed basic blocks that passes through the current basi...
unsigned getResourceLength(ArrayRef< const MachineBasicBlock * > Extrablocks={}, ArrayRef< const MCSchedClassDesc * > ExtraInstrs={}, ArrayRef< const MCSchedClassDesc * > RemoveInstrs={}) const
Return the resource length of the trace.
InstrCycles getInstrCycles(const MachineInstr &MI) const
Return the depth and height of MI.
unsigned getInstrSlack(const MachineInstr &MI) const
Return the slack of MI.
bool isDepInTrace(const MachineInstr &DefMI, const MachineInstr &UseMI) const
A dependence is useful if the basic block of the defining instruction is part of the trace of the use...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
Analysis providing profile information.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
SparseSet - Fast set implementation for objects that can be identified by small unsigned keys.
Definition: SparseSet.h:124
iterator erase(iterator I)
erase - Erases an existing element identified by a valid iterator.
Definition: SparseSet.h:293
const_iterator begin() const
Definition: SparseSet.h:178
const_iterator end() const
Definition: SparseSet.h:179
void setUniverse(unsigned U)
setUniverse - Set the universe size which determines the largest key the set can hold.
Definition: SparseSet.h:160
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
TargetInstrInfo - Interface to description of machine instruction set.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Provide an instruction scheduling machine model to CodeGen passes.
TargetSubtargetInfo - Generic base class for all target subtargets.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ SC
CHAIN = SC CHAIN, Imm128 - System call.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
char & MachineCombinerID
This pass performs instruction combining using trace metrics to estimate critical-path and resource d...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
CombinerObjective
The combiner's goal may differ based on which pattern it is attempting to optimize.
void initializeMachineCombinerPass(PassRegistry &)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition: MCSchedule.h:121
Machine model for scheduling, bundling, and heuristics.
Definition: MCSchedule.h:256
unsigned Depth
Earliest issue cycle as determined by data dependencies and instruction latencies from the beginning ...