LLVM 22.0.0git
MachineCombiner.cpp
Go to the documentation of this file.
1//===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// The machine combiner pass uses machine trace metrics to ensure the combined
10// instructions do not lengthen the critical path or the resource depth.
11//===----------------------------------------------------------------------===//
12
13#include "llvm/ADT/DenseMap.h"
14#include "llvm/ADT/Statistic.h"
32#include "llvm/Support/Debug.h"
34
35using namespace llvm;
36
37#define DEBUG_TYPE "machine-combiner"
38
39STATISTIC(NumInstCombined, "Number of machineinst combined");
40
42inc_threshold("machine-combiner-inc-threshold", cl::Hidden,
43 cl::desc("Incremental depth computation will be used for basic "
44 "blocks with more instructions."), cl::init(500));
45
46static cl::opt<bool> dump_intrs("machine-combiner-dump-subst-intrs", cl::Hidden,
47 cl::desc("Dump all substituted intrs"),
48 cl::init(false));
49
50#ifdef EXPENSIVE_CHECKS
52 "machine-combiner-verify-pattern-order", cl::Hidden,
54 "Verify that the generated patterns are ordered by increasing latency"),
55 cl::init(true));
56#else
58 "machine-combiner-verify-pattern-order", cl::Hidden,
60 "Verify that the generated patterns are ordered by increasing latency"),
61 cl::init(false));
62#endif
63
64namespace {
65class MachineCombiner : public MachineFunctionPass {
66 const TargetSubtargetInfo *STI = nullptr;
67 const TargetInstrInfo *TII = nullptr;
68 const TargetRegisterInfo *TRI = nullptr;
69 MCSchedModel SchedModel;
70 MachineRegisterInfo *MRI = nullptr;
71 MachineLoopInfo *MLI = nullptr; // Current MachineLoopInfo
72 MachineTraceMetrics *Traces = nullptr;
73 MachineTraceMetrics::Ensemble *TraceEnsemble = nullptr;
74 MachineBlockFrequencyInfo *MBFI = nullptr;
75 ProfileSummaryInfo *PSI = nullptr;
76 RegisterClassInfo RegClassInfo;
77
78 TargetSchedModel TSchedModel;
79
80public:
81 static char ID;
82 MachineCombiner() : MachineFunctionPass(ID) {
84 }
85 void getAnalysisUsage(AnalysisUsage &AU) const override;
86 bool runOnMachineFunction(MachineFunction &MF) override;
87 StringRef getPassName() const override { return "Machine InstCombiner"; }
88
89private:
90 bool combineInstructions(MachineBasicBlock *);
91 MachineInstr *getOperandDef(const MachineOperand &MO);
92 bool isTransientMI(const MachineInstr *MI);
93 unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
94 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
96 const MachineBasicBlock &MBB);
97 unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot,
99 bool improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
101 SmallVectorImpl<MachineInstr *> &InsInstrs,
102 SmallVectorImpl<MachineInstr *> &DelInstrs,
103 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
104 unsigned Pattern, bool SlackIsAccurate);
105 bool reduceRegisterPressure(MachineInstr &Root, MachineBasicBlock *MBB,
106 SmallVectorImpl<MachineInstr *> &InsInstrs,
107 SmallVectorImpl<MachineInstr *> &DelInstrs,
108 unsigned Pattern);
109 bool preservesResourceLen(MachineBasicBlock *MBB,
111 SmallVectorImpl<MachineInstr *> &InsInstrs,
112 SmallVectorImpl<MachineInstr *> &DelInstrs);
113 void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs,
114 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC);
115 std::pair<unsigned, unsigned>
116 getLatenciesForInstrSequences(MachineInstr &MI,
117 SmallVectorImpl<MachineInstr *> &InsInstrs,
118 SmallVectorImpl<MachineInstr *> &DelInstrs,
119 MachineTraceMetrics::Trace BlockTrace);
120
121 void verifyPatternOrder(MachineBasicBlock *MBB, MachineInstr &Root,
122 SmallVector<unsigned, 16> &Patterns);
123 CombinerObjective getCombinerObjective(unsigned Pattern);
124};
125}
126
127char MachineCombiner::ID = 0;
128char &llvm::MachineCombinerID = MachineCombiner::ID;
129
131 "Machine InstCombiner", false, false)
134INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner",
136
137void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
138 AU.setPreservesCFG();
139 AU.addPreserved<MachineDominatorTreeWrapperPass>();
140 AU.addRequired<MachineLoopInfoWrapperPass>();
141 AU.addPreserved<MachineLoopInfoWrapperPass>();
142 AU.addRequired<MachineTraceMetricsWrapperPass>();
143 AU.addPreserved<MachineTraceMetricsWrapperPass>();
144 AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
145 AU.addRequired<ProfileSummaryInfoWrapperPass>();
147}
148
150MachineCombiner::getOperandDef(const MachineOperand &MO) {
151 MachineInstr *DefInstr = nullptr;
152 // We need a virtual register definition.
153 if (MO.isReg() && MO.getReg().isVirtual())
154 DefInstr = MRI->getUniqueVRegDef(MO.getReg());
155 return DefInstr;
156}
157
158/// Return true if MI is unlikely to generate an actual target instruction.
159bool MachineCombiner::isTransientMI(const MachineInstr *MI) {
160 if (!MI->isCopy())
161 return MI->isTransient();
162
163 // If MI is a COPY, check if its src and dst registers can be coalesced.
164 Register Dst = MI->getOperand(0).getReg();
165 Register Src = MI->getOperand(1).getReg();
166
167 if (!MI->isFullCopy()) {
168 // If src RC contains super registers of dst RC, it can also be coalesced.
169 if (MI->getOperand(0).getSubReg() || Src.isPhysical() || Dst.isPhysical())
170 return false;
171
172 auto SrcSub = MI->getOperand(1).getSubReg();
173 auto SrcRC = MRI->getRegClass(Src);
174 auto DstRC = MRI->getRegClass(Dst);
175 return TRI->getMatchingSuperRegClass(SrcRC, DstRC, SrcSub) != nullptr;
176 }
177
178 if (Src.isPhysical() && Dst.isPhysical())
179 return Src == Dst;
180
181 if (Src.isVirtual() && Dst.isVirtual()) {
182 auto SrcRC = MRI->getRegClass(Src);
183 auto DstRC = MRI->getRegClass(Dst);
184 return SrcRC->hasSuperClassEq(DstRC) || SrcRC->hasSubClassEq(DstRC);
185 }
186
187 if (Src.isVirtual())
188 std::swap(Src, Dst);
189
190 // Now Src is physical register, Dst is virtual register.
191 auto DstRC = MRI->getRegClass(Dst);
192 return DstRC->contains(Src);
193}
194
195/// Computes depth of instructions in vector \InsInstr.
196///
197/// \param InsInstrs is a vector of machine instructions
198/// \param InstrIdxForVirtReg is a dense map of virtual register to index
199/// of defining machine instruction in \p InsInstrs
200/// \param BlockTrace is a trace of machine instructions
201///
202/// \returns Depth of last instruction in \InsInstrs ("NewRoot")
203unsigned
204MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
205 DenseMap<Register, unsigned> &InstrIdxForVirtReg,
207 const MachineBasicBlock &MBB) {
208 SmallVector<unsigned, 16> InstrDepth;
209 // For each instruction in the new sequence compute the depth based on the
210 // operands. Use the trace information when possible. For new operands which
211 // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
212 for (auto *InstrPtr : InsInstrs) { // for each Use
213 unsigned IDepth = 0;
214 for (const MachineOperand &MO : InstrPtr->all_uses()) {
215 // Check for virtual register operand.
216 if (!MO.getReg().isVirtual())
217 continue;
218 unsigned DepthOp = 0;
219 unsigned LatencyOp = 0;
220 DenseMap<Register, unsigned>::iterator II =
221 InstrIdxForVirtReg.find(MO.getReg());
222 if (II != InstrIdxForVirtReg.end()) {
223 // Operand is new virtual register not in trace
224 assert(II->second < InstrDepth.size() && "Bad Index");
225 MachineInstr *DefInstr = InsInstrs[II->second];
226 assert(DefInstr &&
227 "There must be a definition for a new virtual register");
228 DepthOp = InstrDepth[II->second];
229 int DefIdx =
230 DefInstr->findRegisterDefOperandIdx(MO.getReg(), /*TRI=*/nullptr);
231 int UseIdx =
232 InstrPtr->findRegisterUseOperandIdx(MO.getReg(), /*TRI=*/nullptr);
233 LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx,
234 InstrPtr, UseIdx);
235 } else {
236 MachineInstr *DefInstr = getOperandDef(MO);
237 if (DefInstr && (TII->getMachineCombinerTraceStrategy() !=
238 MachineTraceStrategy::TS_Local ||
239 DefInstr->getParent() == &MBB)) {
240 DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth;
241 if (!isTransientMI(DefInstr))
242 LatencyOp = TSchedModel.computeOperandLatency(
243 DefInstr,
244 DefInstr->findRegisterDefOperandIdx(MO.getReg(),
245 /*TRI=*/nullptr),
246 InstrPtr,
247 InstrPtr->findRegisterUseOperandIdx(MO.getReg(),
248 /*TRI=*/nullptr));
249 }
250 }
251 IDepth = std::max(IDepth, DepthOp + LatencyOp);
252 }
253 InstrDepth.push_back(IDepth);
254 }
255 unsigned NewRootIdx = InsInstrs.size() - 1;
256 return InstrDepth[NewRootIdx];
257}
258
259/// Computes instruction latency as max of latency of defined operands.
260///
261/// \param Root is a machine instruction that could be replaced by NewRoot.
262/// It is used to compute a more accurate latency information for NewRoot in
263/// case there is a dependent instruction in the same trace (\p BlockTrace)
264/// \param NewRoot is the instruction for which the latency is computed
265/// \param BlockTrace is a trace of machine instructions
266///
267/// \returns Latency of \p NewRoot
268unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
269 MachineTraceMetrics::Trace BlockTrace) {
270 // Check each definition in NewRoot and compute the latency
271 unsigned NewRootLatency = 0;
272
273 for (const MachineOperand &MO : NewRoot->all_defs()) {
274 // Check for virtual register operand.
275 if (!MO.getReg().isVirtual())
276 continue;
277 // Get the first instruction that uses MO
278 MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg());
279 RI++;
280 if (RI == MRI->reg_end())
281 continue;
282 MachineInstr *UseMO = RI->getParent();
283 unsigned LatencyOp = 0;
284 if (UseMO && BlockTrace.isDepInTrace(*Root, *UseMO)) {
285 LatencyOp = TSchedModel.computeOperandLatency(
286 NewRoot,
287 NewRoot->findRegisterDefOperandIdx(MO.getReg(), /*TRI=*/nullptr),
288 UseMO,
289 UseMO->findRegisterUseOperandIdx(MO.getReg(), /*TRI=*/nullptr));
290 } else {
291 LatencyOp = TSchedModel.computeInstrLatency(NewRoot);
292 }
293 NewRootLatency = std::max(NewRootLatency, LatencyOp);
294 }
295 return NewRootLatency;
296}
297
298CombinerObjective MachineCombiner::getCombinerObjective(unsigned Pattern) {
299 // TODO: If C++ ever gets a real enum class, make this part of the
300 // MachineCombinerPattern class.
301 switch (Pattern) {
302 case MachineCombinerPattern::REASSOC_AX_BY:
303 case MachineCombinerPattern::REASSOC_AX_YB:
304 case MachineCombinerPattern::REASSOC_XA_BY:
305 case MachineCombinerPattern::REASSOC_XA_YB:
306 return CombinerObjective::MustReduceDepth;
307 default:
308 return TII->getCombinerObjective(Pattern);
309 }
310}
311
312/// Estimate the latency of the new and original instruction sequence by summing
313/// up the latencies of the inserted and deleted instructions. This assumes
314/// that the inserted and deleted instructions are dependent instruction chains,
315/// which might not hold in all cases.
316std::pair<unsigned, unsigned> MachineCombiner::getLatenciesForInstrSequences(
317 MachineInstr &MI, SmallVectorImpl<MachineInstr *> &InsInstrs,
318 SmallVectorImpl<MachineInstr *> &DelInstrs,
319 MachineTraceMetrics::Trace BlockTrace) {
320 assert(!InsInstrs.empty() && "Only support sequences that insert instrs.");
321 unsigned NewRootLatency = 0;
322 // NewRoot is the last instruction in the \p InsInstrs vector.
323 MachineInstr *NewRoot = InsInstrs.back();
324 for (unsigned i = 0; i < InsInstrs.size() - 1; i++)
325 NewRootLatency += TSchedModel.computeInstrLatency(InsInstrs[i]);
326 NewRootLatency += getLatency(&MI, NewRoot, BlockTrace);
327
328 unsigned RootLatency = 0;
329 for (auto *I : DelInstrs)
330 RootLatency += TSchedModel.computeInstrLatency(I);
331
332 return {NewRootLatency, RootLatency};
333}
334
335bool MachineCombiner::reduceRegisterPressure(
336 MachineInstr &Root, MachineBasicBlock *MBB,
337 SmallVectorImpl<MachineInstr *> &InsInstrs,
338 SmallVectorImpl<MachineInstr *> &DelInstrs, unsigned Pattern) {
339 // FIXME: for now, we don't do any check for the register pressure patterns.
340 // We treat them as always profitable. But we can do better if we make
341 // RegPressureTracker class be aware of TIE attribute. Then we can get an
342 // accurate compare of register pressure with DelInstrs or InsInstrs.
343 return true;
344}
345
346/// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
347/// The new code sequence ends in MI NewRoot. A necessary condition for the new
348/// sequence to replace the old sequence is that it cannot lengthen the critical
349/// path. The definition of "improve" may be restricted by specifying that the
350/// new path improves the data dependency chain (MustReduceDepth).
351bool MachineCombiner::improvesCriticalPathLen(
352 MachineBasicBlock *MBB, MachineInstr *Root,
354 SmallVectorImpl<MachineInstr *> &InsInstrs,
355 SmallVectorImpl<MachineInstr *> &DelInstrs,
356 DenseMap<Register, unsigned> &InstrIdxForVirtReg, unsigned Pattern,
357 bool SlackIsAccurate) {
358 // Get depth and latency of NewRoot and Root.
359 unsigned NewRootDepth =
360 getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace, *MBB);
361 unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth;
362
363 LLVM_DEBUG(dbgs() << " Dependence data for " << *Root << "\tNewRootDepth: "
364 << NewRootDepth << "\tRootDepth: " << RootDepth);
365
366 // For a transform such as reassociation, the cost equation is
367 // conservatively calculated so that we must improve the depth (data
368 // dependency cycles) in the critical path to proceed with the transform.
369 // Being conservative also protects against inaccuracies in the underlying
370 // machine trace metrics and CPU models.
371 if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) {
372 LLVM_DEBUG(dbgs() << "\tIt MustReduceDepth ");
373 LLVM_DEBUG(NewRootDepth < RootDepth
374 ? dbgs() << "\t and it does it\n"
375 : dbgs() << "\t but it does NOT do it\n");
376 return NewRootDepth < RootDepth;
377 }
378
379 // A more flexible cost calculation for the critical path includes the slack
380 // of the original code sequence. This may allow the transform to proceed
381 // even if the instruction depths (data dependency cycles) become worse.
382
383 // Account for the latency of the inserted and deleted instructions by
384 unsigned NewRootLatency, RootLatency;
385 if (TII->accumulateInstrSeqToRootLatency(*Root)) {
386 std::tie(NewRootLatency, RootLatency) =
387 getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace);
388 } else {
389 NewRootLatency = TSchedModel.computeInstrLatency(InsInstrs.back());
390 RootLatency = TSchedModel.computeInstrLatency(Root);
391 }
392
393 unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
394 unsigned NewCycleCount = NewRootDepth + NewRootLatency;
395 unsigned OldCycleCount =
396 RootDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0);
397 LLVM_DEBUG(dbgs() << "\n\tNewRootLatency: " << NewRootLatency
398 << "\tRootLatency: " << RootLatency << "\n\tRootSlack: "
399 << RootSlack << " SlackIsAccurate=" << SlackIsAccurate
400 << "\n\tNewRootDepth + NewRootLatency = " << NewCycleCount
401 << "\n\tRootDepth + RootLatency + RootSlack = "
402 << OldCycleCount);
403 LLVM_DEBUG(NewCycleCount <= OldCycleCount
404 ? dbgs() << "\n\t It IMPROVES PathLen because"
405 : dbgs() << "\n\t It DOES NOT improve PathLen because");
406 LLVM_DEBUG(dbgs() << "\n\t\tNewCycleCount = " << NewCycleCount
407 << ", OldCycleCount = " << OldCycleCount << "\n");
408
409 return NewCycleCount <= OldCycleCount;
410}
411
412/// helper routine to convert instructions into SC
413void MachineCombiner::instr2instrSC(
414 SmallVectorImpl<MachineInstr *> &Instrs,
415 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) {
416 for (auto *InstrPtr : Instrs) {
417 unsigned Opc = InstrPtr->getOpcode();
418 unsigned Idx = TII->get(Opc).getSchedClass();
419 const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx);
420 InstrsSC.push_back(SC);
421 }
422}
423
424/// True when the new instructions do not increase resource length
425bool MachineCombiner::preservesResourceLen(
426 MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace,
427 SmallVectorImpl<MachineInstr *> &InsInstrs,
428 SmallVectorImpl<MachineInstr *> &DelInstrs) {
429 if (!TSchedModel.hasInstrSchedModel())
430 return true;
431
432 // Compute current resource length
433
434 //ArrayRef<const MachineBasicBlock *> MBBarr(MBB);
436 MBBarr.push_back(MBB);
437 unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr);
438
439 // Deal with SC rather than Instructions.
442
443 instr2instrSC(InsInstrs, InsInstrsSC);
444 instr2instrSC(DelInstrs, DelInstrsSC);
445
446 ArrayRef<const MCSchedClassDesc *> MSCInsArr{InsInstrsSC};
447 ArrayRef<const MCSchedClassDesc *> MSCDelArr{DelInstrsSC};
448
449 // Compute new resource length.
450 unsigned ResLenAfterCombine =
451 BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr);
452
453 LLVM_DEBUG(dbgs() << "\t\tResource length before replacement: "
454 << ResLenBeforeCombine
455 << " and after: " << ResLenAfterCombine << "\n");
457 ResLenAfterCombine <=
458 ResLenBeforeCombine + TII->getExtendResourceLenLimit()
459 ? dbgs() << "\t\t As result it IMPROVES/PRESERVES Resource Length\n"
460 : dbgs() << "\t\t As result it DOES NOT improve/preserve Resource "
461 "Length\n");
462
463 return ResLenAfterCombine <=
464 ResLenBeforeCombine + TII->getExtendResourceLenLimit();
465}
466
467/// Inserts InsInstrs and deletes DelInstrs. Incrementally updates instruction
468/// depths if requested.
469///
470/// \param MBB basic block to insert instructions in
471/// \param MI current machine instruction
472/// \param InsInstrs new instructions to insert in \p MBB
473/// \param DelInstrs instruction to delete from \p MBB
474/// \param TraceEnsemble is a pointer to the machine trace information
475/// \param RegUnits set of live registers, needed to compute instruction depths
476/// \param TII is target instruction info, used to call target hook
477/// \param Pattern is used to call target hook finalizeInsInstrs
478/// \param IncrementalUpdate if true, compute instruction depths incrementally,
479/// otherwise invalidate the trace
480static void
484 MachineTraceMetrics::Ensemble *TraceEnsemble,
485 LiveRegUnitSet &RegUnits, const TargetInstrInfo *TII,
486 unsigned Pattern, bool IncrementalUpdate) {
487 // If we want to fix up some placeholder for some target, do it now.
488 // We need this because in genAlternativeCodeSequence, we have not decided the
489 // better pattern InsInstrs or DelInstrs, so we don't want generate some
490 // sideeffect to the function. For example we need to delay the constant pool
491 // entry creation here after InsInstrs is selected as better pattern.
492 // Otherwise the constant pool entry created for InsInstrs will not be deleted
493 // even if InsInstrs is not the better pattern.
494 TII->finalizeInsInstrs(MI, Pattern, InsInstrs);
495
496 for (auto *InstrPtr : InsInstrs)
497 MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr);
498
499 for (auto *InstrPtr : DelInstrs) {
500 InstrPtr->eraseFromParent();
501 // Erase all LiveRegs defined by the removed instruction
502 for (auto *I = RegUnits.begin(); I != RegUnits.end();) {
503 if (I->MI == InstrPtr)
504 I = RegUnits.erase(I);
505 else
506 I++;
507 }
508 }
509
510 if (IncrementalUpdate)
511 for (auto *InstrPtr : InsInstrs)
512 TraceEnsemble->updateDepth(MBB, *InstrPtr, RegUnits);
513 else
514 TraceEnsemble->invalidate(MBB);
515
516 NumInstCombined++;
517}
518
519// Check that the difference between original and new latency is decreasing for
520// later patterns. This helps to discover sub-optimal pattern orderings.
521void MachineCombiner::verifyPatternOrder(MachineBasicBlock *MBB,
522 MachineInstr &Root,
523 SmallVector<unsigned, 16> &Patterns) {
524 long PrevLatencyDiff = std::numeric_limits<long>::max();
525 (void)PrevLatencyDiff; // Variable is used in assert only.
526 for (auto P : Patterns) {
529 DenseMap<Register, unsigned> InstrIdxForVirtReg;
530 TII->genAlternativeCodeSequence(Root, P, InsInstrs, DelInstrs,
531 InstrIdxForVirtReg);
532 // Found pattern, but did not generate alternative sequence.
533 // This can happen e.g. when an immediate could not be materialized
534 // in a single instruction.
535 if (InsInstrs.empty() || !TSchedModel.hasInstrSchedModelOrItineraries())
536 continue;
537
538 unsigned NewRootLatency, RootLatency;
539 std::tie(NewRootLatency, RootLatency) = getLatenciesForInstrSequences(
540 Root, InsInstrs, DelInstrs, TraceEnsemble->getTrace(MBB));
541 long CurrentLatencyDiff = ((long)RootLatency) - ((long)NewRootLatency);
542 assert(CurrentLatencyDiff <= PrevLatencyDiff &&
543 "Current pattern is better than previous pattern.");
544 PrevLatencyDiff = CurrentLatencyDiff;
545 }
546}
547
548/// Substitute a slow code sequence with a faster one by
549/// evaluating instruction combining pattern.
550/// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
551/// combining based on machine trace metrics. Only combine a sequence of
552/// instructions when this neither lengthens the critical path nor increases
553/// resource pressure. When optimizing for codesize always combine when the new
554/// sequence is shorter.
555bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
556 bool Changed = false;
557 LLVM_DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
558
559 bool IncrementalUpdate = false;
560 auto BlockIter = MBB->begin();
561 decltype(BlockIter) LastUpdate;
562 // Check if the block is in a loop.
563 const MachineLoop *ML = MLI->getLoopFor(MBB);
564 if (!TraceEnsemble)
565 TraceEnsemble = Traces->getEnsemble(TII->getMachineCombinerTraceStrategy());
566
567 LiveRegUnitSet RegUnits;
568 RegUnits.setUniverse(TRI->getNumRegUnits());
569
570 bool OptForSize = llvm::shouldOptimizeForSize(MBB, PSI, MBFI);
571
572 bool DoRegPressureReduce =
573 TII->shouldReduceRegisterPressure(MBB, &RegClassInfo);
574
575 while (BlockIter != MBB->end()) {
576 auto &MI = *BlockIter++;
577 SmallVector<unsigned, 16> Patterns;
578 // The motivating example is:
579 //
580 // MUL Other MUL_op1 MUL_op2 Other
581 // \ / \ | /
582 // ADD/SUB => MADD/MSUB
583 // (=Root) (=NewRoot)
584
585 // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is
586 // usually beneficial for code size it unfortunately can hurt performance
587 // when the ADD is on the critical path, but the MUL is not. With the
588 // substitution the MUL becomes part of the critical path (in form of the
589 // MADD) and can lengthen it on architectures where the MADD latency is
590 // longer than the ADD latency.
591 //
592 // For each instruction we check if it can be the root of a combiner
593 // pattern. Then for each pattern the new code sequence in form of MI is
594 // generated and evaluated. When the efficiency criteria (don't lengthen
595 // critical path, don't use more resources) is met the new sequence gets
596 // hooked up into the basic block before the old sequence is removed.
597 //
598 // The algorithm does not try to evaluate all patterns and pick the best.
599 // This is only an artificial restriction though. In practice there is
600 // mostly one pattern, and getMachineCombinerPatterns() can order patterns
601 // based on an internal cost heuristic. If
602 // machine-combiner-verify-pattern-order is enabled, all patterns are
603 // checked to ensure later patterns do not provide better latency savings.
604
605 if (!TII->getMachineCombinerPatterns(MI, Patterns, DoRegPressureReduce))
606 continue;
607
609 verifyPatternOrder(MBB, MI, Patterns);
610
611 for (const auto P : Patterns) {
614 DenseMap<Register, unsigned> InstrIdxForVirtReg;
615 TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
616 InstrIdxForVirtReg);
617 // Found pattern, but did not generate alternative sequence.
618 // This can happen e.g. when an immediate could not be materialized
619 // in a single instruction.
620 if (InsInstrs.empty())
621 continue;
622
624 dbgs() << "\tFor the Pattern (" << (int)P
625 << ") these instructions could be removed\n";
626 for (auto const *InstrPtr : DelInstrs)
627 InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
628 /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
629 dbgs() << "\tThese instructions could replace the removed ones\n";
630 for (auto const *InstrPtr : InsInstrs)
631 InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
632 /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
633 });
634
635 if (IncrementalUpdate && LastUpdate != BlockIter) {
636 // Update depths since the last incremental update.
637 TraceEnsemble->updateDepths(LastUpdate, BlockIter, RegUnits);
638 LastUpdate = BlockIter;
639 }
640
641 if (DoRegPressureReduce &&
642 getCombinerObjective(P) ==
643 CombinerObjective::MustReduceRegisterPressure) {
644 if (MBB->size() > inc_threshold) {
645 // Use incremental depth updates for basic blocks above threshold
646 IncrementalUpdate = true;
647 LastUpdate = BlockIter;
648 }
649 if (reduceRegisterPressure(MI, MBB, InsInstrs, DelInstrs, P)) {
650 // Replace DelInstrs with InsInstrs.
651 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
652 RegUnits, TII, P, IncrementalUpdate);
653 Changed |= true;
654
655 // Go back to previous instruction as it may have ILP reassociation
656 // opportunity.
657 BlockIter--;
658 break;
659 }
660 }
661
662 if (ML && TII->isThroughputPattern(P)) {
663 LLVM_DEBUG(dbgs() << "\t Replacing due to throughput pattern in loop\n");
664 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
665 RegUnits, TII, P, IncrementalUpdate);
666 // Eagerly stop after the first pattern fires.
667 Changed = true;
668 break;
669 } else if (OptForSize && InsInstrs.size() < DelInstrs.size()) {
670 LLVM_DEBUG(dbgs() << "\t Replacing due to OptForSize ("
671 << InsInstrs.size() << " < "
672 << DelInstrs.size() << ")\n");
673 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
674 RegUnits, TII, P, IncrementalUpdate);
675 // Eagerly stop after the first pattern fires.
676 Changed = true;
677 break;
678 } else {
679 // For big basic blocks, we only compute the full trace the first time
680 // we hit this. We do not invalidate the trace, but instead update the
681 // instruction depths incrementally.
682 // NOTE: Only the instruction depths up to MI are accurate. All other
683 // trace information is not updated.
684 MachineTraceMetrics::Trace BlockTrace = TraceEnsemble->getTrace(MBB);
685 Traces->verifyAnalysis();
686 if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs,
687 InstrIdxForVirtReg, P,
688 !IncrementalUpdate) &&
689 preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) {
690 if (MBB->size() > inc_threshold) {
691 // Use incremental depth updates for basic blocks above treshold
692 IncrementalUpdate = true;
693 LastUpdate = BlockIter;
694 }
695
696 insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, TraceEnsemble,
697 RegUnits, TII, P, IncrementalUpdate);
698
699 // Eagerly stop after the first pattern fires.
700 Changed = true;
701 break;
702 }
703 // Cleanup instructions of the alternative code sequence. There is no
704 // use for them.
705 MachineFunction *MF = MBB->getParent();
706 for (auto *InstrPtr : InsInstrs)
707 MF->deleteMachineInstr(InstrPtr);
708 }
709 InstrIdxForVirtReg.clear();
710 }
711 }
712
713 if (Changed && IncrementalUpdate)
714 Traces->invalidate(MBB);
715 return Changed;
716}
717
718bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
719 STI = &MF.getSubtarget();
720 TII = STI->getInstrInfo();
721 TRI = STI->getRegisterInfo();
722 SchedModel = STI->getSchedModel();
723 TSchedModel.init(STI);
724 MRI = &MF.getRegInfo();
725 MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
726 Traces = &getAnalysis<MachineTraceMetricsWrapperPass>().getMTM();
727 PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
728 MBFI = (PSI && PSI->hasProfileSummary()) ?
729 &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
730 nullptr;
731 TraceEnsemble = nullptr;
732 RegClassInfo.runOnMachineFunction(MF);
733
734 LLVM_DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n');
735 if (!TII->useMachineCombiner()) {
737 dbgs()
738 << " Skipping pass: Target does not support machine combiner\n");
739 return false;
740 }
741
742 bool Changed = false;
743
744 // Try to combine instructions.
745 for (auto &MBB : MF)
746 Changed |= combineInstructions(&MBB);
747
748 return Changed;
749}
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
This file defines the DenseMap class.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
===- LazyMachineBlockFrequencyInfo.h - Lazy Block Frequency -*- C++ -*–===//
#define I(x, y, z)
Definition MD5.cpp:57
static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI, SmallVectorImpl< MachineInstr * > &InsInstrs, SmallVectorImpl< MachineInstr * > &DelInstrs, MachineTraceMetrics::Ensemble *TraceEnsemble, LiveRegUnitSet &RegUnits, const TargetInstrInfo *TII, unsigned Pattern, bool IncrementalUpdate)
Inserts InsInstrs and deletes DelInstrs.
static cl::opt< bool > VerifyPatternOrder("machine-combiner-verify-pattern-order", cl::Hidden, cl::desc("Verify that the generated patterns are ordered by increasing latency"), cl::init(false))
static cl::opt< unsigned > inc_threshold("machine-combiner-inc-threshold", cl::Hidden, cl::desc("Incremental depth computation will be used for basic " "blocks with more instructions."), cl::init(500))
static cl::opt< bool > dump_intrs("machine-combiner-dump-subst-intrs", cl::Hidden, cl::desc("Dump all substituted intrs"), cl::init(false))
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
uint64_t IntrinsicInst * II
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
Represent the analysis usage information of a pass.
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
iterator end()
Definition DenseMap.h:81
This is an alternative analysis pass to MachineBlockFrequencyInfo.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
const MCSchedModel & getSchedModel() const
Get the machine model for this subtarget's CPU.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
LLVM_ABI StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate machine basic b...
Analysis pass which computes a MachineDominatorTree.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
LLVM_ABI int findRegisterUseOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isKill=false) const
Returns the operand index that is a use of the specific register or -1 if it is not found.
LLVM_ABI int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
defusechain_iterator< true, true, false, true, false > reg_iterator
reg_iterator/reg_begin/reg_end - Walk all defs and uses of the specified register.
A trace ensemble is a collection of traces selected using the same strategy, for example 'minimum res...
void invalidate(const MachineBasicBlock *MBB)
Invalidate traces through BadMBB.
void updateDepth(TraceBlockInfo &TBI, const MachineInstr &, LiveRegUnitSet &RegUnits)
Updates the depth of an machine instruction, given RegUnits.
void updateDepths(MachineBasicBlock::iterator Start, MachineBasicBlock::iterator End, LiveRegUnitSet &RegUnits)
Updates the depth of the instructions from Start to End.
Trace getTrace(const MachineBasicBlock *MBB)
Get the trace that passes through MBB.
unsigned getResourceLength(ArrayRef< const MachineBasicBlock * > Extrablocks={}, ArrayRef< const MCSchedClassDesc * > ExtraInstrs={}, ArrayRef< const MCSchedClassDesc * > RemoveInstrs={}) const
Return the resource length of the trace.
InstrCycles getInstrCycles(const MachineInstr &MI) const
Return the depth and height of MI.
unsigned getInstrSlack(const MachineInstr &MI) const
Return the slack of MI.
bool isDepInTrace(const MachineInstr &DefMI, const MachineInstr &UseMI) const
A dependence is useful if the basic block of the defining instruction is part of the trace of the use...
Ensemble * getEnsemble(MachineTraceStrategy)
Get the trace ensemble representing the given trace selection strategy.
void invalidate(const MachineBasicBlock *MBB)
Invalidate cached information about MBB.
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
Analysis providing profile information.
bool hasProfileSummary() const
Returns true if profile summary is available.
LLVM_ABI void runOnMachineFunction(const MachineFunction &MF, bool Rev=false)
runOnFunction - Prepare to answer questions about MF.
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
iterator erase(iterator I)
erase - Erases an existing element identified by a valid iterator.
Definition SparseSet.h:287
const_iterator begin() const
Definition SparseSet.h:170
const_iterator end() const
Definition SparseSet.h:171
void setUniverse(unsigned U)
setUniverse - Set the universe size which determines the largest key the set can hold.
Definition SparseSet.h:152
TargetInstrInfo - Interface to description of machine instruction set.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Provide an instruction scheduling machine model to CodeGen passes.
LLVM_ABI bool hasInstrSchedModel() const
Return true if this machine model includes an instruction-level scheduling model.
LLVM_ABI void init(const TargetSubtargetInfo *TSInfo, bool EnableSModel=true, bool EnableSItins=true)
Initialize the machine model for instruction scheduling.
LLVM_ABI unsigned computeOperandLatency(const MachineInstr *DefMI, unsigned DefOperIdx, const MachineInstr *UseMI, unsigned UseOperIdx) const
Compute operand latency based on the available machine model.
bool hasInstrSchedModelOrItineraries() const
Return true if this machine model includes an instruction-level scheduling model or cycle-to-cycle it...
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Changed
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI void initializeMachineCombinerPass(PassRegistry &)
LLVM_ABI bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
LLVM_ABI char & MachineCombinerID
This pass performs instruction combining using trace metrics to estimate critical-path and resource d...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
CombinerObjective
The combiner's goal may differ based on which pattern it is attempting to optimize.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
SparseSet< LiveRegUnit, MCRegUnit, MCRegUnitToIndex > LiveRegUnitSet
ArrayRef(const T &OneElt) -> ArrayRef< T >
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:258
const MCSchedClassDesc * getSchedClassDesc(unsigned SchedClassIdx) const
Definition MCSchedule.h:366
unsigned Depth
Earliest issue cycle as determined by data dependencies and instruction latencies from the beginning ...