LLVM 17.0.0git
X86CmovConversion.cpp
Go to the documentation of this file.
1//====- X86CmovConversion.cpp - Convert Cmov to Branch --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file implements a pass that converts X86 cmov instructions into
11/// branches when profitable. This pass is conservative. It transforms if and
12/// only if it can guarantee a gain with high confidence.
13///
14/// Thus, the optimization applies under the following conditions:
15/// 1. Consider as candidates only CMOVs in innermost loops (assume that
16/// most hotspots are represented by these loops).
17/// 2. Given a group of CMOV instructions that are using the same EFLAGS def
18/// instruction:
19/// a. Consider them as candidates only if all have the same code condition
20/// or the opposite one to prevent generating more than one conditional
21/// jump per EFLAGS def instruction.
22/// b. Consider them as candidates only if all are profitable to be
23/// converted (assume that one bad conversion may cause a degradation).
24/// 3. Apply conversion only for loops that are found profitable and only for
25/// CMOV candidates that were found profitable.
26/// a. A loop is considered profitable only if conversion will reduce its
27/// depth cost by some threshold.
28/// b. CMOV is considered profitable if the cost of its condition is higher
29/// than the average cost of its true-value and false-value by 25% of
30/// branch-misprediction-penalty. This assures no degradation even with
31/// 25% branch misprediction.
32///
33/// Note: This pass is assumed to run on SSA machine code.
34//
35//===----------------------------------------------------------------------===//
36//
37// External interfaces:
38// FunctionPass *llvm::createX86CmovConverterPass();
39// bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF);
40//
41//===----------------------------------------------------------------------===//
42
43#include "X86.h"
44#include "X86InstrInfo.h"
45#include "llvm/ADT/ArrayRef.h"
46#include "llvm/ADT/DenseMap.h"
47#include "llvm/ADT/STLExtras.h"
50#include "llvm/ADT/Statistic.h"
63#include "llvm/IR/DebugLoc.h"
65#include "llvm/MC/MCSchedule.h"
66#include "llvm/Pass.h"
68#include "llvm/Support/Debug.h"
71#include <algorithm>
72#include <cassert>
73#include <iterator>
74#include <utility>
75
76using namespace llvm;
77
78#define DEBUG_TYPE "x86-cmov-conversion"
79
80STATISTIC(NumOfSkippedCmovGroups, "Number of unsupported CMOV-groups");
81STATISTIC(NumOfCmovGroupCandidate, "Number of CMOV-group candidates");
82STATISTIC(NumOfLoopCandidate, "Number of CMOV-conversion profitable loops");
83STATISTIC(NumOfOptimizedCmovGroups, "Number of optimized CMOV-groups");
84
85// This internal switch can be used to turn off the cmov/branch optimization.
86static cl::opt<bool>
87 EnableCmovConverter("x86-cmov-converter",
88 cl::desc("Enable the X86 cmov-to-branch optimization."),
89 cl::init(true), cl::Hidden);
90
92 GainCycleThreshold("x86-cmov-converter-threshold",
93 cl::desc("Minimum gain per loop (in cycles) threshold."),
95
97 "x86-cmov-converter-force-mem-operand",
98 cl::desc("Convert cmovs to branches whenever they have memory operands."),
99 cl::init(true), cl::Hidden);
100
102 "x86-cmov-converter-force-all",
103 cl::desc("Convert all cmovs to branches."),
104 cl::init(false), cl::Hidden);
105
106namespace {
107
108/// Converts X86 cmov instructions into branches when profitable.
109class X86CmovConverterPass : public MachineFunctionPass {
110public:
111 X86CmovConverterPass() : MachineFunctionPass(ID) { }
112
113 StringRef getPassName() const override { return "X86 cmov Conversion"; }
114 bool runOnMachineFunction(MachineFunction &MF) override;
115 void getAnalysisUsage(AnalysisUsage &AU) const override;
116
117 /// Pass identification, replacement for typeid.
118 static char ID;
119
120private:
121 MachineRegisterInfo *MRI = nullptr;
122 const TargetInstrInfo *TII = nullptr;
123 const TargetRegisterInfo *TRI = nullptr;
124 MachineLoopInfo *MLI = nullptr;
125 TargetSchedModel TSchedModel;
126
127 /// List of consecutive CMOV instructions.
128 using CmovGroup = SmallVector<MachineInstr *, 2>;
129 using CmovGroups = SmallVector<CmovGroup, 2>;
130
131 /// Collect all CMOV-group-candidates in \p CurrLoop and update \p
132 /// CmovInstGroups accordingly.
133 ///
134 /// \param Blocks List of blocks to process.
135 /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
136 /// \returns true iff it found any CMOV-group-candidate.
137 bool collectCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks,
138 CmovGroups &CmovInstGroups,
139 bool IncludeLoads = false);
140
141 /// Check if it is profitable to transform each CMOV-group-candidates into
142 /// branch. Remove all groups that are not profitable from \p CmovInstGroups.
143 ///
144 /// \param Blocks List of blocks to process.
145 /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
146 /// \returns true iff any CMOV-group-candidate remain.
147 bool checkForProfitableCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks,
148 CmovGroups &CmovInstGroups);
149
150 /// Convert the given list of consecutive CMOV instructions into a branch.
151 ///
152 /// \param Group Consecutive CMOV instructions to be converted into branch.
153 void convertCmovInstsToBranches(SmallVectorImpl<MachineInstr *> &Group) const;
154};
155
156} // end anonymous namespace
157
158char X86CmovConverterPass::ID = 0;
159
160void X86CmovConverterPass::getAnalysisUsage(AnalysisUsage &AU) const {
163}
164
165bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
166 if (skipFunction(MF.getFunction()))
167 return false;
169 return false;
170
171 // If the SelectOptimize pass is enabled, cmovs have already been optimized.
173 return false;
174
175 LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
176 << "**********\n");
177
178 bool Changed = false;
179 MLI = &getAnalysis<MachineLoopInfo>();
180 const TargetSubtargetInfo &STI = MF.getSubtarget();
181 MRI = &MF.getRegInfo();
182 TII = STI.getInstrInfo();
183 TRI = STI.getRegisterInfo();
184 TSchedModel.init(&STI);
185
186 // Before we handle the more subtle cases of register-register CMOVs inside
187 // of potentially hot loops, we want to quickly remove all CMOVs (ForceAll) or
188 // the ones with a memory operand (ForceMemOperand option). The latter CMOV
189 // will risk a stall waiting for the load to complete that speculative
190 // execution behind a branch is better suited to handle on modern x86 chips.
191 if (ForceMemOperand || ForceAll) {
192 CmovGroups AllCmovGroups;
194 for (auto &MBB : MF)
195 Blocks.push_back(&MBB);
196 if (collectCmovCandidates(Blocks, AllCmovGroups, /*IncludeLoads*/ true)) {
197 for (auto &Group : AllCmovGroups) {
198 // Skip any group that doesn't do at least one memory operand cmov.
199 if (ForceMemOperand && !ForceAll &&
200 llvm::none_of(Group, [&](MachineInstr *I) { return I->mayLoad(); }))
201 continue;
202
203 // For CMOV groups which we can rewrite and which contain a memory load,
204 // always rewrite them. On x86, a CMOV will dramatically amplify any
205 // memory latency by blocking speculative execution.
206 Changed = true;
207 convertCmovInstsToBranches(Group);
208 }
209 }
210 // Early return as ForceAll converts all CmovGroups.
211 if (ForceAll)
212 return Changed;
213 }
214
215 //===--------------------------------------------------------------------===//
216 // Register-operand Conversion Algorithm
217 // ---------
218 // For each innermost loop
219 // collectCmovCandidates() {
220 // Find all CMOV-group-candidates.
221 // }
222 //
223 // checkForProfitableCmovCandidates() {
224 // * Calculate both loop-depth and optimized-loop-depth.
225 // * Use these depth to check for loop transformation profitability.
226 // * Check for CMOV-group-candidate transformation profitability.
227 // }
228 //
229 // For each profitable CMOV-group-candidate
230 // convertCmovInstsToBranches() {
231 // * Create FalseBB, SinkBB, Conditional branch to SinkBB.
232 // * Replace each CMOV instruction with a PHI instruction in SinkBB.
233 // }
234 //
235 // Note: For more details, see each function description.
236 //===--------------------------------------------------------------------===//
237
238 // Build up the loops in pre-order.
239 SmallVector<MachineLoop *, 4> Loops(MLI->begin(), MLI->end());
240 // Note that we need to check size on each iteration as we accumulate child
241 // loops.
242 for (int i = 0; i < (int)Loops.size(); ++i)
243 for (MachineLoop *Child : Loops[i]->getSubLoops())
244 Loops.push_back(Child);
245
246 for (MachineLoop *CurrLoop : Loops) {
247 // Optimize only innermost loops.
248 if (!CurrLoop->getSubLoops().empty())
249 continue;
250
251 // List of consecutive CMOV instructions to be processed.
252 CmovGroups CmovInstGroups;
253
254 if (!collectCmovCandidates(CurrLoop->getBlocks(), CmovInstGroups))
255 continue;
256
257 if (!checkForProfitableCmovCandidates(CurrLoop->getBlocks(),
258 CmovInstGroups))
259 continue;
260
261 Changed = true;
262 for (auto &Group : CmovInstGroups)
263 convertCmovInstsToBranches(Group);
264 }
265
266 return Changed;
267}
268
269bool X86CmovConverterPass::collectCmovCandidates(
270 ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups,
271 bool IncludeLoads) {
272 //===--------------------------------------------------------------------===//
273 // Collect all CMOV-group-candidates and add them into CmovInstGroups.
274 //
275 // CMOV-group:
276 // CMOV instructions, in same MBB, that uses same EFLAGS def instruction.
277 //
278 // CMOV-group-candidate:
279 // CMOV-group where all the CMOV instructions are
280 // 1. consecutive.
281 // 2. have same condition code or opposite one.
282 // 3. have only operand registers (X86::CMOVrr).
283 //===--------------------------------------------------------------------===//
284 // List of possible improvement (TODO's):
285 // --------------------------------------
286 // TODO: Add support for X86::CMOVrm instructions.
287 // TODO: Add support for X86::SETcc instructions.
288 // TODO: Add support for CMOV-groups with non consecutive CMOV instructions.
289 //===--------------------------------------------------------------------===//
290
291 // Current processed CMOV-Group.
292 CmovGroup Group;
293 for (auto *MBB : Blocks) {
294 Group.clear();
295 // Condition code of first CMOV instruction current processed range and its
296 // opposite condition code.
297 X86::CondCode FirstCC = X86::COND_INVALID, FirstOppCC = X86::COND_INVALID,
298 MemOpCC = X86::COND_INVALID;
299 // Indicator of a non CMOVrr instruction in the current processed range.
300 bool FoundNonCMOVInst = false;
301 // Indicator for current processed CMOV-group if it should be skipped.
302 bool SkipGroup = false;
303
304 for (auto &I : *MBB) {
305 // Skip debug instructions.
306 if (I.isDebugInstr())
307 continue;
309 // Check if we found a X86::CMOVrr instruction.
310 if (CC != X86::COND_INVALID && (IncludeLoads || !I.mayLoad())) {
311 if (Group.empty()) {
312 // We found first CMOV in the range, reset flags.
313 FirstCC = CC;
315 // Clear out the prior group's memory operand CC.
316 MemOpCC = X86::COND_INVALID;
317 FoundNonCMOVInst = false;
318 SkipGroup = false;
319 }
320 Group.push_back(&I);
321 // Check if it is a non-consecutive CMOV instruction or it has different
322 // condition code than FirstCC or FirstOppCC.
323 if (FoundNonCMOVInst || (CC != FirstCC && CC != FirstOppCC))
324 // Mark the SKipGroup indicator to skip current processed CMOV-Group.
325 SkipGroup = true;
326 if (I.mayLoad()) {
327 if (MemOpCC == X86::COND_INVALID)
328 // The first memory operand CMOV.
329 MemOpCC = CC;
330 else if (CC != MemOpCC)
331 // Can't handle mixed conditions with memory operands.
332 SkipGroup = true;
333 }
334 // Check if we were relying on zero-extending behavior of the CMOV.
335 if (!SkipGroup &&
337 MRI->use_nodbg_instructions(I.defs().begin()->getReg()),
338 [&](MachineInstr &UseI) {
339 return UseI.getOpcode() == X86::SUBREG_TO_REG;
340 }))
341 // FIXME: We should model the cost of using an explicit MOV to handle
342 // the zero-extension rather than just refusing to handle this.
343 SkipGroup = true;
344 continue;
345 }
346 // If Group is empty, keep looking for first CMOV in the range.
347 if (Group.empty())
348 continue;
349
350 // We found a non X86::CMOVrr instruction.
351 FoundNonCMOVInst = true;
352 // Check if this instruction define EFLAGS, to determine end of processed
353 // range, as there would be no more instructions using current EFLAGS def.
354 if (I.definesRegister(X86::EFLAGS)) {
355 // Check if current processed CMOV-group should not be skipped and add
356 // it as a CMOV-group-candidate.
357 if (!SkipGroup)
358 CmovInstGroups.push_back(Group);
359 else
360 ++NumOfSkippedCmovGroups;
361 Group.clear();
362 }
363 }
364 // End of basic block is considered end of range, check if current processed
365 // CMOV-group should not be skipped and add it as a CMOV-group-candidate.
366 if (Group.empty())
367 continue;
368 if (!SkipGroup)
369 CmovInstGroups.push_back(Group);
370 else
371 ++NumOfSkippedCmovGroups;
372 }
373
374 NumOfCmovGroupCandidate += CmovInstGroups.size();
375 return !CmovInstGroups.empty();
376}
377
378/// \returns Depth of CMOV instruction as if it was converted into branch.
379/// \param TrueOpDepth depth cost of CMOV true value operand.
380/// \param FalseOpDepth depth cost of CMOV false value operand.
381static unsigned getDepthOfOptCmov(unsigned TrueOpDepth, unsigned FalseOpDepth) {
382 // The depth of the result after branch conversion is
383 // TrueOpDepth * TrueOpProbability + FalseOpDepth * FalseOpProbability.
384 // As we have no info about branch weight, we assume 75% for one and 25% for
385 // the other, and pick the result with the largest resulting depth.
386 return std::max(
387 divideCeil(TrueOpDepth * 3 + FalseOpDepth, 4),
388 divideCeil(FalseOpDepth * 3 + TrueOpDepth, 4));
389}
390
391bool X86CmovConverterPass::checkForProfitableCmovCandidates(
392 ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups) {
393 struct DepthInfo {
394 /// Depth of original loop.
395 unsigned Depth;
396 /// Depth of optimized loop.
397 unsigned OptDepth;
398 };
399 /// Number of loop iterations to calculate depth for ?!
400 static const unsigned LoopIterations = 2;
402 DepthInfo LoopDepth[LoopIterations] = {{0, 0}, {0, 0}};
403 enum { PhyRegType = 0, VirRegType = 1, RegTypeNum = 2 };
404 /// For each register type maps the register to its last def instruction.
405 DenseMap<unsigned, MachineInstr *> RegDefMaps[RegTypeNum];
406 /// Maps register operand to its def instruction, which can be nullptr if it
407 /// is unknown (e.g., operand is defined outside the loop).
409
410 // Set depth of unknown instruction (i.e., nullptr) to zero.
411 DepthMap[nullptr] = {0, 0};
412
413 SmallPtrSet<MachineInstr *, 4> CmovInstructions;
414 for (auto &Group : CmovInstGroups)
415 CmovInstructions.insert(Group.begin(), Group.end());
416
417 //===--------------------------------------------------------------------===//
418 // Step 1: Calculate instruction depth and loop depth.
419 // Optimized-Loop:
420 // loop with CMOV-group-candidates converted into branches.
421 //
422 // Instruction-Depth:
423 // instruction latency + max operand depth.
424 // * For CMOV instruction in optimized loop the depth is calculated as:
425 // CMOV latency + getDepthOfOptCmov(True-Op-Depth, False-Op-depth)
426 // TODO: Find a better way to estimate the latency of the branch instruction
427 // rather than using the CMOV latency.
428 //
429 // Loop-Depth:
430 // max instruction depth of all instructions in the loop.
431 // Note: instruction with max depth represents the critical-path in the loop.
432 //
433 // Loop-Depth[i]:
434 // Loop-Depth calculated for first `i` iterations.
435 // Note: it is enough to calculate depth for up to two iterations.
436 //
437 // Depth-Diff[i]:
438 // Number of cycles saved in first 'i` iterations by optimizing the loop.
439 //===--------------------------------------------------------------------===//
440 for (DepthInfo &MaxDepth : LoopDepth) {
441 for (auto *MBB : Blocks) {
442 // Clear physical registers Def map.
443 RegDefMaps[PhyRegType].clear();
444 for (MachineInstr &MI : *MBB) {
445 // Skip debug instructions.
446 if (MI.isDebugInstr())
447 continue;
448 unsigned MIDepth = 0;
449 unsigned MIDepthOpt = 0;
450 bool IsCMOV = CmovInstructions.count(&MI);
451 for (auto &MO : MI.uses()) {
452 // Checks for "isUse()" as "uses()" returns also implicit definitions.
453 if (!MO.isReg() || !MO.isUse())
454 continue;
455 Register Reg = MO.getReg();
456 auto &RDM = RegDefMaps[Reg.isVirtual()];
457 if (MachineInstr *DefMI = RDM.lookup(Reg)) {
458 OperandToDefMap[&MO] = DefMI;
459 DepthInfo Info = DepthMap.lookup(DefMI);
460 MIDepth = std::max(MIDepth, Info.Depth);
461 if (!IsCMOV)
462 MIDepthOpt = std::max(MIDepthOpt, Info.OptDepth);
463 }
464 }
465
466 if (IsCMOV)
467 MIDepthOpt = getDepthOfOptCmov(
468 DepthMap[OperandToDefMap.lookup(&MI.getOperand(1))].OptDepth,
469 DepthMap[OperandToDefMap.lookup(&MI.getOperand(2))].OptDepth);
470
471 // Iterates over all operands to handle implicit definitions as well.
472 for (auto &MO : MI.operands()) {
473 if (!MO.isReg() || !MO.isDef())
474 continue;
475 Register Reg = MO.getReg();
476 RegDefMaps[Reg.isVirtual()][Reg] = &MI;
477 }
478
479 unsigned Latency = TSchedModel.computeInstrLatency(&MI);
480 DepthMap[&MI] = {MIDepth += Latency, MIDepthOpt += Latency};
481 MaxDepth.Depth = std::max(MaxDepth.Depth, MIDepth);
482 MaxDepth.OptDepth = std::max(MaxDepth.OptDepth, MIDepthOpt);
483 }
484 }
485 }
486
487 unsigned Diff[LoopIterations] = {LoopDepth[0].Depth - LoopDepth[0].OptDepth,
488 LoopDepth[1].Depth - LoopDepth[1].OptDepth};
489
490 //===--------------------------------------------------------------------===//
491 // Step 2: Check if Loop worth to be optimized.
492 // Worth-Optimize-Loop:
493 // case 1: Diff[1] == Diff[0]
494 // Critical-path is iteration independent - there is no dependency
495 // of critical-path instructions on critical-path instructions of
496 // previous iteration.
497 // Thus, it is enough to check gain percent of 1st iteration -
498 // To be conservative, the optimized loop need to have a depth of
499 // 12.5% cycles less than original loop, per iteration.
500 //
501 // case 2: Diff[1] > Diff[0]
502 // Critical-path is iteration dependent - there is dependency of
503 // critical-path instructions on critical-path instructions of
504 // previous iteration.
505 // Thus, check the gain percent of the 2nd iteration (similar to the
506 // previous case), but it is also required to check the gradient of
507 // the gain - the change in Depth-Diff compared to the change in
508 // Loop-Depth between 1st and 2nd iterations.
509 // To be conservative, the gradient need to be at least 50%.
510 //
511 // In addition, In order not to optimize loops with very small gain, the
512 // gain (in cycles) after 2nd iteration should not be less than a given
513 // threshold. Thus, the check (Diff[1] >= GainCycleThreshold) must apply.
514 //
515 // If loop is not worth optimizing, remove all CMOV-group-candidates.
516 //===--------------------------------------------------------------------===//
517 if (Diff[1] < GainCycleThreshold)
518 return false;
519
520 bool WorthOptLoop = false;
521 if (Diff[1] == Diff[0])
522 WorthOptLoop = Diff[0] * 8 >= LoopDepth[0].Depth;
523 else if (Diff[1] > Diff[0])
524 WorthOptLoop =
525 (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth) &&
526 (Diff[1] * 8 >= LoopDepth[1].Depth);
527
528 if (!WorthOptLoop)
529 return false;
530
531 ++NumOfLoopCandidate;
532
533 //===--------------------------------------------------------------------===//
534 // Step 3: Check for each CMOV-group-candidate if it worth to be optimized.
535 // Worth-Optimize-Group:
536 // Iff it is worth to optimize all CMOV instructions in the group.
537 //
538 // Worth-Optimize-CMOV:
539 // Predicted branch is faster than CMOV by the difference between depth of
540 // condition operand and depth of taken (predicted) value operand.
541 // To be conservative, the gain of such CMOV transformation should cover at
542 // at least 25% of branch-misprediction-penalty.
543 //===--------------------------------------------------------------------===//
544 unsigned MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
545 CmovGroups TempGroups;
546 std::swap(TempGroups, CmovInstGroups);
547 for (auto &Group : TempGroups) {
548 bool WorthOpGroup = true;
549 for (auto *MI : Group) {
550 // Avoid CMOV instruction which value is used as a pointer to load from.
551 // This is another conservative check to avoid converting CMOV instruction
552 // used with tree-search like algorithm, where the branch is unpredicted.
553 auto UIs = MRI->use_instructions(MI->defs().begin()->getReg());
554 if (!UIs.empty() && ++UIs.begin() == UIs.end()) {
555 unsigned Op = UIs.begin()->getOpcode();
556 if (Op == X86::MOV64rm || Op == X86::MOV32rm) {
557 WorthOpGroup = false;
558 break;
559 }
560 }
561
562 unsigned CondCost =
563 DepthMap[OperandToDefMap.lookup(&MI->getOperand(4))].Depth;
564 unsigned ValCost = getDepthOfOptCmov(
565 DepthMap[OperandToDefMap.lookup(&MI->getOperand(1))].Depth,
566 DepthMap[OperandToDefMap.lookup(&MI->getOperand(2))].Depth);
567 if (ValCost > CondCost || (CondCost - ValCost) * 4 < MispredictPenalty) {
568 WorthOpGroup = false;
569 break;
570 }
571 }
572
573 if (WorthOpGroup)
574 CmovInstGroups.push_back(Group);
575 }
576
577 return !CmovInstGroups.empty();
578}
579
581 if (MI->killsRegister(X86::EFLAGS))
582 return false;
583
584 // The EFLAGS operand of MI might be missing a kill marker.
585 // Figure out whether EFLAGS operand should LIVE after MI instruction.
586 MachineBasicBlock *BB = MI->getParent();
588
589 // Scan forward through BB for a use/def of EFLAGS.
590 for (auto I = std::next(ItrMI), E = BB->end(); I != E; ++I) {
591 if (I->readsRegister(X86::EFLAGS))
592 return true;
593 if (I->definesRegister(X86::EFLAGS))
594 return false;
595 }
596
597 // We hit the end of the block, check whether EFLAGS is live into a successor.
598 for (MachineBasicBlock *Succ : BB->successors())
599 if (Succ->isLiveIn(X86::EFLAGS))
600 return true;
601
602 return false;
603}
604
605/// Given /p First CMOV instruction and /p Last CMOV instruction representing a
606/// group of CMOV instructions, which may contain debug instructions in between,
607/// move all debug instructions to after the last CMOV instruction, making the
608/// CMOV group consecutive.
611 "Last instruction in a CMOV group must be a CMOV instruction");
612
613 SmallVector<MachineInstr *, 2> DBGInstructions;
614 for (auto I = First->getIterator(), E = Last->getIterator(); I != E; I++) {
615 if (I->isDebugInstr())
616 DBGInstructions.push_back(&*I);
617 }
618
619 // Splice the debug instruction after the cmov group.
620 MachineBasicBlock *MBB = First->getParent();
621 for (auto *MI : DBGInstructions)
622 MBB->insertAfter(Last, MI->removeFromParent());
623}
624
625void X86CmovConverterPass::convertCmovInstsToBranches(
626 SmallVectorImpl<MachineInstr *> &Group) const {
627 assert(!Group.empty() && "No CMOV instructions to convert");
628 ++NumOfOptimizedCmovGroups;
629
630 // If the CMOV group is not packed, e.g., there are debug instructions between
631 // first CMOV and last CMOV, then pack the group and make the CMOV instruction
632 // consecutive by moving the debug instructions to after the last CMOV.
633 packCmovGroup(Group.front(), Group.back());
634
635 // To convert a CMOVcc instruction, we actually have to insert the diamond
636 // control-flow pattern. The incoming instruction knows the destination vreg
637 // to set, the condition code register to branch on, the true/false values to
638 // select between, and a branch opcode to use.
639
640 // Before
641 // -----
642 // MBB:
643 // cond = cmp ...
644 // v1 = CMOVge t1, f1, cond
645 // v2 = CMOVlt t2, f2, cond
646 // v3 = CMOVge v1, f3, cond
647 //
648 // After
649 // -----
650 // MBB:
651 // cond = cmp ...
652 // jge %SinkMBB
653 //
654 // FalseMBB:
655 // jmp %SinkMBB
656 //
657 // SinkMBB:
658 // %v1 = phi[%f1, %FalseMBB], [%t1, %MBB]
659 // %v2 = phi[%t2, %FalseMBB], [%f2, %MBB] ; For CMOV with OppCC switch
660 // ; true-value with false-value
661 // %v3 = phi[%f3, %FalseMBB], [%t1, %MBB] ; Phi instruction cannot use
662 // ; previous Phi instruction result
663
664 MachineInstr &MI = *Group.front();
665 MachineInstr *LastCMOV = Group.back();
666 DebugLoc DL = MI.getDebugLoc();
667
670 // Potentially swap the condition codes so that any memory operand to a CMOV
671 // is in the *false* position instead of the *true* position. We can invert
672 // any non-memory operand CMOV instructions to cope with this and we ensure
673 // memory operand CMOVs are only included with a single condition code.
674 if (llvm::any_of(Group, [&](MachineInstr *I) {
675 return I->mayLoad() && X86::getCondFromCMov(*I) == CC;
676 }))
677 std::swap(CC, OppCC);
678
679 MachineBasicBlock *MBB = MI.getParent();
682 const BasicBlock *BB = MBB->getBasicBlock();
683
684 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(BB);
685 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(BB);
686 F->insert(It, FalseMBB);
687 F->insert(It, SinkMBB);
688
689 // If the EFLAGS register isn't dead in the terminator, then claim that it's
690 // live into the sink and copy blocks.
691 if (checkEFLAGSLive(LastCMOV)) {
692 FalseMBB->addLiveIn(X86::EFLAGS);
693 SinkMBB->addLiveIn(X86::EFLAGS);
694 }
695
696 // Transfer the remainder of BB and its successor edges to SinkMBB.
697 SinkMBB->splice(SinkMBB->begin(), MBB,
698 std::next(MachineBasicBlock::iterator(LastCMOV)), MBB->end());
700
701 // Add the false and sink blocks as its successors.
702 MBB->addSuccessor(FalseMBB);
703 MBB->addSuccessor(SinkMBB);
704
705 // Create the conditional branch instruction.
706 BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
707
708 // Add the sink block to the false block successors.
709 FalseMBB->addSuccessor(SinkMBB);
710
714 std::next(MachineBasicBlock::iterator(LastCMOV));
715 MachineBasicBlock::iterator FalseInsertionPoint = FalseMBB->begin();
716 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
717
718 // First we need to insert an explicit load on the false path for any memory
719 // operand. We also need to potentially do register rewriting here, but it is
720 // simpler as the memory operands are always on the false path so we can
721 // simply take that input, whatever it is.
722 DenseMap<unsigned, unsigned> FalseBBRegRewriteTable;
723 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd;) {
724 auto &MI = *MIIt++;
725 // Skip any CMOVs in this group which don't load from memory.
726 if (!MI.mayLoad()) {
727 // Remember the false-side register input.
728 Register FalseReg =
729 MI.getOperand(X86::getCondFromCMov(MI) == CC ? 1 : 2).getReg();
730 // Walk back through any intermediate cmovs referenced.
731 while (true) {
732 auto FRIt = FalseBBRegRewriteTable.find(FalseReg);
733 if (FRIt == FalseBBRegRewriteTable.end())
734 break;
735 FalseReg = FRIt->second;
736 }
737 FalseBBRegRewriteTable[MI.getOperand(0).getReg()] = FalseReg;
738 continue;
739 }
740
741 // The condition must be the *opposite* of the one we've decided to branch
742 // on as the branch will go *around* the load and the load should happen
743 // when the CMOV condition is false.
744 assert(X86::getCondFromCMov(MI) == OppCC &&
745 "Can only handle memory-operand cmov instructions with a condition "
746 "opposite to the selected branch direction.");
747
748 // The goal is to rewrite the cmov from:
749 //
750 // MBB:
751 // %A = CMOVcc %B (tied), (mem)
752 //
753 // to
754 //
755 // MBB:
756 // %A = CMOVcc %B (tied), %C
757 // FalseMBB:
758 // %C = MOV (mem)
759 //
760 // Which will allow the next loop to rewrite the CMOV in terms of a PHI:
761 //
762 // MBB:
763 // JMP!cc SinkMBB
764 // FalseMBB:
765 // %C = MOV (mem)
766 // SinkMBB:
767 // %A = PHI [ %C, FalseMBB ], [ %B, MBB]
768
769 // Get a fresh register to use as the destination of the MOV.
770 const TargetRegisterClass *RC = MRI->getRegClass(MI.getOperand(0).getReg());
771 Register TmpReg = MRI->createVirtualRegister(RC);
772
774 bool Unfolded = TII->unfoldMemoryOperand(*MBB->getParent(), MI, TmpReg,
775 /*UnfoldLoad*/ true,
776 /*UnfoldStore*/ false, NewMIs);
777 (void)Unfolded;
778 assert(Unfolded && "Should never fail to unfold a loading cmov!");
779
780 // Move the new CMOV to just before the old one and reset any impacted
781 // iterator.
782 auto *NewCMOV = NewMIs.pop_back_val();
783 assert(X86::getCondFromCMov(*NewCMOV) == OppCC &&
784 "Last new instruction isn't the expected CMOV!");
785 LLVM_DEBUG(dbgs() << "\tRewritten cmov: "; NewCMOV->dump());
787 if (&*MIItBegin == &MI)
788 MIItBegin = MachineBasicBlock::iterator(NewCMOV);
789
790 // Sink whatever instructions were needed to produce the unfolded operand
791 // into the false block.
792 for (auto *NewMI : NewMIs) {
793 LLVM_DEBUG(dbgs() << "\tRewritten load instr: "; NewMI->dump());
794 FalseMBB->insert(FalseInsertionPoint, NewMI);
795 // Re-map any operands that are from other cmovs to the inputs for this block.
796 for (auto &MOp : NewMI->uses()) {
797 if (!MOp.isReg())
798 continue;
799 auto It = FalseBBRegRewriteTable.find(MOp.getReg());
800 if (It == FalseBBRegRewriteTable.end())
801 continue;
802
803 MOp.setReg(It->second);
804 // This might have been a kill when it referenced the cmov result, but
805 // it won't necessarily be once rewritten.
806 // FIXME: We could potentially improve this by tracking whether the
807 // operand to the cmov was also a kill, and then skipping the PHI node
808 // construction below.
809 MOp.setIsKill(false);
810 }
811 }
812 MBB->erase(&MI);
813
814 // Add this PHI to the rewrite table.
815 FalseBBRegRewriteTable[NewCMOV->getOperand(0).getReg()] = TmpReg;
816 }
817
818 // As we are creating the PHIs, we have to be careful if there is more than
819 // one. Later CMOVs may reference the results of earlier CMOVs, but later
820 // PHIs have to reference the individual true/false inputs from earlier PHIs.
821 // That also means that PHI construction must work forward from earlier to
822 // later, and that the code must maintain a mapping from earlier PHI's
823 // destination registers, and the registers that went into the PHI.
825
826 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
827 Register DestReg = MIIt->getOperand(0).getReg();
828 Register Op1Reg = MIIt->getOperand(1).getReg();
829 Register Op2Reg = MIIt->getOperand(2).getReg();
830
831 // If this CMOV we are processing is the opposite condition from the jump we
832 // generated, then we have to swap the operands for the PHI that is going to
833 // be generated.
834 if (X86::getCondFromCMov(*MIIt) == OppCC)
835 std::swap(Op1Reg, Op2Reg);
836
837 auto Op1Itr = RegRewriteTable.find(Op1Reg);
838 if (Op1Itr != RegRewriteTable.end())
839 Op1Reg = Op1Itr->second.first;
840
841 auto Op2Itr = RegRewriteTable.find(Op2Reg);
842 if (Op2Itr != RegRewriteTable.end())
843 Op2Reg = Op2Itr->second.second;
844
845 // SinkMBB:
846 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, MBB ]
847 // ...
848 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
849 .addReg(Op1Reg)
850 .addMBB(FalseMBB)
851 .addReg(Op2Reg)
852 .addMBB(MBB);
853 (void)MIB;
854 LLVM_DEBUG(dbgs() << "\tFrom: "; MIIt->dump());
855 LLVM_DEBUG(dbgs() << "\tTo: "; MIB->dump());
856
857 // Add this PHI to the rewrite table.
858 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
859 }
860
861 // Now remove the CMOV(s).
862 MBB->erase(MIItBegin, MIItEnd);
863
864 // Add new basic blocks to MachineLoopInfo.
865 if (MachineLoop *L = MLI->getLoopFor(MBB)) {
866 L->addBasicBlockToLoop(FalseMBB, MLI->getBase());
867 L->addBasicBlockToLoop(SinkMBB, MLI->getBase());
868 }
869}
870
871INITIALIZE_PASS_BEGIN(X86CmovConverterPass, DEBUG_TYPE, "X86 cmov Conversion",
872 false, false)
874INITIALIZE_PASS_END(X86CmovConverterPass, DEBUG_TYPE, "X86 cmov Conversion",
876
878 return new X86CmovConverterPass();
879}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:491
const HexagonInstrInfo * TII
Hexagon Hardware Loops
IRTranslator LLVM IR MI
static const unsigned MaxDepth
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static cl::opt< bool > DisableSelectOptimize("disable-select-optimize", cl::init(true), cl::Hidden, cl::desc("Disable the select-optimization pass from running"))
Disable the select optimization pass.
X86 cmov Conversion
static cl::opt< bool > ForceAll("x86-cmov-converter-force-all", cl::desc("Convert all cmovs to branches."), cl::init(false), cl::Hidden)
static bool checkEFLAGSLive(MachineInstr *MI)
static unsigned getDepthOfOptCmov(unsigned TrueOpDepth, unsigned FalseOpDepth)
static cl::opt< unsigned > GainCycleThreshold("x86-cmov-converter-threshold", cl::desc("Minimum gain per loop (in cycles) threshold."), cl::init(4), cl::Hidden)
static cl::opt< bool > ForceMemOperand("x86-cmov-converter-force-mem-operand", cl::desc("Convert cmovs to branches whenever they have memory operands."), cl::init(true), cl::Hidden)
static void packCmovGroup(MachineInstr *First, MachineInstr *Last)
Given /p First CMOV instruction and /p Last CMOV instruction representing a group of CMOV instruction...
static cl::opt< bool > EnableCmovConverter("x86-cmov-converter", cl::desc("Enable the X86 cmov-to-branch optimization."), cl::init(true), cl::Hidden)
#define DEBUG_TYPE
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
iterator end()
Definition: DenseMap.h:84
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
iterator insertAfter(iterator I, MachineInstr *MI)
Insert MI into the instruction list after I.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
Definition: MachineInstr.h:68
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:383
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
bool empty() const
Definition: SmallVector.h:94
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:577
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
TargetInstrInfo - Interface to description of machine instruction set.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Provide an instruction scheduling machine model to CodeGen passes.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
virtual const TargetInstrInfo * getInstrInfo() const
self_iterator getIterator()
Definition: ilist_node.h:82
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
Reg
All possible values of the reg field in the ModR/M byte.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
CondCode getCondFromCMov(const MachineInstr &MI)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:522
FunctionPass * createX86CmovConverterPass()
This pass converts X86 cmov instructions into branch when profitable.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1826
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1833
CGPassBuilderOption getCGPassBuilderOption()
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860