LLVM 20.0.0git
X86CmovConversion.cpp
Go to the documentation of this file.
1//====- X86CmovConversion.cpp - Convert Cmov to Branch --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file implements a pass that converts X86 cmov instructions into
11/// branches when profitable. This pass is conservative. It transforms if and
12/// only if it can guarantee a gain with high confidence.
13///
14/// Thus, the optimization applies under the following conditions:
15/// 1. Consider as candidates only CMOVs in innermost loops (assume that
16/// most hotspots are represented by these loops).
17/// 2. Given a group of CMOV instructions that are using the same EFLAGS def
18/// instruction:
19/// a. Consider them as candidates only if all have the same code condition
20/// or the opposite one to prevent generating more than one conditional
21/// jump per EFLAGS def instruction.
22/// b. Consider them as candidates only if all are profitable to be
23/// converted (assume that one bad conversion may cause a degradation).
24/// 3. Apply conversion only for loops that are found profitable and only for
25/// CMOV candidates that were found profitable.
26/// a. A loop is considered profitable only if conversion will reduce its
27/// depth cost by some threshold.
28/// b. CMOV is considered profitable if the cost of its condition is higher
29/// than the average cost of its true-value and false-value by 25% of
30/// branch-misprediction-penalty. This assures no degradation even with
31/// 25% branch misprediction.
32///
33/// Note: This pass is assumed to run on SSA machine code.
34//
35//===----------------------------------------------------------------------===//
36//
37// External interfaces:
38// FunctionPass *llvm::createX86CmovConverterPass();
39// bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF);
40//
41//===----------------------------------------------------------------------===//
42
43#include "X86.h"
44#include "X86InstrInfo.h"
45#include "llvm/ADT/ArrayRef.h"
46#include "llvm/ADT/DenseMap.h"
47#include "llvm/ADT/STLExtras.h"
50#include "llvm/ADT/Statistic.h"
63#include "llvm/IR/DebugLoc.h"
65#include "llvm/MC/MCSchedule.h"
66#include "llvm/Pass.h"
68#include "llvm/Support/Debug.h"
71#include <algorithm>
72#include <cassert>
73#include <iterator>
74#include <utility>
75
76using namespace llvm;
77
78#define DEBUG_TYPE "x86-cmov-conversion"
79
80STATISTIC(NumOfSkippedCmovGroups, "Number of unsupported CMOV-groups");
81STATISTIC(NumOfCmovGroupCandidate, "Number of CMOV-group candidates");
82STATISTIC(NumOfLoopCandidate, "Number of CMOV-conversion profitable loops");
83STATISTIC(NumOfOptimizedCmovGroups, "Number of optimized CMOV-groups");
84
85// This internal switch can be used to turn off the cmov/branch optimization.
86static cl::opt<bool>
87 EnableCmovConverter("x86-cmov-converter",
88 cl::desc("Enable the X86 cmov-to-branch optimization."),
89 cl::init(true), cl::Hidden);
90
92 GainCycleThreshold("x86-cmov-converter-threshold",
93 cl::desc("Minimum gain per loop (in cycles) threshold."),
95
97 "x86-cmov-converter-force-mem-operand",
98 cl::desc("Convert cmovs to branches whenever they have memory operands."),
99 cl::init(true), cl::Hidden);
100
102 "x86-cmov-converter-force-all",
103 cl::desc("Convert all cmovs to branches."),
104 cl::init(false), cl::Hidden);
105
106namespace {
107
108/// Converts X86 cmov instructions into branches when profitable.
109class X86CmovConverterPass : public MachineFunctionPass {
110public:
111 X86CmovConverterPass() : MachineFunctionPass(ID) { }
112
113 StringRef getPassName() const override { return "X86 cmov Conversion"; }
114 bool runOnMachineFunction(MachineFunction &MF) override;
115 void getAnalysisUsage(AnalysisUsage &AU) const override;
116
117 /// Pass identification, replacement for typeid.
118 static char ID;
119
120private:
121 MachineRegisterInfo *MRI = nullptr;
122 const TargetInstrInfo *TII = nullptr;
123 const TargetRegisterInfo *TRI = nullptr;
124 MachineLoopInfo *MLI = nullptr;
125 TargetSchedModel TSchedModel;
126
127 /// List of consecutive CMOV instructions.
128 using CmovGroup = SmallVector<MachineInstr *, 2>;
129 using CmovGroups = SmallVector<CmovGroup, 2>;
130
131 /// Collect all CMOV-group-candidates in \p CurrLoop and update \p
132 /// CmovInstGroups accordingly.
133 ///
134 /// \param Blocks List of blocks to process.
135 /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
136 /// \returns true iff it found any CMOV-group-candidate.
137 bool collectCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks,
138 CmovGroups &CmovInstGroups,
139 bool IncludeLoads = false);
140
141 /// Check if it is profitable to transform each CMOV-group-candidates into
142 /// branch. Remove all groups that are not profitable from \p CmovInstGroups.
143 ///
144 /// \param Blocks List of blocks to process.
145 /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
146 /// \returns true iff any CMOV-group-candidate remain.
147 bool checkForProfitableCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks,
148 CmovGroups &CmovInstGroups);
149
150 /// Convert the given list of consecutive CMOV instructions into a branch.
151 ///
152 /// \param Group Consecutive CMOV instructions to be converted into branch.
153 void convertCmovInstsToBranches(SmallVectorImpl<MachineInstr *> &Group) const;
154};
155
156} // end anonymous namespace
157
158char X86CmovConverterPass::ID = 0;
159
160void X86CmovConverterPass::getAnalysisUsage(AnalysisUsage &AU) const {
163}
164
165bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
166 if (skipFunction(MF.getFunction()))
167 return false;
169 return false;
170
171 // If the SelectOptimize pass is enabled, cmovs have already been optimized.
173 return false;
174
175 LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
176 << "**********\n");
177
178 bool Changed = false;
179 MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
180 const TargetSubtargetInfo &STI = MF.getSubtarget();
181 MRI = &MF.getRegInfo();
182 TII = STI.getInstrInfo();
183 TRI = STI.getRegisterInfo();
184 TSchedModel.init(&STI);
185
186 // Before we handle the more subtle cases of register-register CMOVs inside
187 // of potentially hot loops, we want to quickly remove all CMOVs (ForceAll) or
188 // the ones with a memory operand (ForceMemOperand option). The latter CMOV
189 // will risk a stall waiting for the load to complete that speculative
190 // execution behind a branch is better suited to handle on modern x86 chips.
191 if (ForceMemOperand || ForceAll) {
192 CmovGroups AllCmovGroups;
194 for (auto &MBB : MF)
195 Blocks.push_back(&MBB);
196 if (collectCmovCandidates(Blocks, AllCmovGroups, /*IncludeLoads*/ true)) {
197 for (auto &Group : AllCmovGroups) {
198 // Skip any group that doesn't do at least one memory operand cmov.
199 if (ForceMemOperand && !ForceAll &&
200 llvm::none_of(Group, [&](MachineInstr *I) { return I->mayLoad(); }))
201 continue;
202
203 // For CMOV groups which we can rewrite and which contain a memory load,
204 // always rewrite them. On x86, a CMOV will dramatically amplify any
205 // memory latency by blocking speculative execution.
206 Changed = true;
207 convertCmovInstsToBranches(Group);
208 }
209 }
210 // Early return as ForceAll converts all CmovGroups.
211 if (ForceAll)
212 return Changed;
213 }
214
215 //===--------------------------------------------------------------------===//
216 // Register-operand Conversion Algorithm
217 // ---------
218 // For each innermost loop
219 // collectCmovCandidates() {
220 // Find all CMOV-group-candidates.
221 // }
222 //
223 // checkForProfitableCmovCandidates() {
224 // * Calculate both loop-depth and optimized-loop-depth.
225 // * Use these depth to check for loop transformation profitability.
226 // * Check for CMOV-group-candidate transformation profitability.
227 // }
228 //
229 // For each profitable CMOV-group-candidate
230 // convertCmovInstsToBranches() {
231 // * Create FalseBB, SinkBB, Conditional branch to SinkBB.
232 // * Replace each CMOV instruction with a PHI instruction in SinkBB.
233 // }
234 //
235 // Note: For more details, see each function description.
236 //===--------------------------------------------------------------------===//
237
238 // Build up the loops in pre-order.
239 SmallVector<MachineLoop *, 4> Loops(MLI->begin(), MLI->end());
240 // Note that we need to check size on each iteration as we accumulate child
241 // loops.
242 for (int i = 0; i < (int)Loops.size(); ++i)
243 for (MachineLoop *Child : Loops[i]->getSubLoops())
244 Loops.push_back(Child);
245
246 for (MachineLoop *CurrLoop : Loops) {
247 // Optimize only innermost loops.
248 if (!CurrLoop->getSubLoops().empty())
249 continue;
250
251 // List of consecutive CMOV instructions to be processed.
252 CmovGroups CmovInstGroups;
253
254 if (!collectCmovCandidates(CurrLoop->getBlocks(), CmovInstGroups))
255 continue;
256
257 if (!checkForProfitableCmovCandidates(CurrLoop->getBlocks(),
258 CmovInstGroups))
259 continue;
260
261 Changed = true;
262 for (auto &Group : CmovInstGroups)
263 convertCmovInstsToBranches(Group);
264 }
265
266 return Changed;
267}
268
269bool X86CmovConverterPass::collectCmovCandidates(
270 ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups,
271 bool IncludeLoads) {
272 //===--------------------------------------------------------------------===//
273 // Collect all CMOV-group-candidates and add them into CmovInstGroups.
274 //
275 // CMOV-group:
276 // CMOV instructions, in same MBB, that uses same EFLAGS def instruction.
277 //
278 // CMOV-group-candidate:
279 // CMOV-group where all the CMOV instructions are
280 // 1. consecutive.
281 // 2. have same condition code or opposite one.
282 // 3. have only operand registers (X86::CMOVrr).
283 //===--------------------------------------------------------------------===//
284 // List of possible improvement (TODO's):
285 // --------------------------------------
286 // TODO: Add support for X86::CMOVrm instructions.
287 // TODO: Add support for X86::SETcc instructions.
288 // TODO: Add support for CMOV-groups with non consecutive CMOV instructions.
289 //===--------------------------------------------------------------------===//
290
291 // Current processed CMOV-Group.
292 CmovGroup Group;
293 for (auto *MBB : Blocks) {
294 Group.clear();
295 // Condition code of first CMOV instruction current processed range and its
296 // opposite condition code.
297 X86::CondCode FirstCC = X86::COND_INVALID, FirstOppCC = X86::COND_INVALID,
298 MemOpCC = X86::COND_INVALID;
299 // Indicator of a non CMOVrr instruction in the current processed range.
300 bool FoundNonCMOVInst = false;
301 // Indicator for current processed CMOV-group if it should be skipped.
302 bool SkipGroup = false;
303
304 for (auto &I : *MBB) {
305 // Skip debug instructions.
306 if (I.isDebugInstr())
307 continue;
308
310 // Check if we found a X86::CMOVrr instruction. If it is marked as
311 // unpredictable, skip it and do not convert it to branch.
312 if (CC != X86::COND_INVALID &&
313 !I.getFlag(MachineInstr::MIFlag::Unpredictable) &&
314 (IncludeLoads || !I.mayLoad())) {
315 if (Group.empty()) {
316 // We found first CMOV in the range, reset flags.
317 FirstCC = CC;
319 // Clear out the prior group's memory operand CC.
320 MemOpCC = X86::COND_INVALID;
321 FoundNonCMOVInst = false;
322 SkipGroup = false;
323 }
324 Group.push_back(&I);
325 // Check if it is a non-consecutive CMOV instruction or it has different
326 // condition code than FirstCC or FirstOppCC.
327 if (FoundNonCMOVInst || (CC != FirstCC && CC != FirstOppCC))
328 // Mark the SKipGroup indicator to skip current processed CMOV-Group.
329 SkipGroup = true;
330 if (I.mayLoad()) {
331 if (MemOpCC == X86::COND_INVALID)
332 // The first memory operand CMOV.
333 MemOpCC = CC;
334 else if (CC != MemOpCC)
335 // Can't handle mixed conditions with memory operands.
336 SkipGroup = true;
337 }
338 // Check if we were relying on zero-extending behavior of the CMOV.
339 if (!SkipGroup &&
341 MRI->use_nodbg_instructions(I.defs().begin()->getReg()),
342 [&](MachineInstr &UseI) {
343 return UseI.getOpcode() == X86::SUBREG_TO_REG;
344 }))
345 // FIXME: We should model the cost of using an explicit MOV to handle
346 // the zero-extension rather than just refusing to handle this.
347 SkipGroup = true;
348 continue;
349 }
350 // If Group is empty, keep looking for first CMOV in the range.
351 if (Group.empty())
352 continue;
353
354 // We found a non X86::CMOVrr instruction.
355 FoundNonCMOVInst = true;
356 // Check if this instruction define EFLAGS, to determine end of processed
357 // range, as there would be no more instructions using current EFLAGS def.
358 if (I.definesRegister(X86::EFLAGS, /*TRI=*/nullptr)) {
359 // Check if current processed CMOV-group should not be skipped and add
360 // it as a CMOV-group-candidate.
361 if (!SkipGroup)
362 CmovInstGroups.push_back(Group);
363 else
364 ++NumOfSkippedCmovGroups;
365 Group.clear();
366 }
367 }
368 // End of basic block is considered end of range, check if current processed
369 // CMOV-group should not be skipped and add it as a CMOV-group-candidate.
370 if (Group.empty())
371 continue;
372 if (!SkipGroup)
373 CmovInstGroups.push_back(Group);
374 else
375 ++NumOfSkippedCmovGroups;
376 }
377
378 NumOfCmovGroupCandidate += CmovInstGroups.size();
379 return !CmovInstGroups.empty();
380}
381
382/// \returns Depth of CMOV instruction as if it was converted into branch.
383/// \param TrueOpDepth depth cost of CMOV true value operand.
384/// \param FalseOpDepth depth cost of CMOV false value operand.
385static unsigned getDepthOfOptCmov(unsigned TrueOpDepth, unsigned FalseOpDepth) {
386 // The depth of the result after branch conversion is
387 // TrueOpDepth * TrueOpProbability + FalseOpDepth * FalseOpProbability.
388 // As we have no info about branch weight, we assume 75% for one and 25% for
389 // the other, and pick the result with the largest resulting depth.
390 return std::max(
391 divideCeil(TrueOpDepth * 3 + FalseOpDepth, 4),
392 divideCeil(FalseOpDepth * 3 + TrueOpDepth, 4));
393}
394
395bool X86CmovConverterPass::checkForProfitableCmovCandidates(
396 ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups) {
397 struct DepthInfo {
398 /// Depth of original loop.
399 unsigned Depth;
400 /// Depth of optimized loop.
401 unsigned OptDepth;
402 };
403 /// Number of loop iterations to calculate depth for ?!
404 static const unsigned LoopIterations = 2;
406 DepthInfo LoopDepth[LoopIterations] = {{0, 0}, {0, 0}};
407 enum { PhyRegType = 0, VirRegType = 1, RegTypeNum = 2 };
408 /// For each register type maps the register to its last def instruction.
409 DenseMap<unsigned, MachineInstr *> RegDefMaps[RegTypeNum];
410 /// Maps register operand to its def instruction, which can be nullptr if it
411 /// is unknown (e.g., operand is defined outside the loop).
413
414 // Set depth of unknown instruction (i.e., nullptr) to zero.
415 DepthMap[nullptr] = {0, 0};
416
417 SmallPtrSet<MachineInstr *, 4> CmovInstructions;
418 for (auto &Group : CmovInstGroups)
419 CmovInstructions.insert(Group.begin(), Group.end());
420
421 //===--------------------------------------------------------------------===//
422 // Step 1: Calculate instruction depth and loop depth.
423 // Optimized-Loop:
424 // loop with CMOV-group-candidates converted into branches.
425 //
426 // Instruction-Depth:
427 // instruction latency + max operand depth.
428 // * For CMOV instruction in optimized loop the depth is calculated as:
429 // CMOV latency + getDepthOfOptCmov(True-Op-Depth, False-Op-depth)
430 // TODO: Find a better way to estimate the latency of the branch instruction
431 // rather than using the CMOV latency.
432 //
433 // Loop-Depth:
434 // max instruction depth of all instructions in the loop.
435 // Note: instruction with max depth represents the critical-path in the loop.
436 //
437 // Loop-Depth[i]:
438 // Loop-Depth calculated for first `i` iterations.
439 // Note: it is enough to calculate depth for up to two iterations.
440 //
441 // Depth-Diff[i]:
442 // Number of cycles saved in first 'i` iterations by optimizing the loop.
443 //===--------------------------------------------------------------------===//
444 for (DepthInfo &MaxDepth : LoopDepth) {
445 for (auto *MBB : Blocks) {
446 // Clear physical registers Def map.
447 RegDefMaps[PhyRegType].clear();
448 for (MachineInstr &MI : *MBB) {
449 // Skip debug instructions.
450 if (MI.isDebugInstr())
451 continue;
452 unsigned MIDepth = 0;
453 unsigned MIDepthOpt = 0;
454 bool IsCMOV = CmovInstructions.count(&MI);
455 for (auto &MO : MI.uses()) {
456 // Checks for "isUse()" as "uses()" returns also implicit definitions.
457 if (!MO.isReg() || !MO.isUse())
458 continue;
459 Register Reg = MO.getReg();
460 auto &RDM = RegDefMaps[Reg.isVirtual()];
461 if (MachineInstr *DefMI = RDM.lookup(Reg)) {
462 OperandToDefMap[&MO] = DefMI;
463 DepthInfo Info = DepthMap.lookup(DefMI);
464 MIDepth = std::max(MIDepth, Info.Depth);
465 if (!IsCMOV)
466 MIDepthOpt = std::max(MIDepthOpt, Info.OptDepth);
467 }
468 }
469
470 if (IsCMOV)
471 MIDepthOpt = getDepthOfOptCmov(
472 DepthMap[OperandToDefMap.lookup(&MI.getOperand(1))].OptDepth,
473 DepthMap[OperandToDefMap.lookup(&MI.getOperand(2))].OptDepth);
474
475 // Iterates over all operands to handle implicit definitions as well.
476 for (auto &MO : MI.operands()) {
477 if (!MO.isReg() || !MO.isDef())
478 continue;
479 Register Reg = MO.getReg();
480 RegDefMaps[Reg.isVirtual()][Reg] = &MI;
481 }
482
483 unsigned Latency = TSchedModel.computeInstrLatency(&MI);
484 DepthMap[&MI] = {MIDepth += Latency, MIDepthOpt += Latency};
485 MaxDepth.Depth = std::max(MaxDepth.Depth, MIDepth);
486 MaxDepth.OptDepth = std::max(MaxDepth.OptDepth, MIDepthOpt);
487 }
488 }
489 }
490
491 unsigned Diff[LoopIterations] = {LoopDepth[0].Depth - LoopDepth[0].OptDepth,
492 LoopDepth[1].Depth - LoopDepth[1].OptDepth};
493
494 //===--------------------------------------------------------------------===//
495 // Step 2: Check if Loop worth to be optimized.
496 // Worth-Optimize-Loop:
497 // case 1: Diff[1] == Diff[0]
498 // Critical-path is iteration independent - there is no dependency
499 // of critical-path instructions on critical-path instructions of
500 // previous iteration.
501 // Thus, it is enough to check gain percent of 1st iteration -
502 // To be conservative, the optimized loop need to have a depth of
503 // 12.5% cycles less than original loop, per iteration.
504 //
505 // case 2: Diff[1] > Diff[0]
506 // Critical-path is iteration dependent - there is dependency of
507 // critical-path instructions on critical-path instructions of
508 // previous iteration.
509 // Thus, check the gain percent of the 2nd iteration (similar to the
510 // previous case), but it is also required to check the gradient of
511 // the gain - the change in Depth-Diff compared to the change in
512 // Loop-Depth between 1st and 2nd iterations.
513 // To be conservative, the gradient need to be at least 50%.
514 //
515 // In addition, In order not to optimize loops with very small gain, the
516 // gain (in cycles) after 2nd iteration should not be less than a given
517 // threshold. Thus, the check (Diff[1] >= GainCycleThreshold) must apply.
518 //
519 // If loop is not worth optimizing, remove all CMOV-group-candidates.
520 //===--------------------------------------------------------------------===//
521 if (Diff[1] < GainCycleThreshold)
522 return false;
523
524 bool WorthOptLoop = false;
525 if (Diff[1] == Diff[0])
526 WorthOptLoop = Diff[0] * 8 >= LoopDepth[0].Depth;
527 else if (Diff[1] > Diff[0])
528 WorthOptLoop =
529 (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth) &&
530 (Diff[1] * 8 >= LoopDepth[1].Depth);
531
532 if (!WorthOptLoop)
533 return false;
534
535 ++NumOfLoopCandidate;
536
537 //===--------------------------------------------------------------------===//
538 // Step 3: Check for each CMOV-group-candidate if it worth to be optimized.
539 // Worth-Optimize-Group:
540 // Iff it is worth to optimize all CMOV instructions in the group.
541 //
542 // Worth-Optimize-CMOV:
543 // Predicted branch is faster than CMOV by the difference between depth of
544 // condition operand and depth of taken (predicted) value operand.
545 // To be conservative, the gain of such CMOV transformation should cover at
546 // at least 25% of branch-misprediction-penalty.
547 //===--------------------------------------------------------------------===//
548 unsigned MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
549 CmovGroups TempGroups;
550 std::swap(TempGroups, CmovInstGroups);
551 for (auto &Group : TempGroups) {
552 bool WorthOpGroup = true;
553 for (auto *MI : Group) {
554 // Avoid CMOV instruction which value is used as a pointer to load from.
555 // This is another conservative check to avoid converting CMOV instruction
556 // used with tree-search like algorithm, where the branch is unpredicted.
557 auto UIs = MRI->use_instructions(MI->defs().begin()->getReg());
558 if (!UIs.empty() && ++UIs.begin() == UIs.end()) {
559 unsigned Op = UIs.begin()->getOpcode();
560 if (Op == X86::MOV64rm || Op == X86::MOV32rm) {
561 WorthOpGroup = false;
562 break;
563 }
564 }
565
566 unsigned CondCost =
567 DepthMap[OperandToDefMap.lookup(&MI->getOperand(4))].Depth;
568 unsigned ValCost = getDepthOfOptCmov(
569 DepthMap[OperandToDefMap.lookup(&MI->getOperand(1))].Depth,
570 DepthMap[OperandToDefMap.lookup(&MI->getOperand(2))].Depth);
571 if (ValCost > CondCost || (CondCost - ValCost) * 4 < MispredictPenalty) {
572 WorthOpGroup = false;
573 break;
574 }
575 }
576
577 if (WorthOpGroup)
578 CmovInstGroups.push_back(Group);
579 }
580
581 return !CmovInstGroups.empty();
582}
583
585 if (MI->killsRegister(X86::EFLAGS, /*TRI=*/nullptr))
586 return false;
587
588 // The EFLAGS operand of MI might be missing a kill marker.
589 // Figure out whether EFLAGS operand should LIVE after MI instruction.
590 MachineBasicBlock *BB = MI->getParent();
592
593 // Scan forward through BB for a use/def of EFLAGS.
594 for (auto I = std::next(ItrMI), E = BB->end(); I != E; ++I) {
595 if (I->readsRegister(X86::EFLAGS, /*TRI=*/nullptr))
596 return true;
597 if (I->definesRegister(X86::EFLAGS, /*TRI=*/nullptr))
598 return false;
599 }
600
601 // We hit the end of the block, check whether EFLAGS is live into a successor.
602 for (MachineBasicBlock *Succ : BB->successors())
603 if (Succ->isLiveIn(X86::EFLAGS))
604 return true;
605
606 return false;
607}
608
609/// Given /p First CMOV instruction and /p Last CMOV instruction representing a
610/// group of CMOV instructions, which may contain debug instructions in between,
611/// move all debug instructions to after the last CMOV instruction, making the
612/// CMOV group consecutive.
615 "Last instruction in a CMOV group must be a CMOV instruction");
616
617 SmallVector<MachineInstr *, 2> DBGInstructions;
618 for (auto I = First->getIterator(), E = Last->getIterator(); I != E; I++) {
619 if (I->isDebugInstr())
620 DBGInstructions.push_back(&*I);
621 }
622
623 // Splice the debug instruction after the cmov group.
624 MachineBasicBlock *MBB = First->getParent();
625 for (auto *MI : DBGInstructions)
626 MBB->insertAfter(Last, MI->removeFromParent());
627}
628
629void X86CmovConverterPass::convertCmovInstsToBranches(
630 SmallVectorImpl<MachineInstr *> &Group) const {
631 assert(!Group.empty() && "No CMOV instructions to convert");
632 ++NumOfOptimizedCmovGroups;
633
634 // If the CMOV group is not packed, e.g., there are debug instructions between
635 // first CMOV and last CMOV, then pack the group and make the CMOV instruction
636 // consecutive by moving the debug instructions to after the last CMOV.
637 packCmovGroup(Group.front(), Group.back());
638
639 // To convert a CMOVcc instruction, we actually have to insert the diamond
640 // control-flow pattern. The incoming instruction knows the destination vreg
641 // to set, the condition code register to branch on, the true/false values to
642 // select between, and a branch opcode to use.
643
644 // Before
645 // -----
646 // MBB:
647 // cond = cmp ...
648 // v1 = CMOVge t1, f1, cond
649 // v2 = CMOVlt t2, f2, cond
650 // v3 = CMOVge v1, f3, cond
651 //
652 // After
653 // -----
654 // MBB:
655 // cond = cmp ...
656 // jge %SinkMBB
657 //
658 // FalseMBB:
659 // jmp %SinkMBB
660 //
661 // SinkMBB:
662 // %v1 = phi[%f1, %FalseMBB], [%t1, %MBB]
663 // %v2 = phi[%t2, %FalseMBB], [%f2, %MBB] ; For CMOV with OppCC switch
664 // ; true-value with false-value
665 // %v3 = phi[%f3, %FalseMBB], [%t1, %MBB] ; Phi instruction cannot use
666 // ; previous Phi instruction result
667
668 MachineInstr &MI = *Group.front();
669 MachineInstr *LastCMOV = Group.back();
670 DebugLoc DL = MI.getDebugLoc();
671
674 // Potentially swap the condition codes so that any memory operand to a CMOV
675 // is in the *false* position instead of the *true* position. We can invert
676 // any non-memory operand CMOV instructions to cope with this and we ensure
677 // memory operand CMOVs are only included with a single condition code.
678 if (llvm::any_of(Group, [&](MachineInstr *I) {
679 return I->mayLoad() && X86::getCondFromCMov(*I) == CC;
680 }))
681 std::swap(CC, OppCC);
682
683 MachineBasicBlock *MBB = MI.getParent();
686 const BasicBlock *BB = MBB->getBasicBlock();
687
688 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(BB);
689 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(BB);
690 F->insert(It, FalseMBB);
691 F->insert(It, SinkMBB);
692
693 // If the EFLAGS register isn't dead in the terminator, then claim that it's
694 // live into the sink and copy blocks.
695 if (checkEFLAGSLive(LastCMOV)) {
696 FalseMBB->addLiveIn(X86::EFLAGS);
697 SinkMBB->addLiveIn(X86::EFLAGS);
698 }
699
700 // Transfer the remainder of BB and its successor edges to SinkMBB.
701 SinkMBB->splice(SinkMBB->begin(), MBB,
702 std::next(MachineBasicBlock::iterator(LastCMOV)), MBB->end());
704
705 // Add the false and sink blocks as its successors.
706 MBB->addSuccessor(FalseMBB);
707 MBB->addSuccessor(SinkMBB);
708
709 // Create the conditional branch instruction.
710 BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
711
712 // Add the sink block to the false block successors.
713 FalseMBB->addSuccessor(SinkMBB);
714
718 std::next(MachineBasicBlock::iterator(LastCMOV));
719 MachineBasicBlock::iterator FalseInsertionPoint = FalseMBB->begin();
720 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
721
722 // First we need to insert an explicit load on the false path for any memory
723 // operand. We also need to potentially do register rewriting here, but it is
724 // simpler as the memory operands are always on the false path so we can
725 // simply take that input, whatever it is.
726 DenseMap<unsigned, unsigned> FalseBBRegRewriteTable;
727 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd;) {
728 auto &MI = *MIIt++;
729 // Skip any CMOVs in this group which don't load from memory.
730 if (!MI.mayLoad()) {
731 // Remember the false-side register input.
732 Register FalseReg =
733 MI.getOperand(X86::getCondFromCMov(MI) == CC ? 1 : 2).getReg();
734 // Walk back through any intermediate cmovs referenced.
735 while (true) {
736 auto FRIt = FalseBBRegRewriteTable.find(FalseReg);
737 if (FRIt == FalseBBRegRewriteTable.end())
738 break;
739 FalseReg = FRIt->second;
740 }
741 FalseBBRegRewriteTable[MI.getOperand(0).getReg()] = FalseReg;
742 continue;
743 }
744
745 // The condition must be the *opposite* of the one we've decided to branch
746 // on as the branch will go *around* the load and the load should happen
747 // when the CMOV condition is false.
748 assert(X86::getCondFromCMov(MI) == OppCC &&
749 "Can only handle memory-operand cmov instructions with a condition "
750 "opposite to the selected branch direction.");
751
752 // The goal is to rewrite the cmov from:
753 //
754 // MBB:
755 // %A = CMOVcc %B (tied), (mem)
756 //
757 // to
758 //
759 // MBB:
760 // %A = CMOVcc %B (tied), %C
761 // FalseMBB:
762 // %C = MOV (mem)
763 //
764 // Which will allow the next loop to rewrite the CMOV in terms of a PHI:
765 //
766 // MBB:
767 // JMP!cc SinkMBB
768 // FalseMBB:
769 // %C = MOV (mem)
770 // SinkMBB:
771 // %A = PHI [ %C, FalseMBB ], [ %B, MBB]
772
773 // Get a fresh register to use as the destination of the MOV.
774 const TargetRegisterClass *RC = MRI->getRegClass(MI.getOperand(0).getReg());
775 Register TmpReg = MRI->createVirtualRegister(RC);
776
777 // Retain debug instr number when unfolded.
778 unsigned OldDebugInstrNum = MI.peekDebugInstrNum();
780 bool Unfolded = TII->unfoldMemoryOperand(*MBB->getParent(), MI, TmpReg,
781 /*UnfoldLoad*/ true,
782 /*UnfoldStore*/ false, NewMIs);
783 (void)Unfolded;
784 assert(Unfolded && "Should never fail to unfold a loading cmov!");
785
786 // Move the new CMOV to just before the old one and reset any impacted
787 // iterator.
788 auto *NewCMOV = NewMIs.pop_back_val();
789 assert(X86::getCondFromCMov(*NewCMOV) == OppCC &&
790 "Last new instruction isn't the expected CMOV!");
791 LLVM_DEBUG(dbgs() << "\tRewritten cmov: "; NewCMOV->dump());
793 if (&*MIItBegin == &MI)
794 MIItBegin = MachineBasicBlock::iterator(NewCMOV);
795
796 if (OldDebugInstrNum)
797 NewCMOV->setDebugInstrNum(OldDebugInstrNum);
798
799 // Sink whatever instructions were needed to produce the unfolded operand
800 // into the false block.
801 for (auto *NewMI : NewMIs) {
802 LLVM_DEBUG(dbgs() << "\tRewritten load instr: "; NewMI->dump());
803 FalseMBB->insert(FalseInsertionPoint, NewMI);
804 // Re-map any operands that are from other cmovs to the inputs for this block.
805 for (auto &MOp : NewMI->uses()) {
806 if (!MOp.isReg())
807 continue;
808 auto It = FalseBBRegRewriteTable.find(MOp.getReg());
809 if (It == FalseBBRegRewriteTable.end())
810 continue;
811
812 MOp.setReg(It->second);
813 // This might have been a kill when it referenced the cmov result, but
814 // it won't necessarily be once rewritten.
815 // FIXME: We could potentially improve this by tracking whether the
816 // operand to the cmov was also a kill, and then skipping the PHI node
817 // construction below.
818 MOp.setIsKill(false);
819 }
820 }
821 MBB->erase(&MI);
822
823 // Add this PHI to the rewrite table.
824 FalseBBRegRewriteTable[NewCMOV->getOperand(0).getReg()] = TmpReg;
825 }
826
827 // As we are creating the PHIs, we have to be careful if there is more than
828 // one. Later CMOVs may reference the results of earlier CMOVs, but later
829 // PHIs have to reference the individual true/false inputs from earlier PHIs.
830 // That also means that PHI construction must work forward from earlier to
831 // later, and that the code must maintain a mapping from earlier PHI's
832 // destination registers, and the registers that went into the PHI.
834
835 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
836 Register DestReg = MIIt->getOperand(0).getReg();
837 Register Op1Reg = MIIt->getOperand(1).getReg();
838 Register Op2Reg = MIIt->getOperand(2).getReg();
839
840 // If this CMOV we are processing is the opposite condition from the jump we
841 // generated, then we have to swap the operands for the PHI that is going to
842 // be generated.
843 if (X86::getCondFromCMov(*MIIt) == OppCC)
844 std::swap(Op1Reg, Op2Reg);
845
846 auto Op1Itr = RegRewriteTable.find(Op1Reg);
847 if (Op1Itr != RegRewriteTable.end())
848 Op1Reg = Op1Itr->second.first;
849
850 auto Op2Itr = RegRewriteTable.find(Op2Reg);
851 if (Op2Itr != RegRewriteTable.end())
852 Op2Reg = Op2Itr->second.second;
853
854 // SinkMBB:
855 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, MBB ]
856 // ...
857 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
858 .addReg(Op1Reg)
859 .addMBB(FalseMBB)
860 .addReg(Op2Reg)
861 .addMBB(MBB);
862 (void)MIB;
863 LLVM_DEBUG(dbgs() << "\tFrom: "; MIIt->dump());
864 LLVM_DEBUG(dbgs() << "\tTo: "; MIB->dump());
865
866 // debug-info: we can just copy the instr-ref number from one instruction
867 // to the other, seeing how it's a one-for-one substitution.
868 if (unsigned InstrNum = MIIt->peekDebugInstrNum())
869 MIB->setDebugInstrNum(InstrNum);
870
871 // Add this PHI to the rewrite table.
872 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
873 }
874
875 // Reset the NoPHIs property if a PHI was inserted to prevent a conflict with
876 // the MachineVerifier during testing.
877 if (MIItBegin != MIItEnd)
878 F->getProperties().reset(MachineFunctionProperties::Property::NoPHIs);
879
880 // Now remove the CMOV(s).
881 MBB->erase(MIItBegin, MIItEnd);
882
883 // Add new basic blocks to MachineLoopInfo.
884 if (MachineLoop *L = MLI->getLoopFor(MBB)) {
885 L->addBasicBlockToLoop(FalseMBB, *MLI);
886 L->addBasicBlockToLoop(SinkMBB, *MLI);
887 }
888}
889
890INITIALIZE_PASS_BEGIN(X86CmovConverterPass, DEBUG_TYPE, "X86 cmov Conversion",
891 false, false)
893INITIALIZE_PASS_END(X86CmovConverterPass, DEBUG_TYPE, "X86 cmov Conversion",
895
897 return new X86CmovConverterPass();
898}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
const HexagonInstrInfo * TII
Hexagon Hardware Loops
IRTranslator LLVM IR MI
static const unsigned MaxDepth
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
static cl::opt< bool > DisableSelectOptimize("disable-select-optimize", cl::init(true), cl::Hidden, cl::desc("Disable the select-optimization pass from running"))
Disable the select optimization pass.
X86 cmov Conversion
static cl::opt< bool > ForceAll("x86-cmov-converter-force-all", cl::desc("Convert all cmovs to branches."), cl::init(false), cl::Hidden)
static bool checkEFLAGSLive(MachineInstr *MI)
static unsigned getDepthOfOptCmov(unsigned TrueOpDepth, unsigned FalseOpDepth)
static cl::opt< unsigned > GainCycleThreshold("x86-cmov-converter-threshold", cl::desc("Minimum gain per loop (in cycles) threshold."), cl::init(4), cl::Hidden)
static cl::opt< bool > ForceMemOperand("x86-cmov-converter-force-mem-operand", cl::desc("Convert cmovs to branches whenever they have memory operands."), cl::init(true), cl::Hidden)
static void packCmovGroup(MachineInstr *First, MachineInstr *Last)
Given /p First CMOV instruction and /p Last CMOV instruction representing a group of CMOV instruction...
static cl::opt< bool > EnableCmovConverter("x86-cmov-converter", cl::desc("Enable the X86 cmov-to-branch optimization."), cl::init(true), cl::Hidden)
#define DEBUG_TYPE
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
iterator end()
Definition: DenseMap.h:84
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
iterator insertAfter(iterator I, MachineInstr *MI)
Insert MI into the instruction list after I.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
Definition: MachineInstr.h:69
void setDebugInstrNum(unsigned Num)
Set instruction number of this MachineInstr.
Definition: MachineInstr.h:549
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:435
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:367
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:502
bool empty() const
Definition: SmallVector.h:94
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
TargetInstrInfo - Interface to description of machine instruction set.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Provide an instruction scheduling machine model to CodeGen passes.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
virtual const TargetInstrInfo * getInstrInfo() const
self_iterator getIterator()
Definition: ilist_node.h:132
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
Reg
All possible values of the reg field in the ModR/M byte.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
CondCode getCondFromCMov(const MachineInstr &MI)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
FunctionPass * createX86CmovConverterPass()
This pass converts X86 cmov instructions into branch when profitable.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
CGPassBuilderOption getCGPassBuilderOption()
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860