LLVM 22.0.0git
X86CmovConversion.cpp
Go to the documentation of this file.
1//====- X86CmovConversion.cpp - Convert Cmov to Branch --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file implements a pass that converts X86 cmov instructions into
11/// branches when profitable. This pass is conservative. It transforms if and
12/// only if it can guarantee a gain with high confidence.
13///
14/// Thus, the optimization applies under the following conditions:
15/// 1. Consider as candidates only CMOVs in innermost loops (assume that
16/// most hotspots are represented by these loops).
17/// 2. Given a group of CMOV instructions that are using the same EFLAGS def
18/// instruction:
19/// a. Consider them as candidates only if all have the same code condition
20/// or the opposite one to prevent generating more than one conditional
21/// jump per EFLAGS def instruction.
22/// b. Consider them as candidates only if all are profitable to be
23/// converted (assume that one bad conversion may cause a degradation).
24/// 3. Apply conversion only for loops that are found profitable and only for
25/// CMOV candidates that were found profitable.
26/// a. A loop is considered profitable only if conversion will reduce its
27/// depth cost by some threshold.
28/// b. CMOV is considered profitable if the cost of its condition is higher
29/// than the average cost of its true-value and false-value by 25% of
30/// branch-misprediction-penalty. This assures no degradation even with
31/// 25% branch misprediction.
32///
33/// Note: This pass is assumed to run on SSA machine code.
34//
35//===----------------------------------------------------------------------===//
36//
37// External interfaces:
38// FunctionPass *llvm::createX86CmovConverterPass();
39// bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF);
40//
41//===----------------------------------------------------------------------===//
42
43#include "X86.h"
44#include "X86InstrInfo.h"
45#include "llvm/ADT/ArrayRef.h"
46#include "llvm/ADT/DenseMap.h"
47#include "llvm/ADT/STLExtras.h"
50#include "llvm/ADT/Statistic.h"
63#include "llvm/IR/DebugLoc.h"
65#include "llvm/MC/MCSchedule.h"
66#include "llvm/Pass.h"
68#include "llvm/Support/Debug.h"
71#include <algorithm>
72#include <cassert>
73#include <iterator>
74#include <utility>
75
76using namespace llvm;
77
78#define DEBUG_TYPE "x86-cmov-conversion"
79
80STATISTIC(NumOfSkippedCmovGroups, "Number of unsupported CMOV-groups");
81STATISTIC(NumOfCmovGroupCandidate, "Number of CMOV-group candidates");
82STATISTIC(NumOfLoopCandidate, "Number of CMOV-conversion profitable loops");
83STATISTIC(NumOfOptimizedCmovGroups, "Number of optimized CMOV-groups");
84
85// This internal switch can be used to turn off the cmov/branch optimization.
86static cl::opt<bool>
87 EnableCmovConverter("x86-cmov-converter",
88 cl::desc("Enable the X86 cmov-to-branch optimization."),
89 cl::init(true), cl::Hidden);
90
92 GainCycleThreshold("x86-cmov-converter-threshold",
93 cl::desc("Minimum gain per loop (in cycles) threshold."),
95
97 "x86-cmov-converter-force-mem-operand",
98 cl::desc("Convert cmovs to branches whenever they have memory operands."),
99 cl::init(true), cl::Hidden);
100
102 "x86-cmov-converter-force-all",
103 cl::desc("Convert all cmovs to branches."),
104 cl::init(false), cl::Hidden);
105
106namespace {
107
108/// Converts X86 cmov instructions into branches when profitable.
109class X86CmovConversionImpl {
110public:
111 X86CmovConversionImpl(MachineLoopInfo *MLI) : MLI(MLI) {}
112
113 bool runOnMachineFunction(MachineFunction &MF);
114
115private:
116 MachineRegisterInfo *MRI = nullptr;
117 const TargetInstrInfo *TII = nullptr;
118 const TargetRegisterInfo *TRI = nullptr;
119 MachineLoopInfo *MLI = nullptr;
120 TargetSchedModel TSchedModel;
121
122 /// List of consecutive CMOV instructions.
123 using CmovGroup = SmallVector<MachineInstr *, 2>;
124 using CmovGroups = SmallVector<CmovGroup, 2>;
125
126 /// Collect all CMOV-group-candidates in \p CurrLoop and update \p
127 /// CmovInstGroups accordingly.
128 ///
129 /// \param Blocks List of blocks to process.
130 /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
131 /// \returns true iff it found any CMOV-group-candidate.
132 bool collectCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks,
133 CmovGroups &CmovInstGroups,
134 bool IncludeLoads = false);
135
136 /// Check if it is profitable to transform each CMOV-group-candidates into
137 /// branch. Remove all groups that are not profitable from \p CmovInstGroups.
138 ///
139 /// \param Blocks List of blocks to process.
140 /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
141 /// \returns true iff any CMOV-group-candidate remain.
142 bool checkForProfitableCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks,
143 CmovGroups &CmovInstGroups);
144
145 /// Convert the given list of consecutive CMOV instructions into a branch.
146 ///
147 /// \param Group Consecutive CMOV instructions to be converted into branch.
148 void convertCmovInstsToBranches(SmallVectorImpl<MachineInstr *> &Group) const;
149};
150
151class X86CmovConversionLegacy : public MachineFunctionPass {
152public:
153 X86CmovConversionLegacy() : MachineFunctionPass(ID) {}
154
155 StringRef getPassName() const override { return "X86 cmov Conversion"; }
156 bool runOnMachineFunction(MachineFunction &MF) override;
157 void getAnalysisUsage(AnalysisUsage &AU) const override;
158
159 /// Pass identification, replacement for typeid.
160 static char ID;
161};
162
163} // end anonymous namespace
164
165char X86CmovConversionLegacy::ID = 0;
166
167void X86CmovConversionLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
169 AU.addRequired<MachineLoopInfoWrapperPass>();
170}
171
172bool X86CmovConversionImpl::runOnMachineFunction(MachineFunction &MF) {
174 return false;
175
176 // If the SelectOptimize pass is enabled, cmovs have already been optimized.
178 return false;
179
180 LLVM_DEBUG(dbgs() << "********** " << DEBUG_TYPE << " : " << MF.getName()
181 << "**********\n");
182
183 bool Changed = false;
184 const TargetSubtargetInfo &STI = MF.getSubtarget();
185 MRI = &MF.getRegInfo();
186 TII = STI.getInstrInfo();
187 TRI = STI.getRegisterInfo();
188 TSchedModel.init(&STI);
189
190 // Before we handle the more subtle cases of register-register CMOVs inside
191 // of potentially hot loops, we want to quickly remove all CMOVs (ForceAll) or
192 // the ones with a memory operand (ForceMemOperand option). The latter CMOV
193 // will risk a stall waiting for the load to complete that speculative
194 // execution behind a branch is better suited to handle on modern x86 chips.
195 if (ForceMemOperand || ForceAll) {
196 CmovGroups AllCmovGroups;
197 SmallVector<MachineBasicBlock *, 4> Blocks(llvm::make_pointer_range(MF));
198 if (collectCmovCandidates(Blocks, AllCmovGroups, /*IncludeLoads*/ true)) {
199 for (auto &Group : AllCmovGroups) {
200 // Skip any group that doesn't do at least one memory operand cmov.
201 if (ForceMemOperand && !ForceAll &&
202 llvm::none_of(Group, [&](MachineInstr *I) { return I->mayLoad(); }))
203 continue;
204
205 // For CMOV groups which we can rewrite and which contain a memory load,
206 // always rewrite them. On x86, a CMOV will dramatically amplify any
207 // memory latency by blocking speculative execution.
208 Changed = true;
209 convertCmovInstsToBranches(Group);
210 }
211 }
212 // Early return as ForceAll converts all CmovGroups.
213 if (ForceAll)
214 return Changed;
215 }
216
217 //===--------------------------------------------------------------------===//
218 // Register-operand Conversion Algorithm
219 // ---------
220 // For each innermost loop
221 // collectCmovCandidates() {
222 // Find all CMOV-group-candidates.
223 // }
224 //
225 // checkForProfitableCmovCandidates() {
226 // * Calculate both loop-depth and optimized-loop-depth.
227 // * Use these depth to check for loop transformation profitability.
228 // * Check for CMOV-group-candidate transformation profitability.
229 // }
230 //
231 // For each profitable CMOV-group-candidate
232 // convertCmovInstsToBranches() {
233 // * Create FalseBB, SinkBB, Conditional branch to SinkBB.
234 // * Replace each CMOV instruction with a PHI instruction in SinkBB.
235 // }
236 //
237 // Note: For more details, see each function description.
238 //===--------------------------------------------------------------------===//
239
240 // Build up the loops in pre-order.
242 // Note that we need to check size on each iteration as we accumulate child
243 // loops.
244 for (int i = 0; i < (int)Loops.size(); ++i)
245 llvm::append_range(Loops, Loops[i]->getSubLoops());
246
247 for (MachineLoop *CurrLoop : Loops) {
248 // Optimize only innermost loops.
249 if (!CurrLoop->getSubLoops().empty())
250 continue;
251
252 // List of consecutive CMOV instructions to be processed.
253 CmovGroups CmovInstGroups;
254
255 if (!collectCmovCandidates(CurrLoop->getBlocks(), CmovInstGroups))
256 continue;
257
258 if (!checkForProfitableCmovCandidates(CurrLoop->getBlocks(),
259 CmovInstGroups))
260 continue;
261
262 Changed = true;
263 for (auto &Group : CmovInstGroups)
264 convertCmovInstsToBranches(Group);
265 }
266
267 return Changed;
268}
269
270bool X86CmovConversionImpl::collectCmovCandidates(
271 ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups,
272 bool IncludeLoads) {
273 //===--------------------------------------------------------------------===//
274 // Collect all CMOV-group-candidates and add them into CmovInstGroups.
275 //
276 // CMOV-group:
277 // CMOV instructions, in same MBB, that uses same EFLAGS def instruction.
278 //
279 // CMOV-group-candidate:
280 // CMOV-group where all the CMOV instructions are
281 // 1. consecutive.
282 // 2. have same condition code or opposite one.
283 // 3. have only operand registers (X86::CMOVrr).
284 //===--------------------------------------------------------------------===//
285 // List of possible improvement (TODO's):
286 // --------------------------------------
287 // TODO: Add support for X86::CMOVrm instructions.
288 // TODO: Add support for X86::SETcc instructions.
289 // TODO: Add support for CMOV-groups with non consecutive CMOV instructions.
290 //===--------------------------------------------------------------------===//
291
292 // Current processed CMOV-Group.
293 CmovGroup Group;
294 for (auto *MBB : Blocks) {
295 Group.clear();
296 // Condition code of first CMOV instruction current processed range and its
297 // opposite condition code.
298 X86::CondCode FirstCC = X86::COND_INVALID, FirstOppCC = X86::COND_INVALID,
299 MemOpCC = X86::COND_INVALID;
300 // Indicator of a non CMOVrr instruction in the current processed range.
301 bool FoundNonCMOVInst = false;
302 // Indicator for current processed CMOV-group if it should be skipped.
303 bool SkipGroup = false;
304
305 for (auto &I : *MBB) {
306 // Skip debug instructions.
307 if (I.isDebugInstr())
308 continue;
309
311 // Check if we found a X86::CMOVrr instruction. If it is marked as
312 // unpredictable, skip it and do not convert it to branch.
313 if (CC != X86::COND_INVALID &&
314 !I.getFlag(MachineInstr::MIFlag::Unpredictable) &&
315 (IncludeLoads || !I.mayLoad())) {
316 if (Group.empty()) {
317 // We found first CMOV in the range, reset flags.
318 FirstCC = CC;
319 FirstOppCC = X86::GetOppositeBranchCondition(CC);
320 // Clear out the prior group's memory operand CC.
321 MemOpCC = X86::COND_INVALID;
322 FoundNonCMOVInst = false;
323 SkipGroup = false;
324 }
325 Group.push_back(&I);
326 // Check if it is a non-consecutive CMOV instruction or it has different
327 // condition code than FirstCC or FirstOppCC.
328 if (FoundNonCMOVInst || (CC != FirstCC && CC != FirstOppCC))
329 // Mark the SKipGroup indicator to skip current processed CMOV-Group.
330 SkipGroup = true;
331 if (I.mayLoad()) {
332 if (MemOpCC == X86::COND_INVALID)
333 // The first memory operand CMOV.
334 MemOpCC = CC;
335 else if (CC != MemOpCC)
336 // Can't handle mixed conditions with memory operands.
337 SkipGroup = true;
338 }
339 // Check if we were relying on zero-extending behavior of the CMOV.
340 if (!SkipGroup &&
342 MRI->use_nodbg_instructions(I.defs().begin()->getReg()),
343 [&](MachineInstr &UseI) {
344 return UseI.getOpcode() == X86::SUBREG_TO_REG;
345 }))
346 // FIXME: We should model the cost of using an explicit MOV to handle
347 // the zero-extension rather than just refusing to handle this.
348 SkipGroup = true;
349 continue;
350 }
351 // If Group is empty, keep looking for first CMOV in the range.
352 if (Group.empty())
353 continue;
354
355 // We found a non X86::CMOVrr instruction.
356 FoundNonCMOVInst = true;
357 // Check if this instruction define EFLAGS, to determine end of processed
358 // range, as there would be no more instructions using current EFLAGS def.
359 if (I.definesRegister(X86::EFLAGS, /*TRI=*/nullptr)) {
360 // Check if current processed CMOV-group should not be skipped and add
361 // it as a CMOV-group-candidate.
362 if (!SkipGroup)
363 CmovInstGroups.push_back(Group);
364 else
365 ++NumOfSkippedCmovGroups;
366 Group.clear();
367 }
368 }
369 // End of basic block is considered end of range, check if current processed
370 // CMOV-group should not be skipped and add it as a CMOV-group-candidate.
371 if (Group.empty())
372 continue;
373 if (!SkipGroup)
374 CmovInstGroups.push_back(Group);
375 else
376 ++NumOfSkippedCmovGroups;
377 }
378
379 NumOfCmovGroupCandidate += CmovInstGroups.size();
380 return !CmovInstGroups.empty();
381}
382
383/// \returns Depth of CMOV instruction as if it was converted into branch.
384/// \param TrueOpDepth depth cost of CMOV true value operand.
385/// \param FalseOpDepth depth cost of CMOV false value operand.
386static unsigned getDepthOfOptCmov(unsigned TrueOpDepth, unsigned FalseOpDepth) {
387 // The depth of the result after branch conversion is
388 // TrueOpDepth * TrueOpProbability + FalseOpDepth * FalseOpProbability.
389 // As we have no info about branch weight, we assume 75% for one and 25% for
390 // the other, and pick the result with the largest resulting depth.
391 return std::max(
392 divideCeil(TrueOpDepth * 3 + FalseOpDepth, 4),
393 divideCeil(FalseOpDepth * 3 + TrueOpDepth, 4));
394}
395
396bool X86CmovConversionImpl::checkForProfitableCmovCandidates(
397 ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups) {
398 struct DepthInfo {
399 /// Depth of original loop.
400 unsigned Depth;
401 /// Depth of optimized loop.
402 unsigned OptDepth;
403 };
404 /// Number of loop iterations to calculate depth for ?!
405 static const unsigned LoopIterations = 2;
406 DenseMap<MachineInstr *, DepthInfo> DepthMap;
407 DepthInfo LoopDepth[LoopIterations] = {{0, 0}, {0, 0}};
408 enum { PhyRegType = 0, VirRegType = 1, RegTypeNum = 2 };
409 /// For each register type maps the register to its last def instruction.
410 DenseMap<Register, MachineInstr *> RegDefMaps[RegTypeNum];
411 /// Maps register operand to its def instruction, which can be nullptr if it
412 /// is unknown (e.g., operand is defined outside the loop).
413 DenseMap<MachineOperand *, MachineInstr *> OperandToDefMap;
414
415 // Set depth of unknown instruction (i.e., nullptr) to zero.
416 DepthMap[nullptr] = {0, 0};
417
418 SmallPtrSet<MachineInstr *, 4> CmovInstructions;
419 for (auto &Group : CmovInstGroups)
420 CmovInstructions.insert_range(Group);
421
422 //===--------------------------------------------------------------------===//
423 // Step 1: Calculate instruction depth and loop depth.
424 // Optimized-Loop:
425 // loop with CMOV-group-candidates converted into branches.
426 //
427 // Instruction-Depth:
428 // instruction latency + max operand depth.
429 // * For CMOV instruction in optimized loop the depth is calculated as:
430 // CMOV latency + getDepthOfOptCmov(True-Op-Depth, False-Op-depth)
431 // TODO: Find a better way to estimate the latency of the branch instruction
432 // rather than using the CMOV latency.
433 //
434 // Loop-Depth:
435 // max instruction depth of all instructions in the loop.
436 // Note: instruction with max depth represents the critical-path in the loop.
437 //
438 // Loop-Depth[i]:
439 // Loop-Depth calculated for first `i` iterations.
440 // Note: it is enough to calculate depth for up to two iterations.
441 //
442 // Depth-Diff[i]:
443 // Number of cycles saved in first 'i` iterations by optimizing the loop.
444 //===--------------------------------------------------------------------===//
445 for (DepthInfo &MaxDepth : LoopDepth) {
446 for (auto *MBB : Blocks) {
447 // Clear physical registers Def map.
448 RegDefMaps[PhyRegType].clear();
449 for (MachineInstr &MI : *MBB) {
450 // Skip debug instructions.
451 if (MI.isDebugInstr())
452 continue;
453 unsigned MIDepth = 0;
454 unsigned MIDepthOpt = 0;
455 bool IsCMOV = CmovInstructions.count(&MI);
456 for (auto &MO : MI.uses()) {
457 // Checks for "isUse()" as "uses()" returns also implicit definitions.
458 if (!MO.isReg() || !MO.isUse())
459 continue;
460 Register Reg = MO.getReg();
461 auto &RDM = RegDefMaps[Reg.isVirtual()];
462 if (MachineInstr *DefMI = RDM.lookup(Reg)) {
463 OperandToDefMap[&MO] = DefMI;
464 DepthInfo Info = DepthMap.lookup(DefMI);
465 MIDepth = std::max(MIDepth, Info.Depth);
466 if (!IsCMOV)
467 MIDepthOpt = std::max(MIDepthOpt, Info.OptDepth);
468 }
469 }
470
471 if (IsCMOV)
472 MIDepthOpt = getDepthOfOptCmov(
473 DepthMap[OperandToDefMap.lookup(&MI.getOperand(1))].OptDepth,
474 DepthMap[OperandToDefMap.lookup(&MI.getOperand(2))].OptDepth);
475
476 // Iterates over all operands to handle implicit definitions as well.
477 for (auto &MO : MI.operands()) {
478 if (!MO.isReg() || !MO.isDef())
479 continue;
480 Register Reg = MO.getReg();
481 RegDefMaps[Reg.isVirtual()][Reg] = &MI;
482 }
483
484 unsigned Latency = TSchedModel.computeInstrLatency(&MI);
485 DepthMap[&MI] = {MIDepth += Latency, MIDepthOpt += Latency};
486 MaxDepth.Depth = std::max(MaxDepth.Depth, MIDepth);
487 MaxDepth.OptDepth = std::max(MaxDepth.OptDepth, MIDepthOpt);
488 }
489 }
490 }
491
492 unsigned Diff[LoopIterations] = {LoopDepth[0].Depth - LoopDepth[0].OptDepth,
493 LoopDepth[1].Depth - LoopDepth[1].OptDepth};
494
495 //===--------------------------------------------------------------------===//
496 // Step 2: Check if Loop worth to be optimized.
497 // Worth-Optimize-Loop:
498 // case 1: Diff[1] == Diff[0]
499 // Critical-path is iteration independent - there is no dependency
500 // of critical-path instructions on critical-path instructions of
501 // previous iteration.
502 // Thus, it is enough to check gain percent of 1st iteration -
503 // To be conservative, the optimized loop need to have a depth of
504 // 12.5% cycles less than original loop, per iteration.
505 //
506 // case 2: Diff[1] > Diff[0]
507 // Critical-path is iteration dependent - there is dependency of
508 // critical-path instructions on critical-path instructions of
509 // previous iteration.
510 // Thus, check the gain percent of the 2nd iteration (similar to the
511 // previous case), but it is also required to check the gradient of
512 // the gain - the change in Depth-Diff compared to the change in
513 // Loop-Depth between 1st and 2nd iterations.
514 // To be conservative, the gradient need to be at least 50%.
515 //
516 // In addition, In order not to optimize loops with very small gain, the
517 // gain (in cycles) after 2nd iteration should not be less than a given
518 // threshold. Thus, the check (Diff[1] >= GainCycleThreshold) must apply.
519 //
520 // If loop is not worth optimizing, remove all CMOV-group-candidates.
521 //===--------------------------------------------------------------------===//
522 if (Diff[1] < GainCycleThreshold)
523 return false;
524
525 bool WorthOptLoop = false;
526 if (Diff[1] == Diff[0])
527 WorthOptLoop = Diff[0] * 8 >= LoopDepth[0].Depth;
528 else if (Diff[1] > Diff[0])
529 WorthOptLoop =
530 (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth) &&
531 (Diff[1] * 8 >= LoopDepth[1].Depth);
532
533 if (!WorthOptLoop)
534 return false;
535
536 ++NumOfLoopCandidate;
537
538 //===--------------------------------------------------------------------===//
539 // Step 3: Check for each CMOV-group-candidate if it worth to be optimized.
540 // Worth-Optimize-Group:
541 // Iff it is worth to optimize all CMOV instructions in the group.
542 //
543 // Worth-Optimize-CMOV:
544 // Predicted branch is faster than CMOV by the difference between depth of
545 // condition operand and depth of taken (predicted) value operand.
546 // To be conservative, the gain of such CMOV transformation should cover at
547 // at least 25% of branch-misprediction-penalty.
548 //===--------------------------------------------------------------------===//
549 unsigned MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
550 CmovGroups TempGroups;
551 std::swap(TempGroups, CmovInstGroups);
552 for (auto &Group : TempGroups) {
553 bool WorthOpGroup = true;
554 for (auto *MI : Group) {
555 // Avoid CMOV instruction which value is used as a pointer to load from.
556 // This is another conservative check to avoid converting CMOV instruction
557 // used with tree-search like algorithm, where the branch is unpredicted.
558 auto UIs = MRI->use_instructions(MI->defs().begin()->getReg());
559 if (hasSingleElement(UIs)) {
560 unsigned Op = UIs.begin()->getOpcode();
561 if (Op == X86::MOV64rm || Op == X86::MOV32rm) {
562 WorthOpGroup = false;
563 break;
564 }
565 }
566
567 unsigned CondCost =
568 DepthMap[OperandToDefMap.lookup(&MI->getOperand(4))].Depth;
569 unsigned ValCost = getDepthOfOptCmov(
570 DepthMap[OperandToDefMap.lookup(&MI->getOperand(1))].Depth,
571 DepthMap[OperandToDefMap.lookup(&MI->getOperand(2))].Depth);
572 if (ValCost > CondCost || (CondCost - ValCost) * 4 < MispredictPenalty) {
573 WorthOpGroup = false;
574 break;
575 }
576 }
577
578 if (WorthOpGroup)
579 CmovInstGroups.push_back(Group);
580 }
581
582 return !CmovInstGroups.empty();
583}
584
586 if (MI->killsRegister(X86::EFLAGS, /*TRI=*/nullptr))
587 return false;
588
589 // The EFLAGS operand of MI might be missing a kill marker.
590 // Figure out whether EFLAGS operand should LIVE after MI instruction.
591 MachineBasicBlock *BB = MI->getParent();
593
594 // Scan forward through BB for a use/def of EFLAGS.
595 for (auto I = std::next(ItrMI), E = BB->end(); I != E; ++I) {
596 if (I->readsRegister(X86::EFLAGS, /*TRI=*/nullptr))
597 return true;
598 if (I->definesRegister(X86::EFLAGS, /*TRI=*/nullptr))
599 return false;
600 }
601
602 // We hit the end of the block, check whether EFLAGS is live into a successor.
603 for (MachineBasicBlock *Succ : BB->successors())
604 if (Succ->isLiveIn(X86::EFLAGS))
605 return true;
606
607 return false;
608}
609
610/// Given /p First CMOV instruction and /p Last CMOV instruction representing a
611/// group of CMOV instructions, which may contain debug instructions in between,
612/// move all debug instructions to after the last CMOV instruction, making the
613/// CMOV group consecutive.
616 "Last instruction in a CMOV group must be a CMOV instruction");
617
618 SmallVector<MachineInstr *, 2> DBGInstructions;
619 for (auto I = First->getIterator(), E = Last->getIterator(); I != E; I++) {
620 if (I->isDebugInstr())
621 DBGInstructions.push_back(&*I);
622 }
623
624 // Splice the debug instruction after the cmov group.
625 MachineBasicBlock *MBB = First->getParent();
626 for (auto *MI : DBGInstructions)
627 MBB->insertAfter(Last, MI->removeFromParent());
628}
629
630void X86CmovConversionImpl::convertCmovInstsToBranches(
631 SmallVectorImpl<MachineInstr *> &Group) const {
632 assert(!Group.empty() && "No CMOV instructions to convert");
633 ++NumOfOptimizedCmovGroups;
634
635 // If the CMOV group is not packed, e.g., there are debug instructions between
636 // first CMOV and last CMOV, then pack the group and make the CMOV instruction
637 // consecutive by moving the debug instructions to after the last CMOV.
638 packCmovGroup(Group.front(), Group.back());
639
640 // To convert a CMOVcc instruction, we actually have to insert the diamond
641 // control-flow pattern. The incoming instruction knows the destination vreg
642 // to set, the condition code register to branch on, the true/false values to
643 // select between, and a branch opcode to use.
644
645 // Before
646 // -----
647 // MBB:
648 // cond = cmp ...
649 // v1 = CMOVge t1, f1, cond
650 // v2 = CMOVlt t2, f2, cond
651 // v3 = CMOVge v1, f3, cond
652 //
653 // After
654 // -----
655 // MBB:
656 // cond = cmp ...
657 // jge %SinkMBB
658 //
659 // FalseMBB:
660 // jmp %SinkMBB
661 //
662 // SinkMBB:
663 // %v1 = phi[%f1, %FalseMBB], [%t1, %MBB]
664 // %v2 = phi[%t2, %FalseMBB], [%f2, %MBB] ; For CMOV with OppCC switch
665 // ; true-value with false-value
666 // %v3 = phi[%f3, %FalseMBB], [%t1, %MBB] ; Phi instruction cannot use
667 // ; previous Phi instruction result
668
669 MachineInstr &MI = *Group.front();
670 MachineInstr *LastCMOV = Group.back();
671 DebugLoc DL = MI.getDebugLoc();
672
675 // Potentially swap the condition codes so that any memory operand to a CMOV
676 // is in the *false* position instead of the *true* position. We can invert
677 // any non-memory operand CMOV instructions to cope with this and we ensure
678 // memory operand CMOVs are only included with a single condition code.
679 if (llvm::any_of(Group, [&](MachineInstr *I) {
680 return I->mayLoad() && X86::getCondFromCMov(*I) == CC;
681 }))
682 std::swap(CC, OppCC);
683
684 MachineBasicBlock *MBB = MI.getParent();
686 MachineFunction *F = MBB->getParent();
687 const BasicBlock *BB = MBB->getBasicBlock();
688
689 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(BB);
690 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(BB);
691 F->insert(It, FalseMBB);
692 F->insert(It, SinkMBB);
693
694 // If the EFLAGS register isn't dead in the terminator, then claim that it's
695 // live into the sink and copy blocks.
696 if (checkEFLAGSLive(LastCMOV)) {
697 FalseMBB->addLiveIn(X86::EFLAGS);
698 SinkMBB->addLiveIn(X86::EFLAGS);
699 }
700
701 // Transfer the remainder of BB and its successor edges to SinkMBB.
702 SinkMBB->splice(SinkMBB->begin(), MBB,
703 std::next(MachineBasicBlock::iterator(LastCMOV)), MBB->end());
705
706 // Add the false and sink blocks as its successors.
707 MBB->addSuccessor(FalseMBB);
708 MBB->addSuccessor(SinkMBB);
709
710 // Create the conditional branch instruction.
711 BuildMI(MBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
712
713 // Add the sink block to the false block successors.
714 FalseMBB->addSuccessor(SinkMBB);
715
716 MachineInstrBuilder MIB;
719 std::next(MachineBasicBlock::iterator(LastCMOV));
720 MachineBasicBlock::iterator FalseInsertionPoint = FalseMBB->begin();
721 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
722
723 // First we need to insert an explicit load on the false path for any memory
724 // operand. We also need to potentially do register rewriting here, but it is
725 // simpler as the memory operands are always on the false path so we can
726 // simply take that input, whatever it is.
727 DenseMap<Register, Register> FalseBBRegRewriteTable;
728 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd;) {
729 auto &MI = *MIIt++;
730 // Skip any CMOVs in this group which don't load from memory.
731 if (!MI.mayLoad()) {
732 // Remember the false-side register input.
733 Register FalseReg =
734 MI.getOperand(X86::getCondFromCMov(MI) == CC ? 1 : 2).getReg();
735 // Walk back through any intermediate cmovs referenced.
736 while (true) {
737 auto FRIt = FalseBBRegRewriteTable.find(FalseReg);
738 if (FRIt == FalseBBRegRewriteTable.end())
739 break;
740 FalseReg = FRIt->second;
741 }
742 FalseBBRegRewriteTable[MI.getOperand(0).getReg()] = FalseReg;
743 continue;
744 }
745
746 // The condition must be the *opposite* of the one we've decided to branch
747 // on as the branch will go *around* the load and the load should happen
748 // when the CMOV condition is false.
749 assert(X86::getCondFromCMov(MI) == OppCC &&
750 "Can only handle memory-operand cmov instructions with a condition "
751 "opposite to the selected branch direction.");
752
753 // The goal is to rewrite the cmov from:
754 //
755 // MBB:
756 // %A = CMOVcc %B (tied), (mem)
757 //
758 // to
759 //
760 // MBB:
761 // %A = CMOVcc %B (tied), %C
762 // FalseMBB:
763 // %C = MOV (mem)
764 //
765 // Which will allow the next loop to rewrite the CMOV in terms of a PHI:
766 //
767 // MBB:
768 // JMP!cc SinkMBB
769 // FalseMBB:
770 // %C = MOV (mem)
771 // SinkMBB:
772 // %A = PHI [ %C, FalseMBB ], [ %B, MBB]
773
774 // Get a fresh register to use as the destination of the MOV.
775 const TargetRegisterClass *RC = MRI->getRegClass(MI.getOperand(0).getReg());
776 Register TmpReg = MRI->createVirtualRegister(RC);
777
778 // Retain debug instr number when unfolded.
779 unsigned OldDebugInstrNum = MI.peekDebugInstrNum();
780 SmallVector<MachineInstr *, 4> NewMIs;
781 bool Unfolded = TII->unfoldMemoryOperand(*MBB->getParent(), MI, TmpReg,
782 /*UnfoldLoad*/ true,
783 /*UnfoldStore*/ false, NewMIs);
784 (void)Unfolded;
785 assert(Unfolded && "Should never fail to unfold a loading cmov!");
786
787 // Move the new CMOV to just before the old one and reset any impacted
788 // iterator.
789 auto *NewCMOV = NewMIs.pop_back_val();
790 assert(X86::getCondFromCMov(*NewCMOV) == OppCC &&
791 "Last new instruction isn't the expected CMOV!");
792 LLVM_DEBUG(dbgs() << "\tRewritten cmov: "; NewCMOV->dump());
794 if (&*MIItBegin == &MI)
795 MIItBegin = MachineBasicBlock::iterator(NewCMOV);
796
797 if (OldDebugInstrNum)
798 NewCMOV->setDebugInstrNum(OldDebugInstrNum);
799
800 // Sink whatever instructions were needed to produce the unfolded operand
801 // into the false block.
802 for (auto *NewMI : NewMIs) {
803 LLVM_DEBUG(dbgs() << "\tRewritten load instr: "; NewMI->dump());
804 FalseMBB->insert(FalseInsertionPoint, NewMI);
805 // Re-map any operands that are from other cmovs to the inputs for this block.
806 for (auto &MOp : NewMI->uses()) {
807 if (!MOp.isReg())
808 continue;
809 auto It = FalseBBRegRewriteTable.find(MOp.getReg());
810 if (It == FalseBBRegRewriteTable.end())
811 continue;
812
813 MOp.setReg(It->second);
814 // This might have been a kill when it referenced the cmov result, but
815 // it won't necessarily be once rewritten.
816 // FIXME: We could potentially improve this by tracking whether the
817 // operand to the cmov was also a kill, and then skipping the PHI node
818 // construction below.
819 MOp.setIsKill(false);
820 }
821 }
822 MBB->erase(&MI);
823
824 // Add this PHI to the rewrite table.
825 FalseBBRegRewriteTable[NewCMOV->getOperand(0).getReg()] = TmpReg;
826 }
827
828 // As we are creating the PHIs, we have to be careful if there is more than
829 // one. Later CMOVs may reference the results of earlier CMOVs, but later
830 // PHIs have to reference the individual true/false inputs from earlier PHIs.
831 // That also means that PHI construction must work forward from earlier to
832 // later, and that the code must maintain a mapping from earlier PHI's
833 // destination registers, and the registers that went into the PHI.
834 DenseMap<Register, std::pair<Register, Register>> RegRewriteTable;
835
836 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
837 Register DestReg = MIIt->getOperand(0).getReg();
838 Register Op1Reg = MIIt->getOperand(1).getReg();
839 Register Op2Reg = MIIt->getOperand(2).getReg();
840
841 // If this CMOV we are processing is the opposite condition from the jump we
842 // generated, then we have to swap the operands for the PHI that is going to
843 // be generated.
844 if (X86::getCondFromCMov(*MIIt) == OppCC)
845 std::swap(Op1Reg, Op2Reg);
846
847 auto Op1Itr = RegRewriteTable.find(Op1Reg);
848 if (Op1Itr != RegRewriteTable.end())
849 Op1Reg = Op1Itr->second.first;
850
851 auto Op2Itr = RegRewriteTable.find(Op2Reg);
852 if (Op2Itr != RegRewriteTable.end())
853 Op2Reg = Op2Itr->second.second;
854
855 // SinkMBB:
856 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, MBB ]
857 // ...
858 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
859 .addReg(Op1Reg)
860 .addMBB(FalseMBB)
861 .addReg(Op2Reg)
862 .addMBB(MBB);
863 (void)MIB;
864 LLVM_DEBUG(dbgs() << "\tFrom: "; MIIt->dump());
865 LLVM_DEBUG(dbgs() << "\tTo: "; MIB->dump());
866
867 // debug-info: we can just copy the instr-ref number from one instruction
868 // to the other, seeing how it's a one-for-one substitution.
869 if (unsigned InstrNum = MIIt->peekDebugInstrNum())
870 MIB->setDebugInstrNum(InstrNum);
871
872 // Add this PHI to the rewrite table.
873 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
874 }
875
876 // Reset the NoPHIs property if a PHI was inserted to prevent a conflict with
877 // the MachineVerifier during testing.
878 if (MIItBegin != MIItEnd)
879 F->getProperties().resetNoPHIs();
880
881 // Now remove the CMOV(s).
882 MBB->erase(MIItBegin, MIItEnd);
883
884 // Add new basic blocks to MachineLoopInfo.
885 if (MachineLoop *L = MLI->getLoopFor(MBB)) {
886 L->addBasicBlockToLoop(FalseMBB, *MLI);
887 L->addBasicBlockToLoop(SinkMBB, *MLI);
888 }
889}
890
891INITIALIZE_PASS_BEGIN(X86CmovConversionLegacy, DEBUG_TYPE,
892 "X86 cmov Conversion", false, false)
894INITIALIZE_PASS_END(X86CmovConversionLegacy, DEBUG_TYPE, "X86 cmov Conversion",
896
898 return new X86CmovConversionLegacy();
899}
900
901bool X86CmovConversionLegacy::runOnMachineFunction(MachineFunction &MF) {
902 if (skipFunction(MF.getFunction()))
903 return false;
904 MachineLoopInfo *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
905 X86CmovConversionImpl Impl(MLI);
906 return Impl.runOnMachineFunction(MF);
907}
908
909PreservedAnalyses
913 X86CmovConversionImpl Impl(MLI);
914 bool Changed = Impl.runOnMachineFunction(MF);
917}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file defines the DenseMap class.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
Hexagon Hardware Loops
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file contains some templates that are useful if you are working with the STL at all.
static cl::opt< unsigned > GainCycleThreshold("select-opti-loop-cycle-gain-threshold", cl::desc("Minimum gain per loop (in cycles) threshold."), cl::init(4), cl::Hidden)
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static cl::opt< bool > DisableSelectOptimize("disable-select-optimize", cl::init(true), cl::Hidden, cl::desc("Disable the select-optimization pass from running"))
Disable the select optimization pass.
static cl::opt< bool > ForceAll("x86-cmov-converter-force-all", cl::desc("Convert all cmovs to branches."), cl::init(false), cl::Hidden)
static bool checkEFLAGSLive(MachineInstr *MI)
static unsigned getDepthOfOptCmov(unsigned TrueOpDepth, unsigned FalseOpDepth)
static cl::opt< unsigned > GainCycleThreshold("x86-cmov-converter-threshold", cl::desc("Minimum gain per loop (in cycles) threshold."), cl::init(4), cl::Hidden)
static cl::opt< bool > ForceMemOperand("x86-cmov-converter-force-mem-operand", cl::desc("Convert cmovs to branches whenever they have memory operands."), cl::init(true), cl::Hidden)
static void packCmovGroup(MachineInstr *First, MachineInstr *Last)
Given /p First CMOV instruction and /p Last CMOV instruction representing a group of CMOV instruction...
static cl::opt< bool > EnableCmovConverter("x86-cmov-converter", cl::desc("Enable the X86 cmov-to-branch optimization."), cl::init(true), cl::Hidden)
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
iterator end()
Definition DenseMap.h:81
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
iterator end() const
iterator begin() const
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
void setDebugInstrNum(unsigned Num)
Set instruction number of this MachineInstr.
LLVM_ABI void dump() const
Analysis pass that exposes the MachineLoopInfo for a machine function.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
const MCSchedModel * getMCSchedModel() const
LLVM_ABI void init(const TargetSubtargetInfo *TSInfo, bool EnableSModel=true, bool EnableSItins=true)
Initialize the machine model for instruction scheduling.
virtual const TargetInstrInfo * getInstrInfo() const
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
self_iterator getIterator()
Definition ilist_node.h:123
Changed
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
CondCode getCondFromCMov(const MachineInstr &MI)
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2184
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
FunctionPass * createX86CmovConversionLegacyPass()
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
bool hasSingleElement(ContainerTy &&C)
Returns true if the given container only contains a single element.
Definition STLExtras.h:300
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:368
LLVM_ABI CGPassBuilderOption getCGPassBuilderOption()
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
unsigned MispredictPenalty
Definition MCSchedule.h:311