LLVM 19.0.0git
SelectOptimize.cpp
Go to the documentation of this file.
1//===--- SelectOptimize.cpp - Convert select to branches if profitable ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass converts selects to conditional jumps when profitable.
10//
11//===----------------------------------------------------------------------===//
12
15#include "llvm/ADT/Statistic.h"
22#include "llvm/CodeGen/Passes.h"
27#include "llvm/IR/BasicBlock.h"
28#include "llvm/IR/Dominators.h"
29#include "llvm/IR/Function.h"
30#include "llvm/IR/IRBuilder.h"
31#include "llvm/IR/Instruction.h"
35#include "llvm/Pass.h"
39#include <algorithm>
40#include <memory>
41#include <queue>
42#include <stack>
43
44using namespace llvm;
45using namespace llvm::PatternMatch;
46
47#define DEBUG_TYPE "select-optimize"
48
49STATISTIC(NumSelectOptAnalyzed,
50 "Number of select groups considered for conversion to branch");
51STATISTIC(NumSelectConvertedExpColdOperand,
52 "Number of select groups converted due to expensive cold operand");
53STATISTIC(NumSelectConvertedHighPred,
54 "Number of select groups converted due to high-predictability");
55STATISTIC(NumSelectUnPred,
56 "Number of select groups not converted due to unpredictability");
57STATISTIC(NumSelectColdBB,
58 "Number of select groups not converted due to cold basic block");
59STATISTIC(NumSelectConvertedLoop,
60 "Number of select groups converted due to loop-level analysis");
61STATISTIC(NumSelectsConverted, "Number of selects converted");
62
64 "cold-operand-threshold",
65 cl::desc("Maximum frequency of path for an operand to be considered cold."),
66 cl::init(20), cl::Hidden);
67
69 "cold-operand-max-cost-multiplier",
70 cl::desc("Maximum cost multiplier of TCC_expensive for the dependence "
71 "slice of a cold operand to be considered inexpensive."),
73
75 GainGradientThreshold("select-opti-loop-gradient-gain-threshold",
76 cl::desc("Gradient gain threshold (%)."),
77 cl::init(25), cl::Hidden);
78
80 GainCycleThreshold("select-opti-loop-cycle-gain-threshold",
81 cl::desc("Minimum gain per loop (in cycles) threshold."),
83
85 "select-opti-loop-relative-gain-threshold",
87 "Minimum relative gain per loop threshold (1/X). Defaults to 12.5%"),
89
91 "mispredict-default-rate", cl::Hidden, cl::init(25),
92 cl::desc("Default mispredict rate (initialized to 25%)."));
93
94static cl::opt<bool>
95 DisableLoopLevelHeuristics("disable-loop-level-heuristics", cl::Hidden,
96 cl::init(false),
97 cl::desc("Disable loop-level heuristics."));
98
99namespace {
100
101class SelectOptimizeImpl {
102 const TargetMachine *TM = nullptr;
103 const TargetSubtargetInfo *TSI = nullptr;
104 const TargetLowering *TLI = nullptr;
105 const TargetTransformInfo *TTI = nullptr;
106 const LoopInfo *LI = nullptr;
108 ProfileSummaryInfo *PSI = nullptr;
109 OptimizationRemarkEmitter *ORE = nullptr;
110 TargetSchedModel TSchedModel;
111
112public:
113 SelectOptimizeImpl() = default;
114 SelectOptimizeImpl(const TargetMachine *TM) : TM(TM){};
116 bool runOnFunction(Function &F, Pass &P);
117
119
120 struct CostInfo {
121 /// Predicated cost (with selects as conditional moves).
122 Scaled64 PredCost;
123 /// Non-predicated cost (with selects converted to branches).
124 Scaled64 NonPredCost;
125 };
126
127 /// SelectLike is an abstraction over SelectInst and other operations that can
128 /// act like selects. For example Or(Zext(icmp), X) can be treated like
129 /// select(icmp, X|1, X).
130 class SelectLike {
131 SelectLike(Instruction *I) : I(I) {}
132
133 Instruction *I;
134
135 public:
136 /// Match a select or select-like instruction, returning a SelectLike.
137 static SelectLike match(Instruction *I) {
138 // Select instruction are what we are usually looking for.
139 if (isa<SelectInst>(I))
140 return SelectLike(I);
141
142 // An Or(zext(i1 X), Y) can also be treated like a select, with condition
143 // C and values Y|1 and Y.
144 Value *X;
146 I, m_c_Or(m_OneUse(m_ZExt(m_Value(X))), m_Value())) &&
147 X->getType()->isIntegerTy(1))
148 return SelectLike(I);
149
150 return SelectLike(nullptr);
151 }
152
153 bool isValid() { return I; }
154 operator bool() { return isValid(); }
155
156 Instruction *getI() { return I; }
157 const Instruction *getI() const { return I; }
158
159 Type *getType() const { return I->getType(); }
160
161 /// Return the condition for the SelectLike instruction. For example the
162 /// condition of a select or c in `or(zext(c), x)`
163 Value *getCondition() const {
164 if (auto *Sel = dyn_cast<SelectInst>(I))
165 return Sel->getCondition();
166 // Or(zext) case
167 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
168 Value *X;
169 if (PatternMatch::match(BO->getOperand(0),
171 return X;
172 if (PatternMatch::match(BO->getOperand(1),
174 return X;
175 }
176
177 llvm_unreachable("Unhandled case in getCondition");
178 }
179
180 /// Return the true value for the SelectLike instruction. Note this may not
181 /// exist for all SelectLike instructions. For example, for `or(zext(c), x)`
182 /// the true value would be `or(x,1)`. As this value does not exist, nullptr
183 /// is returned.
184 Value *getTrueValue() const {
185 if (auto *Sel = dyn_cast<SelectInst>(I))
186 return Sel->getTrueValue();
187 // Or(zext) case - The true value is Or(X), so return nullptr as the value
188 // does not yet exist.
189 if (isa<BinaryOperator>(I))
190 return nullptr;
191
192 llvm_unreachable("Unhandled case in getTrueValue");
193 }
194
195 /// Return the false value for the SelectLike instruction. For example the
196 /// getFalseValue of a select or `x` in `or(zext(c), x)` (which is
197 /// `select(c, x|1, x)`)
198 Value *getFalseValue() const {
199 if (auto *Sel = dyn_cast<SelectInst>(I))
200 return Sel->getFalseValue();
201 // Or(zext) case - return the operand which is not the zext.
202 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
203 Value *X;
204 if (PatternMatch::match(BO->getOperand(0),
206 return BO->getOperand(1);
207 if (PatternMatch::match(BO->getOperand(1),
209 return BO->getOperand(0);
210 }
211
212 llvm_unreachable("Unhandled case in getFalseValue");
213 }
214
215 /// Return the NonPredCost cost of the true op, given the costs in
216 /// InstCostMap. This may need to be generated for select-like instructions.
217 Scaled64 getTrueOpCost(DenseMap<const Instruction *, CostInfo> &InstCostMap,
218 const TargetTransformInfo *TTI) {
219 if (auto *Sel = dyn_cast<SelectInst>(I))
220 if (auto *I = dyn_cast<Instruction>(Sel->getTrueValue()))
221 return InstCostMap.contains(I) ? InstCostMap[I].NonPredCost
223
224 // Or case - add the cost of an extra Or to the cost of the False case.
225 if (isa<BinaryOperator>(I))
226 if (auto I = dyn_cast<Instruction>(getFalseValue()))
227 if (InstCostMap.contains(I)) {
229 Instruction::Or, I->getType(), TargetTransformInfo::TCK_Latency,
230 {TargetTransformInfo::OK_AnyValue,
231 TargetTransformInfo::OP_None},
232 {TTI::OK_UniformConstantValue, TTI::OP_PowerOf2});
233 return InstCostMap[I].NonPredCost +
234 Scaled64::get(*OrCost.getValue());
235 }
236
237 return Scaled64::getZero();
238 }
239
240 /// Return the NonPredCost cost of the false op, given the costs in
241 /// InstCostMap. This may need to be generated for select-like instructions.
243 getFalseOpCost(DenseMap<const Instruction *, CostInfo> &InstCostMap,
244 const TargetTransformInfo *TTI) {
245 if (auto *Sel = dyn_cast<SelectInst>(I))
246 if (auto *I = dyn_cast<Instruction>(Sel->getFalseValue()))
247 return InstCostMap.contains(I) ? InstCostMap[I].NonPredCost
249
250 // Or case - return the cost of the false case
251 if (isa<BinaryOperator>(I))
252 if (auto I = dyn_cast<Instruction>(getFalseValue()))
253 if (InstCostMap.contains(I))
254 return InstCostMap[I].NonPredCost;
255
256 return Scaled64::getZero();
257 }
258 };
259
260private:
261 // Select groups consist of consecutive select instructions with the same
262 // condition.
263 using SelectGroup = SmallVector<SelectLike, 2>;
264 using SelectGroups = SmallVector<SelectGroup, 2>;
265
266 // Converts select instructions of a function to conditional jumps when deemed
267 // profitable. Returns true if at least one select was converted.
268 bool optimizeSelects(Function &F);
269
270 // Heuristics for determining which select instructions can be profitably
271 // conveted to branches. Separate heuristics for selects in inner-most loops
272 // and the rest of code regions (base heuristics for non-inner-most loop
273 // regions).
274 void optimizeSelectsBase(Function &F, SelectGroups &ProfSIGroups);
275 void optimizeSelectsInnerLoops(Function &F, SelectGroups &ProfSIGroups);
276
277 // Converts to branches the select groups that were deemed
278 // profitable-to-convert.
279 void convertProfitableSIGroups(SelectGroups &ProfSIGroups);
280
281 // Splits selects of a given basic block into select groups.
282 void collectSelectGroups(BasicBlock &BB, SelectGroups &SIGroups);
283
284 // Determines for which select groups it is profitable converting to branches
285 // (base and inner-most-loop heuristics).
286 void findProfitableSIGroupsBase(SelectGroups &SIGroups,
287 SelectGroups &ProfSIGroups);
288 void findProfitableSIGroupsInnerLoops(const Loop *L, SelectGroups &SIGroups,
289 SelectGroups &ProfSIGroups);
290
291 // Determines if a select group should be converted to a branch (base
292 // heuristics).
293 bool isConvertToBranchProfitableBase(const SelectGroup &ASI);
294
295 // Returns true if there are expensive instructions in the cold value
296 // operand's (if any) dependence slice of any of the selects of the given
297 // group.
298 bool hasExpensiveColdOperand(const SelectGroup &ASI);
299
300 // For a given source instruction, collect its backwards dependence slice
301 // consisting of instructions exclusively computed for producing the operands
302 // of the source instruction.
303 void getExclBackwardsSlice(Instruction *I, std::stack<Instruction *> &Slice,
304 Instruction *SI, bool ForSinking = false);
305
306 // Returns true if the condition of the select is highly predictable.
307 bool isSelectHighlyPredictable(const SelectLike SI);
308
309 // Loop-level checks to determine if a non-predicated version (with branches)
310 // of the given loop is more profitable than its predicated version.
311 bool checkLoopHeuristics(const Loop *L, const CostInfo LoopDepth[2]);
312
313 // Computes instruction and loop-critical-path costs for both the predicated
314 // and non-predicated version of the given loop.
315 bool computeLoopCosts(const Loop *L, const SelectGroups &SIGroups,
317 CostInfo *LoopCost);
318
319 // Returns a set of all the select instructions in the given select groups.
321 getSImap(const SelectGroups &SIGroups);
322
323 // Returns the latency cost of a given instruction.
324 std::optional<uint64_t> computeInstCost(const Instruction *I);
325
326 // Returns the misprediction cost of a given select when converted to branch.
327 Scaled64 getMispredictionCost(const SelectLike SI, const Scaled64 CondCost);
328
329 // Returns the cost of a branch when the prediction is correct.
330 Scaled64 getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
331 const SelectLike SI);
332
333 // Returns true if the target architecture supports lowering a given select.
334 bool isSelectKindSupported(const SelectLike SI);
335};
336
337class SelectOptimize : public FunctionPass {
338 SelectOptimizeImpl Impl;
339
340public:
341 static char ID;
342
343 SelectOptimize() : FunctionPass(ID) {
345 }
346
347 bool runOnFunction(Function &F) override {
348 return Impl.runOnFunction(F, *this);
349 }
350
351 void getAnalysisUsage(AnalysisUsage &AU) const override {
358 }
359};
360
361} // namespace
362
365 SelectOptimizeImpl Impl(TM);
366 return Impl.run(F, FAM);
367}
368
369char SelectOptimize::ID = 0;
370
371INITIALIZE_PASS_BEGIN(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
372 false)
379INITIALIZE_PASS_END(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
380 false)
381
382FunctionPass *llvm::createSelectOptimizePass() { return new SelectOptimize(); }
383
384PreservedAnalyses SelectOptimizeImpl::run(Function &F,
386 TSI = TM->getSubtargetImpl(F);
387 TLI = TSI->getTargetLowering();
388
389 // If none of the select types are supported then skip this pass.
390 // This is an optimization pass. Legality issues will be handled by
391 // instruction selection.
395 return PreservedAnalyses::all();
396
399 return PreservedAnalyses::all();
400
402 .getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
403 assert(PSI && "This pass requires module analysis pass `profile-summary`!");
405
406 // When optimizing for size, selects are preferable over branches.
407 if (F.hasOptSize() || llvm::shouldOptimizeForSize(&F, PSI, BFI))
408 return PreservedAnalyses::all();
409
410 LI = &FAM.getResult<LoopAnalysis>(F);
412 TSchedModel.init(TSI);
413
414 bool Changed = optimizeSelects(F);
415 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
416}
417
418bool SelectOptimizeImpl::runOnFunction(Function &F, Pass &P) {
419 TM = &P.getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
420 TSI = TM->getSubtargetImpl(F);
421 TLI = TSI->getTargetLowering();
422
423 // If none of the select types are supported then skip this pass.
424 // This is an optimization pass. Legality issues will be handled by
425 // instruction selection.
429 return false;
430
431 TTI = &P.getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
432
434 return false;
435
436 LI = &P.getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
437 BFI = &P.getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
438 PSI = &P.getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
439 ORE = &P.getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
440 TSchedModel.init(TSI);
441
442 // When optimizing for size, selects are preferable over branches.
443 if (F.hasOptSize() || llvm::shouldOptimizeForSize(&F, PSI, BFI))
444 return false;
445
446 return optimizeSelects(F);
447}
448
449bool SelectOptimizeImpl::optimizeSelects(Function &F) {
450 // Determine for which select groups it is profitable converting to branches.
451 SelectGroups ProfSIGroups;
452 // Base heuristics apply only to non-loops and outer loops.
453 optimizeSelectsBase(F, ProfSIGroups);
454 // Separate heuristics for inner-most loops.
455 optimizeSelectsInnerLoops(F, ProfSIGroups);
456
457 // Convert to branches the select groups that were deemed
458 // profitable-to-convert.
459 convertProfitableSIGroups(ProfSIGroups);
460
461 // Code modified if at least one select group was converted.
462 return !ProfSIGroups.empty();
463}
464
465void SelectOptimizeImpl::optimizeSelectsBase(Function &F,
466 SelectGroups &ProfSIGroups) {
467 // Collect all the select groups.
468 SelectGroups SIGroups;
469 for (BasicBlock &BB : F) {
470 // Base heuristics apply only to non-loops and outer loops.
471 Loop *L = LI->getLoopFor(&BB);
472 if (L && L->isInnermost())
473 continue;
474 collectSelectGroups(BB, SIGroups);
475 }
476
477 // Determine for which select groups it is profitable converting to branches.
478 findProfitableSIGroupsBase(SIGroups, ProfSIGroups);
479}
480
481void SelectOptimizeImpl::optimizeSelectsInnerLoops(Function &F,
482 SelectGroups &ProfSIGroups) {
484 // Need to check size on each iteration as we accumulate child loops.
485 for (unsigned long i = 0; i < Loops.size(); ++i)
486 for (Loop *ChildL : Loops[i]->getSubLoops())
487 Loops.push_back(ChildL);
488
489 for (Loop *L : Loops) {
490 if (!L->isInnermost())
491 continue;
492
493 SelectGroups SIGroups;
494 for (BasicBlock *BB : L->getBlocks())
495 collectSelectGroups(*BB, SIGroups);
496
497 findProfitableSIGroupsInnerLoops(L, SIGroups, ProfSIGroups);
498 }
499}
500
501/// If \p isTrue is true, return the true value of \p SI, otherwise return
502/// false value of \p SI. If the true/false value of \p SI is defined by any
503/// select instructions in \p Selects, look through the defining select
504/// instruction until the true/false value is not defined in \p Selects.
505static Value *
506getTrueOrFalseValue(SelectOptimizeImpl::SelectLike SI, bool isTrue,
508 IRBuilder<> &IB) {
509 Value *V = nullptr;
510 for (SelectInst *DefSI = dyn_cast<SelectInst>(SI.getI());
511 DefSI != nullptr && Selects.count(DefSI);
512 DefSI = dyn_cast<SelectInst>(V)) {
513 assert(DefSI->getCondition() == SI.getCondition() &&
514 "The condition of DefSI does not match with SI");
515 V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
516 }
517
518 if (isa<BinaryOperator>(SI.getI())) {
519 assert(SI.getI()->getOpcode() == Instruction::Or &&
520 "Only currently handling Or instructions.");
521 V = SI.getFalseValue();
522 if (isTrue)
523 V = IB.CreateOr(V, ConstantInt::get(V->getType(), 1));
524 }
525
526 assert(V && "Failed to get select true/false value");
527 return V;
528}
529
530void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
531 for (SelectGroup &ASI : ProfSIGroups) {
532 // The code transformation here is a modified version of the sinking
533 // transformation in CodeGenPrepare::optimizeSelectInst with a more
534 // aggressive strategy of which instructions to sink.
535 //
536 // TODO: eliminate the redundancy of logic transforming selects to branches
537 // by removing CodeGenPrepare::optimizeSelectInst and optimizing here
538 // selects for all cases (with and without profile information).
539
540 // Transform a sequence like this:
541 // start:
542 // %cmp = cmp uge i32 %a, %b
543 // %sel = select i1 %cmp, i32 %c, i32 %d
544 //
545 // Into:
546 // start:
547 // %cmp = cmp uge i32 %a, %b
548 // %cmp.frozen = freeze %cmp
549 // br i1 %cmp.frozen, label %select.true, label %select.false
550 // select.true:
551 // br label %select.end
552 // select.false:
553 // br label %select.end
554 // select.end:
555 // %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
556 //
557 // %cmp should be frozen, otherwise it may introduce undefined behavior.
558 // In addition, we may sink instructions that produce %c or %d into the
559 // destination(s) of the new branch.
560 // If the true or false blocks do not contain a sunken instruction, that
561 // block and its branch may be optimized away. In that case, one side of the
562 // first branch will point directly to select.end, and the corresponding PHI
563 // predecessor block will be the start block.
564
565 // Find all the instructions that can be soundly sunk to the true/false
566 // blocks. These are instructions that are computed solely for producing the
567 // operands of the select instructions in the group and can be sunk without
568 // breaking the semantics of the LLVM IR (e.g., cannot sink instructions
569 // with side effects).
570 SmallVector<std::stack<Instruction *>, 2> TrueSlices, FalseSlices;
571 typedef std::stack<Instruction *>::size_type StackSizeType;
572 StackSizeType maxTrueSliceLen = 0, maxFalseSliceLen = 0;
573 for (SelectLike SI : ASI) {
574 // For each select, compute the sinkable dependence chains of the true and
575 // false operands.
576 if (auto *TI = dyn_cast_or_null<Instruction>(SI.getTrueValue())) {
577 std::stack<Instruction *> TrueSlice;
578 getExclBackwardsSlice(TI, TrueSlice, SI.getI(), true);
579 maxTrueSliceLen = std::max(maxTrueSliceLen, TrueSlice.size());
580 TrueSlices.push_back(TrueSlice);
581 }
582 if (auto *FI = dyn_cast_or_null<Instruction>(SI.getFalseValue())) {
583 if (isa<SelectInst>(SI.getI()) || !FI->hasOneUse()) {
584 std::stack<Instruction *> FalseSlice;
585 getExclBackwardsSlice(FI, FalseSlice, SI.getI(), true);
586 maxFalseSliceLen = std::max(maxFalseSliceLen, FalseSlice.size());
587 FalseSlices.push_back(FalseSlice);
588 }
589 }
590 }
591 // In the case of multiple select instructions in the same group, the order
592 // of non-dependent instructions (instructions of different dependence
593 // slices) in the true/false blocks appears to affect performance.
594 // Interleaving the slices seems to experimentally be the optimal approach.
595 // This interleaving scheduling allows for more ILP (with a natural downside
596 // of increasing a bit register pressure) compared to a simple ordering of
597 // one whole chain after another. One would expect that this ordering would
598 // not matter since the scheduling in the backend of the compiler would
599 // take care of it, but apparently the scheduler fails to deliver optimal
600 // ILP with a naive ordering here.
601 SmallVector<Instruction *, 2> TrueSlicesInterleaved, FalseSlicesInterleaved;
602 for (StackSizeType IS = 0; IS < maxTrueSliceLen; ++IS) {
603 for (auto &S : TrueSlices) {
604 if (!S.empty()) {
605 TrueSlicesInterleaved.push_back(S.top());
606 S.pop();
607 }
608 }
609 }
610 for (StackSizeType IS = 0; IS < maxFalseSliceLen; ++IS) {
611 for (auto &S : FalseSlices) {
612 if (!S.empty()) {
613 FalseSlicesInterleaved.push_back(S.top());
614 S.pop();
615 }
616 }
617 }
618
619 // We split the block containing the select(s) into two blocks.
620 SelectLike SI = ASI.front();
621 SelectLike LastSI = ASI.back();
622 BasicBlock *StartBlock = SI.getI()->getParent();
623 BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI.getI()));
624 // With RemoveDIs turned off, SplitPt can be a dbg.* intrinsic. With
625 // RemoveDIs turned on, SplitPt would instead point to the next
626 // instruction. To match existing dbg.* intrinsic behaviour with RemoveDIs,
627 // tell splitBasicBlock that we want to include any DPValues attached to
628 // SplitPt in the splice.
629 SplitPt.setHeadBit(true);
630 BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
631 BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock));
632 // Delete the unconditional branch that was just created by the split.
633 StartBlock->getTerminator()->eraseFromParent();
634
635 // Move any debug/pseudo instructions that were in-between the select
636 // group to the newly-created end block.
637 SmallVector<Instruction *, 2> DebugPseudoINS;
638 auto DIt = SI.getI()->getIterator();
639 while (&*DIt != LastSI.getI()) {
640 if (DIt->isDebugOrPseudoInst())
641 DebugPseudoINS.push_back(&*DIt);
642 DIt++;
643 }
644 for (auto *DI : DebugPseudoINS) {
645 DI->moveBeforePreserving(&*EndBlock->getFirstInsertionPt());
646 }
647
648 // Duplicate implementation for DPValues, the non-instruction debug-info
649 // record. Helper lambda for moving DPValues to the end block.
650 auto TransferDPValues = [&](Instruction &I) {
651 for (auto &DPValue : llvm::make_early_inc_range(I.getDbgValueRange())) {
653 EndBlock->insertDPValueBefore(&DPValue,
654 EndBlock->getFirstInsertionPt());
655 }
656 };
657
658 // Iterate over all instructions in between SI and LastSI, not including
659 // SI itself. These are all the variable assignments that happen "in the
660 // middle" of the select group.
661 auto R = make_range(std::next(SI.getI()->getIterator()),
662 std::next(LastSI.getI()->getIterator()));
663 llvm::for_each(R, TransferDPValues);
664
665 // These are the new basic blocks for the conditional branch.
666 // At least one will become an actual new basic block.
667 BasicBlock *TrueBlock = nullptr, *FalseBlock = nullptr;
668 BranchInst *TrueBranch = nullptr, *FalseBranch = nullptr;
669 if (!TrueSlicesInterleaved.empty()) {
670 TrueBlock = BasicBlock::Create(EndBlock->getContext(), "select.true.sink",
671 EndBlock->getParent(), EndBlock);
672 TrueBranch = BranchInst::Create(EndBlock, TrueBlock);
673 TrueBranch->setDebugLoc(LastSI.getI()->getDebugLoc());
674 for (Instruction *TrueInst : TrueSlicesInterleaved)
675 TrueInst->moveBefore(TrueBranch);
676 }
677 if (!FalseSlicesInterleaved.empty()) {
678 FalseBlock =
679 BasicBlock::Create(EndBlock->getContext(), "select.false.sink",
680 EndBlock->getParent(), EndBlock);
681 FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
682 FalseBranch->setDebugLoc(LastSI.getI()->getDebugLoc());
683 for (Instruction *FalseInst : FalseSlicesInterleaved)
684 FalseInst->moveBefore(FalseBranch);
685 }
686 // If there was nothing to sink, then arbitrarily choose the 'false' side
687 // for a new input value to the PHI.
688 if (TrueBlock == FalseBlock) {
689 assert(TrueBlock == nullptr &&
690 "Unexpected basic block transform while optimizing select");
691
692 FalseBlock = BasicBlock::Create(StartBlock->getContext(), "select.false",
693 EndBlock->getParent(), EndBlock);
694 auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
695 FalseBranch->setDebugLoc(SI.getI()->getDebugLoc());
696 }
697
698 // Insert the real conditional branch based on the original condition.
699 // If we did not create a new block for one of the 'true' or 'false' paths
700 // of the condition, it means that side of the branch goes to the end block
701 // directly and the path originates from the start block from the point of
702 // view of the new PHI.
703 BasicBlock *TT, *FT;
704 if (TrueBlock == nullptr) {
705 TT = EndBlock;
706 FT = FalseBlock;
707 TrueBlock = StartBlock;
708 } else if (FalseBlock == nullptr) {
709 TT = TrueBlock;
710 FT = EndBlock;
711 FalseBlock = StartBlock;
712 } else {
713 TT = TrueBlock;
714 FT = FalseBlock;
715 }
716 IRBuilder<> IB(SI.getI());
717 auto *CondFr = IB.CreateFreeze(SI.getCondition(),
718 SI.getCondition()->getName() + ".frozen");
719
721 for (auto SI : ASI)
722 INS.insert(SI.getI());
723
724 // Use reverse iterator because later select may use the value of the
725 // earlier select, and we need to propagate value through earlier select
726 // to get the PHI operand.
727 for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) {
728 SelectLike SI = *It;
729 // The select itself is replaced with a PHI Node.
730 PHINode *PN = PHINode::Create(SI.getType(), 2, "");
731 PN->insertBefore(EndBlock->begin());
732 PN->takeName(SI.getI());
733 PN->addIncoming(getTrueOrFalseValue(SI, true, INS, IB), TrueBlock);
734 PN->addIncoming(getTrueOrFalseValue(SI, false, INS, IB), FalseBlock);
735 PN->setDebugLoc(SI.getI()->getDebugLoc());
736 SI.getI()->replaceAllUsesWith(PN);
737 INS.erase(SI.getI());
738 ++NumSelectsConverted;
739 }
740 IB.CreateCondBr(CondFr, TT, FT, SI.getI());
741
742 // Remove the old select instructions, now that they are not longer used.
743 for (auto SI : ASI)
744 SI.getI()->eraseFromParent();
745 }
746}
747
748void SelectOptimizeImpl::collectSelectGroups(BasicBlock &BB,
749 SelectGroups &SIGroups) {
750 BasicBlock::iterator BBIt = BB.begin();
751 while (BBIt != BB.end()) {
752 Instruction *I = &*BBIt++;
753 if (SelectLike SI = SelectLike::match(I)) {
755 continue;
756
757 SelectGroup SIGroup;
758 SIGroup.push_back(SI);
759 while (BBIt != BB.end()) {
760 Instruction *NI = &*BBIt;
761 // Debug/pseudo instructions should be skipped and not prevent the
762 // formation of a select group.
763 if (NI->isDebugOrPseudoInst()) {
764 ++BBIt;
765 continue;
766 }
767 // We only allow selects in the same group, not other select-like
768 // instructions.
769 if (!isa<SelectInst>(NI))
770 break;
771
772 SelectLike NSI = SelectLike::match(NI);
773 if (NSI && SI.getCondition() == NSI.getCondition()) {
774 SIGroup.push_back(NSI);
775 } else
776 break;
777 ++BBIt;
778 }
779
780 // If the select type is not supported, no point optimizing it.
781 // Instruction selection will take care of it.
782 if (!isSelectKindSupported(SI))
783 continue;
784
785 SIGroups.push_back(SIGroup);
786 }
787 }
788}
789
790void SelectOptimizeImpl::findProfitableSIGroupsBase(
791 SelectGroups &SIGroups, SelectGroups &ProfSIGroups) {
792 for (SelectGroup &ASI : SIGroups) {
793 ++NumSelectOptAnalyzed;
794 if (isConvertToBranchProfitableBase(ASI))
795 ProfSIGroups.push_back(ASI);
796 }
797}
798
801 LLVM_DEBUG(dbgs() << Rem.getMsg() << "\n");
802 ORE->emit(Rem);
803}
804
805void SelectOptimizeImpl::findProfitableSIGroupsInnerLoops(
806 const Loop *L, SelectGroups &SIGroups, SelectGroups &ProfSIGroups) {
807 NumSelectOptAnalyzed += SIGroups.size();
808 // For each select group in an inner-most loop,
809 // a branch is more preferable than a select/conditional-move if:
810 // i) conversion to branches for all the select groups of the loop satisfies
811 // loop-level heuristics including reducing the loop's critical path by
812 // some threshold (see SelectOptimizeImpl::checkLoopHeuristics); and
813 // ii) the total cost of the select group is cheaper with a branch compared
814 // to its predicated version. The cost is in terms of latency and the cost
815 // of a select group is the cost of its most expensive select instruction
816 // (assuming infinite resources and thus fully leveraging available ILP).
817
819 CostInfo LoopCost[2] = {{Scaled64::getZero(), Scaled64::getZero()},
821 if (!computeLoopCosts(L, SIGroups, InstCostMap, LoopCost) ||
822 !checkLoopHeuristics(L, LoopCost)) {
823 return;
824 }
825
826 for (SelectGroup &ASI : SIGroups) {
827 // Assuming infinite resources, the cost of a group of instructions is the
828 // cost of the most expensive instruction of the group.
829 Scaled64 SelectCost = Scaled64::getZero(), BranchCost = Scaled64::getZero();
830 for (SelectLike SI : ASI) {
831 SelectCost = std::max(SelectCost, InstCostMap[SI.getI()].PredCost);
832 BranchCost = std::max(BranchCost, InstCostMap[SI.getI()].NonPredCost);
833 }
834 if (BranchCost < SelectCost) {
835 OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", ASI.front().getI());
836 OR << "Profitable to convert to branch (loop analysis). BranchCost="
837 << BranchCost.toString() << ", SelectCost=" << SelectCost.toString()
838 << ". ";
839 EmitAndPrintRemark(ORE, OR);
840 ++NumSelectConvertedLoop;
841 ProfSIGroups.push_back(ASI);
842 } else {
843 OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti",
844 ASI.front().getI());
845 ORmiss << "Select is more profitable (loop analysis). BranchCost="
846 << BranchCost.toString()
847 << ", SelectCost=" << SelectCost.toString() << ". ";
848 EmitAndPrintRemark(ORE, ORmiss);
849 }
850 }
851}
852
853bool SelectOptimizeImpl::isConvertToBranchProfitableBase(
854 const SelectGroup &ASI) {
855 SelectLike SI = ASI.front();
856 LLVM_DEBUG(dbgs() << "Analyzing select group containing " << *SI.getI()
857 << "\n");
858 OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", SI.getI());
859 OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", SI.getI());
860
861 // Skip cold basic blocks. Better to optimize for size for cold blocks.
862 if (PSI->isColdBlock(SI.getI()->getParent(), BFI)) {
863 ++NumSelectColdBB;
864 ORmiss << "Not converted to branch because of cold basic block. ";
865 EmitAndPrintRemark(ORE, ORmiss);
866 return false;
867 }
868
869 // If unpredictable, branch form is less profitable.
870 if (SI.getI()->getMetadata(LLVMContext::MD_unpredictable)) {
871 ++NumSelectUnPred;
872 ORmiss << "Not converted to branch because of unpredictable branch. ";
873 EmitAndPrintRemark(ORE, ORmiss);
874 return false;
875 }
876
877 // If highly predictable, branch form is more profitable, unless a
878 // predictable select is inexpensive in the target architecture.
879 if (isSelectHighlyPredictable(SI) && TLI->isPredictableSelectExpensive()) {
880 ++NumSelectConvertedHighPred;
881 OR << "Converted to branch because of highly predictable branch. ";
882 EmitAndPrintRemark(ORE, OR);
883 return true;
884 }
885
886 // Look for expensive instructions in the cold operand's (if any) dependence
887 // slice of any of the selects in the group.
888 if (hasExpensiveColdOperand(ASI)) {
889 ++NumSelectConvertedExpColdOperand;
890 OR << "Converted to branch because of expensive cold operand.";
891 EmitAndPrintRemark(ORE, OR);
892 return true;
893 }
894
895 ORmiss << "Not profitable to convert to branch (base heuristic).";
896 EmitAndPrintRemark(ORE, ORmiss);
897 return false;
898}
899
901 uint64_t Denominator) {
902 return (Numerator + (Denominator / 2)) / Denominator;
903}
904
905static bool extractBranchWeights(const SelectOptimizeImpl::SelectLike SI,
906 uint64_t &TrueVal, uint64_t &FalseVal) {
907 if (isa<SelectInst>(SI.getI()))
908 return extractBranchWeights(*SI.getI(), TrueVal, FalseVal);
909 return false;
910}
911
912bool SelectOptimizeImpl::hasExpensiveColdOperand(const SelectGroup &ASI) {
913 bool ColdOperand = false;
914 uint64_t TrueWeight, FalseWeight, TotalWeight;
915 if (extractBranchWeights(ASI.front(), TrueWeight, FalseWeight)) {
916 uint64_t MinWeight = std::min(TrueWeight, FalseWeight);
917 TotalWeight = TrueWeight + FalseWeight;
918 // Is there a path with frequency <ColdOperandThreshold% (default:20%) ?
919 ColdOperand = TotalWeight * ColdOperandThreshold > 100 * MinWeight;
920 } else if (PSI->hasProfileSummary()) {
921 OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti",
922 ASI.front().getI());
923 ORmiss << "Profile data available but missing branch-weights metadata for "
924 "select instruction. ";
925 EmitAndPrintRemark(ORE, ORmiss);
926 }
927 if (!ColdOperand)
928 return false;
929 // Check if the cold path's dependence slice is expensive for any of the
930 // selects of the group.
931 for (SelectLike SI : ASI) {
932 Instruction *ColdI = nullptr;
933 uint64_t HotWeight;
934 if (TrueWeight < FalseWeight) {
935 ColdI = dyn_cast_or_null<Instruction>(SI.getTrueValue());
936 HotWeight = FalseWeight;
937 } else {
938 ColdI = dyn_cast_or_null<Instruction>(SI.getFalseValue());
939 HotWeight = TrueWeight;
940 }
941 if (ColdI) {
942 std::stack<Instruction *> ColdSlice;
943 getExclBackwardsSlice(ColdI, ColdSlice, SI.getI());
944 InstructionCost SliceCost = 0;
945 while (!ColdSlice.empty()) {
946 SliceCost += TTI->getInstructionCost(ColdSlice.top(),
948 ColdSlice.pop();
949 }
950 // The colder the cold value operand of the select is the more expensive
951 // the cmov becomes for computing the cold value operand every time. Thus,
952 // the colder the cold operand is the more its cost counts.
953 // Get nearest integer cost adjusted for coldness.
954 InstructionCost AdjSliceCost =
955 divideNearest(SliceCost * HotWeight, TotalWeight);
956 if (AdjSliceCost >=
958 return true;
959 }
960 }
961 return false;
962}
963
964// Check if it is safe to move LoadI next to the SI.
965// Conservatively assume it is safe only if there is no instruction
966// modifying memory in-between the load and the select instruction.
967static bool isSafeToSinkLoad(Instruction *LoadI, Instruction *SI) {
968 // Assume loads from different basic blocks are unsafe to move.
969 if (LoadI->getParent() != SI->getParent())
970 return false;
971 auto It = LoadI->getIterator();
972 while (&*It != SI) {
973 if (It->mayWriteToMemory())
974 return false;
975 It++;
976 }
977 return true;
978}
979
980// For a given source instruction, collect its backwards dependence slice
981// consisting of instructions exclusively computed for the purpose of producing
982// the operands of the source instruction. As an approximation
983// (sufficiently-accurate in practice), we populate this set with the
984// instructions of the backwards dependence slice that only have one-use and
985// form an one-use chain that leads to the source instruction.
986void SelectOptimizeImpl::getExclBackwardsSlice(Instruction *I,
987 std::stack<Instruction *> &Slice,
988 Instruction *SI,
989 bool ForSinking) {
991 std::queue<Instruction *> Worklist;
992 Worklist.push(I);
993 while (!Worklist.empty()) {
994 Instruction *II = Worklist.front();
995 Worklist.pop();
996
997 // Avoid cycles.
998 if (!Visited.insert(II).second)
999 continue;
1000
1001 if (!II->hasOneUse())
1002 continue;
1003
1004 // Cannot soundly sink instructions with side-effects.
1005 // Terminator or phi instructions cannot be sunk.
1006 // Avoid sinking other select instructions (should be handled separetely).
1007 if (ForSinking && (II->isTerminator() || II->mayHaveSideEffects() ||
1008 isa<SelectInst>(II) || isa<PHINode>(II)))
1009 continue;
1010
1011 // Avoid sinking loads in order not to skip state-modifying instructions,
1012 // that may alias with the loaded address.
1013 // Only allow sinking of loads within the same basic block that are
1014 // conservatively proven to be safe.
1015 if (ForSinking && II->mayReadFromMemory() && !isSafeToSinkLoad(II, SI))
1016 continue;
1017
1018 // Avoid considering instructions with less frequency than the source
1019 // instruction (i.e., avoid colder code regions of the dependence slice).
1020 if (BFI->getBlockFreq(II->getParent()) < BFI->getBlockFreq(I->getParent()))
1021 continue;
1022
1023 // Eligible one-use instruction added to the dependence slice.
1024 Slice.push(II);
1025
1026 // Explore all the operands of the current instruction to expand the slice.
1027 for (unsigned k = 0; k < II->getNumOperands(); ++k)
1028 if (auto *OpI = dyn_cast<Instruction>(II->getOperand(k)))
1029 Worklist.push(OpI);
1030 }
1031}
1032
1033bool SelectOptimizeImpl::isSelectHighlyPredictable(const SelectLike SI) {
1034 uint64_t TrueWeight, FalseWeight;
1035 if (extractBranchWeights(SI, TrueWeight, FalseWeight)) {
1036 uint64_t Max = std::max(TrueWeight, FalseWeight);
1037 uint64_t Sum = TrueWeight + FalseWeight;
1038 if (Sum != 0) {
1039 auto Probability = BranchProbability::getBranchProbability(Max, Sum);
1040 if (Probability > TTI->getPredictableBranchThreshold())
1041 return true;
1042 }
1043 }
1044 return false;
1045}
1046
1047bool SelectOptimizeImpl::checkLoopHeuristics(const Loop *L,
1048 const CostInfo LoopCost[2]) {
1049 // Loop-level checks to determine if a non-predicated version (with branches)
1050 // of the loop is more profitable than its predicated version.
1051
1053 return true;
1054
1055 OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti",
1056 L->getHeader()->getFirstNonPHI());
1057
1058 if (LoopCost[0].NonPredCost > LoopCost[0].PredCost ||
1059 LoopCost[1].NonPredCost >= LoopCost[1].PredCost) {
1060 ORmissL << "No select conversion in the loop due to no reduction of loop's "
1061 "critical path. ";
1062 EmitAndPrintRemark(ORE, ORmissL);
1063 return false;
1064 }
1065
1066 Scaled64 Gain[2] = {LoopCost[0].PredCost - LoopCost[0].NonPredCost,
1067 LoopCost[1].PredCost - LoopCost[1].NonPredCost};
1068
1069 // Profitably converting to branches need to reduce the loop's critical path
1070 // by at least some threshold (absolute gain of GainCycleThreshold cycles and
1071 // relative gain of 12.5%).
1072 if (Gain[1] < Scaled64::get(GainCycleThreshold) ||
1073 Gain[1] * Scaled64::get(GainRelativeThreshold) < LoopCost[1].PredCost) {
1074 Scaled64 RelativeGain = Scaled64::get(100) * Gain[1] / LoopCost[1].PredCost;
1075 ORmissL << "No select conversion in the loop due to small reduction of "
1076 "loop's critical path. Gain="
1077 << Gain[1].toString()
1078 << ", RelativeGain=" << RelativeGain.toString() << "%. ";
1079 EmitAndPrintRemark(ORE, ORmissL);
1080 return false;
1081 }
1082
1083 // If the loop's critical path involves loop-carried dependences, the gradient
1084 // of the gain needs to be at least GainGradientThreshold% (defaults to 25%).
1085 // This check ensures that the latency reduction for the loop's critical path
1086 // keeps decreasing with sufficient rate beyond the two analyzed loop
1087 // iterations.
1088 if (Gain[1] > Gain[0]) {
1089 Scaled64 GradientGain = Scaled64::get(100) * (Gain[1] - Gain[0]) /
1090 (LoopCost[1].PredCost - LoopCost[0].PredCost);
1091 if (GradientGain < Scaled64::get(GainGradientThreshold)) {
1092 ORmissL << "No select conversion in the loop due to small gradient gain. "
1093 "GradientGain="
1094 << GradientGain.toString() << "%. ";
1095 EmitAndPrintRemark(ORE, ORmissL);
1096 return false;
1097 }
1098 }
1099 // If the gain decreases it is not profitable to convert.
1100 else if (Gain[1] < Gain[0]) {
1101 ORmissL
1102 << "No select conversion in the loop due to negative gradient gain. ";
1103 EmitAndPrintRemark(ORE, ORmissL);
1104 return false;
1105 }
1106
1107 // Non-predicated version of the loop is more profitable than its
1108 // predicated version.
1109 return true;
1110}
1111
1112// Computes instruction and loop-critical-path costs for both the predicated
1113// and non-predicated version of the given loop.
1114// Returns false if unable to compute these costs due to invalid cost of loop
1115// instruction(s).
1116bool SelectOptimizeImpl::computeLoopCosts(
1117 const Loop *L, const SelectGroups &SIGroups,
1118 DenseMap<const Instruction *, CostInfo> &InstCostMap, CostInfo *LoopCost) {
1119 LLVM_DEBUG(dbgs() << "Calculating Latency / IPredCost / INonPredCost of loop "
1120 << L->getHeader()->getName() << "\n");
1121 const auto &SImap = getSImap(SIGroups);
1122 // Compute instruction and loop-critical-path costs across two iterations for
1123 // both predicated and non-predicated version.
1124 const unsigned Iterations = 2;
1125 for (unsigned Iter = 0; Iter < Iterations; ++Iter) {
1126 // Cost of the loop's critical path.
1127 CostInfo &MaxCost = LoopCost[Iter];
1128 for (BasicBlock *BB : L->getBlocks()) {
1129 for (const Instruction &I : *BB) {
1130 if (I.isDebugOrPseudoInst())
1131 continue;
1132 // Compute the predicated and non-predicated cost of the instruction.
1133 Scaled64 IPredCost = Scaled64::getZero(),
1134 INonPredCost = Scaled64::getZero();
1135
1136 // Assume infinite resources that allow to fully exploit the available
1137 // instruction-level parallelism.
1138 // InstCost = InstLatency + max(Op1Cost, Op2Cost, … OpNCost)
1139 for (const Use &U : I.operands()) {
1140 auto UI = dyn_cast<Instruction>(U.get());
1141 if (!UI)
1142 continue;
1143 if (InstCostMap.count(UI)) {
1144 IPredCost = std::max(IPredCost, InstCostMap[UI].PredCost);
1145 INonPredCost = std::max(INonPredCost, InstCostMap[UI].NonPredCost);
1146 }
1147 }
1148 auto ILatency = computeInstCost(&I);
1149 if (!ILatency) {
1150 OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti", &I);
1151 ORmissL << "Invalid instruction cost preventing analysis and "
1152 "optimization of the inner-most loop containing this "
1153 "instruction. ";
1154 EmitAndPrintRemark(ORE, ORmissL);
1155 return false;
1156 }
1157 IPredCost += Scaled64::get(*ILatency);
1158 INonPredCost += Scaled64::get(*ILatency);
1159
1160 // For a select that can be converted to branch,
1161 // compute its cost as a branch (non-predicated cost).
1162 //
1163 // BranchCost = PredictedPathCost + MispredictCost
1164 // PredictedPathCost = TrueOpCost * TrueProb + FalseOpCost * FalseProb
1165 // MispredictCost = max(MispredictPenalty, CondCost) * MispredictRate
1166 if (SImap.contains(&I)) {
1167 auto SI = SImap.at(&I);
1168 Scaled64 TrueOpCost = SI.getTrueOpCost(InstCostMap, TTI);
1169 Scaled64 FalseOpCost = SI.getFalseOpCost(InstCostMap, TTI);
1170 Scaled64 PredictedPathCost =
1171 getPredictedPathCost(TrueOpCost, FalseOpCost, SI);
1172
1173 Scaled64 CondCost = Scaled64::getZero();
1174 if (auto *CI = dyn_cast<Instruction>(SI.getCondition()))
1175 if (InstCostMap.count(CI))
1176 CondCost = InstCostMap[CI].NonPredCost;
1177 Scaled64 MispredictCost = getMispredictionCost(SI, CondCost);
1178
1179 INonPredCost = PredictedPathCost + MispredictCost;
1180 }
1181 LLVM_DEBUG(dbgs() << " " << ILatency << "/" << IPredCost << "/"
1182 << INonPredCost << " for " << I << "\n");
1183
1184 InstCostMap[&I] = {IPredCost, INonPredCost};
1185 MaxCost.PredCost = std::max(MaxCost.PredCost, IPredCost);
1186 MaxCost.NonPredCost = std::max(MaxCost.NonPredCost, INonPredCost);
1187 }
1188 }
1189 LLVM_DEBUG(dbgs() << "Iteration " << Iter + 1
1190 << " MaxCost = " << MaxCost.PredCost << " "
1191 << MaxCost.NonPredCost << "\n");
1192 }
1193 return true;
1194}
1195
1197SelectOptimizeImpl::getSImap(const SelectGroups &SIGroups) {
1199 for (const SelectGroup &ASI : SIGroups)
1200 for (SelectLike SI : ASI)
1201 SImap.try_emplace(SI.getI(), SI);
1202 return SImap;
1203}
1204
1205std::optional<uint64_t>
1206SelectOptimizeImpl::computeInstCost(const Instruction *I) {
1207 InstructionCost ICost =
1209 if (auto OC = ICost.getValue())
1210 return std::optional<uint64_t>(*OC);
1211 return std::nullopt;
1212}
1213
1215SelectOptimizeImpl::getMispredictionCost(const SelectLike SI,
1216 const Scaled64 CondCost) {
1217 uint64_t MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
1218
1219 // Account for the default misprediction rate when using a branch
1220 // (conservatively set to 25% by default).
1221 uint64_t MispredictRate = MispredictDefaultRate;
1222 // If the select condition is obviously predictable, then the misprediction
1223 // rate is zero.
1224 if (isSelectHighlyPredictable(SI))
1225 MispredictRate = 0;
1226
1227 // CondCost is included to account for cases where the computation of the
1228 // condition is part of a long dependence chain (potentially loop-carried)
1229 // that would delay detection of a misprediction and increase its cost.
1230 Scaled64 MispredictCost =
1231 std::max(Scaled64::get(MispredictPenalty), CondCost) *
1232 Scaled64::get(MispredictRate);
1233 MispredictCost /= Scaled64::get(100);
1234
1235 return MispredictCost;
1236}
1237
1238// Returns the cost of a branch when the prediction is correct.
1239// TrueCost * TrueProbability + FalseCost * FalseProbability.
1241SelectOptimizeImpl::getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
1242 const SelectLike SI) {
1243 Scaled64 PredPathCost;
1244 uint64_t TrueWeight, FalseWeight;
1245 if (extractBranchWeights(SI, TrueWeight, FalseWeight)) {
1246 uint64_t SumWeight = TrueWeight + FalseWeight;
1247 if (SumWeight != 0) {
1248 PredPathCost = TrueCost * Scaled64::get(TrueWeight) +
1249 FalseCost * Scaled64::get(FalseWeight);
1250 PredPathCost /= Scaled64::get(SumWeight);
1251 return PredPathCost;
1252 }
1253 }
1254 // Without branch weight metadata, we assume 75% for the one path and 25% for
1255 // the other, and pick the result with the biggest cost.
1256 PredPathCost = std::max(TrueCost * Scaled64::get(3) + FalseCost,
1257 FalseCost * Scaled64::get(3) + TrueCost);
1258 PredPathCost /= Scaled64::get(4);
1259 return PredPathCost;
1260}
1261
1262bool SelectOptimizeImpl::isSelectKindSupported(const SelectLike SI) {
1263 bool VectorCond = !SI.getCondition()->getType()->isIntegerTy(1);
1264 if (VectorCond)
1265 return false;
1267 if (SI.getType()->isVectorTy())
1269 else
1271 return TLI->isSelectSupported(SelectKind);
1272}
static Value * getTrueOrFalseValue(SelectInst *SI, bool isTrue, const SmallPtrSet< const Instruction *, 2 > &Selects)
If isTrue is true, return the true value of SI, otherwise return false value of SI.
#define LLVM_DEBUG(X)
Definition: Debug.h:101
static bool runOnFunction(Function &F, bool PostInlining)
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define DEBUG_TYPE
Hexagon Hardware Loops
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define P(N)
FunctionAnalysisManager FAM
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file contains the declarations for profiling metadata utility functions.
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isSafeToSinkLoad(Instruction *LoadI, Instruction *SI)
Optimize selects
static cl::opt< unsigned > ColdOperandMaxCostMultiplier("cold-operand-max-cost-multiplier", cl::desc("Maximum cost multiplier of TCC_expensive for the dependence " "slice of a cold operand to be considered inexpensive."), cl::init(1), cl::Hidden)
static cl::opt< unsigned > ColdOperandThreshold("cold-operand-threshold", cl::desc("Maximum frequency of path for an operand to be considered cold."), cl::init(20), cl::Hidden)
static cl::opt< bool > DisableLoopLevelHeuristics("disable-loop-level-heuristics", cl::Hidden, cl::init(false), cl::desc("Disable loop-level heuristics."))
static cl::opt< unsigned > GainCycleThreshold("select-opti-loop-cycle-gain-threshold", cl::desc("Minimum gain per loop (in cycles) threshold."), cl::init(4), cl::Hidden)
static cl::opt< unsigned > MispredictDefaultRate("mispredict-default-rate", cl::Hidden, cl::init(25), cl::desc("Default mispredict rate (initialized to 25%)."))
static void EmitAndPrintRemark(OptimizationRemarkEmitter *ORE, DiagnosticInfoOptimizationBase &Rem)
static cl::opt< unsigned > GainGradientThreshold("select-opti-loop-gradient-gain-threshold", cl::desc("Gradient gain threshold (%)."), cl::init(25), cl::Hidden)
static cl::opt< unsigned > GainRelativeThreshold("select-opti-loop-relative-gain-threshold", cl::desc("Minimum relative gain per loop threshold (1/X). Defaults to 12.5%"), cl::init(8), cl::Hidden)
This file contains the declaration of the SelectOptimizePass class, its corresponding pass name is se...
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This file describes how to lower LLVM code to machine code.
Target-Independent Code Generator Pass Configuration Options pass.
This pass exposes codegen information to IR-level passes.
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:348
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:500
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
iterator end()
Definition: BasicBlock.h:451
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:438
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:452
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:207
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:613
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:214
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:173
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:213
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:229
void insertDPValueBefore(DbgRecord *DPV, InstListType::iterator Here)
Insert a DPValue into a block at the position given by Here.
Analysis pass which computes BlockFrequencyInfo.
Legacy analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Conditional or Unconditional Branch instruction.
static BranchInst * Create(BasicBlock *IfTrue, Instruction *InsertBefore=nullptr)
static BranchProbability getBranchProbability(uint64_t Numerator, uint64_t Denominator)
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:888
Record of a variable value-assignment, aka a non instruction representation of the dbg....
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:235
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
Common features for diagnostics dealing with optimization remarks that are used by both IR and MIR pa...
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2649
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
bool isDebugOrPseudoInst() const LLVM_READONLY
Return true if the instruction is a DbgInfoIntrinsic or PseudoProbeInst.
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction.
Definition: Instruction.cpp:98
const BasicBlock * getParent() const
Definition: Instruction.h:150
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:93
bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
bool isTerminator() const
Definition: Instruction.h:253
bool mayReadFromMemory() const LLVM_READONLY
Return true if this instruction may read memory.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:449
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
iterator end() const
iterator begin() const
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:593
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
OptimizationRemarkEmitter legacy analysis pass.
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
An analysis over an "inner" IR unit that provides access to an analysis manager over a "outer" IR uni...
Definition: PassManager.h:783
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:109
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:115
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
Analysis providing profile information.
bool hasProfileSummary() const
Returns true if profile summary is available.
bool isColdBlock(const BBType *BB, BFIT *BFI) const
Returns true if BasicBlock BB is considered cold.
std::string toString(unsigned Precision=DefaultPrecision)
Convert to a decimal representation in a string.
Definition: ScaledNumber.h:596
static ScaledNumber get(uint64_t N)
Definition: ScaledNumber.h:526
static ScaledNumber getZero()
Definition: ScaledNumber.h:521
This class represents the LLVM 'select' instruction.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM)
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:356
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
bool empty() const
Definition: SmallVector.h:94
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
Analysis pass providing the TargetTransformInfo.
virtual bool isSelectSupported(SelectSupportKind) const
SelectSupportKind
Enum that describes what type of support for selects the target has.
bool isPredictableSelectExpensive() const
Return true if selects are only cheaper than branches if the branch is unlikely to be predicted right...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:78
Target-Independent Code Generator Pass Configuration Options.
Provide an instruction scheduling machine model to CodeGen passes.
const MCSchedModel * getMCSchedModel() const
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetLowering * getTargetLowering() const
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
bool shouldTreatInstructionLikeSelect(const Instruction *I) const
Should the Select Optimization pass treat the given instruction like a select, potentially converting...
@ TCK_Latency
The latency of instruction.
bool enableSelectOptimize() const
Should the Select Optimization pass be enabled and ran.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
BranchProbability getPredictableBranchThreshold() const
If a branch or a select condition is skewed in one direction by more than this factor,...
@ TCC_Expensive
The cost of a 'div' instruction on x86.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
LLVM Value Representation.
Definition: Value.h:74
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
self_iterator getIterator()
Definition: ilist_node.h:109
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1724
FunctionPass * createSelectOptimizePass()
This pass converts conditional moves to conditional jumps when profitable.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:665
uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator)
Returns the integer nearest(Numerator / Denominator).
Definition: MathExtras.h:422
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool extractBranchWeights(const MDNode *ProfileData, SmallVectorImpl< uint32_t > &Weights)
Extract branch weights from MD_prof metadata.
void initializeSelectOptimizePass(PassRegistry &)
unsigned MispredictPenalty
Definition: MCSchedule.h:306