LLVM 19.0.0git
SelectOptimize.cpp
Go to the documentation of this file.
1//===--- SelectOptimize.cpp - Convert select to branches if profitable ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass converts selects to conditional jumps when profitable.
10//
11//===----------------------------------------------------------------------===//
12
15#include "llvm/ADT/Statistic.h"
22#include "llvm/CodeGen/Passes.h"
27#include "llvm/IR/BasicBlock.h"
28#include "llvm/IR/Dominators.h"
29#include "llvm/IR/Function.h"
30#include "llvm/IR/IRBuilder.h"
31#include "llvm/IR/Instruction.h"
35#include "llvm/Pass.h"
39#include <algorithm>
40#include <memory>
41#include <queue>
42#include <stack>
43
44using namespace llvm;
45using namespace llvm::PatternMatch;
46
47#define DEBUG_TYPE "select-optimize"
48
49STATISTIC(NumSelectOptAnalyzed,
50 "Number of select groups considered for conversion to branch");
51STATISTIC(NumSelectConvertedExpColdOperand,
52 "Number of select groups converted due to expensive cold operand");
53STATISTIC(NumSelectConvertedHighPred,
54 "Number of select groups converted due to high-predictability");
55STATISTIC(NumSelectUnPred,
56 "Number of select groups not converted due to unpredictability");
57STATISTIC(NumSelectColdBB,
58 "Number of select groups not converted due to cold basic block");
59STATISTIC(NumSelectConvertedLoop,
60 "Number of select groups converted due to loop-level analysis");
61STATISTIC(NumSelectsConverted, "Number of selects converted");
62
64 "cold-operand-threshold",
65 cl::desc("Maximum frequency of path for an operand to be considered cold."),
66 cl::init(20), cl::Hidden);
67
69 "cold-operand-max-cost-multiplier",
70 cl::desc("Maximum cost multiplier of TCC_expensive for the dependence "
71 "slice of a cold operand to be considered inexpensive."),
73
75 GainGradientThreshold("select-opti-loop-gradient-gain-threshold",
76 cl::desc("Gradient gain threshold (%)."),
77 cl::init(25), cl::Hidden);
78
80 GainCycleThreshold("select-opti-loop-cycle-gain-threshold",
81 cl::desc("Minimum gain per loop (in cycles) threshold."),
83
85 "select-opti-loop-relative-gain-threshold",
87 "Minimum relative gain per loop threshold (1/X). Defaults to 12.5%"),
89
91 "mispredict-default-rate", cl::Hidden, cl::init(25),
92 cl::desc("Default mispredict rate (initialized to 25%)."));
93
94static cl::opt<bool>
95 DisableLoopLevelHeuristics("disable-loop-level-heuristics", cl::Hidden,
96 cl::init(false),
97 cl::desc("Disable loop-level heuristics."));
98
99namespace {
100
101class SelectOptimizeImpl {
102 const TargetMachine *TM = nullptr;
103 const TargetSubtargetInfo *TSI = nullptr;
104 const TargetLowering *TLI = nullptr;
105 const TargetTransformInfo *TTI = nullptr;
106 const LoopInfo *LI = nullptr;
108 ProfileSummaryInfo *PSI = nullptr;
109 OptimizationRemarkEmitter *ORE = nullptr;
110 TargetSchedModel TSchedModel;
111
112public:
113 SelectOptimizeImpl() = default;
114 SelectOptimizeImpl(const TargetMachine *TM) : TM(TM){};
116 bool runOnFunction(Function &F, Pass &P);
117
119
120 struct CostInfo {
121 /// Predicated cost (with selects as conditional moves).
122 Scaled64 PredCost;
123 /// Non-predicated cost (with selects converted to branches).
124 Scaled64 NonPredCost;
125 };
126
127 /// SelectLike is an abstraction over SelectInst and other operations that can
128 /// act like selects. For example Or(Zext(icmp), X) can be treated like
129 /// select(icmp, X|1, X).
130 class SelectLike {
131 SelectLike(Instruction *I) : I(I) {}
132
133 Instruction *I;
134
135 public:
136 /// Match a select or select-like instruction, returning a SelectLike.
137 static SelectLike match(Instruction *I) {
138 // Select instruction are what we are usually looking for.
139 if (isa<SelectInst>(I))
140 return SelectLike(I);
141
142 // An Or(zext(i1 X), Y) can also be treated like a select, with condition
143 // C and values Y|1 and Y.
144 Value *X;
146 I, m_c_Or(m_OneUse(m_ZExt(m_Value(X))), m_Value())) &&
147 X->getType()->isIntegerTy(1))
148 return SelectLike(I);
149
150 return SelectLike(nullptr);
151 }
152
153 bool isValid() { return I; }
154 operator bool() { return isValid(); }
155
156 Instruction *getI() { return I; }
157 const Instruction *getI() const { return I; }
158
159 Type *getType() const { return I->getType(); }
160
161 /// Return the condition for the SelectLike instruction. For example the
162 /// condition of a select or c in `or(zext(c), x)`
163 Value *getCondition() const {
164 if (auto *Sel = dyn_cast<SelectInst>(I))
165 return Sel->getCondition();
166 // Or(zext) case
167 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
168 Value *X;
169 if (PatternMatch::match(BO->getOperand(0),
171 return X;
172 if (PatternMatch::match(BO->getOperand(1),
174 return X;
175 }
176
177 llvm_unreachable("Unhandled case in getCondition");
178 }
179
180 /// Return the true value for the SelectLike instruction. Note this may not
181 /// exist for all SelectLike instructions. For example, for `or(zext(c), x)`
182 /// the true value would be `or(x,1)`. As this value does not exist, nullptr
183 /// is returned.
184 Value *getTrueValue() const {
185 if (auto *Sel = dyn_cast<SelectInst>(I))
186 return Sel->getTrueValue();
187 // Or(zext) case - The true value is Or(X), so return nullptr as the value
188 // does not yet exist.
189 if (isa<BinaryOperator>(I))
190 return nullptr;
191
192 llvm_unreachable("Unhandled case in getTrueValue");
193 }
194
195 /// Return the false value for the SelectLike instruction. For example the
196 /// getFalseValue of a select or `x` in `or(zext(c), x)` (which is
197 /// `select(c, x|1, x)`)
198 Value *getFalseValue() const {
199 if (auto *Sel = dyn_cast<SelectInst>(I))
200 return Sel->getFalseValue();
201 // Or(zext) case - return the operand which is not the zext.
202 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
203 Value *X;
204 if (PatternMatch::match(BO->getOperand(0),
206 return BO->getOperand(1);
207 if (PatternMatch::match(BO->getOperand(1),
209 return BO->getOperand(0);
210 }
211
212 llvm_unreachable("Unhandled case in getFalseValue");
213 }
214
215 /// Return the NonPredCost cost of the true op, given the costs in
216 /// InstCostMap. This may need to be generated for select-like instructions.
217 Scaled64 getTrueOpCost(DenseMap<const Instruction *, CostInfo> &InstCostMap,
218 const TargetTransformInfo *TTI) {
219 if (auto *Sel = dyn_cast<SelectInst>(I))
220 if (auto *I = dyn_cast<Instruction>(Sel->getTrueValue()))
221 return InstCostMap.contains(I) ? InstCostMap[I].NonPredCost
223
224 // Or case - add the cost of an extra Or to the cost of the False case.
225 if (isa<BinaryOperator>(I))
226 if (auto I = dyn_cast<Instruction>(getFalseValue()))
227 if (InstCostMap.contains(I)) {
229 Instruction::Or, I->getType(), TargetTransformInfo::TCK_Latency,
230 {TargetTransformInfo::OK_AnyValue,
231 TargetTransformInfo::OP_None},
232 {TTI::OK_UniformConstantValue, TTI::OP_PowerOf2});
233 return InstCostMap[I].NonPredCost +
234 Scaled64::get(*OrCost.getValue());
235 }
236
237 return Scaled64::getZero();
238 }
239
240 /// Return the NonPredCost cost of the false op, given the costs in
241 /// InstCostMap. This may need to be generated for select-like instructions.
243 getFalseOpCost(DenseMap<const Instruction *, CostInfo> &InstCostMap,
244 const TargetTransformInfo *TTI) {
245 if (auto *Sel = dyn_cast<SelectInst>(I))
246 if (auto *I = dyn_cast<Instruction>(Sel->getFalseValue()))
247 return InstCostMap.contains(I) ? InstCostMap[I].NonPredCost
249
250 // Or case - return the cost of the false case
251 if (isa<BinaryOperator>(I))
252 if (auto I = dyn_cast<Instruction>(getFalseValue()))
253 if (InstCostMap.contains(I))
254 return InstCostMap[I].NonPredCost;
255
256 return Scaled64::getZero();
257 }
258 };
259
260private:
261 // Select groups consist of consecutive select instructions with the same
262 // condition.
263 using SelectGroup = SmallVector<SelectLike, 2>;
264 using SelectGroups = SmallVector<SelectGroup, 2>;
265
266 // Converts select instructions of a function to conditional jumps when deemed
267 // profitable. Returns true if at least one select was converted.
268 bool optimizeSelects(Function &F);
269
270 // Heuristics for determining which select instructions can be profitably
271 // conveted to branches. Separate heuristics for selects in inner-most loops
272 // and the rest of code regions (base heuristics for non-inner-most loop
273 // regions).
274 void optimizeSelectsBase(Function &F, SelectGroups &ProfSIGroups);
275 void optimizeSelectsInnerLoops(Function &F, SelectGroups &ProfSIGroups);
276
277 // Converts to branches the select groups that were deemed
278 // profitable-to-convert.
279 void convertProfitableSIGroups(SelectGroups &ProfSIGroups);
280
281 // Splits selects of a given basic block into select groups.
282 void collectSelectGroups(BasicBlock &BB, SelectGroups &SIGroups);
283
284 // Determines for which select groups it is profitable converting to branches
285 // (base and inner-most-loop heuristics).
286 void findProfitableSIGroupsBase(SelectGroups &SIGroups,
287 SelectGroups &ProfSIGroups);
288 void findProfitableSIGroupsInnerLoops(const Loop *L, SelectGroups &SIGroups,
289 SelectGroups &ProfSIGroups);
290
291 // Determines if a select group should be converted to a branch (base
292 // heuristics).
293 bool isConvertToBranchProfitableBase(const SelectGroup &ASI);
294
295 // Returns true if there are expensive instructions in the cold value
296 // operand's (if any) dependence slice of any of the selects of the given
297 // group.
298 bool hasExpensiveColdOperand(const SelectGroup &ASI);
299
300 // For a given source instruction, collect its backwards dependence slice
301 // consisting of instructions exclusively computed for producing the operands
302 // of the source instruction.
303 void getExclBackwardsSlice(Instruction *I, std::stack<Instruction *> &Slice,
304 Instruction *SI, bool ForSinking = false);
305
306 // Returns true if the condition of the select is highly predictable.
307 bool isSelectHighlyPredictable(const SelectLike SI);
308
309 // Loop-level checks to determine if a non-predicated version (with branches)
310 // of the given loop is more profitable than its predicated version.
311 bool checkLoopHeuristics(const Loop *L, const CostInfo LoopDepth[2]);
312
313 // Computes instruction and loop-critical-path costs for both the predicated
314 // and non-predicated version of the given loop.
315 bool computeLoopCosts(const Loop *L, const SelectGroups &SIGroups,
317 CostInfo *LoopCost);
318
319 // Returns a set of all the select instructions in the given select groups.
321 getSImap(const SelectGroups &SIGroups);
322
323 // Returns the latency cost of a given instruction.
324 std::optional<uint64_t> computeInstCost(const Instruction *I);
325
326 // Returns the misprediction cost of a given select when converted to branch.
327 Scaled64 getMispredictionCost(const SelectLike SI, const Scaled64 CondCost);
328
329 // Returns the cost of a branch when the prediction is correct.
330 Scaled64 getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
331 const SelectLike SI);
332
333 // Returns true if the target architecture supports lowering a given select.
334 bool isSelectKindSupported(const SelectLike SI);
335};
336
337class SelectOptimize : public FunctionPass {
338 SelectOptimizeImpl Impl;
339
340public:
341 static char ID;
342
343 SelectOptimize() : FunctionPass(ID) {
345 }
346
347 bool runOnFunction(Function &F) override {
348 return Impl.runOnFunction(F, *this);
349 }
350
351 void getAnalysisUsage(AnalysisUsage &AU) const override {
358 }
359};
360
361} // namespace
362
365 SelectOptimizeImpl Impl(TM);
366 return Impl.run(F, FAM);
367}
368
369char SelectOptimize::ID = 0;
370
371INITIALIZE_PASS_BEGIN(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
372 false)
379INITIALIZE_PASS_END(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
380 false)
381
382FunctionPass *llvm::createSelectOptimizePass() { return new SelectOptimize(); }
383
384PreservedAnalyses SelectOptimizeImpl::run(Function &F,
386 TSI = TM->getSubtargetImpl(F);
387 TLI = TSI->getTargetLowering();
388
389 // If none of the select types are supported then skip this pass.
390 // This is an optimization pass. Legality issues will be handled by
391 // instruction selection.
395 return PreservedAnalyses::all();
396
399 return PreservedAnalyses::all();
400
402 .getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
403 assert(PSI && "This pass requires module analysis pass `profile-summary`!");
405
406 // When optimizing for size, selects are preferable over branches.
407 if (F.hasOptSize() || llvm::shouldOptimizeForSize(&F, PSI, BFI))
408 return PreservedAnalyses::all();
409
410 LI = &FAM.getResult<LoopAnalysis>(F);
412 TSchedModel.init(TSI);
413
414 bool Changed = optimizeSelects(F);
415 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
416}
417
418bool SelectOptimizeImpl::runOnFunction(Function &F, Pass &P) {
419 TM = &P.getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
420 TSI = TM->getSubtargetImpl(F);
421 TLI = TSI->getTargetLowering();
422
423 // If none of the select types are supported then skip this pass.
424 // This is an optimization pass. Legality issues will be handled by
425 // instruction selection.
429 return false;
430
431 TTI = &P.getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
432
434 return false;
435
436 LI = &P.getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
437 BFI = &P.getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
438 PSI = &P.getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
439 ORE = &P.getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
440 TSchedModel.init(TSI);
441
442 // When optimizing for size, selects are preferable over branches.
443 if (F.hasOptSize() || llvm::shouldOptimizeForSize(&F, PSI, BFI))
444 return false;
445
446 return optimizeSelects(F);
447}
448
449bool SelectOptimizeImpl::optimizeSelects(Function &F) {
450 // Determine for which select groups it is profitable converting to branches.
451 SelectGroups ProfSIGroups;
452 // Base heuristics apply only to non-loops and outer loops.
453 optimizeSelectsBase(F, ProfSIGroups);
454 // Separate heuristics for inner-most loops.
455 optimizeSelectsInnerLoops(F, ProfSIGroups);
456
457 // Convert to branches the select groups that were deemed
458 // profitable-to-convert.
459 convertProfitableSIGroups(ProfSIGroups);
460
461 // Code modified if at least one select group was converted.
462 return !ProfSIGroups.empty();
463}
464
465void SelectOptimizeImpl::optimizeSelectsBase(Function &F,
466 SelectGroups &ProfSIGroups) {
467 // Collect all the select groups.
468 SelectGroups SIGroups;
469 for (BasicBlock &BB : F) {
470 // Base heuristics apply only to non-loops and outer loops.
471 Loop *L = LI->getLoopFor(&BB);
472 if (L && L->isInnermost())
473 continue;
474 collectSelectGroups(BB, SIGroups);
475 }
476
477 // Determine for which select groups it is profitable converting to branches.
478 findProfitableSIGroupsBase(SIGroups, ProfSIGroups);
479}
480
481void SelectOptimizeImpl::optimizeSelectsInnerLoops(Function &F,
482 SelectGroups &ProfSIGroups) {
484 // Need to check size on each iteration as we accumulate child loops.
485 for (unsigned long i = 0; i < Loops.size(); ++i)
486 for (Loop *ChildL : Loops[i]->getSubLoops())
487 Loops.push_back(ChildL);
488
489 for (Loop *L : Loops) {
490 if (!L->isInnermost())
491 continue;
492
493 SelectGroups SIGroups;
494 for (BasicBlock *BB : L->getBlocks())
495 collectSelectGroups(*BB, SIGroups);
496
497 findProfitableSIGroupsInnerLoops(L, SIGroups, ProfSIGroups);
498 }
499}
500
501/// If \p isTrue is true, return the true value of \p SI, otherwise return
502/// false value of \p SI. If the true/false value of \p SI is defined by any
503/// select instructions in \p Selects, look through the defining select
504/// instruction until the true/false value is not defined in \p Selects.
505static Value *
506getTrueOrFalseValue(SelectOptimizeImpl::SelectLike SI, bool isTrue,
508 IRBuilder<> &IB) {
509 Value *V = nullptr;
510 for (SelectInst *DefSI = dyn_cast<SelectInst>(SI.getI());
511 DefSI != nullptr && Selects.count(DefSI);
512 DefSI = dyn_cast<SelectInst>(V)) {
513 assert(DefSI->getCondition() == SI.getCondition() &&
514 "The condition of DefSI does not match with SI");
515 V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
516 }
517
518 if (isa<BinaryOperator>(SI.getI())) {
519 assert(SI.getI()->getOpcode() == Instruction::Or &&
520 "Only currently handling Or instructions.");
521 V = SI.getFalseValue();
522 if (isTrue)
523 V = IB.CreateOr(V, ConstantInt::get(V->getType(), 1));
524 }
525
526 assert(V && "Failed to get select true/false value");
527 return V;
528}
529
530void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
531 for (SelectGroup &ASI : ProfSIGroups) {
532 // The code transformation here is a modified version of the sinking
533 // transformation in CodeGenPrepare::optimizeSelectInst with a more
534 // aggressive strategy of which instructions to sink.
535 //
536 // TODO: eliminate the redundancy of logic transforming selects to branches
537 // by removing CodeGenPrepare::optimizeSelectInst and optimizing here
538 // selects for all cases (with and without profile information).
539
540 // Transform a sequence like this:
541 // start:
542 // %cmp = cmp uge i32 %a, %b
543 // %sel = select i1 %cmp, i32 %c, i32 %d
544 //
545 // Into:
546 // start:
547 // %cmp = cmp uge i32 %a, %b
548 // %cmp.frozen = freeze %cmp
549 // br i1 %cmp.frozen, label %select.true, label %select.false
550 // select.true:
551 // br label %select.end
552 // select.false:
553 // br label %select.end
554 // select.end:
555 // %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
556 //
557 // %cmp should be frozen, otherwise it may introduce undefined behavior.
558 // In addition, we may sink instructions that produce %c or %d into the
559 // destination(s) of the new branch.
560 // If the true or false blocks do not contain a sunken instruction, that
561 // block and its branch may be optimized away. In that case, one side of the
562 // first branch will point directly to select.end, and the corresponding PHI
563 // predecessor block will be the start block.
564
565 // Find all the instructions that can be soundly sunk to the true/false
566 // blocks. These are instructions that are computed solely for producing the
567 // operands of the select instructions in the group and can be sunk without
568 // breaking the semantics of the LLVM IR (e.g., cannot sink instructions
569 // with side effects).
570 SmallVector<std::stack<Instruction *>, 2> TrueSlices, FalseSlices;
571 typedef std::stack<Instruction *>::size_type StackSizeType;
572 StackSizeType maxTrueSliceLen = 0, maxFalseSliceLen = 0;
573 for (SelectLike SI : ASI) {
574 // For each select, compute the sinkable dependence chains of the true and
575 // false operands.
576 if (auto *TI = dyn_cast_or_null<Instruction>(SI.getTrueValue())) {
577 std::stack<Instruction *> TrueSlice;
578 getExclBackwardsSlice(TI, TrueSlice, SI.getI(), true);
579 maxTrueSliceLen = std::max(maxTrueSliceLen, TrueSlice.size());
580 TrueSlices.push_back(TrueSlice);
581 }
582 if (auto *FI = dyn_cast_or_null<Instruction>(SI.getFalseValue())) {
583 if (isa<SelectInst>(SI.getI()) || !FI->hasOneUse()) {
584 std::stack<Instruction *> FalseSlice;
585 getExclBackwardsSlice(FI, FalseSlice, SI.getI(), true);
586 maxFalseSliceLen = std::max(maxFalseSliceLen, FalseSlice.size());
587 FalseSlices.push_back(FalseSlice);
588 }
589 }
590 }
591 // In the case of multiple select instructions in the same group, the order
592 // of non-dependent instructions (instructions of different dependence
593 // slices) in the true/false blocks appears to affect performance.
594 // Interleaving the slices seems to experimentally be the optimal approach.
595 // This interleaving scheduling allows for more ILP (with a natural downside
596 // of increasing a bit register pressure) compared to a simple ordering of
597 // one whole chain after another. One would expect that this ordering would
598 // not matter since the scheduling in the backend of the compiler would
599 // take care of it, but apparently the scheduler fails to deliver optimal
600 // ILP with a naive ordering here.
601 SmallVector<Instruction *, 2> TrueSlicesInterleaved, FalseSlicesInterleaved;
602 for (StackSizeType IS = 0; IS < maxTrueSliceLen; ++IS) {
603 for (auto &S : TrueSlices) {
604 if (!S.empty()) {
605 TrueSlicesInterleaved.push_back(S.top());
606 S.pop();
607 }
608 }
609 }
610 for (StackSizeType IS = 0; IS < maxFalseSliceLen; ++IS) {
611 for (auto &S : FalseSlices) {
612 if (!S.empty()) {
613 FalseSlicesInterleaved.push_back(S.top());
614 S.pop();
615 }
616 }
617 }
618
619 // We split the block containing the select(s) into two blocks.
620 SelectLike SI = ASI.front();
621 SelectLike LastSI = ASI.back();
622 BasicBlock *StartBlock = SI.getI()->getParent();
623 BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI.getI()));
624 // With RemoveDIs turned off, SplitPt can be a dbg.* intrinsic. With
625 // RemoveDIs turned on, SplitPt would instead point to the next
626 // instruction. To match existing dbg.* intrinsic behaviour with RemoveDIs,
627 // tell splitBasicBlock that we want to include any DbgVariableRecords
628 // attached to SplitPt in the splice.
629 SplitPt.setHeadBit(true);
630 BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
631 BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock));
632 // Delete the unconditional branch that was just created by the split.
633 StartBlock->getTerminator()->eraseFromParent();
634
635 // Move any debug/pseudo instructions that were in-between the select
636 // group to the newly-created end block.
637 SmallVector<Instruction *, 2> DebugPseudoINS;
638 auto DIt = SI.getI()->getIterator();
639 while (&*DIt != LastSI.getI()) {
640 if (DIt->isDebugOrPseudoInst())
641 DebugPseudoINS.push_back(&*DIt);
642 DIt++;
643 }
644 for (auto *DI : DebugPseudoINS) {
645 DI->moveBeforePreserving(&*EndBlock->getFirstInsertionPt());
646 }
647
648 // Duplicate implementation for DbgRecords, the non-instruction debug-info
649 // format. Helper lambda for moving DbgRecords to the end block.
650 auto TransferDbgRecords = [&](Instruction &I) {
651 for (auto &DbgRecord :
652 llvm::make_early_inc_range(I.getDbgRecordRange())) {
655 EndBlock->getFirstInsertionPt());
656 }
657 };
658
659 // Iterate over all instructions in between SI and LastSI, not including
660 // SI itself. These are all the variable assignments that happen "in the
661 // middle" of the select group.
662 auto R = make_range(std::next(SI.getI()->getIterator()),
663 std::next(LastSI.getI()->getIterator()));
664 llvm::for_each(R, TransferDbgRecords);
665
666 // These are the new basic blocks for the conditional branch.
667 // At least one will become an actual new basic block.
668 BasicBlock *TrueBlock = nullptr, *FalseBlock = nullptr;
669 BranchInst *TrueBranch = nullptr, *FalseBranch = nullptr;
670 if (!TrueSlicesInterleaved.empty()) {
671 TrueBlock = BasicBlock::Create(EndBlock->getContext(), "select.true.sink",
672 EndBlock->getParent(), EndBlock);
673 TrueBranch = BranchInst::Create(EndBlock, TrueBlock);
674 TrueBranch->setDebugLoc(LastSI.getI()->getDebugLoc());
675 for (Instruction *TrueInst : TrueSlicesInterleaved)
676 TrueInst->moveBefore(TrueBranch);
677 }
678 if (!FalseSlicesInterleaved.empty()) {
679 FalseBlock =
680 BasicBlock::Create(EndBlock->getContext(), "select.false.sink",
681 EndBlock->getParent(), EndBlock);
682 FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
683 FalseBranch->setDebugLoc(LastSI.getI()->getDebugLoc());
684 for (Instruction *FalseInst : FalseSlicesInterleaved)
685 FalseInst->moveBefore(FalseBranch);
686 }
687 // If there was nothing to sink, then arbitrarily choose the 'false' side
688 // for a new input value to the PHI.
689 if (TrueBlock == FalseBlock) {
690 assert(TrueBlock == nullptr &&
691 "Unexpected basic block transform while optimizing select");
692
693 FalseBlock = BasicBlock::Create(StartBlock->getContext(), "select.false",
694 EndBlock->getParent(), EndBlock);
695 auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
696 FalseBranch->setDebugLoc(SI.getI()->getDebugLoc());
697 }
698
699 // Insert the real conditional branch based on the original condition.
700 // If we did not create a new block for one of the 'true' or 'false' paths
701 // of the condition, it means that side of the branch goes to the end block
702 // directly and the path originates from the start block from the point of
703 // view of the new PHI.
704 BasicBlock *TT, *FT;
705 if (TrueBlock == nullptr) {
706 TT = EndBlock;
707 FT = FalseBlock;
708 TrueBlock = StartBlock;
709 } else if (FalseBlock == nullptr) {
710 TT = TrueBlock;
711 FT = EndBlock;
712 FalseBlock = StartBlock;
713 } else {
714 TT = TrueBlock;
715 FT = FalseBlock;
716 }
717 IRBuilder<> IB(SI.getI());
718 auto *CondFr = IB.CreateFreeze(SI.getCondition(),
719 SI.getCondition()->getName() + ".frozen");
720
722 for (auto SI : ASI)
723 INS.insert(SI.getI());
724
725 // Use reverse iterator because later select may use the value of the
726 // earlier select, and we need to propagate value through earlier select
727 // to get the PHI operand.
728 for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) {
729 SelectLike SI = *It;
730 // The select itself is replaced with a PHI Node.
731 PHINode *PN = PHINode::Create(SI.getType(), 2, "");
732 PN->insertBefore(EndBlock->begin());
733 PN->takeName(SI.getI());
734 PN->addIncoming(getTrueOrFalseValue(SI, true, INS, IB), TrueBlock);
735 PN->addIncoming(getTrueOrFalseValue(SI, false, INS, IB), FalseBlock);
736 PN->setDebugLoc(SI.getI()->getDebugLoc());
737 SI.getI()->replaceAllUsesWith(PN);
738 INS.erase(SI.getI());
739 ++NumSelectsConverted;
740 }
741 IB.CreateCondBr(CondFr, TT, FT, SI.getI());
742
743 // Remove the old select instructions, now that they are not longer used.
744 for (auto SI : ASI)
745 SI.getI()->eraseFromParent();
746 }
747}
748
749void SelectOptimizeImpl::collectSelectGroups(BasicBlock &BB,
750 SelectGroups &SIGroups) {
751 BasicBlock::iterator BBIt = BB.begin();
752 while (BBIt != BB.end()) {
753 Instruction *I = &*BBIt++;
754 if (SelectLike SI = SelectLike::match(I)) {
756 continue;
757
758 SelectGroup SIGroup;
759 SIGroup.push_back(SI);
760 while (BBIt != BB.end()) {
761 Instruction *NI = &*BBIt;
762 // Debug/pseudo instructions should be skipped and not prevent the
763 // formation of a select group.
764 if (NI->isDebugOrPseudoInst()) {
765 ++BBIt;
766 continue;
767 }
768 // We only allow selects in the same group, not other select-like
769 // instructions.
770 if (!isa<SelectInst>(NI))
771 break;
772
773 SelectLike NSI = SelectLike::match(NI);
774 if (NSI && SI.getCondition() == NSI.getCondition()) {
775 SIGroup.push_back(NSI);
776 } else
777 break;
778 ++BBIt;
779 }
780
781 // If the select type is not supported, no point optimizing it.
782 // Instruction selection will take care of it.
783 if (!isSelectKindSupported(SI))
784 continue;
785
786 SIGroups.push_back(SIGroup);
787 }
788 }
789}
790
791void SelectOptimizeImpl::findProfitableSIGroupsBase(
792 SelectGroups &SIGroups, SelectGroups &ProfSIGroups) {
793 for (SelectGroup &ASI : SIGroups) {
794 ++NumSelectOptAnalyzed;
795 if (isConvertToBranchProfitableBase(ASI))
796 ProfSIGroups.push_back(ASI);
797 }
798}
799
802 LLVM_DEBUG(dbgs() << Rem.getMsg() << "\n");
803 ORE->emit(Rem);
804}
805
806void SelectOptimizeImpl::findProfitableSIGroupsInnerLoops(
807 const Loop *L, SelectGroups &SIGroups, SelectGroups &ProfSIGroups) {
808 NumSelectOptAnalyzed += SIGroups.size();
809 // For each select group in an inner-most loop,
810 // a branch is more preferable than a select/conditional-move if:
811 // i) conversion to branches for all the select groups of the loop satisfies
812 // loop-level heuristics including reducing the loop's critical path by
813 // some threshold (see SelectOptimizeImpl::checkLoopHeuristics); and
814 // ii) the total cost of the select group is cheaper with a branch compared
815 // to its predicated version. The cost is in terms of latency and the cost
816 // of a select group is the cost of its most expensive select instruction
817 // (assuming infinite resources and thus fully leveraging available ILP).
818
820 CostInfo LoopCost[2] = {{Scaled64::getZero(), Scaled64::getZero()},
822 if (!computeLoopCosts(L, SIGroups, InstCostMap, LoopCost) ||
823 !checkLoopHeuristics(L, LoopCost)) {
824 return;
825 }
826
827 for (SelectGroup &ASI : SIGroups) {
828 // Assuming infinite resources, the cost of a group of instructions is the
829 // cost of the most expensive instruction of the group.
830 Scaled64 SelectCost = Scaled64::getZero(), BranchCost = Scaled64::getZero();
831 for (SelectLike SI : ASI) {
832 SelectCost = std::max(SelectCost, InstCostMap[SI.getI()].PredCost);
833 BranchCost = std::max(BranchCost, InstCostMap[SI.getI()].NonPredCost);
834 }
835 if (BranchCost < SelectCost) {
836 OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", ASI.front().getI());
837 OR << "Profitable to convert to branch (loop analysis). BranchCost="
838 << BranchCost.toString() << ", SelectCost=" << SelectCost.toString()
839 << ". ";
840 EmitAndPrintRemark(ORE, OR);
841 ++NumSelectConvertedLoop;
842 ProfSIGroups.push_back(ASI);
843 } else {
844 OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti",
845 ASI.front().getI());
846 ORmiss << "Select is more profitable (loop analysis). BranchCost="
847 << BranchCost.toString()
848 << ", SelectCost=" << SelectCost.toString() << ". ";
849 EmitAndPrintRemark(ORE, ORmiss);
850 }
851 }
852}
853
854bool SelectOptimizeImpl::isConvertToBranchProfitableBase(
855 const SelectGroup &ASI) {
856 SelectLike SI = ASI.front();
857 LLVM_DEBUG(dbgs() << "Analyzing select group containing " << *SI.getI()
858 << "\n");
859 OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", SI.getI());
860 OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", SI.getI());
861
862 // Skip cold basic blocks. Better to optimize for size for cold blocks.
863 if (PSI->isColdBlock(SI.getI()->getParent(), BFI)) {
864 ++NumSelectColdBB;
865 ORmiss << "Not converted to branch because of cold basic block. ";
866 EmitAndPrintRemark(ORE, ORmiss);
867 return false;
868 }
869
870 // If unpredictable, branch form is less profitable.
871 if (SI.getI()->getMetadata(LLVMContext::MD_unpredictable)) {
872 ++NumSelectUnPred;
873 ORmiss << "Not converted to branch because of unpredictable branch. ";
874 EmitAndPrintRemark(ORE, ORmiss);
875 return false;
876 }
877
878 // If highly predictable, branch form is more profitable, unless a
879 // predictable select is inexpensive in the target architecture.
880 if (isSelectHighlyPredictable(SI) && TLI->isPredictableSelectExpensive()) {
881 ++NumSelectConvertedHighPred;
882 OR << "Converted to branch because of highly predictable branch. ";
883 EmitAndPrintRemark(ORE, OR);
884 return true;
885 }
886
887 // Look for expensive instructions in the cold operand's (if any) dependence
888 // slice of any of the selects in the group.
889 if (hasExpensiveColdOperand(ASI)) {
890 ++NumSelectConvertedExpColdOperand;
891 OR << "Converted to branch because of expensive cold operand.";
892 EmitAndPrintRemark(ORE, OR);
893 return true;
894 }
895
896 ORmiss << "Not profitable to convert to branch (base heuristic).";
897 EmitAndPrintRemark(ORE, ORmiss);
898 return false;
899}
900
902 uint64_t Denominator) {
903 return (Numerator + (Denominator / 2)) / Denominator;
904}
905
906static bool extractBranchWeights(const SelectOptimizeImpl::SelectLike SI,
907 uint64_t &TrueVal, uint64_t &FalseVal) {
908 if (isa<SelectInst>(SI.getI()))
909 return extractBranchWeights(*SI.getI(), TrueVal, FalseVal);
910 return false;
911}
912
913bool SelectOptimizeImpl::hasExpensiveColdOperand(const SelectGroup &ASI) {
914 bool ColdOperand = false;
915 uint64_t TrueWeight, FalseWeight, TotalWeight;
916 if (extractBranchWeights(ASI.front(), TrueWeight, FalseWeight)) {
917 uint64_t MinWeight = std::min(TrueWeight, FalseWeight);
918 TotalWeight = TrueWeight + FalseWeight;
919 // Is there a path with frequency <ColdOperandThreshold% (default:20%) ?
920 ColdOperand = TotalWeight * ColdOperandThreshold > 100 * MinWeight;
921 } else if (PSI->hasProfileSummary()) {
922 OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti",
923 ASI.front().getI());
924 ORmiss << "Profile data available but missing branch-weights metadata for "
925 "select instruction. ";
926 EmitAndPrintRemark(ORE, ORmiss);
927 }
928 if (!ColdOperand)
929 return false;
930 // Check if the cold path's dependence slice is expensive for any of the
931 // selects of the group.
932 for (SelectLike SI : ASI) {
933 Instruction *ColdI = nullptr;
934 uint64_t HotWeight;
935 if (TrueWeight < FalseWeight) {
936 ColdI = dyn_cast_or_null<Instruction>(SI.getTrueValue());
937 HotWeight = FalseWeight;
938 } else {
939 ColdI = dyn_cast_or_null<Instruction>(SI.getFalseValue());
940 HotWeight = TrueWeight;
941 }
942 if (ColdI) {
943 std::stack<Instruction *> ColdSlice;
944 getExclBackwardsSlice(ColdI, ColdSlice, SI.getI());
945 InstructionCost SliceCost = 0;
946 while (!ColdSlice.empty()) {
947 SliceCost += TTI->getInstructionCost(ColdSlice.top(),
949 ColdSlice.pop();
950 }
951 // The colder the cold value operand of the select is the more expensive
952 // the cmov becomes for computing the cold value operand every time. Thus,
953 // the colder the cold operand is the more its cost counts.
954 // Get nearest integer cost adjusted for coldness.
955 InstructionCost AdjSliceCost =
956 divideNearest(SliceCost * HotWeight, TotalWeight);
957 if (AdjSliceCost >=
959 return true;
960 }
961 }
962 return false;
963}
964
965// Check if it is safe to move LoadI next to the SI.
966// Conservatively assume it is safe only if there is no instruction
967// modifying memory in-between the load and the select instruction.
968static bool isSafeToSinkLoad(Instruction *LoadI, Instruction *SI) {
969 // Assume loads from different basic blocks are unsafe to move.
970 if (LoadI->getParent() != SI->getParent())
971 return false;
972 auto It = LoadI->getIterator();
973 while (&*It != SI) {
974 if (It->mayWriteToMemory())
975 return false;
976 It++;
977 }
978 return true;
979}
980
981// For a given source instruction, collect its backwards dependence slice
982// consisting of instructions exclusively computed for the purpose of producing
983// the operands of the source instruction. As an approximation
984// (sufficiently-accurate in practice), we populate this set with the
985// instructions of the backwards dependence slice that only have one-use and
986// form an one-use chain that leads to the source instruction.
987void SelectOptimizeImpl::getExclBackwardsSlice(Instruction *I,
988 std::stack<Instruction *> &Slice,
989 Instruction *SI,
990 bool ForSinking) {
992 std::queue<Instruction *> Worklist;
993 Worklist.push(I);
994 while (!Worklist.empty()) {
995 Instruction *II = Worklist.front();
996 Worklist.pop();
997
998 // Avoid cycles.
999 if (!Visited.insert(II).second)
1000 continue;
1001
1002 if (!II->hasOneUse())
1003 continue;
1004
1005 // Cannot soundly sink instructions with side-effects.
1006 // Terminator or phi instructions cannot be sunk.
1007 // Avoid sinking other select instructions (should be handled separetely).
1008 if (ForSinking && (II->isTerminator() || II->mayHaveSideEffects() ||
1009 isa<SelectInst>(II) || isa<PHINode>(II)))
1010 continue;
1011
1012 // Avoid sinking loads in order not to skip state-modifying instructions,
1013 // that may alias with the loaded address.
1014 // Only allow sinking of loads within the same basic block that are
1015 // conservatively proven to be safe.
1016 if (ForSinking && II->mayReadFromMemory() && !isSafeToSinkLoad(II, SI))
1017 continue;
1018
1019 // Avoid considering instructions with less frequency than the source
1020 // instruction (i.e., avoid colder code regions of the dependence slice).
1021 if (BFI->getBlockFreq(II->getParent()) < BFI->getBlockFreq(I->getParent()))
1022 continue;
1023
1024 // Eligible one-use instruction added to the dependence slice.
1025 Slice.push(II);
1026
1027 // Explore all the operands of the current instruction to expand the slice.
1028 for (unsigned k = 0; k < II->getNumOperands(); ++k)
1029 if (auto *OpI = dyn_cast<Instruction>(II->getOperand(k)))
1030 Worklist.push(OpI);
1031 }
1032}
1033
1034bool SelectOptimizeImpl::isSelectHighlyPredictable(const SelectLike SI) {
1035 uint64_t TrueWeight, FalseWeight;
1036 if (extractBranchWeights(SI, TrueWeight, FalseWeight)) {
1037 uint64_t Max = std::max(TrueWeight, FalseWeight);
1038 uint64_t Sum = TrueWeight + FalseWeight;
1039 if (Sum != 0) {
1040 auto Probability = BranchProbability::getBranchProbability(Max, Sum);
1041 if (Probability > TTI->getPredictableBranchThreshold())
1042 return true;
1043 }
1044 }
1045 return false;
1046}
1047
1048bool SelectOptimizeImpl::checkLoopHeuristics(const Loop *L,
1049 const CostInfo LoopCost[2]) {
1050 // Loop-level checks to determine if a non-predicated version (with branches)
1051 // of the loop is more profitable than its predicated version.
1052
1054 return true;
1055
1056 OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti",
1057 L->getHeader()->getFirstNonPHI());
1058
1059 if (LoopCost[0].NonPredCost > LoopCost[0].PredCost ||
1060 LoopCost[1].NonPredCost >= LoopCost[1].PredCost) {
1061 ORmissL << "No select conversion in the loop due to no reduction of loop's "
1062 "critical path. ";
1063 EmitAndPrintRemark(ORE, ORmissL);
1064 return false;
1065 }
1066
1067 Scaled64 Gain[2] = {LoopCost[0].PredCost - LoopCost[0].NonPredCost,
1068 LoopCost[1].PredCost - LoopCost[1].NonPredCost};
1069
1070 // Profitably converting to branches need to reduce the loop's critical path
1071 // by at least some threshold (absolute gain of GainCycleThreshold cycles and
1072 // relative gain of 12.5%).
1073 if (Gain[1] < Scaled64::get(GainCycleThreshold) ||
1074 Gain[1] * Scaled64::get(GainRelativeThreshold) < LoopCost[1].PredCost) {
1075 Scaled64 RelativeGain = Scaled64::get(100) * Gain[1] / LoopCost[1].PredCost;
1076 ORmissL << "No select conversion in the loop due to small reduction of "
1077 "loop's critical path. Gain="
1078 << Gain[1].toString()
1079 << ", RelativeGain=" << RelativeGain.toString() << "%. ";
1080 EmitAndPrintRemark(ORE, ORmissL);
1081 return false;
1082 }
1083
1084 // If the loop's critical path involves loop-carried dependences, the gradient
1085 // of the gain needs to be at least GainGradientThreshold% (defaults to 25%).
1086 // This check ensures that the latency reduction for the loop's critical path
1087 // keeps decreasing with sufficient rate beyond the two analyzed loop
1088 // iterations.
1089 if (Gain[1] > Gain[0]) {
1090 Scaled64 GradientGain = Scaled64::get(100) * (Gain[1] - Gain[0]) /
1091 (LoopCost[1].PredCost - LoopCost[0].PredCost);
1092 if (GradientGain < Scaled64::get(GainGradientThreshold)) {
1093 ORmissL << "No select conversion in the loop due to small gradient gain. "
1094 "GradientGain="
1095 << GradientGain.toString() << "%. ";
1096 EmitAndPrintRemark(ORE, ORmissL);
1097 return false;
1098 }
1099 }
1100 // If the gain decreases it is not profitable to convert.
1101 else if (Gain[1] < Gain[0]) {
1102 ORmissL
1103 << "No select conversion in the loop due to negative gradient gain. ";
1104 EmitAndPrintRemark(ORE, ORmissL);
1105 return false;
1106 }
1107
1108 // Non-predicated version of the loop is more profitable than its
1109 // predicated version.
1110 return true;
1111}
1112
1113// Computes instruction and loop-critical-path costs for both the predicated
1114// and non-predicated version of the given loop.
1115// Returns false if unable to compute these costs due to invalid cost of loop
1116// instruction(s).
1117bool SelectOptimizeImpl::computeLoopCosts(
1118 const Loop *L, const SelectGroups &SIGroups,
1119 DenseMap<const Instruction *, CostInfo> &InstCostMap, CostInfo *LoopCost) {
1120 LLVM_DEBUG(dbgs() << "Calculating Latency / IPredCost / INonPredCost of loop "
1121 << L->getHeader()->getName() << "\n");
1122 const auto &SImap = getSImap(SIGroups);
1123 // Compute instruction and loop-critical-path costs across two iterations for
1124 // both predicated and non-predicated version.
1125 const unsigned Iterations = 2;
1126 for (unsigned Iter = 0; Iter < Iterations; ++Iter) {
1127 // Cost of the loop's critical path.
1128 CostInfo &MaxCost = LoopCost[Iter];
1129 for (BasicBlock *BB : L->getBlocks()) {
1130 for (const Instruction &I : *BB) {
1131 if (I.isDebugOrPseudoInst())
1132 continue;
1133 // Compute the predicated and non-predicated cost of the instruction.
1134 Scaled64 IPredCost = Scaled64::getZero(),
1135 INonPredCost = Scaled64::getZero();
1136
1137 // Assume infinite resources that allow to fully exploit the available
1138 // instruction-level parallelism.
1139 // InstCost = InstLatency + max(Op1Cost, Op2Cost, … OpNCost)
1140 for (const Use &U : I.operands()) {
1141 auto UI = dyn_cast<Instruction>(U.get());
1142 if (!UI)
1143 continue;
1144 if (InstCostMap.count(UI)) {
1145 IPredCost = std::max(IPredCost, InstCostMap[UI].PredCost);
1146 INonPredCost = std::max(INonPredCost, InstCostMap[UI].NonPredCost);
1147 }
1148 }
1149 auto ILatency = computeInstCost(&I);
1150 if (!ILatency) {
1151 OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti", &I);
1152 ORmissL << "Invalid instruction cost preventing analysis and "
1153 "optimization of the inner-most loop containing this "
1154 "instruction. ";
1155 EmitAndPrintRemark(ORE, ORmissL);
1156 return false;
1157 }
1158 IPredCost += Scaled64::get(*ILatency);
1159 INonPredCost += Scaled64::get(*ILatency);
1160
1161 // For a select that can be converted to branch,
1162 // compute its cost as a branch (non-predicated cost).
1163 //
1164 // BranchCost = PredictedPathCost + MispredictCost
1165 // PredictedPathCost = TrueOpCost * TrueProb + FalseOpCost * FalseProb
1166 // MispredictCost = max(MispredictPenalty, CondCost) * MispredictRate
1167 if (SImap.contains(&I)) {
1168 auto SI = SImap.at(&I);
1169 Scaled64 TrueOpCost = SI.getTrueOpCost(InstCostMap, TTI);
1170 Scaled64 FalseOpCost = SI.getFalseOpCost(InstCostMap, TTI);
1171 Scaled64 PredictedPathCost =
1172 getPredictedPathCost(TrueOpCost, FalseOpCost, SI);
1173
1174 Scaled64 CondCost = Scaled64::getZero();
1175 if (auto *CI = dyn_cast<Instruction>(SI.getCondition()))
1176 if (InstCostMap.count(CI))
1177 CondCost = InstCostMap[CI].NonPredCost;
1178 Scaled64 MispredictCost = getMispredictionCost(SI, CondCost);
1179
1180 INonPredCost = PredictedPathCost + MispredictCost;
1181 }
1182 LLVM_DEBUG(dbgs() << " " << ILatency << "/" << IPredCost << "/"
1183 << INonPredCost << " for " << I << "\n");
1184
1185 InstCostMap[&I] = {IPredCost, INonPredCost};
1186 MaxCost.PredCost = std::max(MaxCost.PredCost, IPredCost);
1187 MaxCost.NonPredCost = std::max(MaxCost.NonPredCost, INonPredCost);
1188 }
1189 }
1190 LLVM_DEBUG(dbgs() << "Iteration " << Iter + 1
1191 << " MaxCost = " << MaxCost.PredCost << " "
1192 << MaxCost.NonPredCost << "\n");
1193 }
1194 return true;
1195}
1196
1198SelectOptimizeImpl::getSImap(const SelectGroups &SIGroups) {
1200 for (const SelectGroup &ASI : SIGroups)
1201 for (SelectLike SI : ASI)
1202 SImap.try_emplace(SI.getI(), SI);
1203 return SImap;
1204}
1205
1206std::optional<uint64_t>
1207SelectOptimizeImpl::computeInstCost(const Instruction *I) {
1208 InstructionCost ICost =
1210 if (auto OC = ICost.getValue())
1211 return std::optional<uint64_t>(*OC);
1212 return std::nullopt;
1213}
1214
1216SelectOptimizeImpl::getMispredictionCost(const SelectLike SI,
1217 const Scaled64 CondCost) {
1218 uint64_t MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
1219
1220 // Account for the default misprediction rate when using a branch
1221 // (conservatively set to 25% by default).
1222 uint64_t MispredictRate = MispredictDefaultRate;
1223 // If the select condition is obviously predictable, then the misprediction
1224 // rate is zero.
1225 if (isSelectHighlyPredictable(SI))
1226 MispredictRate = 0;
1227
1228 // CondCost is included to account for cases where the computation of the
1229 // condition is part of a long dependence chain (potentially loop-carried)
1230 // that would delay detection of a misprediction and increase its cost.
1231 Scaled64 MispredictCost =
1232 std::max(Scaled64::get(MispredictPenalty), CondCost) *
1233 Scaled64::get(MispredictRate);
1234 MispredictCost /= Scaled64::get(100);
1235
1236 return MispredictCost;
1237}
1238
1239// Returns the cost of a branch when the prediction is correct.
1240// TrueCost * TrueProbability + FalseCost * FalseProbability.
1242SelectOptimizeImpl::getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
1243 const SelectLike SI) {
1244 Scaled64 PredPathCost;
1245 uint64_t TrueWeight, FalseWeight;
1246 if (extractBranchWeights(SI, TrueWeight, FalseWeight)) {
1247 uint64_t SumWeight = TrueWeight + FalseWeight;
1248 if (SumWeight != 0) {
1249 PredPathCost = TrueCost * Scaled64::get(TrueWeight) +
1250 FalseCost * Scaled64::get(FalseWeight);
1251 PredPathCost /= Scaled64::get(SumWeight);
1252 return PredPathCost;
1253 }
1254 }
1255 // Without branch weight metadata, we assume 75% for the one path and 25% for
1256 // the other, and pick the result with the biggest cost.
1257 PredPathCost = std::max(TrueCost * Scaled64::get(3) + FalseCost,
1258 FalseCost * Scaled64::get(3) + TrueCost);
1259 PredPathCost /= Scaled64::get(4);
1260 return PredPathCost;
1261}
1262
1263bool SelectOptimizeImpl::isSelectKindSupported(const SelectLike SI) {
1264 bool VectorCond = !SI.getCondition()->getType()->isIntegerTy(1);
1265 if (VectorCond)
1266 return false;
1268 if (SI.getType()->isVectorTy())
1270 else
1272 return TLI->isSelectSupported(SelectKind);
1273}
static Value * getTrueOrFalseValue(SelectInst *SI, bool isTrue, const SmallPtrSet< const Instruction *, 2 > &Selects)
If isTrue is true, return the true value of SI, otherwise return false value of SI.
#define LLVM_DEBUG(X)
Definition: Debug.h:101
static bool runOnFunction(Function &F, bool PostInlining)
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define DEBUG_TYPE
Hexagon Hardware Loops
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define P(N)
FunctionAnalysisManager FAM
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file contains the declarations for profiling metadata utility functions.
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isSafeToSinkLoad(Instruction *LoadI, Instruction *SI)
Optimize selects
static cl::opt< unsigned > ColdOperandMaxCostMultiplier("cold-operand-max-cost-multiplier", cl::desc("Maximum cost multiplier of TCC_expensive for the dependence " "slice of a cold operand to be considered inexpensive."), cl::init(1), cl::Hidden)
static cl::opt< unsigned > ColdOperandThreshold("cold-operand-threshold", cl::desc("Maximum frequency of path for an operand to be considered cold."), cl::init(20), cl::Hidden)
static cl::opt< bool > DisableLoopLevelHeuristics("disable-loop-level-heuristics", cl::Hidden, cl::init(false), cl::desc("Disable loop-level heuristics."))
static cl::opt< unsigned > GainCycleThreshold("select-opti-loop-cycle-gain-threshold", cl::desc("Minimum gain per loop (in cycles) threshold."), cl::init(4), cl::Hidden)
static cl::opt< unsigned > MispredictDefaultRate("mispredict-default-rate", cl::Hidden, cl::init(25), cl::desc("Default mispredict rate (initialized to 25%)."))
static void EmitAndPrintRemark(OptimizationRemarkEmitter *ORE, DiagnosticInfoOptimizationBase &Rem)
static cl::opt< unsigned > GainGradientThreshold("select-opti-loop-gradient-gain-threshold", cl::desc("Gradient gain threshold (%)."), cl::init(25), cl::Hidden)
static cl::opt< unsigned > GainRelativeThreshold("select-opti-loop-relative-gain-threshold", cl::desc("Minimum relative gain per loop threshold (1/X). Defaults to 12.5%"), cl::init(8), cl::Hidden)
This file contains the declaration of the SelectOptimizePass class, its corresponding pass name is se...
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This file describes how to lower LLVM code to machine code.
Target-Independent Code Generator Pass Configuration Options pass.
This pass exposes codegen information to IR-level passes.
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:348
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:500
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
iterator end()
Definition: BasicBlock.h:442
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:429
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:398
void insertDbgRecordBefore(DbgRecord *DR, InstListType::iterator Here)
Insert a DbgRecord into a block at the position given by Here.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:198
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:559
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:205
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:164
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:157
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:220
Analysis pass which computes BlockFrequencyInfo.
Legacy analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Conditional or Unconditional Branch instruction.
static BranchInst * Create(BasicBlock *IfTrue, BasicBlock::iterator InsertBefore)
static BranchProbability getBranchProbability(uint64_t Numerator, uint64_t Denominator)
Base class for non-instruction debug metadata records that have positions within IR.
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:235
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
Common features for diagnostics dealing with optimization remarks that are used by both IR and MIR pa...
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2644
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
bool isDebugOrPseudoInst() const LLVM_READONLY
Return true if the instruction is a DbgInfoIntrinsic or PseudoProbeInst.
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction.
const BasicBlock * getParent() const
Definition: Instruction.h:152
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
bool isTerminator() const
Definition: Instruction.h:255
bool mayReadFromMemory() const LLVM_READONLY
Return true if this instruction may read memory.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:451
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
iterator end() const
iterator begin() const
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:593
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
OptimizationRemarkEmitter legacy analysis pass.
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
An analysis over an "inner" IR unit that provides access to an analysis manager over a "outer" IR uni...
Definition: PassManager.h:783
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr, BasicBlock::iterator InsertBefore)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:109
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:115
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
Analysis providing profile information.
bool hasProfileSummary() const
Returns true if profile summary is available.
bool isColdBlock(const BBType *BB, BFIT *BFI) const
Returns true if BasicBlock BB is considered cold.
std::string toString(unsigned Precision=DefaultPrecision)
Convert to a decimal representation in a string.
Definition: ScaledNumber.h:596
static ScaledNumber get(uint64_t N)
Definition: ScaledNumber.h:526
static ScaledNumber getZero()
Definition: ScaledNumber.h:521
This class represents the LLVM 'select' instruction.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM)
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:356
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
bool empty() const
Definition: SmallVector.h:94
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
Analysis pass providing the TargetTransformInfo.
virtual bool isSelectSupported(SelectSupportKind) const
SelectSupportKind
Enum that describes what type of support for selects the target has.
bool isPredictableSelectExpensive() const
Return true if selects are only cheaper than branches if the branch is unlikely to be predicted right...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
Target-Independent Code Generator Pass Configuration Options.
Provide an instruction scheduling machine model to CodeGen passes.
const MCSchedModel * getMCSchedModel() const
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetLowering * getTargetLowering() const
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
bool shouldTreatInstructionLikeSelect(const Instruction *I) const
Should the Select Optimization pass treat the given instruction like a select, potentially converting...
@ TCK_Latency
The latency of instruction.
bool enableSelectOptimize() const
Should the Select Optimization pass be enabled and ran.
BranchProbability getPredictableBranchThreshold() const
If a branch or a select condition is skewed in one direction by more than this factor,...
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
@ TCC_Expensive
The cost of a 'div' instruction on x86.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
LLVM Value Representation.
Definition: Value.h:74
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
self_iterator getIterator()
Definition: ilist_node.h:109
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1724
FunctionPass * createSelectOptimizePass()
This pass converts conditional moves to conditional jumps when profitable.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:665
uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator)
Returns the integer nearest(Numerator / Denominator).
Definition: MathExtras.h:422
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool extractBranchWeights(const MDNode *ProfileData, SmallVectorImpl< uint32_t > &Weights)
Extract branch weights from MD_prof metadata.
void initializeSelectOptimizePass(PassRegistry &)
unsigned MispredictPenalty
Definition: MCSchedule.h:306