LLVM 23.0.0git
LoopUnrollPass.cpp
Go to the documentation of this file.
1//===- LoopUnroll.cpp - Loop unroller pass --------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements a simple loop unroller. It works best when loops have
10// been canonicalized by the -indvars pass, allowing it to determine the trip
11// counts of loops easily.
12//===----------------------------------------------------------------------===//
13
15#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/DenseSet.h"
18#include "llvm/ADT/STLExtras.h"
19#include "llvm/ADT/SetVector.h"
22#include "llvm/ADT/StringRef.h"
35#include "llvm/IR/BasicBlock.h"
36#include "llvm/IR/CFG.h"
37#include "llvm/IR/Constant.h"
38#include "llvm/IR/Constants.h"
40#include "llvm/IR/Dominators.h"
41#include "llvm/IR/Function.h"
42#include "llvm/IR/Instruction.h"
44#include "llvm/IR/Metadata.h"
45#include "llvm/IR/PassManager.h"
47#include "llvm/Pass.h"
50#include "llvm/Support/Debug.h"
62#include <algorithm>
63#include <cassert>
64#include <cstdint>
65#include <limits>
66#include <optional>
67#include <string>
68#include <tuple>
69#include <utility>
70
71using namespace llvm;
72
73#define DEBUG_TYPE "loop-unroll"
74
76 "forget-scev-loop-unroll", cl::init(false), cl::Hidden,
77 cl::desc("Forget everything in SCEV when doing LoopUnroll, instead of just"
78 " the current top-most loop. This is sometimes preferred to reduce"
79 " compile time."));
80
82 UnrollThreshold("unroll-threshold", cl::Hidden,
83 cl::desc("The cost threshold for loop unrolling"));
84
87 "unroll-optsize-threshold", cl::init(0), cl::Hidden,
88 cl::desc("The cost threshold for loop unrolling when optimizing for "
89 "size"));
90
92 "unroll-partial-threshold", cl::Hidden,
93 cl::desc("The cost threshold for partial loop unrolling"));
94
96 "unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden,
97 cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied "
98 "to the threshold when aggressively unrolling a loop due to the "
99 "dynamic cost savings. If completely unrolling a loop will reduce "
100 "the total runtime from X to Y, we boost the loop unroll "
101 "threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, "
102 "X/Y). This limit avoids excessive code bloat."));
103
105 "unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden,
106 cl::desc("Don't allow loop unrolling to simulate more than this number of "
107 "iterations when checking full unroll profitability"));
108
110 "unroll-count", cl::Hidden,
111 cl::desc("Use this unroll count for all loops including those with "
112 "unroll_count pragma values, for testing purposes"));
113
115 "unroll-max-count", cl::Hidden,
116 cl::desc("Set the max unroll count for partial and runtime unrolling, for"
117 "testing purposes"));
118
120 "unroll-full-max-count", cl::Hidden,
121 cl::desc(
122 "Set the max unroll count for full unrolling, for testing purposes"));
123
124static cl::opt<bool>
125 UnrollAllowPartial("unroll-allow-partial", cl::Hidden,
126 cl::desc("Allows loops to be partially unrolled until "
127 "-unroll-threshold loop size is reached."));
128
130 "unroll-allow-remainder", cl::Hidden,
131 cl::desc("Allow generation of a loop remainder (extra iterations) "
132 "when unrolling a loop."));
133
134static cl::opt<bool>
135 UnrollRuntime("unroll-runtime", cl::Hidden,
136 cl::desc("Unroll loops with run-time trip counts"));
137
139 "unroll-max-upperbound", cl::init(8), cl::Hidden,
140 cl::desc(
141 "The max of trip count upper bound that is considered in unrolling"));
142
144 "pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,
145 cl::desc("Unrolled size limit for loops with an unroll(full) or "
146 "unroll_count pragma."));
147
149 "flat-loop-tripcount-threshold", cl::init(5), cl::Hidden,
150 cl::desc("If the runtime tripcount for the loop is lower than the "
151 "threshold, the loop is considered as flat and will be less "
152 "aggressively unrolled."));
153
155 "unroll-remainder", cl::Hidden,
156 cl::desc("Allow the loop remainder to be unrolled."));
157
158// This option isn't ever intended to be enabled, it serves to allow
159// experiments to check the assumptions about when this kind of revisit is
160// necessary.
162 "unroll-revisit-child-loops", cl::Hidden,
163 cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. "
164 "This shouldn't typically be needed as child loops (or their "
165 "clones) were already visited."));
166
168 "unroll-threshold-aggressive", cl::init(300), cl::Hidden,
169 cl::desc("Threshold (max size of unrolled loop) to use in aggressive (O3) "
170 "optimizations"));
172 UnrollThresholdDefault("unroll-threshold-default", cl::init(150),
174 cl::desc("Default threshold (max size of unrolled "
175 "loop), used in all but O3 optimizations"));
176
178 "pragma-unroll-full-max-iterations", cl::init(1'000'000), cl::Hidden,
179 cl::desc("Maximum allowed iterations to unroll under pragma unroll full."));
180
181/// A magic value for use with the Threshold parameter to indicate
182/// that the loop unroll should be performed regardless of how much
183/// code expansion would result.
184static const unsigned NoThreshold = std::numeric_limits<unsigned>::max();
185
186/// Gather the various unrolling parameters based on the defaults, compiler
187/// flags, TTI overrides and user specified parameters.
191 OptimizationRemarkEmitter &ORE, int OptLevel,
192 std::optional<unsigned> UserThreshold, std::optional<unsigned> UserCount,
193 std::optional<bool> UserAllowPartial, std::optional<bool> UserRuntime,
194 std::optional<bool> UserUpperBound,
195 std::optional<unsigned> UserFullUnrollMaxCount) {
197
198 // Set up the defaults
199 UP.Threshold =
203 UP.PartialThreshold = 150;
205 UP.Count = 0;
207 UP.MaxCount = std::numeric_limits<unsigned>::max();
209 UP.FullUnrollMaxCount = std::numeric_limits<unsigned>::max();
210 UP.BEInsns = 2;
211 UP.Partial = false;
212 UP.Runtime = false;
213 UP.AllowRemainder = true;
214 UP.UnrollRemainder = false;
215 UP.AllowExpensiveTripCount = false;
216 UP.Force = false;
217 UP.UpperBound = false;
218 UP.UnrollAndJam = false;
222 UP.RuntimeUnrollMultiExit = false;
223 UP.AddAdditionalAccumulators = false;
224
225 // Override with any target specific settings
226 TTI.getUnrollingPreferences(L, SE, UP, &ORE);
227
228 // Apply size attributes
229 bool OptForSize = L->getHeader()->getParent()->hasOptSize() ||
230 // Let unroll hints / pragmas take precedence over PGSO.
232 llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
234 if (OptForSize) {
238 }
239
240 // Apply any user values specified by cl::opt
241 if (UnrollThreshold.getNumOccurrences() > 0)
243 if (UnrollPartialThreshold.getNumOccurrences() > 0)
245 if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > 0)
247 if (UnrollMaxCount.getNumOccurrences() > 0)
249 if (UnrollMaxUpperBound.getNumOccurrences() > 0)
251 if (UnrollFullMaxCount.getNumOccurrences() > 0)
253 if (UnrollAllowPartial.getNumOccurrences() > 0)
255 if (UnrollAllowRemainder.getNumOccurrences() > 0)
257 if (UnrollRuntime.getNumOccurrences() > 0)
259 if (UnrollMaxUpperBound == 0)
260 UP.UpperBound = false;
261 if (UnrollUnrollRemainder.getNumOccurrences() > 0)
263 if (UnrollMaxIterationsCountToAnalyze.getNumOccurrences() > 0)
265
266 // Apply user values provided by argument
267 if (UserThreshold) {
268 UP.Threshold = *UserThreshold;
269 UP.PartialThreshold = *UserThreshold;
270 }
271 if (UserCount)
272 UP.Count = *UserCount;
273 if (UserAllowPartial)
274 UP.Partial = *UserAllowPartial;
275 if (UserRuntime)
276 UP.Runtime = *UserRuntime;
277 if (UserUpperBound)
278 UP.UpperBound = *UserUpperBound;
279 if (UserFullUnrollMaxCount)
280 UP.FullUnrollMaxCount = *UserFullUnrollMaxCount;
281
282 return UP;
283}
284
285namespace {
286
287/// A struct to densely store the state of an instruction after unrolling at
288/// each iteration.
289///
290/// This is designed to work like a tuple of <Instruction *, int> for the
291/// purposes of hashing and lookup, but to be able to associate two boolean
292/// states with each key.
293struct UnrolledInstState {
294 Instruction *I;
295 int Iteration : 30;
296 unsigned IsFree : 1;
297 unsigned IsCounted : 1;
298};
299
300/// Hashing and equality testing for a set of the instruction states.
301struct UnrolledInstStateKeyInfo {
302 using PtrInfo = DenseMapInfo<Instruction *>;
303 using PairInfo = DenseMapInfo<std::pair<Instruction *, int>>;
304
305 static inline UnrolledInstState getEmptyKey() {
306 return {PtrInfo::getEmptyKey(), 0, 0, 0};
307 }
308
309 static inline UnrolledInstState getTombstoneKey() {
310 return {PtrInfo::getTombstoneKey(), 0, 0, 0};
311 }
312
313 static inline unsigned getHashValue(const UnrolledInstState &S) {
314 return PairInfo::getHashValue({S.I, S.Iteration});
315 }
316
317 static inline bool isEqual(const UnrolledInstState &LHS,
318 const UnrolledInstState &RHS) {
319 return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration});
320 }
321};
322
323struct EstimatedUnrollCost {
324 /// The estimated cost after unrolling.
325 unsigned UnrolledCost;
326
327 /// The estimated dynamic cost of executing the instructions in the
328 /// rolled form.
329 unsigned RolledDynamicCost;
330};
331
332struct PragmaInfo {
333 PragmaInfo(bool UUC, bool PFU, unsigned PC, bool PEU)
334 : UserUnrollCount(UUC), PragmaFullUnroll(PFU), PragmaCount(PC),
335 PragmaEnableUnroll(PEU) {}
336 const bool UserUnrollCount;
337 const bool PragmaFullUnroll;
338 const unsigned PragmaCount;
339 const bool PragmaEnableUnroll;
340};
341
342} // end anonymous namespace
343
344/// Figure out if the loop is worth full unrolling.
345///
346/// Complete loop unrolling can make some loads constant, and we need to know
347/// if that would expose any further optimization opportunities. This routine
348/// estimates this optimization. It computes cost of unrolled loop
349/// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By
350/// dynamic cost we mean that we won't count costs of blocks that are known not
351/// to be executed (i.e. if we have a branch in the loop and we know that at the
352/// given iteration its condition would be resolved to true, we won't add up the
353/// cost of the 'false'-block).
354/// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If
355/// the analysis failed (no benefits expected from the unrolling, or the loop is
356/// too big to analyze), the returned value is std::nullopt.
357static std::optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
358 const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE,
359 const SmallPtrSetImpl<const Value *> &EphValues,
360 const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize,
361 unsigned MaxIterationsCountToAnalyze) {
362 // We want to be able to scale offsets by the trip count and add more offsets
363 // to them without checking for overflows, and we already don't want to
364 // analyze *massive* trip counts, so we force the max to be reasonably small.
365 assert(MaxIterationsCountToAnalyze <
366 (unsigned)(std::numeric_limits<int>::max() / 2) &&
367 "The unroll iterations max is too large!");
368
369 // Only analyze inner loops. We can't properly estimate cost of nested loops
370 // and we won't visit inner loops again anyway.
371 if (!L->isInnermost())
372 return std::nullopt;
373
374 // Don't simulate loops with a big or unknown tripcount
375 if (!TripCount || TripCount > MaxIterationsCountToAnalyze)
376 return std::nullopt;
377
380 DenseMap<Value *, Value *> SimplifiedValues;
381 SmallVector<std::pair<Value *, Value *>, 4> SimplifiedInputValues;
382
383 // The estimated cost of the unrolled form of the loop. We try to estimate
384 // this by simplifying as much as we can while computing the estimate.
385 InstructionCost UnrolledCost = 0;
386
387 // We also track the estimated dynamic (that is, actually executed) cost in
388 // the rolled form. This helps identify cases when the savings from unrolling
389 // aren't just exposing dead control flows, but actual reduced dynamic
390 // instructions due to the simplifications which we expect to occur after
391 // unrolling.
392 InstructionCost RolledDynamicCost = 0;
393
394 // We track the simplification of each instruction in each iteration. We use
395 // this to recursively merge costs into the unrolled cost on-demand so that
396 // we don't count the cost of any dead code. This is essentially a map from
397 // <instruction, int> to <bool, bool>, but stored as a densely packed struct.
399
400 // A small worklist used to accumulate cost of instructions from each
401 // observable and reached root in the loop.
403
404 // PHI-used worklist used between iterations while accumulating cost.
406
407 // Helper function to accumulate cost for instructions in the loop.
408 auto AddCostRecursively = [&](Instruction &RootI, int Iteration) {
409 assert(Iteration >= 0 && "Cannot have a negative iteration!");
410 assert(CostWorklist.empty() && "Must start with an empty cost list");
411 assert(PHIUsedList.empty() && "Must start with an empty phi used list");
412 CostWorklist.push_back(&RootI);
414 RootI.getFunction()->hasMinSize() ?
417 for (;; --Iteration) {
418 do {
419 Instruction *I = CostWorklist.pop_back_val();
420
421 // InstCostMap only uses I and Iteration as a key, the other two values
422 // don't matter here.
423 auto CostIter = InstCostMap.find({I, Iteration, 0, 0});
424 if (CostIter == InstCostMap.end())
425 // If an input to a PHI node comes from a dead path through the loop
426 // we may have no cost data for it here. What that actually means is
427 // that it is free.
428 continue;
429 auto &Cost = *CostIter;
430 if (Cost.IsCounted)
431 // Already counted this instruction.
432 continue;
433
434 // Mark that we are counting the cost of this instruction now.
435 Cost.IsCounted = true;
436
437 // If this is a PHI node in the loop header, just add it to the PHI set.
438 if (auto *PhiI = dyn_cast<PHINode>(I))
439 if (PhiI->getParent() == L->getHeader()) {
440 assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they "
441 "inherently simplify during unrolling.");
442 if (Iteration == 0)
443 continue;
444
445 // Push the incoming value from the backedge into the PHI used list
446 // if it is an in-loop instruction. We'll use this to populate the
447 // cost worklist for the next iteration (as we count backwards).
448 if (auto *OpI = dyn_cast<Instruction>(
449 PhiI->getIncomingValueForBlock(L->getLoopLatch())))
450 if (L->contains(OpI))
451 PHIUsedList.push_back(OpI);
452 continue;
453 }
454
455 // First accumulate the cost of this instruction.
456 if (!Cost.IsFree) {
457 // Consider simplified operands in instruction cost.
459 transform(I->operands(), std::back_inserter(Operands),
460 [&](Value *Op) {
461 if (auto Res = SimplifiedValues.lookup(Op))
462 return Res;
463 return Op;
464 });
465 UnrolledCost += TTI.getInstructionCost(I, Operands, CostKind);
467 << "Adding cost of instruction (iteration " << Iteration
468 << "): ");
469 LLVM_DEBUG(I->dump());
470 }
471
472 // We must count the cost of every operand which is not free,
473 // recursively. If we reach a loop PHI node, simply add it to the set
474 // to be considered on the next iteration (backwards!).
475 for (Value *Op : I->operands()) {
476 // Check whether this operand is free due to being a constant or
477 // outside the loop.
478 auto *OpI = dyn_cast<Instruction>(Op);
479 if (!OpI || !L->contains(OpI))
480 continue;
481
482 // Otherwise accumulate its cost.
483 CostWorklist.push_back(OpI);
484 }
485 } while (!CostWorklist.empty());
486
487 if (PHIUsedList.empty())
488 // We've exhausted the search.
489 break;
490
491 assert(Iteration > 0 &&
492 "Cannot track PHI-used values past the first iteration!");
493 CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end());
494 PHIUsedList.clear();
495 }
496 };
497
498 // Ensure that we don't violate the loop structure invariants relied on by
499 // this analysis.
500 assert(L->isLoopSimplifyForm() && "Must put loop into normal form first.");
501 assert(L->isLCSSAForm(DT) &&
502 "Must have loops in LCSSA form to track live-out values.");
503
505 << "Starting LoopUnroll profitability analysis...\n");
506
508 L->getHeader()->getParent()->hasMinSize() ?
510 // Simulate execution of each iteration of the loop counting instructions,
511 // which would be simplified.
512 // Since the same load will take different values on different iterations,
513 // we literally have to go through all loop's iterations.
514 for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {
515 LLVM_DEBUG(dbgs().indent(4) << "Analyzing iteration " << Iteration << "\n");
516
517 // Prepare for the iteration by collecting any simplified entry or backedge
518 // inputs.
519 for (Instruction &I : *L->getHeader()) {
520 auto *PHI = dyn_cast<PHINode>(&I);
521 if (!PHI)
522 break;
523
524 // The loop header PHI nodes must have exactly two input: one from the
525 // loop preheader and one from the loop latch.
526 assert(
527 PHI->getNumIncomingValues() == 2 &&
528 "Must have an incoming value only for the preheader and the latch.");
529
530 Value *V = PHI->getIncomingValueForBlock(
531 Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch());
532 if (Iteration != 0 && SimplifiedValues.count(V))
533 V = SimplifiedValues.lookup(V);
534 SimplifiedInputValues.push_back({PHI, V});
535 }
536
537 // Now clear and re-populate the map for the next iteration.
538 SimplifiedValues.clear();
539 while (!SimplifiedInputValues.empty())
540 SimplifiedValues.insert(SimplifiedInputValues.pop_back_val());
541
542 UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L);
543
544 BBWorklist.clear();
545 BBWorklist.insert(L->getHeader());
546 // Note that we *must not* cache the size, this loop grows the worklist.
547 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
548 BasicBlock *BB = BBWorklist[Idx];
549
550 // Visit all instructions in the given basic block and try to simplify
551 // it. We don't change the actual IR, just count optimization
552 // opportunities.
553 for (Instruction &I : *BB) {
554 // These won't get into the final code - don't even try calculating the
555 // cost for them.
556 if (EphValues.count(&I))
557 continue;
558
559 // Track this instruction's expected baseline cost when executing the
560 // rolled loop form.
561 RolledDynamicCost += TTI.getInstructionCost(&I, CostKind);
562
563 // Visit the instruction to analyze its loop cost after unrolling,
564 // and if the visitor returns true, mark the instruction as free after
565 // unrolling and continue.
566 bool IsFree = Analyzer.visit(I);
567 bool Inserted = InstCostMap.insert({&I, (int)Iteration,
568 (unsigned)IsFree,
569 /*IsCounted*/ false}).second;
570 (void)Inserted;
571 assert(Inserted && "Cannot have a state for an unvisited instruction!");
572
573 if (IsFree)
574 continue;
575
576 // Can't properly model a cost of a call.
577 // FIXME: With a proper cost model we should be able to do it.
578 if (auto *CI = dyn_cast<CallInst>(&I)) {
579 const Function *Callee = CI->getCalledFunction();
580 if (!Callee || TTI.isLoweredToCall(Callee)) {
582 << "Can't analyze cost of loop with call\n");
583 return std::nullopt;
584 }
585 }
586
587 // If the instruction might have a side-effect recursively account for
588 // the cost of it and all the instructions leading up to it.
589 if (I.mayHaveSideEffects())
590 AddCostRecursively(I, Iteration);
591
592 // If unrolled body turns out to be too big, bail out.
593 if (UnrolledCost > MaxUnrolledLoopSize) {
594 LLVM_DEBUG({
595 dbgs().indent(4) << "Exceeded threshold.. exiting.\n";
596 dbgs().indent(4)
597 << "UnrolledCost: " << UnrolledCost
598 << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize << "\n";
599 });
600 return std::nullopt;
601 }
602 }
603
604 Instruction *TI = BB->getTerminator();
605
606 auto getSimplifiedConstant = [&](Value *V) -> Constant * {
607 if (SimplifiedValues.count(V))
608 V = SimplifiedValues.lookup(V);
609 return dyn_cast<Constant>(V);
610 };
611
612 // Add in the live successors by first checking whether we have terminator
613 // that may be simplified based on the values simplified by this call.
614 BasicBlock *KnownSucc = nullptr;
615 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
616 if (BI->isConditional()) {
617 if (auto *SimpleCond = getSimplifiedConstant(BI->getCondition())) {
618 // Just take the first successor if condition is undef
619 if (isa<UndefValue>(SimpleCond))
620 KnownSucc = BI->getSuccessor(0);
621 else if (ConstantInt *SimpleCondVal =
622 dyn_cast<ConstantInt>(SimpleCond))
623 KnownSucc = BI->getSuccessor(SimpleCondVal->isZero() ? 1 : 0);
624 }
625 }
626 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
627 if (auto *SimpleCond = getSimplifiedConstant(SI->getCondition())) {
628 // Just take the first successor if condition is undef
629 if (isa<UndefValue>(SimpleCond))
630 KnownSucc = SI->getSuccessor(0);
631 else if (ConstantInt *SimpleCondVal =
632 dyn_cast<ConstantInt>(SimpleCond))
633 KnownSucc = SI->findCaseValue(SimpleCondVal)->getCaseSuccessor();
634 }
635 }
636 if (KnownSucc) {
637 if (L->contains(KnownSucc))
638 BBWorklist.insert(KnownSucc);
639 else
640 ExitWorklist.insert({BB, KnownSucc});
641 continue;
642 }
643
644 // Add BB's successors to the worklist.
645 for (BasicBlock *Succ : successors(BB))
646 if (L->contains(Succ))
647 BBWorklist.insert(Succ);
648 else
649 ExitWorklist.insert({BB, Succ});
650 AddCostRecursively(*TI, Iteration);
651 }
652
653 // If we found no optimization opportunities on the first iteration, we
654 // won't find them on later ones too.
655 if (UnrolledCost == RolledDynamicCost) {
656 LLVM_DEBUG({
657 dbgs().indent(4) << "No opportunities found.. exiting.\n";
658 dbgs().indent(4) << "UnrolledCost: " << UnrolledCost << "\n";
659 });
660 return std::nullopt;
661 }
662 }
663
664 while (!ExitWorklist.empty()) {
665 BasicBlock *ExitingBB, *ExitBB;
666 std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val();
667
668 for (Instruction &I : *ExitBB) {
669 auto *PN = dyn_cast<PHINode>(&I);
670 if (!PN)
671 break;
672
673 Value *Op = PN->getIncomingValueForBlock(ExitingBB);
674 if (auto *OpI = dyn_cast<Instruction>(Op))
675 if (L->contains(OpI))
676 AddCostRecursively(*OpI, TripCount - 1);
677 }
678 }
679
680 assert(UnrolledCost.isValid() && RolledDynamicCost.isValid() &&
681 "All instructions must have a valid cost, whether the "
682 "loop is rolled or unrolled.");
683
684 LLVM_DEBUG({
685 dbgs().indent(4) << "Analysis finished:\n";
686 dbgs().indent(4) << "UnrolledCost: " << UnrolledCost
687 << ", RolledDynamicCost: " << RolledDynamicCost << "\n";
688 });
689 return {{unsigned(UnrolledCost.getValue()),
690 unsigned(RolledDynamicCost.getValue())}};
691}
692
694 const Loop *L, const TargetTransformInfo &TTI,
695 const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) {
697 for (BasicBlock *BB : L->blocks())
698 Metrics.analyzeBasicBlock(BB, TTI, EphValues, /* PrepareForLTO= */ false,
699 L);
700 NumInlineCandidates = Metrics.NumInlineCandidates;
701 NotDuplicatable = Metrics.notDuplicatable;
702 Convergence = Metrics.Convergence;
703 LoopSize = Metrics.NumInsts;
705 Metrics.Convergence != ConvergenceKind::Uncontrolled &&
707
708 // Don't allow an estimate of size zero. This would allows unrolling of loops
709 // with huge iteration counts, which is a compile time problem even if it's
710 // not a problem for code quality. Also, the code using this size may assume
711 // that each loop has at least three instructions (likely a conditional
712 // branch, a comparison feeding that branch, and some kind of loop increment
713 // feeding that comparison instruction).
714 if (LoopSize.isValid() && LoopSize < BEInsns + 1)
715 // This is an open coded max() on InstructionCost
716 LoopSize = BEInsns + 1;
717}
718
722 << "Not unrolling: contains convergent operations.\n");
723 return false;
724 }
725 if (!LoopSize.isValid()) {
727 << "Not unrolling: loop size could not be computed.\n");
728 return false;
729 }
730 if (NotDuplicatable) {
732 << "Not unrolling: contains non-duplicatable instructions.\n");
733 return false;
734 }
735 return true;
736}
737
740 unsigned CountOverwrite) const {
741 unsigned LS = LoopSize.getValue();
742 assert(LS >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
743 if (CountOverwrite)
744 return static_cast<uint64_t>(LS - UP.BEInsns) * CountOverwrite + UP.BEInsns;
745 else
746 return static_cast<uint64_t>(LS - UP.BEInsns) * UP.Count + UP.BEInsns;
747}
748
749// Returns the loop hint metadata node with the given name (for example,
750// "llvm.loop.unroll.count"). If no such metadata node exists, then nullptr is
751// returned.
753 if (MDNode *LoopID = L->getLoopID())
754 return GetUnrollMetadata(LoopID, Name);
755 return nullptr;
756}
757
758// Returns true if the loop has an unroll(full) pragma.
759static bool hasUnrollFullPragma(const Loop *L) {
760 return getUnrollMetadataForLoop(L, "llvm.loop.unroll.full");
761}
762
763// Returns true if the loop has an unroll(enable) pragma. This metadata is used
764// for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives.
765static bool hasUnrollEnablePragma(const Loop *L) {
766 return getUnrollMetadataForLoop(L, "llvm.loop.unroll.enable");
767}
768
769// Returns true if the loop has an runtime unroll(disable) pragma.
770static bool hasRuntimeUnrollDisablePragma(const Loop *L) {
771 return getUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable");
772}
773
774// If loop has an unroll_count pragma return the (necessarily
775// positive) value from the pragma. Otherwise return 0.
776static unsigned unrollCountPragmaValue(const Loop *L) {
777 MDNode *MD = getUnrollMetadataForLoop(L, "llvm.loop.unroll.count");
778 if (MD) {
779 assert(MD->getNumOperands() == 2 &&
780 "Unroll count hint metadata should have two operands.");
781 unsigned Count =
782 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
783 assert(Count >= 1 && "Unroll count must be positive.");
784 return Count;
785 }
786 return 0;
787}
788
789// Computes the boosting factor for complete unrolling.
790// If fully unrolling the loop would save a lot of RolledDynamicCost, it would
791// be beneficial to fully unroll the loop even if unrolledcost is large. We
792// use (RolledDynamicCost / UnrolledCost) to model the unroll benefits to adjust
793// the unroll threshold.
794static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost,
795 unsigned MaxPercentThresholdBoost) {
796 if (Cost.RolledDynamicCost >= std::numeric_limits<unsigned>::max() / 100)
797 return 100;
798 else if (Cost.UnrolledCost != 0)
799 // The boosting factor is RolledDynamicCost / UnrolledCost
800 return std::min(100 * Cost.RolledDynamicCost / Cost.UnrolledCost,
801 MaxPercentThresholdBoost);
802 else
803 return MaxPercentThresholdBoost;
804}
805
806static std::optional<unsigned>
807shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo,
808 const unsigned TripMultiple, const unsigned TripCount,
809 unsigned MaxTripCount, const UnrollCostEstimator UCE,
811
812 // Using unroll pragma
813 // 1st priority is unroll count set by "unroll-count" option.
814
815 if (PInfo.UserUnrollCount) {
816 if (UP.AllowRemainder &&
817 UCE.getUnrolledLoopSize(UP, (unsigned)UnrollCount) < UP.Threshold)
818 return (unsigned)UnrollCount;
819 }
820
821 // 2nd priority is unroll count set by pragma.
822 if (PInfo.PragmaCount > 0) {
823 if ((UP.AllowRemainder || (TripMultiple % PInfo.PragmaCount == 0)))
824 return PInfo.PragmaCount;
825 }
826
827 if (PInfo.PragmaFullUnroll && TripCount != 0) {
828 // Certain cases with UBSAN can cause trip count to be calculated as
829 // INT_MAX, Block full unrolling at a reasonable limit so that the compiler
830 // doesn't hang trying to unroll the loop. See PR77842
831 if (TripCount > PragmaUnrollFullMaxIterations) {
832 LLVM_DEBUG(dbgs().indent(3) << "Won't unroll; trip count is too large\n");
833 return std::nullopt;
834 }
835
836 return TripCount;
837 }
838
839 if (PInfo.PragmaEnableUnroll && !TripCount && MaxTripCount &&
840 MaxTripCount <= UP.MaxUpperBound)
841 return MaxTripCount;
842
843 return std::nullopt;
844}
845
846static std::optional<unsigned> shouldFullUnroll(
849 const unsigned FullUnrollTripCount, const UnrollCostEstimator UCE,
851 assert(FullUnrollTripCount && "should be non-zero!");
852
853 if (FullUnrollTripCount > UP.FullUnrollMaxCount)
854 return std::nullopt;
855
856 // When computing the unrolled size, note that BEInsns are not replicated
857 // like the rest of the loop body.
858 if (UCE.getUnrolledLoopSize(UP) < UP.Threshold)
859 return FullUnrollTripCount;
860
861 // The loop isn't that small, but we still can fully unroll it if that
862 // helps to remove a significant number of instructions.
863 // To check that, run additional analysis on the loop.
864 if (std::optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
865 L, FullUnrollTripCount, DT, SE, EphValues, TTI,
868 unsigned Boost =
870 if (Cost->UnrolledCost < UP.Threshold * Boost / 100)
871 return FullUnrollTripCount;
872 }
873 return std::nullopt;
874}
875
876static std::optional<unsigned>
877shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount,
878 const UnrollCostEstimator UCE,
880
881 if (!TripCount)
882 return std::nullopt;
883
884 if (!UP.Partial) {
885 LLVM_DEBUG(dbgs().indent(3) << "Will not try to unroll partially because "
886 << "-unroll-allow-partial not given\n");
887 return 0;
888 }
889 unsigned count = UP.Count;
890 if (count == 0)
891 count = TripCount;
892 if (UP.PartialThreshold != NoThreshold) {
893 // Reduce unroll count to be modulo of TripCount for partial unrolling.
895 count = (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
896 (LoopSize - UP.BEInsns);
897 if (count > UP.MaxCount)
898 count = UP.MaxCount;
899 while (count != 0 && TripCount % count != 0)
900 count--;
901 if (UP.AllowRemainder && count <= 1) {
902 // If there is no Count that is modulo of TripCount, set Count to
903 // largest power-of-two factor that satisfies the threshold limit.
904 // As we'll create fixup loop, do the type of unrolling only if
905 // remainder loop is allowed.
906 // Note: DefaultUnrollRuntimeCount is used as a reasonable starting point
907 // even though this is partial unrolling (not runtime unrolling).
909 while (count != 0 &&
911 count >>= 1;
912 }
913 if (count < 2) {
914 count = 0;
915 }
916 } else {
917 count = TripCount;
918 }
919 if (count > UP.MaxCount)
920 count = UP.MaxCount;
921
923 << "Partially unrolling with count: " << count << "\n");
924
925 return count;
926}
927// Returns true if unroll count was set explicitly.
928// Calculates unroll count and writes it to UP.Count.
929// Unless IgnoreUser is true, will also use metadata and command-line options
930// that are specific to the LoopUnroll pass (which, for instance, are
931// irrelevant for the LoopUnrollAndJam pass).
932// FIXME: This function is used by LoopUnroll and LoopUnrollAndJam, but consumes
933// many LoopUnroll-specific options. The shared functionality should be
934// refactored into it own function.
938 const SmallPtrSetImpl<const Value *> &EphValues,
939 OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount,
940 bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE,
942 TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) {
943
944 unsigned LoopSize = UCE.getRolledLoopSize();
945
946 const bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;
947 const bool PragmaFullUnroll = hasUnrollFullPragma(L);
948 const unsigned PragmaCount = unrollCountPragmaValue(L);
949 const bool PragmaEnableUnroll = hasUnrollEnablePragma(L);
950
951 const bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll ||
952 PragmaEnableUnroll || UserUnrollCount;
953
954 PragmaInfo PInfo(UserUnrollCount, PragmaFullUnroll, PragmaCount,
955 PragmaEnableUnroll);
956 // Use an explicit peel count that has been specified for testing. In this
957 // case it's not permitted to also specify an explicit unroll count.
958 if (PP.PeelCount) {
959 if (UnrollCount.getNumOccurrences() > 0) {
960 reportFatalUsageError("Cannot specify both explicit peel count and "
961 "explicit unroll count");
962 }
963 UP.Count = 1;
964 UP.Runtime = false;
965 return true;
966 }
967 // Check for explicit Count.
968 // 1st priority is unroll count set by "unroll-count" option.
969 // 2nd priority is unroll count set by pragma.
970 if (auto UnrollFactor = shouldPragmaUnroll(L, PInfo, TripMultiple, TripCount,
971 MaxTripCount, UCE, UP)) {
972 UP.Count = *UnrollFactor;
973
974 if (UserUnrollCount || (PragmaCount > 0)) {
975 UP.AllowExpensiveTripCount = true;
976 UP.Force = true;
977 }
978 UP.Runtime |= (PragmaCount > 0);
979 return ExplicitUnroll;
980 } else {
981 if (ExplicitUnroll && TripCount != 0) {
982 // If the loop has an unrolling pragma, we want to be more aggressive with
983 // unrolling limits. Set thresholds to at least the PragmaUnrollThreshold
984 // value which is larger than the default limits.
985 UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
987 std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
988 }
989 }
990
991 // 3rd priority is exact full unrolling. This will eliminate all copies
992 // of some exit test.
993 UP.Count = 0;
994 if (TripCount) {
995 UP.Count = TripCount;
996 if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
997 TripCount, UCE, UP)) {
998 UP.Count = *UnrollFactor;
999 UseUpperBound = false;
1000 return ExplicitUnroll;
1001 }
1002 }
1003
1004 // 4th priority is bounded unrolling.
1005 // We can unroll by the upper bound amount if it's generally allowed or if
1006 // we know that the loop is executed either the upper bound or zero times.
1007 // (MaxOrZero unrolling keeps only the first loop test, so the number of
1008 // loop tests remains the same compared to the non-unrolled version, whereas
1009 // the generic upper bound unrolling keeps all but the last loop test so the
1010 // number of loop tests goes up which may end up being worse on targets with
1011 // constrained branch predictor resources so is controlled by an option.)
1012 // In addition we only unroll small upper bounds.
1013 // Note that the cost of bounded unrolling is always strictly greater than
1014 // cost of exact full unrolling. As such, if we have an exact count and
1015 // found it unprofitable, we'll never chose to bounded unroll.
1016 if (!TripCount && MaxTripCount && (UP.UpperBound || MaxOrZero) &&
1017 MaxTripCount <= UP.MaxUpperBound) {
1018 UP.Count = MaxTripCount;
1019 if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
1020 MaxTripCount, UCE, UP)) {
1021 UP.Count = *UnrollFactor;
1022 UseUpperBound = true;
1023 return ExplicitUnroll;
1024 }
1025 }
1026
1027 // 5th priority is loop peeling.
1028 computePeelCount(L, LoopSize, PP, TripCount, DT, SE, TTI, AC, UP.Threshold);
1029 if (PP.PeelCount) {
1030 UP.Runtime = false;
1031 UP.Count = 1;
1032 return ExplicitUnroll;
1033 }
1034
1035 // Before starting partial unrolling, set up.partial to true,
1036 // if user explicitly asked for unrolling
1037 if (TripCount)
1038 UP.Partial |= ExplicitUnroll;
1039
1040 // 6th priority is partial unrolling.
1041 // Try partial unroll only when TripCount could be statically calculated.
1042 if (auto UnrollFactor = shouldPartialUnroll(LoopSize, TripCount, UCE, UP)) {
1043 UP.Count = *UnrollFactor;
1044
1045 if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
1046 UP.Count != TripCount)
1047 ORE->emit([&]() {
1049 "FullUnrollAsDirectedTooLarge",
1050 L->getStartLoc(), L->getHeader())
1051 << "Unable to fully unroll loop as directed by unroll pragma "
1052 "because "
1053 "unrolled size is too large.";
1054 });
1055
1056 if (UP.PartialThreshold != NoThreshold) {
1057 if (UP.Count == 0) {
1058 if (PragmaEnableUnroll)
1059 ORE->emit([&]() {
1061 "UnrollAsDirectedTooLarge",
1062 L->getStartLoc(), L->getHeader())
1063 << "Unable to unroll loop as directed by unroll(enable) "
1064 "pragma "
1065 "because unrolled size is too large.";
1066 });
1067 }
1068 }
1069 return ExplicitUnroll;
1070 }
1071 assert(TripCount == 0 &&
1072 "All cases when TripCount is constant should be covered here.");
1073 if (PragmaFullUnroll)
1074 ORE->emit([&]() {
1076 DEBUG_TYPE, "CantFullUnrollAsDirectedRuntimeTripCount",
1077 L->getStartLoc(), L->getHeader())
1078 << "Unable to fully unroll loop as directed by unroll(full) "
1079 "pragma "
1080 "because loop has a runtime trip count.";
1081 });
1082
1083 // 7th priority is runtime unrolling.
1084 // Don't unroll a runtime trip count loop when it is disabled.
1086 UP.Count = 0;
1087 return false;
1088 }
1089
1090 // Don't unroll a small upper bound loop unless user or TTI asked to do so.
1091 if (MaxTripCount && !UP.Force && MaxTripCount < UP.MaxUpperBound) {
1092 UP.Count = 0;
1093 return false;
1094 }
1095
1096 // Check if the runtime trip count is too small when profile is available.
1097 if (L->getHeader()->getParent()->hasProfileData()) {
1098 if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
1099 if (*ProfileTripCount < FlatLoopTripCountThreshold)
1100 return false;
1101 else
1102 UP.AllowExpensiveTripCount = true;
1103 }
1104 }
1105 UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount;
1106 if (!UP.Runtime) {
1108 << "Will not try to unroll loop with runtime trip count "
1109 << "because -unroll-runtime not given\n");
1110 UP.Count = 0;
1111 return false;
1112 }
1113 if (UP.Count == 0)
1115
1116 // Reduce unroll count to be the largest power-of-two factor of
1117 // the original count which satisfies the threshold limit.
1118 while (UP.Count != 0 &&
1120 UP.Count >>= 1;
1121
1122#ifndef NDEBUG
1123 unsigned OrigCount = UP.Count;
1124#endif
1125
1126 if (!UP.AllowRemainder && UP.Count != 0 && (TripMultiple % UP.Count) != 0) {
1127 while (UP.Count != 0 && TripMultiple % UP.Count != 0)
1128 UP.Count >>= 1;
1130 << "Remainder loop is restricted (that could architecture "
1131 "specific or because the loop contains a convergent "
1132 "instruction), so unroll count must divide the trip "
1133 "multiple, "
1134 << TripMultiple << ". Reducing unroll count from " << OrigCount
1135 << " to " << UP.Count << ".\n");
1136
1137 using namespace ore;
1138
1139 if (PragmaCount > 0 && !UP.AllowRemainder)
1140 ORE->emit([&]() {
1142 "DifferentUnrollCountFromDirected",
1143 L->getStartLoc(), L->getHeader())
1144 << "Unable to unroll loop the number of times directed by "
1145 "unroll_count pragma because remainder loop is restricted "
1146 "(that could architecture specific or because the loop "
1147 "contains a convergent instruction) and so must have an "
1148 "unroll "
1149 "count that divides the loop trip multiple of "
1150 << NV("TripMultiple", TripMultiple) << ". Unrolling instead "
1151 << NV("UnrollCount", UP.Count) << " time(s).";
1152 });
1153 }
1154
1155 if (UP.Count > UP.MaxCount)
1156 UP.Count = UP.MaxCount;
1157
1158 if (MaxTripCount && UP.Count > MaxTripCount)
1159 UP.Count = MaxTripCount;
1160
1162 << "Runtime unrolling with count: " << UP.Count << "\n");
1163 if (UP.Count < 2)
1164 UP.Count = 0;
1165 return ExplicitUnroll;
1166}
1167
1168static LoopUnrollResult
1172 ProfileSummaryInfo *PSI, bool PreserveLCSSA, int OptLevel,
1173 bool OnlyFullUnroll, bool OnlyWhenForced, bool ForgetAllSCEV,
1174 std::optional<unsigned> ProvidedCount,
1175 std::optional<unsigned> ProvidedThreshold,
1176 std::optional<bool> ProvidedAllowPartial,
1177 std::optional<bool> ProvidedRuntime,
1178 std::optional<bool> ProvidedUpperBound,
1179 std::optional<bool> ProvidedAllowPeeling,
1180 std::optional<bool> ProvidedAllowProfileBasedPeeling,
1181 std::optional<unsigned> ProvidedFullUnrollMaxCount,
1182 AAResults *AA = nullptr) {
1183
1184 LLVM_DEBUG(dbgs() << "Loop Unroll: F["
1185 << L->getHeader()->getParent()->getName() << "] Loop %"
1186 << L->getHeader()->getName() << "\n");
1188 if (TM & TM_Disable)
1190
1191 // If this loop isn't forced to be unrolled, avoid unrolling it when the
1192 // parent loop has an explicit unroll-and-jam pragma. This is to prevent
1193 // automatic unrolling from interfering with the user requested
1194 // transformation.
1195 Loop *ParentL = L->getParentLoop();
1196 if (ParentL != nullptr &&
1199 LLVM_DEBUG(dbgs().indent(1) << "Not unrolling loop since parent loop has"
1200 << " llvm.loop.unroll_and_jam.\n");
1202 }
1203
1204 // If this loop isn't forced to be unrolled, avoid unrolling it when the
1205 // loop has an explicit unroll-and-jam pragma. This is to prevent automatic
1206 // unrolling from interfering with the user requested transformation.
1209 LLVM_DEBUG(
1210 dbgs().indent(1)
1211 << "Not unrolling loop since it has llvm.loop.unroll_and_jam.\n");
1213 }
1214
1215 if (!L->isLoopSimplifyForm()) {
1217 << "Not unrolling loop which is not in loop-simplify form.\n");
1219 }
1220
1221 // When automatic unrolling is disabled, do not unroll unless overridden for
1222 // this loop.
1223 if (OnlyWhenForced && !(TM & TM_Enable))
1225
1226 bool OptForSize = L->getHeader()->getParent()->hasOptSize();
1228 L, SE, TTI, BFI, PSI, ORE, OptLevel, ProvidedThreshold, ProvidedCount,
1229 ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
1230 ProvidedFullUnrollMaxCount);
1232 L, SE, TTI, ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling, true);
1233
1234 // Exit early if unrolling is disabled. For OptForSize, we pick the loop size
1235 // as threshold later on.
1236 if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0) &&
1237 !OptForSize)
1239
1241 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
1242
1243 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
1244 if (!UCE.canUnroll())
1246
1247 unsigned LoopSize = UCE.getRolledLoopSize();
1248 LLVM_DEBUG(dbgs() << "Loop Size = " << LoopSize << "\n");
1249
1250 // When optimizing for size, use LoopSize + 1 as threshold (we use < Threshold
1251 // later), to (fully) unroll loops, if it does not increase code size.
1252 if (OptForSize)
1253 UP.Threshold = std::max(UP.Threshold, LoopSize + 1);
1254
1255 if (UCE.NumInlineCandidates != 0) {
1257 << "Not unrolling loop with inlinable calls.\n");
1259 }
1260
1261 // Find the smallest exact trip count for any exit. This is an upper bound
1262 // on the loop trip count, but an exit at an earlier iteration is still
1263 // possible. An unroll by the smallest exact trip count guarantees that all
1264 // branches relating to at least one exit can be eliminated. This is unlike
1265 // the max trip count, which only guarantees that the backedge can be broken.
1266 unsigned TripCount = 0;
1267 unsigned TripMultiple = 1;
1268 SmallVector<BasicBlock *, 8> ExitingBlocks;
1269 L->getExitingBlocks(ExitingBlocks);
1270 for (BasicBlock *ExitingBlock : ExitingBlocks)
1271 if (unsigned TC = SE.getSmallConstantTripCount(L, ExitingBlock))
1272 if (!TripCount || TC < TripCount)
1273 TripCount = TripMultiple = TC;
1274
1275 if (!TripCount) {
1276 // If no exact trip count is known, determine the trip multiple of either
1277 // the loop latch or the single exiting block.
1278 // TODO: Relax for multiple exits.
1279 BasicBlock *ExitingBlock = L->getLoopLatch();
1280 if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
1281 ExitingBlock = L->getExitingBlock();
1282 if (ExitingBlock)
1283 TripMultiple = SE.getSmallConstantTripMultiple(L, ExitingBlock);
1284 }
1285
1286 // If the loop contains a convergent operation, the prelude we'd add
1287 // to do the first few instructions before we hit the unrolled loop
1288 // is unsafe -- it adds a control-flow dependency to the convergent
1289 // operation. Therefore restrict remainder loop (try unrolling without).
1290 //
1291 // TODO: This is somewhat conservative; we could allow the remainder if the
1292 // trip count is uniform.
1294
1295 // Try to find the trip count upper bound if we cannot find the exact trip
1296 // count.
1297 unsigned MaxTripCount = 0;
1298 bool MaxOrZero = false;
1299 if (!TripCount) {
1300 MaxTripCount = SE.getSmallConstantMaxTripCount(L);
1301 MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L);
1302 }
1303
1304 // computeUnrollCount() decides whether it is beneficial to use upper bound to
1305 // fully unroll the loop.
1306 bool UseUpperBound = false;
1307 bool IsCountSetExplicitly = computeUnrollCount(
1308 L, TTI, DT, LI, &AC, SE, EphValues, &ORE, TripCount, MaxTripCount,
1309 MaxOrZero, TripMultiple, UCE, UP, PP, UseUpperBound);
1310 if (!UP.Count)
1312
1314
1315 if (PP.PeelCount) {
1316 assert(UP.Count == 1 && "Cannot perform peel and unroll in the same step");
1317 LLVM_DEBUG(dbgs() << "PEELING loop %" << L->getHeader()->getName()
1318 << " with iteration count " << PP.PeelCount << "!\n");
1319 ORE.emit([&]() {
1320 return OptimizationRemark(DEBUG_TYPE, "Peeled", L->getStartLoc(),
1321 L->getHeader())
1322 << "peeled loop by " << ore::NV("PeelCount", PP.PeelCount)
1323 << " iterations";
1324 });
1325
1326 ValueToValueMapTy VMap;
1327 peelLoop(L, PP.PeelCount, PP.PeelLast, LI, &SE, DT, &AC, PreserveLCSSA,
1328 VMap);
1329 simplifyLoopAfterUnroll(L, true, LI, &SE, &DT, &AC, &TTI, nullptr);
1330 // If the loop was peeled, we already "used up" the profile information
1331 // we had, so we don't want to unroll or peel again.
1333 L->setLoopAlreadyUnrolled();
1335 }
1336
1337 // Do not attempt partial/runtime unrolling in FullLoopUnrolling
1338 if (OnlyFullUnroll && ((!TripCount && !MaxTripCount) ||
1339 UP.Count < TripCount || UP.Count < MaxTripCount)) {
1341 << "Not attempting partial/runtime unroll in FullLoopUnroll.\n");
1343 }
1344
1345 // At this point, UP.Runtime indicates that run-time unrolling is allowed.
1346 // However, we only want to actually perform it if we don't know the trip
1347 // count and the unroll count doesn't divide the known trip multiple.
1348 // TODO: This decision should probably be pushed up into
1349 // computeUnrollCount().
1350 UP.Runtime &= TripCount == 0 && TripMultiple % UP.Count != 0;
1351
1352 // Save loop properties before it is transformed.
1353 MDNode *OrigLoopID = L->getLoopID();
1354
1355 // Unroll the loop.
1356 Loop *RemainderLoop = nullptr;
1358 ULO.Count = UP.Count;
1359 ULO.Force = UP.Force;
1362 ULO.Runtime = UP.Runtime;
1363 ULO.ForgetAllSCEV = ForgetAllSCEV;
1368 LoopUnrollResult UnrollResult = UnrollLoop(
1369 L, ULO, LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA);
1370 if (UnrollResult == LoopUnrollResult::Unmodified)
1372
1373 if (RemainderLoop) {
1374 std::optional<MDNode *> RemainderLoopID =
1377 if (RemainderLoopID)
1378 RemainderLoop->setLoopID(*RemainderLoopID);
1379 }
1380
1381 if (UnrollResult != LoopUnrollResult::FullyUnrolled) {
1382 std::optional<MDNode *> NewLoopID =
1385 if (NewLoopID) {
1386 L->setLoopID(*NewLoopID);
1387
1388 // Do not setLoopAlreadyUnrolled if loop attributes have been specified
1389 // explicitly.
1390 return UnrollResult;
1391 }
1392 }
1393
1394 // If loop has an unroll count pragma or unrolled by explicitly set count
1395 // mark loop as unrolled to prevent unrolling beyond that requested.
1396 if (UnrollResult != LoopUnrollResult::FullyUnrolled && IsCountSetExplicitly)
1397 L->setLoopAlreadyUnrolled();
1398
1399 return UnrollResult;
1400}
1401
1402namespace {
1403
1404class LoopUnroll : public LoopPass {
1405public:
1406 static char ID; // Pass ID, replacement for typeid
1407
1408 int OptLevel;
1409
1410 /// If false, use a cost model to determine whether unrolling of a loop is
1411 /// profitable. If true, only loops that explicitly request unrolling via
1412 /// metadata are considered. All other loops are skipped.
1413 bool OnlyWhenForced;
1414
1415 /// If false, when SCEV is invalidated, only forget everything in the
1416 /// top-most loop (call forgetTopMostLoop), of the loop being processed.
1417 /// Otherwise, forgetAllLoops and rebuild when needed next.
1418 bool ForgetAllSCEV;
1419
1420 std::optional<unsigned> ProvidedCount;
1421 std::optional<unsigned> ProvidedThreshold;
1422 std::optional<bool> ProvidedAllowPartial;
1423 std::optional<bool> ProvidedRuntime;
1424 std::optional<bool> ProvidedUpperBound;
1425 std::optional<bool> ProvidedAllowPeeling;
1426 std::optional<bool> ProvidedAllowProfileBasedPeeling;
1427 std::optional<unsigned> ProvidedFullUnrollMaxCount;
1428
1429 LoopUnroll(int OptLevel = 2, bool OnlyWhenForced = false,
1430 bool ForgetAllSCEV = false,
1431 std::optional<unsigned> Threshold = std::nullopt,
1432 std::optional<unsigned> Count = std::nullopt,
1433 std::optional<bool> AllowPartial = std::nullopt,
1434 std::optional<bool> Runtime = std::nullopt,
1435 std::optional<bool> UpperBound = std::nullopt,
1436 std::optional<bool> AllowPeeling = std::nullopt,
1437 std::optional<bool> AllowProfileBasedPeeling = std::nullopt,
1438 std::optional<unsigned> ProvidedFullUnrollMaxCount = std::nullopt)
1439 : LoopPass(ID), OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced),
1440 ForgetAllSCEV(ForgetAllSCEV), ProvidedCount(std::move(Count)),
1441 ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
1442 ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound),
1443 ProvidedAllowPeeling(AllowPeeling),
1444 ProvidedAllowProfileBasedPeeling(AllowProfileBasedPeeling),
1445 ProvidedFullUnrollMaxCount(ProvidedFullUnrollMaxCount) {
1447 }
1448
1449 bool runOnLoop(Loop *L, LPPassManager &LPM) override {
1450 if (skipLoop(L))
1451 return false;
1452
1453 Function &F = *L->getHeader()->getParent();
1454
1455 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1456 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1457 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1458 const TargetTransformInfo &TTI =
1459 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1460 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1461 // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
1462 // pass. Function analyses need to be preserved across loop transformations
1463 // but ORE cannot be preserved (see comment before the pass definition).
1464 OptimizationRemarkEmitter ORE(&F);
1465 bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
1466
1468 L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr, PreserveLCSSA, OptLevel,
1469 /*OnlyFullUnroll*/ false, OnlyWhenForced, ForgetAllSCEV, ProvidedCount,
1470 ProvidedThreshold, ProvidedAllowPartial, ProvidedRuntime,
1471 ProvidedUpperBound, ProvidedAllowPeeling,
1472 ProvidedAllowProfileBasedPeeling, ProvidedFullUnrollMaxCount);
1473
1474 if (Result == LoopUnrollResult::FullyUnrolled)
1475 LPM.markLoopAsDeleted(*L);
1476
1477 return Result != LoopUnrollResult::Unmodified;
1478 }
1479
1480 /// This transformation requires natural loop information & requires that
1481 /// loop preheaders be inserted into the CFG...
1482 void getAnalysisUsage(AnalysisUsage &AU) const override {
1483 AU.addRequired<AssumptionCacheTracker>();
1484 AU.addRequired<TargetTransformInfoWrapperPass>();
1485 // FIXME: Loop passes are required to preserve domtree, and for now we just
1486 // recreate dom info if anything gets unrolled.
1488 }
1489};
1490
1491} // end anonymous namespace
1492
1493char LoopUnroll::ID = 0;
1494
1495INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
1499INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
1500
1501Pass *llvm::createLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
1502 bool ForgetAllSCEV, int Threshold, int Count,
1503 int AllowPartial, int Runtime, int UpperBound,
1504 int AllowPeeling) {
1505 // TODO: It would make more sense for this function to take the optionals
1506 // directly, but that's dangerous since it would silently break out of tree
1507 // callers.
1508 return new LoopUnroll(
1509 OptLevel, OnlyWhenForced, ForgetAllSCEV,
1510 Threshold == -1 ? std::nullopt : std::optional<unsigned>(Threshold),
1511 Count == -1 ? std::nullopt : std::optional<unsigned>(Count),
1512 AllowPartial == -1 ? std::nullopt : std::optional<bool>(AllowPartial),
1513 Runtime == -1 ? std::nullopt : std::optional<bool>(Runtime),
1514 UpperBound == -1 ? std::nullopt : std::optional<bool>(UpperBound),
1515 AllowPeeling == -1 ? std::nullopt : std::optional<bool>(AllowPeeling));
1516}
1517
1520 LPMUpdater &Updater) {
1521 // For the new PM, we can't use OptimizationRemarkEmitter as an analysis
1522 // pass. Function analyses need to be preserved across loop transformations
1523 // but ORE cannot be preserved (see comment before the pass definition).
1524 OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
1525
1526 // Keep track of the previous loop structure so we can identify new loops
1527 // created by unrolling.
1528 Loop *ParentL = L.getParentLoop();
1529 SmallPtrSet<Loop *, 4> OldLoops;
1530 if (ParentL)
1531 OldLoops.insert_range(*ParentL);
1532 else
1533 OldLoops.insert_range(AR.LI);
1534
1535 std::string LoopName = std::string(L.getName());
1536
1537 bool Changed =
1538 tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, ORE,
1539 /*BFI*/ nullptr, /*PSI*/ nullptr,
1540 /*PreserveLCSSA*/ true, OptLevel, /*OnlyFullUnroll*/ true,
1541 OnlyWhenForced, ForgetSCEV, /*Count*/ std::nullopt,
1542 /*Threshold*/ std::nullopt, /*AllowPartial*/ false,
1543 /*Runtime*/ false, /*UpperBound*/ false,
1544 /*AllowPeeling*/ true,
1545 /*AllowProfileBasedPeeling*/ false,
1546 /*FullUnrollMaxCount*/ std::nullopt) !=
1548 if (!Changed)
1549 return PreservedAnalyses::all();
1550
1551 // The parent must not be damaged by unrolling!
1552#ifndef NDEBUG
1553 if (ParentL)
1554 ParentL->verifyLoop();
1555#endif
1556
1557 // Unrolling can do several things to introduce new loops into a loop nest:
1558 // - Full unrolling clones child loops within the current loop but then
1559 // removes the current loop making all of the children appear to be new
1560 // sibling loops.
1561 //
1562 // When a new loop appears as a sibling loop after fully unrolling,
1563 // its nesting structure has fundamentally changed and we want to revisit
1564 // it to reflect that.
1565 //
1566 // When unrolling has removed the current loop, we need to tell the
1567 // infrastructure that it is gone.
1568 //
1569 // Finally, we support a debugging/testing mode where we revisit child loops
1570 // as well. These are not expected to require further optimizations as either
1571 // they or the loop they were cloned from have been directly visited already.
1572 // But the debugging mode allows us to check this assumption.
1573 bool IsCurrentLoopValid = false;
1574 SmallVector<Loop *, 4> SibLoops;
1575 if (ParentL)
1576 SibLoops.append(ParentL->begin(), ParentL->end());
1577 else
1578 SibLoops.append(AR.LI.begin(), AR.LI.end());
1579 erase_if(SibLoops, [&](Loop *SibLoop) {
1580 if (SibLoop == &L) {
1581 IsCurrentLoopValid = true;
1582 return true;
1583 }
1584
1585 // Otherwise erase the loop from the list if it was in the old loops.
1586 return OldLoops.contains(SibLoop);
1587 });
1588 Updater.addSiblingLoops(SibLoops);
1589
1590 if (!IsCurrentLoopValid) {
1591 Updater.markLoopAsDeleted(L, LoopName);
1592 } else {
1593 // We can only walk child loops if the current loop remained valid.
1595 // Walk *all* of the child loops.
1596 SmallVector<Loop *, 4> ChildLoops(L.begin(), L.end());
1597 Updater.addChildLoops(ChildLoops);
1598 }
1599 }
1600
1602}
1603
1606 auto &LI = AM.getResult<LoopAnalysis>(F);
1607 // There are no loops in the function. Return before computing other expensive
1608 // analyses.
1609 if (LI.empty())
1610 return PreservedAnalyses::all();
1611 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
1612 auto &TTI = AM.getResult<TargetIRAnalysis>(F);
1613 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
1614 auto &AC = AM.getResult<AssumptionAnalysis>(F);
1617
1618 LoopAnalysisManager *LAM = nullptr;
1619 if (auto *LAMProxy = AM.getCachedResult<LoopAnalysisManagerFunctionProxy>(F))
1620 LAM = &LAMProxy->getManager();
1621
1622 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
1623 ProfileSummaryInfo *PSI =
1624 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
1625 auto *BFI = (PSI && PSI->hasProfileSummary()) ?
1626 &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
1627
1628 bool Changed = false;
1629
1630 // The unroller requires loops to be in simplified form, and also needs LCSSA.
1631 // Since simplification may add new inner loops, it has to run before the
1632 // legality and profitability checks. This means running the loop unroller
1633 // will simplify all loops, regardless of whether anything end up being
1634 // unrolled.
1635 for (const auto &L : LI) {
1636 Changed |=
1637 simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */);
1638 Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
1639 }
1640
1641 // Add the loop nests in the reverse order of LoopInfo. See method
1642 // declaration.
1644 appendLoopsToWorklist(LI, Worklist);
1645
1646 while (!Worklist.empty()) {
1647 // Because the LoopInfo stores the loops in RPO, we walk the worklist
1648 // from back to front so that we work forward across the CFG, which
1649 // for unrolling is only needed to get optimization remarks emitted in
1650 // a forward order.
1651 Loop &L = *Worklist.pop_back_val();
1652#ifndef NDEBUG
1653 Loop *ParentL = L.getParentLoop();
1654#endif
1655
1656 // Check if the profile summary indicates that the profiled application
1657 // has a huge working set size, in which case we disable peeling to avoid
1658 // bloating it further.
1659 std::optional<bool> LocalAllowPeeling = UnrollOpts.AllowPeeling;
1660 if (PSI && PSI->hasHugeWorkingSetSize())
1661 LocalAllowPeeling = false;
1662 std::string LoopName = std::string(L.getName());
1663 // The API here is quite complex to call and we allow to select some
1664 // flavors of unrolling during construction time (by setting UnrollOpts).
1666 &L, DT, &LI, SE, TTI, AC, ORE, BFI, PSI,
1667 /*PreserveLCSSA*/ true, UnrollOpts.OptLevel, /*OnlyFullUnroll*/ false,
1668 UnrollOpts.OnlyWhenForced, UnrollOpts.ForgetSCEV,
1669 /*Count*/ std::nullopt,
1670 /*Threshold*/ std::nullopt, UnrollOpts.AllowPartial,
1671 UnrollOpts.AllowRuntime, UnrollOpts.AllowUpperBound, LocalAllowPeeling,
1672 UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount,
1673 &AA);
1675
1676 // The parent must not be damaged by unrolling!
1677#ifndef NDEBUG
1678 if (Result != LoopUnrollResult::Unmodified && ParentL)
1679 ParentL->verifyLoop();
1680#endif
1681
1682 // Clear any cached analysis results for L if we removed it completely.
1683 if (LAM && Result == LoopUnrollResult::FullyUnrolled)
1684 LAM->clear(L, LoopName);
1685 }
1686
1687 if (!Changed)
1688 return PreservedAnalyses::all();
1689
1691}
1692
1694 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
1695 static_cast<PassInfoMixin<LoopUnrollPass> *>(this)->printPipeline(
1696 OS, MapClassName2PassName);
1697 OS << '<';
1698 if (UnrollOpts.AllowPartial != std::nullopt)
1699 OS << (*UnrollOpts.AllowPartial ? "" : "no-") << "partial;";
1700 if (UnrollOpts.AllowPeeling != std::nullopt)
1701 OS << (*UnrollOpts.AllowPeeling ? "" : "no-") << "peeling;";
1702 if (UnrollOpts.AllowRuntime != std::nullopt)
1703 OS << (*UnrollOpts.AllowRuntime ? "" : "no-") << "runtime;";
1704 if (UnrollOpts.AllowUpperBound != std::nullopt)
1705 OS << (*UnrollOpts.AllowUpperBound ? "" : "no-") << "upperbound;";
1706 if (UnrollOpts.AllowProfileBasedPeeling != std::nullopt)
1707 OS << (*UnrollOpts.AllowProfileBasedPeeling ? "" : "no-")
1708 << "profile-peeling;";
1709 if (UnrollOpts.FullUnrollMaxCount != std::nullopt)
1710 OS << "full-unroll-max=" << UnrollOpts.FullUnrollMaxCount << ';';
1711 OS << 'O' << UnrollOpts.OptLevel;
1712 OS << '>';
1713}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
This file defines DenseMapInfo traits for DenseMap.
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
#define DEBUG_TYPE
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
This header provides classes for managing per-loop analyses.
This header provides classes for managing a pipeline of passes over loops in LLVM IR.
static MDNode * getUnrollMetadataForLoop(const Loop *L, StringRef Name)
static cl::opt< unsigned > UnrollMaxCount("unroll-max-count", cl::Hidden, cl::desc("Set the max unroll count for partial and runtime unrolling, for" "testing purposes"))
static cl::opt< unsigned > UnrollCount("unroll-count", cl::Hidden, cl::desc("Use this unroll count for all loops including those with " "unroll_count pragma values, for testing purposes"))
static cl::opt< unsigned > UnrollThresholdDefault("unroll-threshold-default", cl::init(150), cl::Hidden, cl::desc("Default threshold (max size of unrolled " "loop), used in all but O3 optimizations"))
static cl::opt< unsigned > FlatLoopTripCountThreshold("flat-loop-tripcount-threshold", cl::init(5), cl::Hidden, cl::desc("If the runtime tripcount for the loop is lower than the " "threshold, the loop is considered as flat and will be less " "aggressively unrolled."))
static cl::opt< unsigned > UnrollOptSizeThreshold("unroll-optsize-threshold", cl::init(0), cl::Hidden, cl::desc("The cost threshold for loop unrolling when optimizing for " "size"))
static bool hasUnrollFullPragma(const Loop *L)
static cl::opt< bool > UnrollUnrollRemainder("unroll-remainder", cl::Hidden, cl::desc("Allow the loop remainder to be unrolled."))
static unsigned unrollCountPragmaValue(const Loop *L)
static bool hasUnrollEnablePragma(const Loop *L)
static cl::opt< unsigned > UnrollFullMaxCount("unroll-full-max-count", cl::Hidden, cl::desc("Set the max unroll count for full unrolling, for testing purposes"))
static cl::opt< unsigned > UnrollMaxUpperBound("unroll-max-upperbound", cl::init(8), cl::Hidden, cl::desc("The max of trip count upper bound that is considered in unrolling"))
static std::optional< unsigned > shouldFullUnroll(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, const unsigned FullUnrollTripCount, const UnrollCostEstimator UCE, const TargetTransformInfo::UnrollingPreferences &UP)
static std::optional< EstimatedUnrollCost > analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize, unsigned MaxIterationsCountToAnalyze)
Figure out if the loop is worth full unrolling.
static cl::opt< unsigned > UnrollPartialThreshold("unroll-partial-threshold", cl::Hidden, cl::desc("The cost threshold for partial loop unrolling"))
static cl::opt< bool > UnrollAllowRemainder("unroll-allow-remainder", cl::Hidden, cl::desc("Allow generation of a loop remainder (extra iterations) " "when unrolling a loop."))
static std::optional< unsigned > shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount, const UnrollCostEstimator UCE, const TargetTransformInfo::UnrollingPreferences &UP)
static cl::opt< unsigned > PragmaUnrollFullMaxIterations("pragma-unroll-full-max-iterations", cl::init(1 '000 '000), cl::Hidden, cl::desc("Maximum allowed iterations to unroll under pragma unroll full."))
static const unsigned NoThreshold
A magic value for use with the Threshold parameter to indicate that the loop unroll should be perform...
static std::optional< unsigned > shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo, const unsigned TripMultiple, const unsigned TripCount, unsigned MaxTripCount, const UnrollCostEstimator UCE, const TargetTransformInfo::UnrollingPreferences &UP)
static cl::opt< bool > UnrollRevisitChildLoops("unroll-revisit-child-loops", cl::Hidden, cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. " "This shouldn't typically be needed as child loops (or their " "clones) were already visited."))
static cl::opt< unsigned > UnrollThreshold("unroll-threshold", cl::Hidden, cl::desc("The cost threshold for loop unrolling"))
static cl::opt< bool > UnrollRuntime("unroll-runtime", cl::Hidden, cl::desc("Unroll loops with run-time trip counts"))
static LoopUnrollResult tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, const TargetTransformInfo &TTI, AssumptionCache &AC, OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, bool PreserveLCSSA, int OptLevel, bool OnlyFullUnroll, bool OnlyWhenForced, bool ForgetAllSCEV, std::optional< unsigned > ProvidedCount, std::optional< unsigned > ProvidedThreshold, std::optional< bool > ProvidedAllowPartial, std::optional< bool > ProvidedRuntime, std::optional< bool > ProvidedUpperBound, std::optional< bool > ProvidedAllowPeeling, std::optional< bool > ProvidedAllowProfileBasedPeeling, std::optional< unsigned > ProvidedFullUnrollMaxCount, AAResults *AA=nullptr)
static bool hasRuntimeUnrollDisablePragma(const Loop *L)
static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost, unsigned MaxPercentThresholdBoost)
static cl::opt< unsigned > UnrollThresholdAggressive("unroll-threshold-aggressive", cl::init(300), cl::Hidden, cl::desc("Threshold (max size of unrolled loop) to use in aggressive (O3) " "optimizations"))
static cl::opt< unsigned > UnrollMaxIterationsCountToAnalyze("unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden, cl::desc("Don't allow loop unrolling to simulate more than this number of " "iterations when checking full unroll profitability"))
static cl::opt< unsigned > UnrollMaxPercentThresholdBoost("unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden, cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied " "to the threshold when aggressively unrolling a loop due to the " "dynamic cost savings. If completely unrolling a loop will reduce " "the total runtime from X to Y, we boost the loop unroll " "threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, " "X/Y). This limit avoids excessive code bloat."))
static cl::opt< unsigned > PragmaUnrollThreshold("pragma-unroll-threshold", cl::init(16 *1024), cl::Hidden, cl::desc("Unrolled size limit for loops with an unroll(full) or " "unroll_count pragma."))
static cl::opt< bool > UnrollAllowPartial("unroll-allow-partial", cl::Hidden, cl::desc("Allows loops to be partially unrolled until " "-unroll-threshold loop size is reached."))
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Machine Trace Metrics
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
This file contains the declarations for metadata subclasses.
LoopAnalysisManager LAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:114
This pass exposes codegen information to IR-level passes.
Value * RHS
Value * LHS
A manager for alias analyses.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addRequired()
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Conditional or Unconditional Branch instruction.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
This is an important base class in LLVM.
Definition Constant.h:43
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:174
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:711
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
void addChildLoops(ArrayRef< Loop * > NewChildLoops)
Loop passes should use this method to indicate they have added new child loops of the current loop.
void markLoopAsDeleted(Loop &L, llvm::StringRef Name)
Loop passes should use this method to indicate they have deleted a loop from the nest.
void addSiblingLoops(ArrayRef< Loop * > NewSibLoops)
Loop passes should use this method to indicate they have added new sibling loops to the current loop.
void markLoopAsDeleted(Loop &L)
Definition LoopPass.cpp:111
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
void verifyLoop() const
Verify loop structure.
iterator end() const
iterator begin() const
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
iterator end() const
iterator begin() const
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
void setLoopID(MDNode *LoopID) const
Set the llvm.loop loop id metadata for this loop.
Definition LoopInfo.cpp:526
Metadata node.
Definition Metadata.h:1080
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1444
unsigned getNumOperands() const
Return number of MDNode operands.
Definition Metadata.h:1450
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
bool empty() const
Determine if the PriorityWorklist is empty or not.
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI unsigned getSmallConstantTripMultiple(const Loop *L, const SCEV *ExitCount)
Returns the largest constant divisor of the trip count as a normal unsigned value,...
LLVM_ABI unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Returns the upper bound of the loop trip count as a normal unsigned value.
LLVM_ABI bool isBackedgeTakenCountMaxOrZero(const Loop *L)
Return true if the backedge taken count is either the value returned by getConstantMaxBackedgeTakenCo...
LLVM_ABI unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
void clear()
Completely clear the SetVector.
Definition SetVector.h:267
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
value_type pop_back_val()
Definition SetVector.h:279
A version of PriorityWorklist that selects small size optimized data structures for the vector and ma...
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Multiway switch.
Analysis pass providing the TargetTransformInfo.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
TargetCostKind
The kind of cost model.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Produce an estimate of the unrolled cost of the specified loop.
Definition UnrollLoop.h:135
ConvergenceKind Convergence
Definition UnrollLoop.h:141
LLVM_ABI uint64_t getUnrolledLoopSize(const TargetTransformInfo::UnrollingPreferences &UP, unsigned CountOverwrite=0) const
Returns loop size estimation for unrolled loop, given the unrolling configuration specified by UP.
LLVM_ABI bool canUnroll() const
Whether it is legal to unroll this loop.
LLVM_ABI UnrollCostEstimator(const Loop *L, const TargetTransformInfo &TTI, const SmallPtrSetImpl< const Value * > &EphValues, unsigned BEInsns)
uint64_t getRolledLoopSize() const
Definition UnrollLoop.h:151
void visit(Iterator Start, Iterator End)
Definition InstVisitor.h:87
LLVM Value Representation.
Definition Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
iterator find(const_arg_type_t< ValueT > V)
Definition DenseSet.h:167
An efficient, type-erasing, non-owning reference to a callable.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
Changed
Abstract Attribute helper functions.
Definition Attributor.h:165
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
Definition Metadata.h:668
Add a small namespace to avoid name clashes with the classes used in the streaming interface.
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
LLVM_ABI bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
LLVM_ABI std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Return either:
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
LLVM_ABI void simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, const TargetTransformInfo *TTI, AAResults *AA=nullptr)
Perform some cleanup and simplifications on loops after unrolling.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
auto successors(const MachineBasicBlock *BB)
@ Runtime
Detect stack use after return if not disabled runtime with (ASAN_OPTIONS=detect_stack_use_after_retur...
OuterAnalysisManagerProxy< ModuleAnalysisManager, Function > ModuleAnalysisManagerFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
LLVM_ABI bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition LCSSA.cpp:449
LLVM_ABI std::optional< MDNode * > makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef< StringRef > FollowupAttrs, const char *InheritOptionsAttrsPrefix="", bool AlwaysNew=false)
Create a new loop identifier for a loop created from a loop transformation.
LLVM_ABI bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
LLVM_ABI char & LCSSAID
Definition LCSSA.cpp:526
LLVM_ABI Pass * createLoopUnrollPass(int OptLevel=2, bool OnlyWhenForced=false, bool ForgetAllSCEV=false, int Threshold=-1, int Count=-1, int AllowPartial=-1, int Runtime=-1, int UpperBound=-1, int AllowPeeling=-1)
AnalysisManager< Loop, LoopStandardAnalysisResults & > LoopAnalysisManager
The loop analysis manager.
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:2026
LLVM_ABI void initializeLoopUnrollPass(PassRegistry &)
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
LLVM_ABI CallBase * getLoopConvergenceHeart(const Loop *TheLoop)
Find the convergence heart of the loop.
LLVM_ABI TransformationMode hasUnrollAndJamTransformation(const Loop *L)
cl::opt< bool > ForgetSCEVInLoopUnroll
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
void computePeelCount(Loop *L, unsigned LoopSize, TargetTransformInfo::PeelingPreferences &PP, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE, const TargetTransformInfo &TTI, AssumptionCache *AC=nullptr, unsigned Threshold=UINT_MAX)
Definition LoopPeel.cpp:751
LLVM_TEMPLATE_ABI void appendLoopsToWorklist(RangeT &&, SmallPriorityWorklist< Loop *, 4 > &)
Utility that implements appending of loops onto a worklist given a range.
LLVM_ABI cl::opt< unsigned > SCEVCheapExpansionBudget
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
LLVM_ABI TransformationMode hasUnrollTransformation(const Loop *L)
LoopUnrollResult
Represents the result of a UnrollLoop invocation.
Definition UnrollLoop.h:58
@ PartiallyUnrolled
The loop was partially unrolled – we still have a loop, but with a smaller trip count.
Definition UnrollLoop.h:65
@ Unmodified
The loop was not modified.
Definition UnrollLoop.h:60
@ FullyUnrolled
The loop was fully unrolled into straight-line code.
Definition UnrollLoop.h:69
LLVM_ABI bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void getLoopAnalysisUsage(AnalysisUsage &AU)
Helper to consistently add the set of standard passes to a loop pass's AnalysisUsage.
void peelLoop(Loop *L, unsigned PeelCount, bool PeelLast, LoopInfo *LI, ScalarEvolution *SE, DominatorTree &DT, AssumptionCache *AC, bool PreserveLCSSA, ValueToValueMapTy &VMap)
VMap is the value-map that maps instructions from the original loop to instructions in the last peele...
const char *const LLVMLoopUnrollFollowupAll
Definition UnrollLoop.h:45
TargetTransformInfo TTI
TransformationMode
The mode sets how eager a transformation should be applied.
Definition LoopUtils.h:283
@ TM_ForcedByUser
The transformation was directed by the user, e.g.
Definition LoopUtils.h:300
@ TM_Disable
The transformation should not be applied.
Definition LoopUtils.h:292
@ TM_Enable
The transformation should be applied without considering a cost model.
Definition LoopUtils.h:289
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2012
DWARFExpression::Operation Op
LLVM_ABI TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1917
const char *const LLVMLoopUnrollFollowupRemainder
Definition UnrollLoop.h:48
LLVM_ABI PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
const char *const LLVMLoopUnrollFollowupUnrolled
Definition UnrollLoop.h:46
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2192
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI MDNode * GetUnrollMetadata(MDNode *LoopID, StringRef Name)
Given an llvm.loop loop id metadata node, returns the loop hint metadata node with the given name (fo...
LLVM_ABI LoopUnrollResult UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, const llvm::TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE, bool PreserveLCSSA, Loop **RemainderLoop=nullptr, AAResults *AA=nullptr)
Unroll the given loop by Count.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:177
Utility to calculate the size and a few similar metrics for a set of basic blocks.
Definition CodeMetrics.h:34
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition PassManager.h:70
bool PeelLast
Peel off the last PeelCount loop iterations.
bool PeelProfiledIterations
Allow peeling basing on profile.
unsigned PeelCount
A forced peeling factor (the number of bodied of the original loop that should be peeled off before t...
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
bool UpperBound
Allow using trip count upper bound to unroll loops.
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned MaxPercentThresholdBoost
If complete unrolling will reduce the cost of the loop, we will boost the Threshold by a certain perc...
bool RuntimeUnrollMultiExit
Allow runtime unrolling multi-exit loops.
unsigned SCEVExpansionBudget
Don't allow runtime unrolling if expanding the trip count takes more than SCEVExpansionBudget.
bool AddAdditionalAccumulators
Allow unrolling to add parallel reduction phis.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
unsigned MaxIterationsCountToAnalyze
Don't allow loop unrolling to simulate more than this number of iterations when checking full unroll ...
bool AllowRemainder
Allow generation of a loop remainder (extra iterations after unroll).
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned FullUnrollMaxCount
Set the maximum unrolling factor for full unrolling.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
bool AllowExpensiveTripCount
Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for...
unsigned MaxUpperBound
Set the maximum upper bound of trip count.
const Instruction * Heart
Definition UnrollLoop.h:79