LLVM 23.0.0git
LoopUnrollPass.cpp
Go to the documentation of this file.
1//===- LoopUnroll.cpp - Loop unroller pass --------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements a simple loop unroller. It works best when loops have
10// been canonicalized by the -indvars pass, allowing it to determine the trip
11// counts of loops easily.
12//===----------------------------------------------------------------------===//
13
15#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/DenseSet.h"
18#include "llvm/ADT/STLExtras.h"
19#include "llvm/ADT/SetVector.h"
22#include "llvm/ADT/StringRef.h"
36#include "llvm/IR/BasicBlock.h"
37#include "llvm/IR/CFG.h"
38#include "llvm/IR/Constant.h"
39#include "llvm/IR/Constants.h"
41#include "llvm/IR/Dominators.h"
42#include "llvm/IR/Function.h"
43#include "llvm/IR/Instruction.h"
45#include "llvm/IR/Metadata.h"
46#include "llvm/IR/PassManager.h"
48#include "llvm/Pass.h"
51#include "llvm/Support/Debug.h"
63#include <algorithm>
64#include <cassert>
65#include <cstdint>
66#include <limits>
67#include <optional>
68#include <string>
69#include <tuple>
70#include <utility>
71
72using namespace llvm;
73
74#define DEBUG_TYPE "loop-unroll"
75
77 "forget-scev-loop-unroll", cl::init(false), cl::Hidden,
78 cl::desc("Forget everything in SCEV when doing LoopUnroll, instead of just"
79 " the current top-most loop. This is sometimes preferred to reduce"
80 " compile time."));
81
83 UnrollThreshold("unroll-threshold", cl::Hidden,
84 cl::desc("The cost threshold for loop unrolling"));
85
88 "unroll-optsize-threshold", cl::init(0), cl::Hidden,
89 cl::desc("The cost threshold for loop unrolling when optimizing for "
90 "size"));
91
93 "unroll-partial-threshold", cl::Hidden,
94 cl::desc("The cost threshold for partial loop unrolling"));
95
97 "unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden,
98 cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied "
99 "to the threshold when aggressively unrolling a loop due to the "
100 "dynamic cost savings. If completely unrolling a loop will reduce "
101 "the total runtime from X to Y, we boost the loop unroll "
102 "threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, "
103 "X/Y). This limit avoids excessive code bloat."));
104
106 "unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden,
107 cl::desc("Don't allow loop unrolling to simulate more than this number of "
108 "iterations when checking full unroll profitability"));
109
111 "unroll-count", cl::Hidden,
112 cl::desc("Use this unroll count for all loops including those with "
113 "unroll_count pragma values, for testing purposes"));
114
116 "unroll-max-count", cl::Hidden,
117 cl::desc("Set the max unroll count for partial and runtime unrolling, for"
118 "testing purposes"));
119
121 "unroll-full-max-count", cl::Hidden,
122 cl::desc(
123 "Set the max unroll count for full unrolling, for testing purposes"));
124
125static cl::opt<bool>
126 UnrollAllowPartial("unroll-allow-partial", cl::Hidden,
127 cl::desc("Allows loops to be partially unrolled until "
128 "-unroll-threshold loop size is reached."));
129
131 "unroll-allow-remainder", cl::Hidden,
132 cl::desc("Allow generation of a loop remainder (extra iterations) "
133 "when unrolling a loop."));
134
135static cl::opt<bool>
136 UnrollRuntime("unroll-runtime", cl::Hidden,
137 cl::desc("Unroll loops with run-time trip counts"));
138
140 "unroll-max-upperbound", cl::init(8), cl::Hidden,
141 cl::desc(
142 "The max of trip count upper bound that is considered in unrolling"));
143
145 "pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,
146 cl::desc("Unrolled size limit for loops with unroll metadata "
147 "(full, enable, or count)."));
148
150 "flat-loop-tripcount-threshold", cl::init(5), cl::Hidden,
151 cl::desc("If the runtime tripcount for the loop is lower than the "
152 "threshold, the loop is considered as flat and will be less "
153 "aggressively unrolled."));
154
156 "unroll-remainder", cl::Hidden,
157 cl::desc("Allow the loop remainder to be unrolled."));
158
159// This option isn't ever intended to be enabled, it serves to allow
160// experiments to check the assumptions about when this kind of revisit is
161// necessary.
163 "unroll-revisit-child-loops", cl::Hidden,
164 cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. "
165 "This shouldn't typically be needed as child loops (or their "
166 "clones) were already visited."));
167
169 "unroll-threshold-aggressive", cl::init(300), cl::Hidden,
170 cl::desc("Threshold (max size of unrolled loop) to use in aggressive (O3) "
171 "optimizations"));
173 UnrollThresholdDefault("unroll-threshold-default", cl::init(150),
175 cl::desc("Default threshold (max size of unrolled "
176 "loop), used in all but O3 optimizations"));
177
179 "pragma-unroll-full-max-iterations", cl::init(1'000'000), cl::Hidden,
180 cl::desc("Maximum allowed iterations to unroll under pragma unroll full."));
181
182/// A magic value for use with the Threshold parameter to indicate
183/// that the loop unroll should be performed regardless of how much
184/// code expansion would result.
185static const unsigned NoThreshold = std::numeric_limits<unsigned>::max();
186
187/// Gather the various unrolling parameters based on the defaults, compiler
188/// flags, TTI overrides and user specified parameters.
192 OptimizationRemarkEmitter &ORE, int OptLevel,
193 std::optional<unsigned> UserThreshold, std::optional<unsigned> UserCount,
194 std::optional<bool> UserAllowPartial, std::optional<bool> UserRuntime,
195 std::optional<bool> UserUpperBound,
196 std::optional<unsigned> UserFullUnrollMaxCount) {
198
199 // Set up the defaults
200 UP.Threshold =
204 UP.PartialThreshold = 150;
206 UP.Count = 0;
208 UP.MaxCount = std::numeric_limits<unsigned>::max();
210 UP.FullUnrollMaxCount = std::numeric_limits<unsigned>::max();
211 UP.BEInsns = 2;
212 UP.Partial = false;
213 UP.Runtime = false;
214 UP.AllowRemainder = true;
215 UP.UnrollRemainder = false;
216 UP.AllowExpensiveTripCount = false;
217 UP.Force = false;
218 UP.UpperBound = false;
219 UP.UnrollAndJam = false;
223 UP.RuntimeUnrollMultiExit = false;
224 UP.AddAdditionalAccumulators = false;
225
226 // Override with any target specific settings
227 TTI.getUnrollingPreferences(L, SE, UP, &ORE);
228
229 // Apply size attributes
230 bool OptForSize = L->getHeader()->getParent()->hasOptSize() ||
231 // Let unroll hints / pragmas take precedence over PGSO.
233 llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
235 if (OptForSize) {
239 }
240
241 // Apply any user values specified by cl::opt
242 if (UnrollThreshold.getNumOccurrences() > 0)
244 if (UnrollPartialThreshold.getNumOccurrences() > 0)
246 if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > 0)
248 if (UnrollMaxCount.getNumOccurrences() > 0)
250 if (UnrollMaxUpperBound.getNumOccurrences() > 0)
252 if (UnrollFullMaxCount.getNumOccurrences() > 0)
254 if (UnrollAllowPartial.getNumOccurrences() > 0)
256 if (UnrollAllowRemainder.getNumOccurrences() > 0)
258 if (UnrollRuntime.getNumOccurrences() > 0)
260 if (UnrollMaxUpperBound == 0)
261 UP.UpperBound = false;
262 if (UnrollUnrollRemainder.getNumOccurrences() > 0)
264 if (UnrollMaxIterationsCountToAnalyze.getNumOccurrences() > 0)
266
267 // Apply user values provided by argument
268 if (UserThreshold) {
269 UP.Threshold = *UserThreshold;
270 UP.PartialThreshold = *UserThreshold;
271 }
272 if (UserCount)
273 UP.Count = *UserCount;
274 if (UserAllowPartial)
275 UP.Partial = *UserAllowPartial;
276 if (UserRuntime)
277 UP.Runtime = *UserRuntime;
278 if (UserUpperBound)
279 UP.UpperBound = *UserUpperBound;
280 if (UserFullUnrollMaxCount)
281 UP.FullUnrollMaxCount = *UserFullUnrollMaxCount;
282
283 return UP;
284}
285
286namespace {
287
288/// A struct to densely store the state of an instruction after unrolling at
289/// each iteration.
290///
291/// This is designed to work like a tuple of <Instruction *, int> for the
292/// purposes of hashing and lookup, but to be able to associate two boolean
293/// states with each key.
294struct UnrolledInstState {
295 Instruction *I;
296 int Iteration : 30;
297 unsigned IsFree : 1;
298 unsigned IsCounted : 1;
299};
300
301/// Hashing and equality testing for a set of the instruction states.
302struct UnrolledInstStateKeyInfo {
303 using PtrInfo = DenseMapInfo<Instruction *>;
304 using PairInfo = DenseMapInfo<std::pair<Instruction *, int>>;
305
306 static inline UnrolledInstState getEmptyKey() {
307 return {PtrInfo::getEmptyKey(), 0, 0, 0};
308 }
309
310 static inline unsigned getHashValue(const UnrolledInstState &S) {
311 return PairInfo::getHashValue({S.I, S.Iteration});
312 }
313
314 static inline bool isEqual(const UnrolledInstState &LHS,
315 const UnrolledInstState &RHS) {
316 return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration});
317 }
318};
319
320struct EstimatedUnrollCost {
321 /// The estimated cost after unrolling.
322 unsigned UnrolledCost;
323
324 /// The estimated dynamic cost of executing the instructions in the
325 /// rolled form.
326 unsigned RolledDynamicCost;
327};
328
329} // end anonymous namespace
330
331/// Figure out if the loop is worth full unrolling.
332///
333/// Complete loop unrolling can make some loads constant, and we need to know
334/// if that would expose any further optimization opportunities. This routine
335/// estimates this optimization. It computes cost of unrolled loop
336/// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By
337/// dynamic cost we mean that we won't count costs of blocks that are known not
338/// to be executed (i.e. if we have a branch in the loop and we know that at the
339/// given iteration its condition would be resolved to true, we won't add up the
340/// cost of the 'false'-block).
341/// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If
342/// the analysis failed (no benefits expected from the unrolling, or the loop is
343/// too big to analyze), the returned value is std::nullopt.
344static std::optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
345 const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE,
346 const SmallPtrSetImpl<const Value *> &EphValues,
347 const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize,
348 unsigned MaxIterationsCountToAnalyze) {
349 // We want to be able to scale offsets by the trip count and add more offsets
350 // to them without checking for overflows, and we already don't want to
351 // analyze *massive* trip counts, so we force the max to be reasonably small.
352 assert(MaxIterationsCountToAnalyze <
353 (unsigned)(std::numeric_limits<int>::max() / 2) &&
354 "The unroll iterations max is too large!");
355
356 // Only analyze inner loops. We can't properly estimate cost of nested loops
357 // and we won't visit inner loops again anyway.
358 if (!L->isInnermost()) {
360 << "Not analyzing loop cost: not an innermost loop.\n");
361 return std::nullopt;
362 }
363
364 // Don't simulate loops with a big or unknown tripcount
365 if (!TripCount || TripCount > MaxIterationsCountToAnalyze) {
367 << "Not analyzing loop cost: trip count "
368 << (TripCount ? "too large" : "unknown") << ".\n");
369 return std::nullopt;
370 }
371
374 DenseMap<Value *, Value *> SimplifiedValues;
375 SmallVector<std::pair<Value *, Value *>, 4> SimplifiedInputValues;
376
377 // The estimated cost of the unrolled form of the loop. We try to estimate
378 // this by simplifying as much as we can while computing the estimate.
379 InstructionCost UnrolledCost = 0;
380
381 // We also track the estimated dynamic (that is, actually executed) cost in
382 // the rolled form. This helps identify cases when the savings from unrolling
383 // aren't just exposing dead control flows, but actual reduced dynamic
384 // instructions due to the simplifications which we expect to occur after
385 // unrolling.
386 InstructionCost RolledDynamicCost = 0;
387
388 // We track the simplification of each instruction in each iteration. We use
389 // this to recursively merge costs into the unrolled cost on-demand so that
390 // we don't count the cost of any dead code. This is essentially a map from
391 // <instruction, int> to <bool, bool>, but stored as a densely packed struct.
393
394 // A small worklist used to accumulate cost of instructions from each
395 // observable and reached root in the loop.
397
398 // PHI-used worklist used between iterations while accumulating cost.
400
401 // Helper function to accumulate cost for instructions in the loop.
402 auto AddCostRecursively = [&](Instruction &RootI, int Iteration) {
403 assert(Iteration >= 0 && "Cannot have a negative iteration!");
404 assert(CostWorklist.empty() && "Must start with an empty cost list");
405 assert(PHIUsedList.empty() && "Must start with an empty phi used list");
406 CostWorklist.push_back(&RootI);
408 RootI.getFunction()->hasMinSize() ?
411 for (;; --Iteration) {
412 do {
413 Instruction *I = CostWorklist.pop_back_val();
414
415 // InstCostMap only uses I and Iteration as a key, the other two values
416 // don't matter here.
417 auto CostIter = InstCostMap.find({I, Iteration, 0, 0});
418 if (CostIter == InstCostMap.end())
419 // If an input to a PHI node comes from a dead path through the loop
420 // we may have no cost data for it here. What that actually means is
421 // that it is free.
422 continue;
423 auto &Cost = *CostIter;
424 if (Cost.IsCounted)
425 // Already counted this instruction.
426 continue;
427
428 // Mark that we are counting the cost of this instruction now.
429 Cost.IsCounted = true;
430
431 // If this is a PHI node in the loop header, just add it to the PHI set.
432 if (auto *PhiI = dyn_cast<PHINode>(I))
433 if (PhiI->getParent() == L->getHeader()) {
434 assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they "
435 "inherently simplify during unrolling.");
436 if (Iteration == 0)
437 continue;
438
439 // Push the incoming value from the backedge into the PHI used list
440 // if it is an in-loop instruction. We'll use this to populate the
441 // cost worklist for the next iteration (as we count backwards).
442 if (auto *OpI = dyn_cast<Instruction>(
443 PhiI->getIncomingValueForBlock(L->getLoopLatch())))
444 if (L->contains(OpI))
445 PHIUsedList.push_back(OpI);
446 continue;
447 }
448
449 // First accumulate the cost of this instruction.
450 if (!Cost.IsFree) {
451 // Consider simplified operands in instruction cost.
453 transform(I->operands(), std::back_inserter(Operands),
454 [&](Value *Op) {
455 if (auto Res = SimplifiedValues.lookup(Op))
456 return Res;
457 return Op;
458 });
459 UnrolledCost += TTI.getInstructionCost(I, Operands, CostKind);
461 << "Adding cost of instruction (iteration " << Iteration
462 << "): ");
463 LLVM_DEBUG(I->dump());
464 }
465
466 // We must count the cost of every operand which is not free,
467 // recursively. If we reach a loop PHI node, simply add it to the set
468 // to be considered on the next iteration (backwards!).
469 for (Value *Op : I->operands()) {
470 // Check whether this operand is free due to being a constant or
471 // outside the loop.
472 auto *OpI = dyn_cast<Instruction>(Op);
473 if (!OpI || !L->contains(OpI))
474 continue;
475
476 // Otherwise accumulate its cost.
477 CostWorklist.push_back(OpI);
478 }
479 } while (!CostWorklist.empty());
480
481 if (PHIUsedList.empty())
482 // We've exhausted the search.
483 break;
484
485 assert(Iteration > 0 &&
486 "Cannot track PHI-used values past the first iteration!");
487 CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end());
488 PHIUsedList.clear();
489 }
490 };
491
492 // Ensure that we don't violate the loop structure invariants relied on by
493 // this analysis.
494 assert(L->isLoopSimplifyForm() && "Must put loop into normal form first.");
495 assert(L->isLCSSAForm(DT) &&
496 "Must have loops in LCSSA form to track live-out values.");
497
499 << "Starting LoopUnroll profitability analysis...\n");
500
502 L->getHeader()->getParent()->hasMinSize() ?
504 // Simulate execution of each iteration of the loop counting instructions,
505 // which would be simplified.
506 // Since the same load will take different values on different iterations,
507 // we literally have to go through all loop's iterations.
508 for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {
509 LLVM_DEBUG(dbgs().indent(3) << "Analyzing iteration " << Iteration << "\n");
510
511 // Prepare for the iteration by collecting any simplified entry or backedge
512 // inputs.
513 for (Instruction &I : *L->getHeader()) {
514 auto *PHI = dyn_cast<PHINode>(&I);
515 if (!PHI)
516 break;
517
518 // The loop header PHI nodes must have exactly two input: one from the
519 // loop preheader and one from the loop latch.
520 assert(
521 PHI->getNumIncomingValues() == 2 &&
522 "Must have an incoming value only for the preheader and the latch.");
523
524 Value *V = PHI->getIncomingValueForBlock(
525 Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch());
526 if (Iteration != 0 && SimplifiedValues.count(V))
527 V = SimplifiedValues.lookup(V);
528 SimplifiedInputValues.push_back({PHI, V});
529 }
530
531 // Now clear and re-populate the map for the next iteration.
532 SimplifiedValues.clear();
533 while (!SimplifiedInputValues.empty())
534 SimplifiedValues.insert(SimplifiedInputValues.pop_back_val());
535
536 UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L);
537
538 BBWorklist.clear();
539 BBWorklist.insert(L->getHeader());
540 // Note that we *must not* cache the size, this loop grows the worklist.
541 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
542 BasicBlock *BB = BBWorklist[Idx];
543
544 // Visit all instructions in the given basic block and try to simplify
545 // it. We don't change the actual IR, just count optimization
546 // opportunities.
547 for (Instruction &I : *BB) {
548 // These won't get into the final code - don't even try calculating the
549 // cost for them.
550 if (EphValues.count(&I))
551 continue;
552
553 // Track this instruction's expected baseline cost when executing the
554 // rolled loop form.
555 RolledDynamicCost += TTI.getInstructionCost(&I, CostKind);
556
557 // Visit the instruction to analyze its loop cost after unrolling,
558 // and if the visitor returns true, mark the instruction as free after
559 // unrolling and continue.
560 bool IsFree = Analyzer.visit(I);
561 bool Inserted = InstCostMap.insert({&I, (int)Iteration,
562 (unsigned)IsFree,
563 /*IsCounted*/ false}).second;
564 (void)Inserted;
565 assert(Inserted && "Cannot have a state for an unvisited instruction!");
566
567 if (IsFree)
568 continue;
569
570 // Can't properly model a cost of a call.
571 // FIXME: With a proper cost model we should be able to do it.
572 if (auto *CI = dyn_cast<CallInst>(&I)) {
573 const Function *Callee = CI->getCalledFunction();
574 if (!Callee || TTI.isLoweredToCall(Callee)) {
576 << "Can't analyze cost of loop with call\n");
577 return std::nullopt;
578 }
579 }
580
581 // If the instruction might have a side-effect recursively account for
582 // the cost of it and all the instructions leading up to it.
583 if (I.mayHaveSideEffects())
584 AddCostRecursively(I, Iteration);
585
586 // If unrolled body turns out to be too big, bail out.
587 if (UnrolledCost > MaxUnrolledLoopSize) {
588 LLVM_DEBUG({
589 dbgs().indent(3) << "Exceeded threshold.. exiting.\n";
590 dbgs().indent(3)
591 << "UnrolledCost: " << UnrolledCost
592 << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize << "\n";
593 });
594 return std::nullopt;
595 }
596 }
597
598 Instruction *TI = BB->getTerminator();
599
600 auto getSimplifiedConstant = [&](Value *V) -> Constant * {
601 if (SimplifiedValues.count(V))
602 V = SimplifiedValues.lookup(V);
603 return dyn_cast<Constant>(V);
604 };
605
606 // Add in the live successors by first checking whether we have terminator
607 // that may be simplified based on the values simplified by this call.
608 BasicBlock *KnownSucc = nullptr;
609 if (CondBrInst *BI = dyn_cast<CondBrInst>(TI)) {
610 if (auto *SimpleCond = getSimplifiedConstant(BI->getCondition())) {
611 // Just take the first successor if condition is undef
612 if (isa<UndefValue>(SimpleCond))
613 KnownSucc = BI->getSuccessor(0);
614 else if (ConstantInt *SimpleCondVal =
615 dyn_cast<ConstantInt>(SimpleCond))
616 KnownSucc = BI->getSuccessor(SimpleCondVal->isZero() ? 1 : 0);
617 }
618 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
619 if (auto *SimpleCond = getSimplifiedConstant(SI->getCondition())) {
620 // Just take the first successor if condition is undef
621 if (isa<UndefValue>(SimpleCond))
622 KnownSucc = SI->getSuccessor(0);
623 else if (ConstantInt *SimpleCondVal =
624 dyn_cast<ConstantInt>(SimpleCond))
625 KnownSucc = SI->findCaseValue(SimpleCondVal)->getCaseSuccessor();
626 }
627 }
628 if (KnownSucc) {
629 if (L->contains(KnownSucc))
630 BBWorklist.insert(KnownSucc);
631 else
632 ExitWorklist.insert({BB, KnownSucc});
633 continue;
634 }
635
636 // Add BB's successors to the worklist.
637 for (BasicBlock *Succ : successors(BB))
638 if (L->contains(Succ))
639 BBWorklist.insert(Succ);
640 else
641 ExitWorklist.insert({BB, Succ});
642 AddCostRecursively(*TI, Iteration);
643 }
644
645 // If we found no optimization opportunities on the first iteration, we
646 // won't find them on later ones too.
647 if (UnrolledCost == RolledDynamicCost) {
648 LLVM_DEBUG({
649 dbgs().indent(3) << "No opportunities found.. exiting.\n";
650 dbgs().indent(3) << "UnrolledCost: " << UnrolledCost << "\n";
651 });
652 return std::nullopt;
653 }
654 }
655
656 while (!ExitWorklist.empty()) {
657 BasicBlock *ExitingBB, *ExitBB;
658 std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val();
659
660 for (Instruction &I : *ExitBB) {
661 auto *PN = dyn_cast<PHINode>(&I);
662 if (!PN)
663 break;
664
665 Value *Op = PN->getIncomingValueForBlock(ExitingBB);
666 if (auto *OpI = dyn_cast<Instruction>(Op))
667 if (L->contains(OpI))
668 AddCostRecursively(*OpI, TripCount - 1);
669 }
670 }
671
672 assert(UnrolledCost.isValid() && RolledDynamicCost.isValid() &&
673 "All instructions must have a valid cost, whether the "
674 "loop is rolled or unrolled.");
675
676 LLVM_DEBUG({
677 dbgs().indent(3) << "Analysis finished:\n";
678 dbgs().indent(3) << "UnrolledCost: " << UnrolledCost
679 << ", RolledDynamicCost: " << RolledDynamicCost << "\n";
680 });
681 return {{unsigned(UnrolledCost.getValue()),
682 unsigned(RolledDynamicCost.getValue())}};
683}
684
686 const Loop *L, const TargetTransformInfo &TTI,
687 const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns,
688 bool TripCountIsUniform) {
690 for (BasicBlock *BB : L->blocks())
691 Metrics.analyzeBasicBlock(BB, TTI, EphValues, /* PrepareForLTO= */ false,
692 L);
693 NumInlineCandidates = Metrics.NumInlineCandidates;
694 NotDuplicatable = Metrics.notDuplicatable;
695 Convergence = Metrics.Convergence;
696 LoopSize = Metrics.NumInsts;
697 // Convergent operations make the remainder prelude unsafe by adding a
698 // control-flow dependency, unless the trip count is uniform per
699 // UniformityInfo, in which case all paths agree and the remainder is safe.
701 (Metrics.Convergence != ConvergenceKind::Uncontrolled &&
703 TripCountIsUniform;
704
705 // Don't allow an estimate of size zero. This would allows unrolling of loops
706 // with huge iteration counts, which is a compile time problem even if it's
707 // not a problem for code quality. Also, the code using this size may assume
708 // that each loop has at least three instructions (likely a conditional
709 // branch, a comparison feeding that branch, and some kind of loop increment
710 // feeding that comparison instruction).
711 if (LoopSize.isValid() && LoopSize < BEInsns + 1)
712 // This is an open coded max() on InstructionCost
713 LoopSize = BEInsns + 1;
714}
715
717 const Loop *L) const {
718 auto ReportCannotUnroll = [&](StringRef Reason) {
719 LLVM_DEBUG(dbgs().indent(1) << "Not unrolling: " << Reason << ".\n");
720 if (ORE && L)
721 ORE->emit([&]() {
722 return OptimizationRemarkMissed(DEBUG_TYPE, "CannotUnrollLoop",
723 L->getStartLoc(), L->getHeader())
724 << "unable to unroll loop: " << Reason;
725 });
726 };
727
729 ReportCannotUnroll("contains convergent operations");
730 return false;
731 }
732 if (!LoopSize.isValid()) {
733 ReportCannotUnroll("loop size could not be computed");
734 return false;
735 }
736 if (NotDuplicatable) {
737 ReportCannotUnroll("contains non-duplicatable instructions");
738 return false;
739 }
740 return true;
741}
742
745 unsigned CountOverwrite) const {
746 unsigned LS = LoopSize.getValue();
747 assert(LS >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
748 if (CountOverwrite)
749 return static_cast<uint64_t>(LS - UP.BEInsns) * CountOverwrite + UP.BEInsns;
750 else
751 return static_cast<uint64_t>(LS - UP.BEInsns) * UP.Count + UP.BEInsns;
752}
753
754// Returns true if the loop has an unroll(full) pragma.
755static bool hasUnrollFullPragma(const Loop *L) {
756 return getUnrollMetadataForLoop(L, "llvm.loop.unroll.full");
757}
758
759// Returns true if the loop has an unroll(enable) pragma. This metadata is used
760// for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives.
761static bool hasUnrollEnablePragma(const Loop *L) {
762 return getUnrollMetadataForLoop(L, "llvm.loop.unroll.enable");
763}
764
765// Returns true if the loop has a runtime unroll(disable) pragma.
766static bool hasRuntimeUnrollDisablePragma(const Loop *L) {
767 return getUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable");
768}
769
770/// Returns true if the SCEV expression is uniform, i.e., all threads in a
771/// convergent execution agree on its value. Recursively checks operands.
772/// Returns false if the SCEV could not be computed.
773static bool isSCEVUniform(const SCEV *S, UniformityInfo &UI) {
775 return false;
776 if (isa<SCEVConstant>(S))
777 return true;
778 if (auto *U = dyn_cast<SCEVUnknown>(S))
779 return UI.isUniformAtDef(U->getValue());
780 for (const SCEV *Op : S->operands()) {
781 if (!isSCEVUniform(Op, UI))
782 return false;
783 }
784 return true;
785}
786
787// If loop has an unroll_count pragma return the (necessarily
788// positive) value from the pragma. Otherwise return 0.
789static unsigned unrollCountPragmaValue(const Loop *L) {
790 MDNode *MD = getUnrollMetadataForLoop(L, "llvm.loop.unroll.count");
791 if (MD) {
792 assert(MD->getNumOperands() == 2 &&
793 "Unroll count hint metadata should have two operands.");
794 unsigned Count =
795 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
796 assert(Count >= 1 && "Unroll count must be positive.");
797 return Count;
798 }
799 return 0;
800}
801
810
811// Computes the boosting factor for complete unrolling.
812// If fully unrolling the loop would save a lot of RolledDynamicCost, it would
813// be beneficial to fully unroll the loop even if unrolledcost is large. We
814// use (RolledDynamicCost / UnrolledCost) to model the unroll benefits to adjust
815// the unroll threshold.
816static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost,
817 unsigned MaxPercentThresholdBoost) {
818 if (Cost.RolledDynamicCost >= std::numeric_limits<unsigned>::max() / 100)
819 return 100;
820 else if (Cost.UnrolledCost != 0)
821 // The boosting factor is RolledDynamicCost / UnrolledCost
822 return std::min(100 * Cost.RolledDynamicCost / Cost.UnrolledCost,
823 MaxPercentThresholdBoost);
824 else
825 return MaxPercentThresholdBoost;
826}
827
828static std::optional<unsigned>
830 const unsigned TripMultiple, const unsigned TripCount,
831 unsigned MaxTripCount, const UnrollCostEstimator UCE,
834
835 // Using unroll pragma
836 // 1st priority is unroll count set by "unroll-count" option.
837
838 if (PInfo.UserUnrollCount) {
839 if (UP.AllowRemainder &&
840 UCE.getUnrolledLoopSize(UP, (unsigned)UnrollCount) < UP.Threshold) {
841 LLVM_DEBUG(dbgs().indent(2) << "Unrolling with user-specified count: "
842 << UnrollCount << ".\n");
843 return (unsigned)UnrollCount;
844 }
846 << "Not unrolling with user count " << UnrollCount << ": "
847 << (UP.AllowRemainder ? "exceeds threshold"
848 : "remainder not allowed")
849 << ".\n");
850 }
851
852 // 2nd priority is unroll count set by pragma.
853 if (PInfo.PragmaCount > 0) {
854 if ((UP.AllowRemainder || (TripMultiple % PInfo.PragmaCount == 0))) {
855 LLVM_DEBUG(dbgs().indent(2) << "Unrolling with pragma count: "
856 << PInfo.PragmaCount << ".\n");
857 return PInfo.PragmaCount;
858 }
860 << "Not unrolling with pragma count " << PInfo.PragmaCount
861 << ": remainder not allowed, count does not divide trip "
862 << "multiple " << TripMultiple << ".\n");
863 ORE->emit([&]() {
864 return OptimizationRemarkAnalysis(DEBUG_TYPE, "PragmaUnrollCountRejected",
865 L->getStartLoc(), L->getHeader())
866 << "may be unable to unroll loop with count "
867 << ore::NV("PragmaCount", PInfo.PragmaCount)
868 << ": remainder loop is not allowed and count does not divide "
869 "trip multiple "
870 << ore::NV("TripMultiple", TripMultiple);
871 });
872 }
873
874 if (PInfo.PragmaFullUnroll) {
875 if (TripCount != 0) {
876 // Certain cases with UBSAN can cause trip count to be calculated as
877 // INT_MAX, Block full unrolling at a reasonable limit so that the
878 // compiler doesn't hang trying to unroll the loop. See PR77842
879 if (TripCount > PragmaUnrollFullMaxIterations) {
881 << "Won't unroll; trip count is too large.\n");
882 ORE->emit([&]() {
884 "PragmaFullUnrollTripCountTooLarge",
885 L->getStartLoc(), L->getHeader())
886 << "may be unable to fully unroll loop: trip count "
887 << ore::NV("TripCount", TripCount) << " exceeds limit "
889 });
890 return std::nullopt;
891 }
892
894 << "Fully unrolling with trip count: " << TripCount << ".\n");
895 return TripCount;
896 }
898 << "Not fully unrolling: unknown trip count.\n");
899 ORE->emit([&]() {
901 "PragmaFullUnrollUnknownTripCount",
902 L->getStartLoc(), L->getHeader())
903 << "may be unable to fully unroll loop: trip count is unknown";
904 });
905 }
906
907 if (PInfo.PragmaEnableUnroll && !TripCount && MaxTripCount &&
908 MaxTripCount <= UP.MaxUpperBound) {
910 << "Unrolling with max trip count: " << MaxTripCount << ".\n");
911 return MaxTripCount;
912 }
913
914 return std::nullopt;
915}
916
917static std::optional<unsigned> shouldFullUnroll(
920 const unsigned FullUnrollTripCount, const UnrollCostEstimator UCE,
922 assert(FullUnrollTripCount && "should be non-zero!");
923
924 if (FullUnrollTripCount > UP.FullUnrollMaxCount) {
926 << "Not unrolling: trip count " << FullUnrollTripCount
927 << " exceeds max count " << UP.FullUnrollMaxCount << ".\n");
928 return std::nullopt;
929 }
930
931 // When computing the unrolled size, note that BEInsns are not replicated
932 // like the rest of the loop body.
933 uint64_t UnrolledSize = UCE.getUnrolledLoopSize(UP, FullUnrollTripCount);
934 if (UnrolledSize < UP.Threshold) {
935 LLVM_DEBUG(dbgs().indent(2) << "Unrolling: size " << UnrolledSize
936 << " < threshold " << UP.Threshold << ".\n");
937 return FullUnrollTripCount;
938 }
939
941 << "Unrolled size " << UnrolledSize << " exceeds threshold "
942 << UP.Threshold << "; checking for cost benefit.\n");
943
944 // The loop isn't that small, but we still can fully unroll it if that
945 // helps to remove a significant number of instructions.
946 // To check that, run additional analysis on the loop.
947 if (std::optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
948 L, FullUnrollTripCount, DT, SE, EphValues, TTI,
951 unsigned Boost =
953 unsigned BoostedThreshold = UP.Threshold * Boost / 100;
954 if (Cost->UnrolledCost < BoostedThreshold) {
955 LLVM_DEBUG(dbgs().indent(2) << "Profitable after cost analysis.\n");
956 return FullUnrollTripCount;
957 }
959 << "Not unrolling: cost " << Cost->UnrolledCost
960 << " >= boosted threshold " << BoostedThreshold << ".\n");
961 }
962
963 return std::nullopt;
964}
965
966static std::optional<unsigned>
967shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount,
968 const UnrollCostEstimator UCE,
970
971 if (!TripCount)
972 return std::nullopt;
973
974 if (!UP.Partial) {
975 LLVM_DEBUG(dbgs().indent(2) << "Will not try to unroll partially because "
976 << "-unroll-allow-partial not given\n");
977 return 0;
978 }
979 unsigned count = UP.Count;
980 if (count == 0)
981 count = TripCount;
982 if (UP.PartialThreshold != NoThreshold) {
983 // Reduce unroll count to be modulo of TripCount for partial unrolling.
984 if (UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold) {
985 unsigned NewCount =
986 (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
987 (LoopSize - UP.BEInsns);
989 << "Unrolled size exceeds threshold; reducing count "
990 << "from " << count << " to " << NewCount << ".\n");
991 count = NewCount;
992 }
993 if (count > UP.MaxCount)
994 count = UP.MaxCount;
995 while (count != 0 && TripCount % count != 0)
996 count--;
997 if (UP.AllowRemainder && count <= 1) {
998 // If there is no Count that is modulo of TripCount, set Count to
999 // largest power-of-two factor that satisfies the threshold limit.
1000 // As we'll create fixup loop, do the type of unrolling only if
1001 // remainder loop is allowed.
1002 // Note: DefaultUnrollRuntimeCount is used as a reasonable starting point
1003 // even though this is partial unrolling (not runtime unrolling).
1005 while (count != 0 &&
1007 count >>= 1;
1008 }
1009 if (count < 2) {
1011 << "Will not partially unroll: no profitable count.\n");
1012 count = 0;
1013 }
1014 } else {
1015 count = TripCount;
1016 }
1017 if (count > UP.MaxCount)
1018 count = UP.MaxCount;
1019
1021 << "Partially unrolling with count: " << count << "\n");
1022
1023 return count;
1024}
1025// Calculates unroll count and writes it to UP.Count.
1026// Unless IgnoreUser is true, will also use metadata and command-line options
1027// that are specific to the LoopUnroll pass (which, for instance, are
1028// irrelevant for the LoopUnrollAndJam pass).
1029// FIXME: This function is used by LoopUnroll and LoopUnrollAndJam, but consumes
1030// many LoopUnroll-specific options. The shared functionality should be
1031// refactored into it own function.
1033 DominatorTree &DT, LoopInfo *LI,
1035 const SmallPtrSetImpl<const Value *> &EphValues,
1037 const unsigned TripCount,
1038 const unsigned MaxTripCount, const bool MaxOrZero,
1039 const unsigned TripMultiple,
1040 const UnrollCostEstimator &UCE,
1043
1044 unsigned LoopSize = UCE.getRolledLoopSize();
1045
1046 LLVM_DEBUG(dbgs().indent(1) << "Computing unroll count: TripCount="
1047 << TripCount << ", MaxTripCount=" << MaxTripCount
1048 << (MaxOrZero ? " (MaxOrZero)" : "")
1049 << ", TripMultiple=" << TripMultiple << "\n");
1050
1051 UnrollPragmaInfo PInfo(L);
1052 LLVM_DEBUG({
1053 if (PInfo.ExplicitUnroll) {
1054 dbgs().indent(1) << "Explicit unroll requested:";
1055 if (PInfo.UserUnrollCount)
1056 dbgs() << " user-count";
1057 if (PInfo.PragmaFullUnroll)
1058 dbgs() << " pragma-full";
1059 if (PInfo.PragmaCount > 0)
1060 dbgs() << " pragma-count(" << PInfo.PragmaCount << ")";
1061 if (PInfo.PragmaEnableUnroll)
1062 dbgs() << " pragma-enable";
1063 dbgs() << "\n";
1064 }
1065 });
1066
1067 // Use an explicit peel count that has been specified for testing. In this
1068 // case it's not permitted to also specify an explicit unroll count.
1069 if (PP.PeelCount) {
1070 if (UnrollCount.getNumOccurrences() > 0) {
1071 reportFatalUsageError("Cannot specify both explicit peel count and "
1072 "explicit unroll count");
1073 }
1075 << "Using explicit peel count: " << PP.PeelCount << ".\n");
1076 UP.Count = 1;
1077 UP.Runtime = false;
1078 return;
1079 }
1080
1081 // If a user provided an explicit unroll pragma (with or without count),
1082 // enable runtime unrolling and override expensive trip count checks.
1083 if (PInfo.PragmaEnableUnroll || PInfo.PragmaCount > 0) {
1084 UP.AllowExpensiveTripCount = true;
1085 UP.Runtime = true;
1086 }
1087
1088 // Check for explicit Count.
1089 // 1st priority is unroll count set by "unroll-count" option.
1090 // 2nd priority is unroll count set by pragma.
1091 LLVM_DEBUG(dbgs().indent(1) << "Trying pragma unroll...\n");
1092 if (auto UnrollFactor = shouldPragmaUnroll(L, PInfo, TripMultiple, TripCount,
1093 MaxTripCount, UCE, UP, ORE)) {
1094 UP.Count = *UnrollFactor;
1095
1096 if (PInfo.UserUnrollCount || (PInfo.PragmaCount > 0)) {
1097 UP.AllowExpensiveTripCount = true;
1098 UP.Force = true;
1099 }
1100 return;
1101 } else {
1102 if (PInfo.ExplicitUnroll && TripCount != 0) {
1103 // If the loop has an unrolling pragma, we want to be more aggressive with
1104 // unrolling limits. Set thresholds to at least the PragmaUnrollThreshold
1105 // value which is larger than the default limits.
1106 UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
1107 UP.PartialThreshold =
1108 std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
1109 }
1110 }
1111
1112 // 3rd priority is exact full unrolling. This will eliminate all copies
1113 // of some exit test.
1114 LLVM_DEBUG(dbgs().indent(1) << "Trying full unroll...\n");
1115 assert(UP.Count == 0);
1116 if (TripCount) {
1117 if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
1118 TripCount, UCE, UP)) {
1119 UP.Count = *UnrollFactor;
1120 return;
1121 }
1122 }
1123
1124 // 4th priority is bounded unrolling.
1125 // We can unroll by the upper bound amount if it's generally allowed or if
1126 // we know that the loop is executed either the upper bound or zero times.
1127 // (MaxOrZero unrolling keeps only the first loop test, so the number of
1128 // loop tests remains the same compared to the non-unrolled version, whereas
1129 // the generic upper bound unrolling keeps all but the last loop test so the
1130 // number of loop tests goes up which may end up being worse on targets with
1131 // constrained branch predictor resources so is controlled by an option.)
1132 // In addition we only unroll small upper bounds.
1133 // Note that the cost of bounded unrolling is always strictly greater than
1134 // cost of exact full unrolling. As such, if we have an exact count and
1135 // found it unprofitable, we'll never chose to bounded unroll.
1136 LLVM_DEBUG(dbgs().indent(1) << "Trying upper-bound unroll...\n");
1137 if (!TripCount && MaxTripCount && (UP.UpperBound || MaxOrZero) &&
1138 MaxTripCount <= UP.MaxUpperBound) {
1139 if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
1140 MaxTripCount, UCE, UP)) {
1141 UP.Count = *UnrollFactor;
1142 return;
1143 }
1144 }
1145
1146 // 5th priority is loop peeling.
1147 LLVM_DEBUG(dbgs().indent(1) << "Trying loop peeling...\n");
1148 computePeelCount(L, LoopSize, PP, TripCount, DT, SE, TTI, AC, UP.Threshold);
1149 if (PP.PeelCount) {
1151 << "Peeling with count: " << PP.PeelCount << ".\n");
1152 UP.Runtime = false;
1153 UP.Count = 1;
1154 return;
1155 }
1156
1157 // Before starting partial unrolling, set up.partial to true,
1158 // if user explicitly asked for unrolling
1159 if (TripCount)
1160 UP.Partial |= PInfo.ExplicitUnroll;
1161
1162 // 6th priority is partial unrolling.
1163 // Try partial unroll only when TripCount could be statically calculated.
1164 LLVM_DEBUG(dbgs().indent(1) << "Trying partial unroll...\n");
1165 if (auto UnrollFactor = shouldPartialUnroll(LoopSize, TripCount, UCE, UP)) {
1166 UP.Count = *UnrollFactor;
1167 return;
1168 }
1169 assert(TripCount == 0 &&
1170 "All cases when TripCount is constant should be covered here.");
1171
1172 // 7th priority is runtime unrolling.
1173 LLVM_DEBUG(dbgs().indent(1) << "Trying runtime unroll...\n");
1174 // Don't unroll a runtime trip count loop when it is disabled.
1175 if (PInfo.PragmaRuntimeUnrollDisable) {
1177 << "Not runtime unrolling: disabled by pragma.\n");
1178 return;
1179 }
1180
1181 // Don't unroll a small upper bound loop unless user or TTI asked to do so.
1182 if (MaxTripCount && !UP.Force && MaxTripCount <= UP.MaxUpperBound) {
1183 LLVM_DEBUG(dbgs().indent(2) << "Not runtime unrolling: max trip count "
1184 << MaxTripCount << " is small (<= "
1185 << UP.MaxUpperBound << ") and not forced.\n");
1186 return;
1187 }
1188
1189 // Check if the runtime trip count is too small when profile is available.
1190 if (L->getHeader()->getParent()->hasProfileData()) {
1191 if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
1192 if (*ProfileTripCount < FlatLoopTripCountThreshold)
1193 return;
1194 else
1195 UP.AllowExpensiveTripCount = true;
1196 }
1197 }
1198 if (!UP.Runtime) {
1200 << "Will not try to unroll loop with runtime trip count "
1201 << "because -unroll-runtime not given\n");
1202 return;
1203 }
1204
1205 assert(UP.Count == 0);
1207
1208 // Reduce unroll count to be the largest power-of-two factor of
1209 // the original count which satisfies the threshold limit.
1210 while (UP.Count != 0 &&
1212 UP.Count >>= 1;
1213
1214#ifndef NDEBUG
1215 unsigned OrigCount = UP.Count;
1216#endif
1217
1218 if (!UP.AllowRemainder && UP.Count != 0 && (TripMultiple % UP.Count) != 0) {
1219 while (UP.Count != 0 && TripMultiple % UP.Count != 0)
1220 UP.Count >>= 1;
1222 << "Remainder loop is restricted (that could be architecture "
1223 "specific or because the loop contains a convergent "
1224 "instruction), so unroll count must divide the trip "
1225 "multiple, "
1226 << TripMultiple << ". Reducing unroll count from " << OrigCount
1227 << " to " << UP.Count << ".\n");
1228 }
1229
1230 if (UP.Count > UP.MaxCount)
1231 UP.Count = UP.MaxCount;
1232
1233 if (MaxTripCount && UP.Count > MaxTripCount)
1234 UP.Count = MaxTripCount;
1235
1236 if (UP.Count < 2)
1237 UP.Count = 0;
1238 else
1240 << "Runtime unrolling with count: " << UP.Count << "\n");
1241 return;
1242}
1243
1244static LoopUnrollResult
1248 ProfileSummaryInfo *PSI, bool PreserveLCSSA, int OptLevel,
1249 bool OnlyFullUnroll, bool OnlyWhenForced, bool ForgetAllSCEV,
1250 std::optional<unsigned> ProvidedCount,
1251 std::optional<unsigned> ProvidedThreshold,
1252 std::optional<bool> ProvidedAllowPartial,
1253 std::optional<bool> ProvidedRuntime,
1254 std::optional<bool> ProvidedUpperBound,
1255 std::optional<bool> ProvidedAllowPeeling,
1256 std::optional<bool> ProvidedAllowProfileBasedPeeling,
1257 std::optional<unsigned> ProvidedFullUnrollMaxCount,
1258 UniformityInfo *UI = nullptr, AAResults *AA = nullptr) {
1259
1260 LLVM_DEBUG(dbgs() << "Loop Unroll: F["
1261 << L->getHeader()->getParent()->getName() << "] Loop %"
1262 << L->getHeader()->getName()
1263 << " (depth=" << L->getLoopDepth() << ")\n");
1265 if (TM & TM_Disable) {
1266 LLVM_DEBUG(dbgs().indent(1) << "Not unrolling: transformation disabled by "
1267 << "metadata.\n");
1269 }
1270
1271 // If this loop isn't forced to be unrolled, avoid unrolling it when the
1272 // parent loop has an explicit unroll-and-jam pragma. This is to prevent
1273 // automatic unrolling from interfering with the user requested
1274 // transformation.
1275 Loop *ParentL = L->getParentLoop();
1276 if (ParentL != nullptr &&
1279 LLVM_DEBUG(dbgs().indent(1) << "Not unrolling loop since parent loop has"
1280 << " llvm.loop.unroll_and_jam.\n");
1282 }
1283
1284 // If this loop isn't forced to be unrolled, avoid unrolling it when the
1285 // loop has an explicit unroll-and-jam pragma. This is to prevent automatic
1286 // unrolling from interfering with the user requested transformation.
1289 LLVM_DEBUG(
1290 dbgs().indent(1)
1291 << "Not unrolling loop since it has llvm.loop.unroll_and_jam.\n");
1293 }
1294
1295 if (!L->isLoopSimplifyForm()) {
1297 << "Not unrolling loop which is not in loop-simplify form.\n");
1298 if (TM & TM_ForcedByUser) {
1299 ORE.emit([&]() {
1300 return OptimizationRemarkMissed(DEBUG_TYPE, "NotInLoopSimplifyForm",
1301 L->getStartLoc(), L->getHeader())
1302 << "unable to unroll loop: not in loop-simplify form";
1303 });
1304 }
1306 }
1307
1308 // When automatic unrolling is disabled, do not unroll unless overridden for
1309 // this loop.
1310 if (OnlyWhenForced && !(TM & TM_Enable)) {
1311 LLVM_DEBUG(dbgs().indent(1) << "Not unrolling: automatic unrolling "
1312 << "disabled and loop not explicitly "
1313 << "enabled.\n");
1315 }
1316
1317 bool OptForSize = L->getHeader()->getParent()->hasOptSize();
1319 L, SE, TTI, BFI, PSI, ORE, OptLevel, ProvidedThreshold, ProvidedCount,
1320 ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
1321 ProvidedFullUnrollMaxCount);
1323 L, SE, TTI, ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling, true);
1324
1325 // Exit early if unrolling is disabled. For OptForSize, we pick the loop size
1326 // as threshold later on.
1327 if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0) &&
1328 !OptForSize) {
1329 LLVM_DEBUG(dbgs().indent(1) << "Not unrolling: all thresholds are zero.\n");
1330 if (TM & TM_ForcedByUser) {
1331 ORE.emit([&]() {
1332 return OptimizationRemarkMissed(DEBUG_TYPE, "UnrollThresholdsZero",
1333 L->getStartLoc(), L->getHeader())
1334 << "unable to unroll loop: unroll threshold is zero";
1335 });
1336 }
1338 }
1339
1341 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
1342
1343 // Check if the backedge-taken count is uniform before constructing UCE.
1344 // This is used to allow runtime unrolling with a remainder for convergent
1345 // loops when all threads agree on the trip count.
1346 const SCEV *BTC = SE.getBackedgeTakenCount(L);
1347 bool TripCountIsUniform = UI && isSCEVUniform(BTC, *UI);
1348 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns, TripCountIsUniform);
1349 if (!UCE.canUnroll((TM & TM_ForcedByUser) ? &ORE : nullptr, L))
1351
1352 unsigned LoopSize = UCE.getRolledLoopSize();
1353 LLVM_DEBUG(dbgs() << "Loop Size = " << LoopSize << "\n");
1354
1355 // When optimizing for size, use LoopSize + 1 as threshold (we use < Threshold
1356 // later), to (fully) unroll loops, if it does not increase code size.
1357 if (OptForSize)
1358 UP.Threshold = std::max(UP.Threshold, LoopSize + 1);
1359
1360 if (UCE.NumInlineCandidates != 0) {
1362 << "Not unrolling loop with inlinable calls.\n");
1363 if (TM & TM_ForcedByUser) {
1364 ORE.emit([&]() {
1366 "InlineCandidatesPreventUnroll",
1367 L->getStartLoc(), L->getHeader())
1368 << "unable to unroll loop: contains inlinable calls";
1369 });
1370 }
1372 }
1373
1374 // Find the smallest exact trip count for any exit. This is an upper bound
1375 // on the loop trip count, but an exit at an earlier iteration is still
1376 // possible. An unroll by the smallest exact trip count guarantees that all
1377 // branches relating to at least one exit can be eliminated. This is unlike
1378 // the max trip count, which only guarantees that the backedge can be broken.
1379 unsigned TripCount = 0;
1380 unsigned TripMultiple = 1;
1381 SmallVector<BasicBlock *, 8> ExitingBlocks;
1382 L->getExitingBlocks(ExitingBlocks);
1383 for (BasicBlock *ExitingBlock : ExitingBlocks)
1384 if (unsigned TC = SE.getSmallConstantTripCount(L, ExitingBlock))
1385 if (!TripCount || TC < TripCount)
1386 TripCount = TripMultiple = TC;
1387
1388 if (!TripCount) {
1389 // If no exact trip count is known, determine the trip multiple of either
1390 // the loop latch or the single exiting block.
1391 // TODO: Relax for multiple exits.
1392 BasicBlock *ExitingBlock = L->getLoopLatch();
1393 if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
1394 ExitingBlock = L->getExitingBlock();
1395 if (ExitingBlock)
1396 TripMultiple = SE.getSmallConstantTripMultiple(L, ExitingBlock);
1397 }
1398
1399 // If the loop contains a convergent operation, the prelude we'd add
1400 // to do the first few instructions before we hit the unrolled loop
1401 // is unsafe -- it adds a control-flow dependency to the convergent
1402 // operation. Therefore restrict remainder loop (try unrolling without).
1404
1405 // Try to find the trip count upper bound if we cannot find the exact trip
1406 // count.
1407 unsigned MaxTripCount = 0;
1408 bool MaxOrZero = false;
1409 if (!TripCount) {
1410 MaxTripCount = SE.getSmallConstantMaxTripCount(L);
1411 MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L);
1412 }
1413
1414 // computeUnrollCount() decides whether it is beneficial to use upper bound to
1415 // fully unroll the loop.
1416 computeUnrollCount(L, TTI, DT, LI, &AC, SE, EphValues, &ORE, TripCount,
1417 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP);
1418 if (!UP.Count) {
1420 << "Not unrolling: no viable strategy found.\n");
1421 if (TM & TM_ForcedByUser) {
1422 ORE.emit([&]() {
1423 return OptimizationRemarkMissed(DEBUG_TYPE, "NoUnrollStrategy",
1424 L->getStartLoc(), L->getHeader())
1425 << "unable to unroll loop: no viable unroll count found";
1426 });
1427 }
1429 }
1430
1432
1433 if (PP.PeelCount) {
1434 assert(UP.Count == 1 && "Cannot perform peel and unroll in the same step");
1435 LLVM_DEBUG(dbgs() << "PEELING loop %" << L->getHeader()->getName()
1436 << " with iteration count " << PP.PeelCount << "!\n");
1437 ORE.emit([&]() {
1438 return OptimizationRemark(DEBUG_TYPE, "Peeled", L->getStartLoc(),
1439 L->getHeader())
1440 << "peeled loop by " << ore::NV("PeelCount", PP.PeelCount)
1441 << " iterations";
1442 });
1443
1444 ValueToValueMapTy VMap;
1445 peelLoop(L, PP.PeelCount, PP.PeelLast, LI, &SE, DT, &AC, PreserveLCSSA,
1446 VMap);
1447 simplifyLoopAfterUnroll(L, true, LI, &SE, &DT, &AC, &TTI, L->getBlocks(),
1448 nullptr);
1449 // If the loop was peeled, we already "used up" the profile information
1450 // we had, so we don't want to unroll or peel again.
1452 L->setLoopAlreadyUnrolled();
1454 }
1455
1456 // Do not attempt partial/runtime unrolling in FullLoopUnrolling
1457 if (OnlyFullUnroll && ((!TripCount && !MaxTripCount) ||
1458 UP.Count < TripCount || UP.Count < MaxTripCount)) {
1460 << "Not attempting partial/runtime unroll in FullLoopUnroll.\n");
1462 }
1463
1464 // At this point, UP.Runtime indicates that run-time unrolling is allowed.
1465 // However, we only want to actually perform it if we don't know the trip
1466 // count and the unroll count doesn't divide the known trip multiple.
1467 // TODO: This decision should probably be pushed up into
1468 // computeUnrollCount().
1469 UP.Runtime &= TripCount == 0 && TripMultiple % UP.Count != 0;
1470
1471 // Save loop properties before it is transformed.
1472 MDNode *OrigLoopID = L->getLoopID();
1473 UnrollPragmaInfo PInfo(L);
1474 DebugLoc LoopStartLoc = L->getStartLoc();
1475 BasicBlock *LoopHeader = L->getHeader();
1476
1477 // Unroll the loop.
1478 Loop *RemainderLoop = nullptr;
1480 ULO.Count = UP.Count;
1481 ULO.Force = UP.Force;
1484 ULO.Runtime = UP.Runtime;
1485 ULO.ForgetAllSCEV = ForgetAllSCEV;
1490 LoopUnrollResult UnrollResult = UnrollLoop(
1491 L, ULO, LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA);
1492 if (UnrollResult == LoopUnrollResult::Unmodified) {
1493 if (PInfo.ExplicitUnroll) {
1495 << "Failed to unroll loop as explicitly requested.\n");
1496 ORE.emit([&]() {
1497 return OptimizationRemarkMissed(DEBUG_TYPE, "FailedToUnrollAsRequested",
1498 LoopStartLoc, LoopHeader)
1499 << "failed to unroll loop as explicitly requested";
1500 });
1501 }
1503 }
1504
1505 if (PInfo.PragmaFullUnroll && ULO.Count != TripCount) {
1506 ORE.emit([&]() {
1507 return OptimizationRemarkMissed(DEBUG_TYPE, "FullUnrollAsDirectedFailed",
1508 LoopStartLoc, LoopHeader)
1509 << "unable to fully unroll loop as directed; "
1510 << "unrolled by factor " << ore::NV("UnrollCount", ULO.Count);
1511 });
1512 }
1513 if (PInfo.PragmaCount > 0 && ULO.Count != PInfo.PragmaCount) {
1514 ORE.emit([&]() {
1515 return OptimizationRemarkMissed(DEBUG_TYPE, "UnrollCountDiffers",
1516 LoopStartLoc, LoopHeader)
1517 << "unable to unroll loop with requested count "
1518 << ore::NV("RequestedCount", PInfo.PragmaCount)
1519 << "; unrolled by factor " << ore::NV("UnrollCount", ULO.Count);
1520 });
1521 }
1522
1523 if (RemainderLoop) {
1524 std::optional<MDNode *> RemainderLoopID =
1527 if (RemainderLoopID)
1528 RemainderLoop->setLoopID(*RemainderLoopID);
1529 }
1530
1531 if (UnrollResult != LoopUnrollResult::FullyUnrolled) {
1532 std::optional<MDNode *> NewLoopID =
1535 if (NewLoopID) {
1536 L->setLoopID(*NewLoopID);
1537
1538 // Do not setLoopAlreadyUnrolled if loop attributes have been specified
1539 // explicitly.
1540 return UnrollResult;
1541 }
1542 }
1543
1544 // If loop has an unroll count pragma or unrolled by explicitly set count
1545 // mark loop as unrolled to prevent unrolling beyond that requested.
1546 if (UnrollResult != LoopUnrollResult::FullyUnrolled && PInfo.ExplicitUnroll)
1547 L->setLoopAlreadyUnrolled();
1548
1549 return UnrollResult;
1550}
1551
1552namespace {
1553
1554class LoopUnroll : public LoopPass {
1555public:
1556 static char ID; // Pass ID, replacement for typeid
1557
1558 int OptLevel;
1559
1560 /// If false, use a cost model to determine whether unrolling of a loop is
1561 /// profitable. If true, only loops that explicitly request unrolling via
1562 /// metadata are considered. All other loops are skipped.
1563 bool OnlyWhenForced;
1564
1565 /// If false, when SCEV is invalidated, only forget everything in the
1566 /// top-most loop (call forgetTopMostLoop), of the loop being processed.
1567 /// Otherwise, forgetAllLoops and rebuild when needed next.
1568 bool ForgetAllSCEV;
1569
1570 std::optional<unsigned> ProvidedCount;
1571 std::optional<unsigned> ProvidedThreshold;
1572 std::optional<bool> ProvidedAllowPartial;
1573 std::optional<bool> ProvidedRuntime;
1574 std::optional<bool> ProvidedUpperBound;
1575 std::optional<bool> ProvidedAllowPeeling;
1576 std::optional<bool> ProvidedAllowProfileBasedPeeling;
1577 std::optional<unsigned> ProvidedFullUnrollMaxCount;
1578
1579 LoopUnroll(int OptLevel = 2, bool OnlyWhenForced = false,
1580 bool ForgetAllSCEV = false,
1581 std::optional<unsigned> Threshold = std::nullopt,
1582 std::optional<unsigned> Count = std::nullopt,
1583 std::optional<bool> AllowPartial = std::nullopt,
1584 std::optional<bool> Runtime = std::nullopt,
1585 std::optional<bool> UpperBound = std::nullopt,
1586 std::optional<bool> AllowPeeling = std::nullopt,
1587 std::optional<bool> AllowProfileBasedPeeling = std::nullopt,
1588 std::optional<unsigned> ProvidedFullUnrollMaxCount = std::nullopt)
1589 : LoopPass(ID), OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced),
1590 ForgetAllSCEV(ForgetAllSCEV), ProvidedCount(std::move(Count)),
1591 ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
1592 ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound),
1593 ProvidedAllowPeeling(AllowPeeling),
1594 ProvidedAllowProfileBasedPeeling(AllowProfileBasedPeeling),
1595 ProvidedFullUnrollMaxCount(ProvidedFullUnrollMaxCount) {
1597 }
1598
1599 bool runOnLoop(Loop *L, LPPassManager &LPM) override {
1600 if (skipLoop(L))
1601 return false;
1602
1603 Function &F = *L->getHeader()->getParent();
1604
1605 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1606 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1607 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1608 const TargetTransformInfo &TTI =
1609 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1610 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1611 UniformityInfo *UI =
1613 ? &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo()
1614 : nullptr;
1615 // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
1616 // pass. Function analyses need to be preserved across loop transformations
1617 // but ORE cannot be preserved (see comment before the pass definition).
1618 OptimizationRemarkEmitter ORE(&F);
1619 bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
1620
1622 L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr, PreserveLCSSA, OptLevel,
1623 /*OnlyFullUnroll*/ false, OnlyWhenForced, ForgetAllSCEV, ProvidedCount,
1624 ProvidedThreshold, ProvidedAllowPartial, ProvidedRuntime,
1625 ProvidedUpperBound, ProvidedAllowPeeling,
1626 ProvidedAllowProfileBasedPeeling, ProvidedFullUnrollMaxCount, UI);
1627
1628 if (Result == LoopUnrollResult::FullyUnrolled)
1629 LPM.markLoopAsDeleted(*L);
1630
1631 return Result != LoopUnrollResult::Unmodified;
1632 }
1633
1634 /// This transformation requires natural loop information & requires that
1635 /// loop preheaders be inserted into the CFG...
1636 void getAnalysisUsage(AnalysisUsage &AU) const override {
1637 AU.addRequired<AssumptionCacheTracker>();
1638 AU.addRequired<TargetTransformInfoWrapperPass>();
1639 AU.addRequired<UniformityInfoWrapperPass>();
1640 // FIXME: Loop passes are required to preserve domtree, and for now we just
1641 // recreate dom info if anything gets unrolled.
1643 }
1644};
1645
1646} // end anonymous namespace
1647
1648char LoopUnroll::ID = 0;
1649
1650INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
1655INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
1656
1657Pass *llvm::createLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
1658 bool ForgetAllSCEV, int Threshold, int Count,
1659 int AllowPartial, int Runtime, int UpperBound,
1660 int AllowPeeling) {
1661 // TODO: It would make more sense for this function to take the optionals
1662 // directly, but that's dangerous since it would silently break out of tree
1663 // callers.
1664 return new LoopUnroll(
1665 OptLevel, OnlyWhenForced, ForgetAllSCEV,
1666 Threshold == -1 ? std::nullopt : std::optional<unsigned>(Threshold),
1667 Count == -1 ? std::nullopt : std::optional<unsigned>(Count),
1668 AllowPartial == -1 ? std::nullopt : std::optional<bool>(AllowPartial),
1669 Runtime == -1 ? std::nullopt : std::optional<bool>(Runtime),
1670 UpperBound == -1 ? std::nullopt : std::optional<bool>(UpperBound),
1671 AllowPeeling == -1 ? std::nullopt : std::optional<bool>(AllowPeeling));
1672}
1673
1676 LPMUpdater &Updater) {
1677 // For the new PM, we can't use OptimizationRemarkEmitter as an analysis
1678 // pass. Function analyses need to be preserved across loop transformations
1679 // but ORE cannot be preserved (see comment before the pass definition).
1680 OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
1681
1682 // Keep track of the previous loop structure so we can identify new loops
1683 // created by unrolling.
1684 Loop *ParentL = L.getParentLoop();
1685 SmallPtrSet<Loop *, 4> OldLoops;
1686 if (ParentL)
1687 OldLoops.insert_range(*ParentL);
1688 else
1689 OldLoops.insert_range(AR.LI);
1690
1691 std::string LoopName = std::string(L.getName());
1692
1693 bool Changed =
1694 tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, ORE,
1695 /*BFI*/ nullptr, /*PSI*/ nullptr,
1696 /*PreserveLCSSA*/ true, OptLevel, /*OnlyFullUnroll*/ true,
1697 OnlyWhenForced, ForgetSCEV, /*Count*/ std::nullopt,
1698 /*Threshold*/ std::nullopt, /*AllowPartial*/ false,
1699 /*Runtime*/ false, /*UpperBound*/ false,
1700 /*AllowPeeling*/ true,
1701 /*AllowProfileBasedPeeling*/ false,
1702 /*FullUnrollMaxCount*/ std::nullopt) !=
1704 if (!Changed)
1705 return PreservedAnalyses::all();
1706
1707 // The parent must not be damaged by unrolling!
1708#ifndef NDEBUG
1709 if (ParentL)
1710 ParentL->verifyLoop();
1711#endif
1712
1713 // Unrolling can do several things to introduce new loops into a loop nest:
1714 // - Full unrolling clones child loops within the current loop but then
1715 // removes the current loop making all of the children appear to be new
1716 // sibling loops.
1717 //
1718 // When a new loop appears as a sibling loop after fully unrolling,
1719 // its nesting structure has fundamentally changed and we want to revisit
1720 // it to reflect that.
1721 //
1722 // When unrolling has removed the current loop, we need to tell the
1723 // infrastructure that it is gone.
1724 //
1725 // Finally, we support a debugging/testing mode where we revisit child loops
1726 // as well. These are not expected to require further optimizations as either
1727 // they or the loop they were cloned from have been directly visited already.
1728 // But the debugging mode allows us to check this assumption.
1729 bool IsCurrentLoopValid = false;
1730 SmallVector<Loop *, 4> SibLoops;
1731 if (ParentL)
1732 SibLoops.append(ParentL->begin(), ParentL->end());
1733 else
1734 SibLoops.append(AR.LI.begin(), AR.LI.end());
1735 erase_if(SibLoops, [&](Loop *SibLoop) {
1736 if (SibLoop == &L) {
1737 IsCurrentLoopValid = true;
1738 return true;
1739 }
1740
1741 // Otherwise erase the loop from the list if it was in the old loops.
1742 return OldLoops.contains(SibLoop);
1743 });
1744 Updater.addSiblingLoops(SibLoops);
1745
1746 if (!IsCurrentLoopValid) {
1747 Updater.markLoopAsDeleted(L, LoopName);
1748 } else {
1749 // We can only walk child loops if the current loop remained valid.
1751 // Walk *all* of the child loops.
1752 SmallVector<Loop *, 4> ChildLoops(L.begin(), L.end());
1753 Updater.addChildLoops(ChildLoops);
1754 }
1755 }
1756
1758}
1759
1762 auto &LI = AM.getResult<LoopAnalysis>(F);
1763 // There are no loops in the function. Return before computing other expensive
1764 // analyses.
1765 if (LI.empty())
1766 return PreservedAnalyses::all();
1767 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
1768 auto &TTI = AM.getResult<TargetIRAnalysis>(F);
1769 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
1770 auto &AC = AM.getResult<AssumptionAnalysis>(F);
1773
1774 UniformityInfo *UI = TTI.hasBranchDivergence(&F)
1776 : nullptr;
1777
1778 LoopAnalysisManager *LAM = nullptr;
1779 if (auto *LAMProxy = AM.getCachedResult<LoopAnalysisManagerFunctionProxy>(F))
1780 LAM = &LAMProxy->getManager();
1781
1782 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
1783 ProfileSummaryInfo *PSI =
1784 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
1785 auto *BFI = (PSI && PSI->hasProfileSummary()) ?
1786 &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
1787
1788 bool Changed = false;
1789
1790 // The unroller requires loops to be in simplified form, and also needs LCSSA.
1791 // Since simplification may add new inner loops, it has to run before the
1792 // legality and profitability checks. This means running the loop unroller
1793 // will simplify all loops, regardless of whether anything end up being
1794 // unrolled.
1795 for (const auto &L : LI) {
1796 Changed |=
1797 simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */);
1798 Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
1799 }
1800
1801 // Add the loop nests in the reverse order of LoopInfo. See method
1802 // declaration.
1804 appendLoopsToWorklist(LI, Worklist);
1805
1806 while (!Worklist.empty()) {
1807 // Because the LoopInfo stores the loops in RPO, we walk the worklist
1808 // from back to front so that we work forward across the CFG, which
1809 // for unrolling is only needed to get optimization remarks emitted in
1810 // a forward order.
1811 Loop &L = *Worklist.pop_back_val();
1812#ifndef NDEBUG
1813 Loop *ParentL = L.getParentLoop();
1814#endif
1815
1816 // Check if the profile summary indicates that the profiled application
1817 // has a huge working set size, in which case we disable peeling to avoid
1818 // bloating it further.
1819 std::optional<bool> LocalAllowPeeling = UnrollOpts.AllowPeeling;
1820 if (PSI && PSI->hasHugeWorkingSetSize())
1821 LocalAllowPeeling = false;
1822 std::string LoopName = std::string(L.getName());
1823 // The API here is quite complex to call and we allow to select some
1824 // flavors of unrolling during construction time (by setting UnrollOpts).
1826 &L, DT, &LI, SE, TTI, AC, ORE, BFI, PSI,
1827 /*PreserveLCSSA*/ true, UnrollOpts.OptLevel, /*OnlyFullUnroll*/ false,
1828 UnrollOpts.OnlyWhenForced, UnrollOpts.ForgetSCEV,
1829 /*Count*/ std::nullopt,
1830 /*Threshold*/ std::nullopt, UnrollOpts.AllowPartial,
1831 UnrollOpts.AllowRuntime, UnrollOpts.AllowUpperBound, LocalAllowPeeling,
1832 UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount, UI,
1833 &AA);
1835
1836 // The parent must not be damaged by unrolling!
1837#ifndef NDEBUG
1838 if (Result != LoopUnrollResult::Unmodified && ParentL)
1839 ParentL->verifyLoop();
1840#endif
1841
1842 // Clear any cached analysis results for L if we removed it completely.
1843 if (LAM && Result == LoopUnrollResult::FullyUnrolled)
1844 LAM->clear(L, LoopName);
1845 }
1846
1847 if (!Changed)
1848 return PreservedAnalyses::all();
1849
1851}
1852
1854 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
1855 static_cast<PassInfoMixin<LoopUnrollPass> *>(this)->printPipeline(
1856 OS, MapClassName2PassName);
1857 OS << '<';
1858 if (UnrollOpts.AllowPartial != std::nullopt)
1859 OS << (*UnrollOpts.AllowPartial ? "" : "no-") << "partial;";
1860 if (UnrollOpts.AllowPeeling != std::nullopt)
1861 OS << (*UnrollOpts.AllowPeeling ? "" : "no-") << "peeling;";
1862 if (UnrollOpts.AllowRuntime != std::nullopt)
1863 OS << (*UnrollOpts.AllowRuntime ? "" : "no-") << "runtime;";
1864 if (UnrollOpts.AllowUpperBound != std::nullopt)
1865 OS << (*UnrollOpts.AllowUpperBound ? "" : "no-") << "upperbound;";
1866 if (UnrollOpts.AllowProfileBasedPeeling != std::nullopt)
1867 OS << (*UnrollOpts.AllowProfileBasedPeeling ? "" : "no-")
1868 << "profile-peeling;";
1869 if (UnrollOpts.FullUnrollMaxCount != std::nullopt)
1870 OS << "full-unroll-max=" << UnrollOpts.FullUnrollMaxCount << ';';
1871 OS << 'O' << UnrollOpts.OptLevel;
1872 OS << '>';
1873}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
This file defines DenseMapInfo traits for DenseMap.
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
#define DEBUG_TYPE
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
This header provides classes for managing per-loop analyses.
This header provides classes for managing a pipeline of passes over loops in LLVM IR.
static cl::opt< unsigned > UnrollMaxCount("unroll-max-count", cl::Hidden, cl::desc("Set the max unroll count for partial and runtime unrolling, for" "testing purposes"))
static cl::opt< unsigned > UnrollCount("unroll-count", cl::Hidden, cl::desc("Use this unroll count for all loops including those with " "unroll_count pragma values, for testing purposes"))
static cl::opt< unsigned > UnrollThresholdDefault("unroll-threshold-default", cl::init(150), cl::Hidden, cl::desc("Default threshold (max size of unrolled " "loop), used in all but O3 optimizations"))
static cl::opt< unsigned > FlatLoopTripCountThreshold("flat-loop-tripcount-threshold", cl::init(5), cl::Hidden, cl::desc("If the runtime tripcount for the loop is lower than the " "threshold, the loop is considered as flat and will be less " "aggressively unrolled."))
static cl::opt< unsigned > UnrollOptSizeThreshold("unroll-optsize-threshold", cl::init(0), cl::Hidden, cl::desc("The cost threshold for loop unrolling when optimizing for " "size"))
static bool hasUnrollFullPragma(const Loop *L)
static bool isSCEVUniform(const SCEV *S, UniformityInfo &UI)
Returns true if the SCEV expression is uniform, i.e., all threads in a convergent execution agree on ...
static cl::opt< bool > UnrollUnrollRemainder("unroll-remainder", cl::Hidden, cl::desc("Allow the loop remainder to be unrolled."))
static unsigned unrollCountPragmaValue(const Loop *L)
static bool hasUnrollEnablePragma(const Loop *L)
static cl::opt< unsigned > PragmaUnrollThreshold("pragma-unroll-threshold", cl::init(16 *1024), cl::Hidden, cl::desc("Unrolled size limit for loops with unroll metadata " "(full, enable, or count)."))
static cl::opt< unsigned > UnrollFullMaxCount("unroll-full-max-count", cl::Hidden, cl::desc("Set the max unroll count for full unrolling, for testing purposes"))
static cl::opt< unsigned > UnrollMaxUpperBound("unroll-max-upperbound", cl::init(8), cl::Hidden, cl::desc("The max of trip count upper bound that is considered in unrolling"))
static std::optional< unsigned > shouldPragmaUnroll(Loop *L, const UnrollPragmaInfo &PInfo, const unsigned TripMultiple, const unsigned TripCount, unsigned MaxTripCount, const UnrollCostEstimator UCE, const TargetTransformInfo::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
static std::optional< unsigned > shouldFullUnroll(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, const unsigned FullUnrollTripCount, const UnrollCostEstimator UCE, const TargetTransformInfo::UnrollingPreferences &UP)
static std::optional< EstimatedUnrollCost > analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize, unsigned MaxIterationsCountToAnalyze)
Figure out if the loop is worth full unrolling.
static LoopUnrollResult tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, const TargetTransformInfo &TTI, AssumptionCache &AC, OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, bool PreserveLCSSA, int OptLevel, bool OnlyFullUnroll, bool OnlyWhenForced, bool ForgetAllSCEV, std::optional< unsigned > ProvidedCount, std::optional< unsigned > ProvidedThreshold, std::optional< bool > ProvidedAllowPartial, std::optional< bool > ProvidedRuntime, std::optional< bool > ProvidedUpperBound, std::optional< bool > ProvidedAllowPeeling, std::optional< bool > ProvidedAllowProfileBasedPeeling, std::optional< unsigned > ProvidedFullUnrollMaxCount, UniformityInfo *UI=nullptr, AAResults *AA=nullptr)
static cl::opt< unsigned > UnrollPartialThreshold("unroll-partial-threshold", cl::Hidden, cl::desc("The cost threshold for partial loop unrolling"))
static cl::opt< bool > UnrollAllowRemainder("unroll-allow-remainder", cl::Hidden, cl::desc("Allow generation of a loop remainder (extra iterations) " "when unrolling a loop."))
static std::optional< unsigned > shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount, const UnrollCostEstimator UCE, const TargetTransformInfo::UnrollingPreferences &UP)
static cl::opt< unsigned > PragmaUnrollFullMaxIterations("pragma-unroll-full-max-iterations", cl::init(1 '000 '000), cl::Hidden, cl::desc("Maximum allowed iterations to unroll under pragma unroll full."))
static const unsigned NoThreshold
A magic value for use with the Threshold parameter to indicate that the loop unroll should be perform...
static cl::opt< bool > UnrollRevisitChildLoops("unroll-revisit-child-loops", cl::Hidden, cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. " "This shouldn't typically be needed as child loops (or their " "clones) were already visited."))
static cl::opt< unsigned > UnrollThreshold("unroll-threshold", cl::Hidden, cl::desc("The cost threshold for loop unrolling"))
static cl::opt< bool > UnrollRuntime("unroll-runtime", cl::Hidden, cl::desc("Unroll loops with run-time trip counts"))
static bool hasRuntimeUnrollDisablePragma(const Loop *L)
static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost, unsigned MaxPercentThresholdBoost)
static cl::opt< unsigned > UnrollThresholdAggressive("unroll-threshold-aggressive", cl::init(300), cl::Hidden, cl::desc("Threshold (max size of unrolled loop) to use in aggressive (O3) " "optimizations"))
static cl::opt< unsigned > UnrollMaxIterationsCountToAnalyze("unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden, cl::desc("Don't allow loop unrolling to simulate more than this number of " "iterations when checking full unroll profitability"))
static cl::opt< unsigned > UnrollMaxPercentThresholdBoost("unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden, cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied " "to the threshold when aggressively unrolling a loop due to the " "dynamic cost savings. If completely unrolling a loop will reduce " "the total runtime from X to Y, we boost the loop unroll " "threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, " "X/Y). This limit avoids excessive code bloat."))
static cl::opt< bool > UnrollAllowPartial("unroll-allow-partial", cl::Hidden, cl::desc("Allows loops to be partially unrolled until " "-unroll-threshold loop size is reached."))
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Machine Trace Metrics
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
This file contains the declarations for metadata subclasses.
LoopAnalysisManager LAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:119
This pass exposes codegen information to IR-level passes.
LLVM IR instance of the generic uniformity analysis.
Value * RHS
Value * LHS
A manager for alias analyses.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addRequired()
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Conditional Branch instruction.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
This is an important base class in LLVM.
Definition Constant.h:43
A debug info location.
Definition DebugLoc.h:124
ValueT lookup(const_arg_type_t< KeyT > Val) const
Return the entry for the specified key, or a default constructed value if no such entry exists.
Definition DenseMap.h:252
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:221
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:286
Implements a dense probed hash-table based set.
Definition DenseSet.h:289
Analysis pass which computes a DominatorTree.
Definition Dominators.h:274
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:155
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:711
bool isUniformAtDef(ConstValueRefT V) const
Whether V is uniform/non-divergent at its definition.
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
void addChildLoops(ArrayRef< Loop * > NewChildLoops)
Loop passes should use this method to indicate they have added new child loops of the current loop.
void markLoopAsDeleted(Loop &L, llvm::StringRef Name)
Loop passes should use this method to indicate they have deleted a loop from the nest.
void addSiblingLoops(ArrayRef< Loop * > NewSibLoops)
Loop passes should use this method to indicate they have added new sibling loops to the current loop.
void markLoopAsDeleted(Loop &L)
Definition LoopPass.cpp:111
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:587
void verifyLoop() const
Verify loop structure.
iterator end() const
iterator begin() const
LLVM_ABI PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
iterator end() const
iterator begin() const
LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
LLVM_ABI void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
void setLoopID(MDNode *LoopID) const
Set the llvm.loop loop id metadata for this loop.
Definition LoopInfo.cpp:547
Metadata node.
Definition Metadata.h:1075
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1439
unsigned getNumOperands() const
Return number of MDNode operands.
Definition Metadata.h:1445
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
bool empty() const
Determine if the PriorityWorklist is empty or not.
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
This class represents an analyzed expression in the program.
LLVM_ABI ArrayRef< SCEVUse > operands() const
Return operands of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI unsigned getSmallConstantTripMultiple(const Loop *L, const SCEV *ExitCount)
Returns the largest constant divisor of the trip count as a normal unsigned value,...
LLVM_ABI unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Returns the upper bound of the loop trip count as a normal unsigned value.
LLVM_ABI bool isBackedgeTakenCountMaxOrZero(const Loop *L)
Return true if the backedge taken count is either the value returned by getConstantMaxBackedgeTakenCo...
LLVM_ABI unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
void clear()
Completely clear the SetVector.
Definition SetVector.h:267
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
value_type pop_back_val()
Definition SetVector.h:279
A version of PriorityWorklist that selects small size optimized data structures for the vector and ma...
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
Multiway switch.
Analysis pass providing the TargetTransformInfo.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI bool hasBranchDivergence(const Function *F=nullptr) const
Return true if branch divergence exists.
TargetCostKind
The kind of cost model.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Analysis pass which computes UniformityInfo.
Legacy analysis pass which computes a CycleInfo.
Produce an estimate of the unrolled cost of the specified loop.
Definition UnrollLoop.h:151
ConvergenceKind Convergence
Definition UnrollLoop.h:157
LLVM_ABI uint64_t getUnrolledLoopSize(const TargetTransformInfo::UnrollingPreferences &UP, unsigned CountOverwrite=0) const
Returns loop size estimation for unrolled loop, given the unrolling configuration specified by UP.
LLVM_ABI bool canUnroll(OptimizationRemarkEmitter *ORE=nullptr, const Loop *L=nullptr) const
Whether it is legal to unroll this loop.
LLVM_ABI UnrollCostEstimator(const Loop *L, const TargetTransformInfo &TTI, const SmallPtrSetImpl< const Value * > &EphValues, unsigned BEInsns, bool TripCountIsUniform=false)
uint64_t getRolledLoopSize() const
Definition UnrollLoop.h:173
void visit(Iterator Start, Iterator End)
Definition InstVisitor.h:87
LLVM Value Representation.
Definition Value.h:75
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:212
iterator find(const_arg_type_t< ValueT > V)
Definition DenseSet.h:177
An efficient, type-erasing, non-owning reference to a callable.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
Changed
Abstract Attribute helper functions.
Definition Attributor.h:165
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
Definition Metadata.h:668
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
GenericUniformityInfo< SSAContext > UniformityInfo
LLVM_ABI std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Return either:
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
auto successors(const MachineBasicBlock *BB)
@ Runtime
Detect stack use after return if not disabled runtime with (ASAN_OPTIONS=detect_stack_use_after_retur...
OuterAnalysisManagerProxy< ModuleAnalysisManager, Function > ModuleAnalysisManagerFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
LLVM_ABI bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition LCSSA.cpp:449
LLVM_ABI std::optional< MDNode * > makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef< StringRef > FollowupAttrs, const char *InheritOptionsAttrsPrefix="", bool AlwaysNew=false)
Create a new loop identifier for a loop created from a loop transformation.
LLVM_ABI bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
LLVM_ABI char & LCSSAID
Definition LCSSA.cpp:526
LLVM_ABI void simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, const TargetTransformInfo *TTI, ArrayRef< BasicBlock * > Blocks, AAResults *AA=nullptr)
Perform some cleanup and simplifications on loops after unrolling.
LLVM_ABI Pass * createLoopUnrollPass(int OptLevel=2, bool OnlyWhenForced=false, bool ForgetAllSCEV=false, int Threshold=-1, int Count=-1, int AllowPartial=-1, int Runtime=-1, int UpperBound=-1, int AllowPeeling=-1)
AnalysisManager< Loop, LoopStandardAnalysisResults & > LoopAnalysisManager
The loop analysis manager.
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:2025
LLVM_ABI void initializeLoopUnrollPass(PassRegistry &)
LLVM_ABI TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
LLVM_ABI CallBase * getLoopConvergenceHeart(const Loop *TheLoop)
Find the convergence heart of the loop.
LLVM_ABI TransformationMode hasUnrollAndJamTransformation(const Loop *L)
LLVM_ABI cl::opt< bool > ForgetSCEVInLoopUnroll
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
LLVM_ABI void computePeelCount(Loop *L, unsigned LoopSize, TargetTransformInfo::PeelingPreferences &PP, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE, const TargetTransformInfo &TTI, AssumptionCache *AC=nullptr, unsigned Threshold=UINT_MAX)
Definition LoopPeel.cpp:753
LLVM_TEMPLATE_ABI void appendLoopsToWorklist(RangeT &&, SmallPriorityWorklist< Loop *, 4 > &)
Utility that implements appending of loops onto a worklist given a range.
LLVM_ABI cl::opt< unsigned > SCEVCheapExpansionBudget
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
LLVM_ABI TransformationMode hasUnrollTransformation(const Loop *L)
LoopUnrollResult
Represents the result of a UnrollLoop invocation.
Definition UnrollLoop.h:58
@ PartiallyUnrolled
The loop was partially unrolled – we still have a loop, but with a smaller trip count.
Definition UnrollLoop.h:65
@ Unmodified
The loop was not modified.
Definition UnrollLoop.h:60
@ FullyUnrolled
The loop was fully unrolled into straight-line code.
Definition UnrollLoop.h:69
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void getLoopAnalysisUsage(AnalysisUsage &AU)
Helper to consistently add the set of standard passes to a loop pass's AnalysisUsage.
LLVM_ABI void peelLoop(Loop *L, unsigned PeelCount, bool PeelLast, LoopInfo *LI, ScalarEvolution *SE, DominatorTree &DT, AssumptionCache *AC, bool PreserveLCSSA, ValueToValueMapTy &VMap)
VMap is the value-map that maps instructions from the original loop to instructions in the last peele...
const char *const LLVMLoopUnrollFollowupAll
Definition UnrollLoop.h:45
TargetTransformInfo TTI
TransformationMode
The mode sets how eager a transformation should be applied.
Definition LoopUtils.h:283
@ TM_ForcedByUser
The transformation was directed by the user, e.g.
Definition LoopUtils.h:300
@ TM_Disable
The transformation should not be applied.
Definition LoopUtils.h:292
@ TM_Enable
The transformation should be applied without considering a cost model.
Definition LoopUtils.h:289
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2011
LLVM_ABI MDNode * getUnrollMetadataForLoop(const Loop *L, StringRef Name)
DWARFExpression::Operation Op
LLVM_ABI TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1916
const char *const LLVMLoopUnrollFollowupRemainder
Definition UnrollLoop.h:48
LLVM_ABI PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
const char *const LLVMLoopUnrollFollowupUnrolled
Definition UnrollLoop.h:46
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2191
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI LoopUnrollResult UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, const llvm::TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE, bool PreserveLCSSA, Loop **RemainderLoop=nullptr, AAResults *AA=nullptr)
Unroll the given loop by Count.
LLVM_ABI void computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP)
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:177
Utility to calculate the size and a few similar metrics for a set of basic blocks.
Definition CodeMetrics.h:34
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition PassManager.h:89
bool PeelLast
Peel off the last PeelCount loop iterations.
bool PeelProfiledIterations
Allow peeling basing on profile.
unsigned PeelCount
A forced peeling factor (the number of bodied of the original loop that should be peeled off before t...
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
bool UpperBound
Allow using trip count upper bound to unroll loops.
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned MaxPercentThresholdBoost
If complete unrolling will reduce the cost of the loop, we will boost the Threshold by a certain perc...
bool RuntimeUnrollMultiExit
Allow runtime unrolling multi-exit loops.
unsigned SCEVExpansionBudget
Don't allow runtime unrolling if expanding the trip count takes more than SCEVExpansionBudget.
bool AddAdditionalAccumulators
Allow unrolling to add parallel reduction phis.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
unsigned MaxIterationsCountToAnalyze
Don't allow loop unrolling to simulate more than this number of iterations when checking full unroll ...
bool AllowRemainder
Allow generation of a loop remainder (extra iterations after unroll).
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned FullUnrollMaxCount
Set the maximum unrolling factor for full unrolling.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
bool AllowExpensiveTripCount
Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for...
unsigned MaxUpperBound
Set the maximum upper bound of trip count.
const Instruction * Heart
Definition UnrollLoop.h:79
const bool PragmaFullUnroll
Definition UnrollLoop.h:131
LLVM_ABI UnrollPragmaInfo(const Loop *L)
const unsigned PragmaCount
Definition UnrollLoop.h:132
const bool ExplicitUnroll
Definition UnrollLoop.h:135
const bool PragmaRuntimeUnrollDisable
Definition UnrollLoop.h:134
const bool UserUnrollCount
Definition UnrollLoop.h:130
const bool PragmaEnableUnroll
Definition UnrollLoop.h:133