LLVM 17.0.0git
LoopUnrollPass.cpp
Go to the documentation of this file.
1//===- LoopUnroll.cpp - Loop unroller pass --------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements a simple loop unroller. It works best when loops have
10// been canonicalized by the -indvars pass, allowing it to determine the trip
11// counts of loops easily.
12//===----------------------------------------------------------------------===//
13
15#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/DenseSet.h"
18#include "llvm/ADT/STLExtras.h"
19#include "llvm/ADT/SetVector.h"
22#include "llvm/ADT/StringRef.h"
34#include "llvm/IR/BasicBlock.h"
35#include "llvm/IR/CFG.h"
36#include "llvm/IR/Constant.h"
37#include "llvm/IR/Constants.h"
39#include "llvm/IR/Dominators.h"
40#include "llvm/IR/Function.h"
41#include "llvm/IR/Instruction.h"
44#include "llvm/IR/Metadata.h"
45#include "llvm/IR/PassManager.h"
47#include "llvm/Pass.h"
50#include "llvm/Support/Debug.h"
61#include <algorithm>
62#include <cassert>
63#include <cstdint>
64#include <limits>
65#include <optional>
66#include <string>
67#include <tuple>
68#include <utility>
69
70using namespace llvm;
71
72#define DEBUG_TYPE "loop-unroll"
73
75 "forget-scev-loop-unroll", cl::init(false), cl::Hidden,
76 cl::desc("Forget everything in SCEV when doing LoopUnroll, instead of just"
77 " the current top-most loop. This is sometimes preferred to reduce"
78 " compile time."));
79
81 UnrollThreshold("unroll-threshold", cl::Hidden,
82 cl::desc("The cost threshold for loop unrolling"));
83
86 "unroll-optsize-threshold", cl::init(0), cl::Hidden,
87 cl::desc("The cost threshold for loop unrolling when optimizing for "
88 "size"));
89
91 "unroll-partial-threshold", cl::Hidden,
92 cl::desc("The cost threshold for partial loop unrolling"));
93
95 "unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden,
96 cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied "
97 "to the threshold when aggressively unrolling a loop due to the "
98 "dynamic cost savings. If completely unrolling a loop will reduce "
99 "the total runtime from X to Y, we boost the loop unroll "
100 "threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, "
101 "X/Y). This limit avoids excessive code bloat."));
102
104 "unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden,
105 cl::desc("Don't allow loop unrolling to simulate more than this number of"
106 "iterations when checking full unroll profitability"));
107
109 "unroll-count", cl::Hidden,
110 cl::desc("Use this unroll count for all loops including those with "
111 "unroll_count pragma values, for testing purposes"));
112
114 "unroll-max-count", cl::Hidden,
115 cl::desc("Set the max unroll count for partial and runtime unrolling, for"
116 "testing purposes"));
117
119 "unroll-full-max-count", cl::Hidden,
120 cl::desc(
121 "Set the max unroll count for full unrolling, for testing purposes"));
122
123static cl::opt<bool>
124 UnrollAllowPartial("unroll-allow-partial", cl::Hidden,
125 cl::desc("Allows loops to be partially unrolled until "
126 "-unroll-threshold loop size is reached."));
127
129 "unroll-allow-remainder", cl::Hidden,
130 cl::desc("Allow generation of a loop remainder (extra iterations) "
131 "when unrolling a loop."));
132
133static cl::opt<bool>
134 UnrollRuntime("unroll-runtime", cl::Hidden,
135 cl::desc("Unroll loops with run-time trip counts"));
136
138 "unroll-max-upperbound", cl::init(8), cl::Hidden,
139 cl::desc(
140 "The max of trip count upper bound that is considered in unrolling"));
141
143 "pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,
144 cl::desc("Unrolled size limit for loops with an unroll(full) or "
145 "unroll_count pragma."));
146
148 "flat-loop-tripcount-threshold", cl::init(5), cl::Hidden,
149 cl::desc("If the runtime tripcount for the loop is lower than the "
150 "threshold, the loop is considered as flat and will be less "
151 "aggressively unrolled."));
152
154 "unroll-remainder", cl::Hidden,
155 cl::desc("Allow the loop remainder to be unrolled."));
156
157// This option isn't ever intended to be enabled, it serves to allow
158// experiments to check the assumptions about when this kind of revisit is
159// necessary.
161 "unroll-revisit-child-loops", cl::Hidden,
162 cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. "
163 "This shouldn't typically be needed as child loops (or their "
164 "clones) were already visited."));
165
167 "unroll-threshold-aggressive", cl::init(300), cl::Hidden,
168 cl::desc("Threshold (max size of unrolled loop) to use in aggressive (O3) "
169 "optimizations"));
171 UnrollThresholdDefault("unroll-threshold-default", cl::init(150),
173 cl::desc("Default threshold (max size of unrolled "
174 "loop), used in all but O3 optimizations"));
175
176/// A magic value for use with the Threshold parameter to indicate
177/// that the loop unroll should be performed regardless of how much
178/// code expansion would result.
179static const unsigned NoThreshold = std::numeric_limits<unsigned>::max();
180
181/// Gather the various unrolling parameters based on the defaults, compiler
182/// flags, TTI overrides and user specified parameters.
186 OptimizationRemarkEmitter &ORE, int OptLevel,
187 std::optional<unsigned> UserThreshold, std::optional<unsigned> UserCount,
188 std::optional<bool> UserAllowPartial, std::optional<bool> UserRuntime,
189 std::optional<bool> UserUpperBound,
190 std::optional<unsigned> UserFullUnrollMaxCount) {
192
193 // Set up the defaults
194 UP.Threshold =
198 UP.PartialThreshold = 150;
200 UP.Count = 0;
202 UP.MaxCount = std::numeric_limits<unsigned>::max();
203 UP.FullUnrollMaxCount = std::numeric_limits<unsigned>::max();
204 UP.BEInsns = 2;
205 UP.Partial = false;
206 UP.Runtime = false;
207 UP.AllowRemainder = true;
208 UP.UnrollRemainder = false;
209 UP.AllowExpensiveTripCount = false;
210 UP.Force = false;
211 UP.UpperBound = false;
212 UP.UnrollAndJam = false;
215
216 // Override with any target specific settings
217 TTI.getUnrollingPreferences(L, SE, UP, &ORE);
218
219 // Apply size attributes
220 bool OptForSize = L->getHeader()->getParent()->hasOptSize() ||
221 // Let unroll hints / pragmas take precedence over PGSO.
223 llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
224 PGSOQueryType::IRPass));
225 if (OptForSize) {
229 }
230
231 // Apply any user values specified by cl::opt
232 if (UnrollThreshold.getNumOccurrences() > 0)
234 if (UnrollPartialThreshold.getNumOccurrences() > 0)
236 if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > 0)
238 if (UnrollMaxCount.getNumOccurrences() > 0)
240 if (UnrollFullMaxCount.getNumOccurrences() > 0)
242 if (UnrollAllowPartial.getNumOccurrences() > 0)
244 if (UnrollAllowRemainder.getNumOccurrences() > 0)
246 if (UnrollRuntime.getNumOccurrences() > 0)
249 UP.UpperBound = false;
250 if (UnrollUnrollRemainder.getNumOccurrences() > 0)
252 if (UnrollMaxIterationsCountToAnalyze.getNumOccurrences() > 0)
254
255 // Apply user values provided by argument
256 if (UserThreshold) {
257 UP.Threshold = *UserThreshold;
258 UP.PartialThreshold = *UserThreshold;
259 }
260 if (UserCount)
261 UP.Count = *UserCount;
262 if (UserAllowPartial)
263 UP.Partial = *UserAllowPartial;
264 if (UserRuntime)
265 UP.Runtime = *UserRuntime;
266 if (UserUpperBound)
267 UP.UpperBound = *UserUpperBound;
268 if (UserFullUnrollMaxCount)
269 UP.FullUnrollMaxCount = *UserFullUnrollMaxCount;
270
271 return UP;
272}
273
274namespace {
275
276/// A struct to densely store the state of an instruction after unrolling at
277/// each iteration.
278///
279/// This is designed to work like a tuple of <Instruction *, int> for the
280/// purposes of hashing and lookup, but to be able to associate two boolean
281/// states with each key.
282struct UnrolledInstState {
283 Instruction *I;
284 int Iteration : 30;
285 unsigned IsFree : 1;
286 unsigned IsCounted : 1;
287};
288
289/// Hashing and equality testing for a set of the instruction states.
290struct UnrolledInstStateKeyInfo {
291 using PtrInfo = DenseMapInfo<Instruction *>;
293
294 static inline UnrolledInstState getEmptyKey() {
295 return {PtrInfo::getEmptyKey(), 0, 0, 0};
296 }
297
298 static inline UnrolledInstState getTombstoneKey() {
299 return {PtrInfo::getTombstoneKey(), 0, 0, 0};
300 }
301
302 static inline unsigned getHashValue(const UnrolledInstState &S) {
303 return PairInfo::getHashValue({S.I, S.Iteration});
304 }
305
306 static inline bool isEqual(const UnrolledInstState &LHS,
307 const UnrolledInstState &RHS) {
308 return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration});
309 }
310};
311
312struct EstimatedUnrollCost {
313 /// The estimated cost after unrolling.
314 unsigned UnrolledCost;
315
316 /// The estimated dynamic cost of executing the instructions in the
317 /// rolled form.
318 unsigned RolledDynamicCost;
319};
320
321struct PragmaInfo {
322 PragmaInfo(bool UUC, bool PFU, unsigned PC, bool PEU)
323 : UserUnrollCount(UUC), PragmaFullUnroll(PFU), PragmaCount(PC),
324 PragmaEnableUnroll(PEU) {}
325 const bool UserUnrollCount;
326 const bool PragmaFullUnroll;
327 const unsigned PragmaCount;
328 const bool PragmaEnableUnroll;
329};
330
331} // end anonymous namespace
332
333/// Figure out if the loop is worth full unrolling.
334///
335/// Complete loop unrolling can make some loads constant, and we need to know
336/// if that would expose any further optimization opportunities. This routine
337/// estimates this optimization. It computes cost of unrolled loop
338/// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By
339/// dynamic cost we mean that we won't count costs of blocks that are known not
340/// to be executed (i.e. if we have a branch in the loop and we know that at the
341/// given iteration its condition would be resolved to true, we won't add up the
342/// cost of the 'false'-block).
343/// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If
344/// the analysis failed (no benefits expected from the unrolling, or the loop is
345/// too big to analyze), the returned value is std::nullopt.
346static std::optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
347 const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE,
348 const SmallPtrSetImpl<const Value *> &EphValues,
349 const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize,
350 unsigned MaxIterationsCountToAnalyze) {
351 // We want to be able to scale offsets by the trip count and add more offsets
352 // to them without checking for overflows, and we already don't want to
353 // analyze *massive* trip counts, so we force the max to be reasonably small.
354 assert(MaxIterationsCountToAnalyze <
355 (unsigned)(std::numeric_limits<int>::max() / 2) &&
356 "The unroll iterations max is too large!");
357
358 // Only analyze inner loops. We can't properly estimate cost of nested loops
359 // and we won't visit inner loops again anyway.
360 if (!L->isInnermost())
361 return std::nullopt;
362
363 // Don't simulate loops with a big or unknown tripcount
364 if (!TripCount || TripCount > MaxIterationsCountToAnalyze)
365 return std::nullopt;
366
369 DenseMap<Value *, Value *> SimplifiedValues;
370 SmallVector<std::pair<Value *, Value *>, 4> SimplifiedInputValues;
371
372 // The estimated cost of the unrolled form of the loop. We try to estimate
373 // this by simplifying as much as we can while computing the estimate.
374 InstructionCost UnrolledCost = 0;
375
376 // We also track the estimated dynamic (that is, actually executed) cost in
377 // the rolled form. This helps identify cases when the savings from unrolling
378 // aren't just exposing dead control flows, but actual reduced dynamic
379 // instructions due to the simplifications which we expect to occur after
380 // unrolling.
381 InstructionCost RolledDynamicCost = 0;
382
383 // We track the simplification of each instruction in each iteration. We use
384 // this to recursively merge costs into the unrolled cost on-demand so that
385 // we don't count the cost of any dead code. This is essentially a map from
386 // <instruction, int> to <bool, bool>, but stored as a densely packed struct.
388
389 // A small worklist used to accumulate cost of instructions from each
390 // observable and reached root in the loop.
392
393 // PHI-used worklist used between iterations while accumulating cost.
395
396 // Helper function to accumulate cost for instructions in the loop.
397 auto AddCostRecursively = [&](Instruction &RootI, int Iteration) {
398 assert(Iteration >= 0 && "Cannot have a negative iteration!");
399 assert(CostWorklist.empty() && "Must start with an empty cost list");
400 assert(PHIUsedList.empty() && "Must start with an empty phi used list");
401 CostWorklist.push_back(&RootI);
403 RootI.getFunction()->hasMinSize() ?
406 for (;; --Iteration) {
407 do {
408 Instruction *I = CostWorklist.pop_back_val();
409
410 // InstCostMap only uses I and Iteration as a key, the other two values
411 // don't matter here.
412 auto CostIter = InstCostMap.find({I, Iteration, 0, 0});
413 if (CostIter == InstCostMap.end())
414 // If an input to a PHI node comes from a dead path through the loop
415 // we may have no cost data for it here. What that actually means is
416 // that it is free.
417 continue;
418 auto &Cost = *CostIter;
419 if (Cost.IsCounted)
420 // Already counted this instruction.
421 continue;
422
423 // Mark that we are counting the cost of this instruction now.
424 Cost.IsCounted = true;
425
426 // If this is a PHI node in the loop header, just add it to the PHI set.
427 if (auto *PhiI = dyn_cast<PHINode>(I))
428 if (PhiI->getParent() == L->getHeader()) {
429 assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they "
430 "inherently simplify during unrolling.");
431 if (Iteration == 0)
432 continue;
433
434 // Push the incoming value from the backedge into the PHI used list
435 // if it is an in-loop instruction. We'll use this to populate the
436 // cost worklist for the next iteration (as we count backwards).
437 if (auto *OpI = dyn_cast<Instruction>(
438 PhiI->getIncomingValueForBlock(L->getLoopLatch())))
439 if (L->contains(OpI))
440 PHIUsedList.push_back(OpI);
441 continue;
442 }
443
444 // First accumulate the cost of this instruction.
445 if (!Cost.IsFree) {
446 UnrolledCost += TTI.getInstructionCost(I, CostKind);
447 LLVM_DEBUG(dbgs() << "Adding cost of instruction (iteration "
448 << Iteration << "): ");
449 LLVM_DEBUG(I->dump());
450 }
451
452 // We must count the cost of every operand which is not free,
453 // recursively. If we reach a loop PHI node, simply add it to the set
454 // to be considered on the next iteration (backwards!).
455 for (Value *Op : I->operands()) {
456 // Check whether this operand is free due to being a constant or
457 // outside the loop.
458 auto *OpI = dyn_cast<Instruction>(Op);
459 if (!OpI || !L->contains(OpI))
460 continue;
461
462 // Otherwise accumulate its cost.
463 CostWorklist.push_back(OpI);
464 }
465 } while (!CostWorklist.empty());
466
467 if (PHIUsedList.empty())
468 // We've exhausted the search.
469 break;
470
471 assert(Iteration > 0 &&
472 "Cannot track PHI-used values past the first iteration!");
473 CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end());
474 PHIUsedList.clear();
475 }
476 };
477
478 // Ensure that we don't violate the loop structure invariants relied on by
479 // this analysis.
480 assert(L->isLoopSimplifyForm() && "Must put loop into normal form first.");
481 assert(L->isLCSSAForm(DT) &&
482 "Must have loops in LCSSA form to track live-out values.");
483
484 LLVM_DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");
485
487 L->getHeader()->getParent()->hasMinSize() ?
489 // Simulate execution of each iteration of the loop counting instructions,
490 // which would be simplified.
491 // Since the same load will take different values on different iterations,
492 // we literally have to go through all loop's iterations.
493 for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {
494 LLVM_DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n");
495
496 // Prepare for the iteration by collecting any simplified entry or backedge
497 // inputs.
498 for (Instruction &I : *L->getHeader()) {
499 auto *PHI = dyn_cast<PHINode>(&I);
500 if (!PHI)
501 break;
502
503 // The loop header PHI nodes must have exactly two input: one from the
504 // loop preheader and one from the loop latch.
505 assert(
506 PHI->getNumIncomingValues() == 2 &&
507 "Must have an incoming value only for the preheader and the latch.");
508
509 Value *V = PHI->getIncomingValueForBlock(
510 Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch());
511 if (Iteration != 0 && SimplifiedValues.count(V))
512 V = SimplifiedValues.lookup(V);
513 SimplifiedInputValues.push_back({PHI, V});
514 }
515
516 // Now clear and re-populate the map for the next iteration.
517 SimplifiedValues.clear();
518 while (!SimplifiedInputValues.empty())
519 SimplifiedValues.insert(SimplifiedInputValues.pop_back_val());
520
521 UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L);
522
523 BBWorklist.clear();
524 BBWorklist.insert(L->getHeader());
525 // Note that we *must not* cache the size, this loop grows the worklist.
526 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
527 BasicBlock *BB = BBWorklist[Idx];
528
529 // Visit all instructions in the given basic block and try to simplify
530 // it. We don't change the actual IR, just count optimization
531 // opportunities.
532 for (Instruction &I : *BB) {
533 // These won't get into the final code - don't even try calculating the
534 // cost for them.
535 if (isa<DbgInfoIntrinsic>(I) || EphValues.count(&I))
536 continue;
537
538 // Track this instruction's expected baseline cost when executing the
539 // rolled loop form.
540 RolledDynamicCost += TTI.getInstructionCost(&I, CostKind);
541
542 // Visit the instruction to analyze its loop cost after unrolling,
543 // and if the visitor returns true, mark the instruction as free after
544 // unrolling and continue.
545 bool IsFree = Analyzer.visit(I);
546 bool Inserted = InstCostMap.insert({&I, (int)Iteration,
547 (unsigned)IsFree,
548 /*IsCounted*/ false}).second;
549 (void)Inserted;
550 assert(Inserted && "Cannot have a state for an unvisited instruction!");
551
552 if (IsFree)
553 continue;
554
555 // Can't properly model a cost of a call.
556 // FIXME: With a proper cost model we should be able to do it.
557 if (auto *CI = dyn_cast<CallInst>(&I)) {
558 const Function *Callee = CI->getCalledFunction();
559 if (!Callee || TTI.isLoweredToCall(Callee)) {
560 LLVM_DEBUG(dbgs() << "Can't analyze cost of loop with call\n");
561 return std::nullopt;
562 }
563 }
564
565 // If the instruction might have a side-effect recursively account for
566 // the cost of it and all the instructions leading up to it.
567 if (I.mayHaveSideEffects())
568 AddCostRecursively(I, Iteration);
569
570 // If unrolled body turns out to be too big, bail out.
571 if (UnrolledCost > MaxUnrolledLoopSize) {
572 LLVM_DEBUG(dbgs() << " Exceeded threshold.. exiting.\n"
573 << " UnrolledCost: " << UnrolledCost
574 << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize
575 << "\n");
576 return std::nullopt;
577 }
578 }
579
580 Instruction *TI = BB->getTerminator();
581
582 auto getSimplifiedConstant = [&](Value *V) -> Constant * {
583 if (SimplifiedValues.count(V))
584 V = SimplifiedValues.lookup(V);
585 return dyn_cast<Constant>(V);
586 };
587
588 // Add in the live successors by first checking whether we have terminator
589 // that may be simplified based on the values simplified by this call.
590 BasicBlock *KnownSucc = nullptr;
591 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
592 if (BI->isConditional()) {
593 if (auto *SimpleCond = getSimplifiedConstant(BI->getCondition())) {
594 // Just take the first successor if condition is undef
595 if (isa<UndefValue>(SimpleCond))
596 KnownSucc = BI->getSuccessor(0);
597 else if (ConstantInt *SimpleCondVal =
598 dyn_cast<ConstantInt>(SimpleCond))
599 KnownSucc = BI->getSuccessor(SimpleCondVal->isZero() ? 1 : 0);
600 }
601 }
602 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
603 if (auto *SimpleCond = getSimplifiedConstant(SI->getCondition())) {
604 // Just take the first successor if condition is undef
605 if (isa<UndefValue>(SimpleCond))
606 KnownSucc = SI->getSuccessor(0);
607 else if (ConstantInt *SimpleCondVal =
608 dyn_cast<ConstantInt>(SimpleCond))
609 KnownSucc = SI->findCaseValue(SimpleCondVal)->getCaseSuccessor();
610 }
611 }
612 if (KnownSucc) {
613 if (L->contains(KnownSucc))
614 BBWorklist.insert(KnownSucc);
615 else
616 ExitWorklist.insert({BB, KnownSucc});
617 continue;
618 }
619
620 // Add BB's successors to the worklist.
621 for (BasicBlock *Succ : successors(BB))
622 if (L->contains(Succ))
623 BBWorklist.insert(Succ);
624 else
625 ExitWorklist.insert({BB, Succ});
626 AddCostRecursively(*TI, Iteration);
627 }
628
629 // If we found no optimization opportunities on the first iteration, we
630 // won't find them on later ones too.
631 if (UnrolledCost == RolledDynamicCost) {
632 LLVM_DEBUG(dbgs() << " No opportunities found.. exiting.\n"
633 << " UnrolledCost: " << UnrolledCost << "\n");
634 return std::nullopt;
635 }
636 }
637
638 while (!ExitWorklist.empty()) {
639 BasicBlock *ExitingBB, *ExitBB;
640 std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val();
641
642 for (Instruction &I : *ExitBB) {
643 auto *PN = dyn_cast<PHINode>(&I);
644 if (!PN)
645 break;
646
647 Value *Op = PN->getIncomingValueForBlock(ExitingBB);
648 if (auto *OpI = dyn_cast<Instruction>(Op))
649 if (L->contains(OpI))
650 AddCostRecursively(*OpI, TripCount - 1);
651 }
652 }
653
654 assert(UnrolledCost.isValid() && RolledDynamicCost.isValid() &&
655 "All instructions must have a valid cost, whether the "
656 "loop is rolled or unrolled.");
657
658 LLVM_DEBUG(dbgs() << "Analysis finished:\n"
659 << "UnrolledCost: " << UnrolledCost << ", "
660 << "RolledDynamicCost: " << RolledDynamicCost << "\n");
661 return {{unsigned(*UnrolledCost.getValue()),
662 unsigned(*RolledDynamicCost.getValue())}};
663}
664
665/// ApproximateLoopSize - Approximate the size of the loop.
667 const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent,
669 const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) {
671 for (BasicBlock *BB : L->blocks())
672 Metrics.analyzeBasicBlock(BB, TTI, EphValues);
673 NumCalls = Metrics.NumInlineCandidates;
674 NotDuplicatable = Metrics.notDuplicatable;
675 Convergent = Metrics.convergent;
676
677 InstructionCost LoopSize = Metrics.NumInsts;
678
679 // Don't allow an estimate of size zero. This would allows unrolling of loops
680 // with huge iteration counts, which is a compile time problem even if it's
681 // not a problem for code quality. Also, the code using this size may assume
682 // that each loop has at least three instructions (likely a conditional
683 // branch, a comparison feeding that branch, and some kind of loop increment
684 // feeding that comparison instruction).
685 if (LoopSize.isValid() && LoopSize < BEInsns + 1)
686 // This is an open coded max() on InstructionCost
687 LoopSize = BEInsns + 1;
688
689 return LoopSize;
690}
691
692// Returns the loop hint metadata node with the given name (for example,
693// "llvm.loop.unroll.count"). If no such metadata node exists, then nullptr is
694// returned.
696 if (MDNode *LoopID = L->getLoopID())
697 return GetUnrollMetadata(LoopID, Name);
698 return nullptr;
699}
700
701// Returns true if the loop has an unroll(full) pragma.
702static bool hasUnrollFullPragma(const Loop *L) {
703 return getUnrollMetadataForLoop(L, "llvm.loop.unroll.full");
704}
705
706// Returns true if the loop has an unroll(enable) pragma. This metadata is used
707// for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives.
708static bool hasUnrollEnablePragma(const Loop *L) {
709 return getUnrollMetadataForLoop(L, "llvm.loop.unroll.enable");
710}
711
712// Returns true if the loop has an runtime unroll(disable) pragma.
713static bool hasRuntimeUnrollDisablePragma(const Loop *L) {
714 return getUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable");
715}
716
717// If loop has an unroll_count pragma return the (necessarily
718// positive) value from the pragma. Otherwise return 0.
719static unsigned unrollCountPragmaValue(const Loop *L) {
720 MDNode *MD = getUnrollMetadataForLoop(L, "llvm.loop.unroll.count");
721 if (MD) {
722 assert(MD->getNumOperands() == 2 &&
723 "Unroll count hint metadata should have two operands.");
724 unsigned Count =
725 mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
726 assert(Count >= 1 && "Unroll count must be positive.");
727 return Count;
728 }
729 return 0;
730}
731
732// Computes the boosting factor for complete unrolling.
733// If fully unrolling the loop would save a lot of RolledDynamicCost, it would
734// be beneficial to fully unroll the loop even if unrolledcost is large. We
735// use (RolledDynamicCost / UnrolledCost) to model the unroll benefits to adjust
736// the unroll threshold.
737static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost,
738 unsigned MaxPercentThresholdBoost) {
739 if (Cost.RolledDynamicCost >= std::numeric_limits<unsigned>::max() / 100)
740 return 100;
741 else if (Cost.UnrolledCost != 0)
742 // The boosting factor is RolledDynamicCost / UnrolledCost
743 return std::min(100 * Cost.RolledDynamicCost / Cost.UnrolledCost,
744 MaxPercentThresholdBoost);
745 else
746 return MaxPercentThresholdBoost;
747}
748
749// Produce an estimate of the unrolled cost of the specified loop. This
750// is used to a) produce a cost estimate for partial unrolling and b) to
751// cheaply estimate cost for full unrolling when we don't want to symbolically
752// evaluate all iterations.
754 const unsigned LoopSize;
755
756public:
757 UnrollCostEstimator(Loop &L, unsigned LoopSize) : LoopSize(LoopSize) {}
758
759 // Returns loop size estimation for unrolled loop, given the unrolling
760 // configuration specified by UP.
763 const unsigned CountOverwrite = 0) const {
764 assert(LoopSize >= UP.BEInsns &&
765 "LoopSize should not be less than BEInsns!");
766 if (CountOverwrite)
767 return static_cast<uint64_t>(LoopSize - UP.BEInsns) * CountOverwrite +
768 UP.BEInsns;
769 else
770 return static_cast<uint64_t>(LoopSize - UP.BEInsns) * UP.Count +
771 UP.BEInsns;
772 }
773};
774
775static std::optional<unsigned>
776shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo,
777 const unsigned TripMultiple, const unsigned TripCount,
778 const UnrollCostEstimator UCE,
780
781 // Using unroll pragma
782 // 1st priority is unroll count set by "unroll-count" option.
783
784 if (PInfo.UserUnrollCount) {
785 if (UP.AllowRemainder &&
786 UCE.getUnrolledLoopSize(UP, (unsigned)UnrollCount) < UP.Threshold)
787 return (unsigned)UnrollCount;
788 }
789
790 // 2nd priority is unroll count set by pragma.
791 if (PInfo.PragmaCount > 0) {
792 if ((UP.AllowRemainder || (TripMultiple % PInfo.PragmaCount == 0)))
793 return PInfo.PragmaCount;
794 }
795
796 if (PInfo.PragmaFullUnroll && TripCount != 0)
797 return TripCount;
798
799 // if didn't return until here, should continue to other priorties
800 return std::nullopt;
801}
802
803static std::optional<unsigned> shouldFullUnroll(
806 const unsigned FullUnrollTripCount, const UnrollCostEstimator UCE,
808 assert(FullUnrollTripCount && "should be non-zero!");
809
810 if (FullUnrollTripCount > UP.FullUnrollMaxCount)
811 return std::nullopt;
812
813 // When computing the unrolled size, note that BEInsns are not replicated
814 // like the rest of the loop body.
815 if (UCE.getUnrolledLoopSize(UP) < UP.Threshold)
816 return FullUnrollTripCount;
817
818 // The loop isn't that small, but we still can fully unroll it if that
819 // helps to remove a significant number of instructions.
820 // To check that, run additional analysis on the loop.
821 if (std::optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
822 L, FullUnrollTripCount, DT, SE, EphValues, TTI,
825 unsigned Boost =
827 if (Cost->UnrolledCost < UP.Threshold * Boost / 100)
828 return FullUnrollTripCount;
829 }
830 return std::nullopt;
831}
832
833static std::optional<unsigned>
834shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount,
835 const UnrollCostEstimator UCE,
837
838 if (!TripCount)
839 return std::nullopt;
840
841 if (!UP.Partial) {
842 LLVM_DEBUG(dbgs() << " will not try to unroll partially because "
843 << "-unroll-allow-partial not given\n");
844 return 0;
845 }
846 unsigned count = UP.Count;
847 if (count == 0)
848 count = TripCount;
849 if (UP.PartialThreshold != NoThreshold) {
850 // Reduce unroll count to be modulo of TripCount for partial unrolling.
852 count = (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
853 (LoopSize - UP.BEInsns);
854 if (count > UP.MaxCount)
855 count = UP.MaxCount;
856 while (count != 0 && TripCount % count != 0)
857 count--;
858 if (UP.AllowRemainder && count <= 1) {
859 // If there is no Count that is modulo of TripCount, set Count to
860 // largest power-of-two factor that satisfies the threshold limit.
861 // As we'll create fixup loop, do the type of unrolling only if
862 // remainder loop is allowed.
864 while (count != 0 &&
866 count >>= 1;
867 }
868 if (count < 2) {
869 count = 0;
870 }
871 } else {
872 count = TripCount;
873 }
874 if (count > UP.MaxCount)
875 count = UP.MaxCount;
876
877 LLVM_DEBUG(dbgs() << " partially unrolling with count: " << count << "\n");
878
879 return count;
880}
881// Returns true if unroll count was set explicitly.
882// Calculates unroll count and writes it to UP.Count.
883// Unless IgnoreUser is true, will also use metadata and command-line options
884// that are specific to to the LoopUnroll pass (which, for instance, are
885// irrelevant for the LoopUnrollAndJam pass).
886// FIXME: This function is used by LoopUnroll and LoopUnrollAndJam, but consumes
887// many LoopUnroll-specific options. The shared functionality should be
888// refactored into it own function.
891 AssumptionCache *AC,
893 OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount,
894 bool MaxOrZero, unsigned TripMultiple, unsigned LoopSize,
896 TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) {
897
898 UnrollCostEstimator UCE(*L, LoopSize);
899
900 const bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;
901 const bool PragmaFullUnroll = hasUnrollFullPragma(L);
902 const unsigned PragmaCount = unrollCountPragmaValue(L);
903 const bool PragmaEnableUnroll = hasUnrollEnablePragma(L);
904
905 const bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll ||
906 PragmaEnableUnroll || UserUnrollCount;
907
908 PragmaInfo PInfo(UserUnrollCount, PragmaFullUnroll, PragmaCount,
909 PragmaEnableUnroll);
910 // Use an explicit peel count that has been specified for testing. In this
911 // case it's not permitted to also specify an explicit unroll count.
912 if (PP.PeelCount) {
913 if (UnrollCount.getNumOccurrences() > 0) {
914 report_fatal_error("Cannot specify both explicit peel count and "
915 "explicit unroll count", /*GenCrashDiag=*/false);
916 }
917 UP.Count = 1;
918 UP.Runtime = false;
919 return true;
920 }
921 // Check for explicit Count.
922 // 1st priority is unroll count set by "unroll-count" option.
923 // 2nd priority is unroll count set by pragma.
924 if (auto UnrollFactor = shouldPragmaUnroll(L, PInfo, TripMultiple, TripCount,
925 UCE, UP)) {
926 UP.Count = *UnrollFactor;
927
928 if (UserUnrollCount || (PragmaCount > 0)) {
929 UP.AllowExpensiveTripCount = true;
930 UP.Force = true;
931 }
932 UP.Runtime |= (PragmaCount > 0);
933 return ExplicitUnroll;
934 } else {
935 if (ExplicitUnroll && TripCount != 0) {
936 // If the loop has an unrolling pragma, we want to be more aggressive with
937 // unrolling limits. Set thresholds to at least the PragmaUnrollThreshold
938 // value which is larger than the default limits.
939 UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
941 std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
942 }
943 }
944
945 // 3rd priority is exact full unrolling. This will eliminate all copies
946 // of some exit test.
947 UP.Count = 0;
948 if (TripCount) {
949 UP.Count = TripCount;
950 if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
951 TripCount, UCE, UP)) {
952 UP.Count = *UnrollFactor;
953 UseUpperBound = false;
954 return ExplicitUnroll;
955 }
956 }
957
958 // 4th priority is bounded unrolling.
959 // We can unroll by the upper bound amount if it's generally allowed or if
960 // we know that the loop is executed either the upper bound or zero times.
961 // (MaxOrZero unrolling keeps only the first loop test, so the number of
962 // loop tests remains the same compared to the non-unrolled version, whereas
963 // the generic upper bound unrolling keeps all but the last loop test so the
964 // number of loop tests goes up which may end up being worse on targets with
965 // constrained branch predictor resources so is controlled by an option.)
966 // In addition we only unroll small upper bounds.
967 // Note that the cost of bounded unrolling is always strictly greater than
968 // cost of exact full unrolling. As such, if we have an exact count and
969 // found it unprofitable, we'll never chose to bounded unroll.
970 if (!TripCount && MaxTripCount && (UP.UpperBound || MaxOrZero) &&
971 MaxTripCount <= UnrollMaxUpperBound) {
972 UP.Count = MaxTripCount;
973 if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
974 MaxTripCount, UCE, UP)) {
975 UP.Count = *UnrollFactor;
976 UseUpperBound = true;
977 return ExplicitUnroll;
978 }
979 }
980
981 // 5th priority is loop peeling.
982 computePeelCount(L, LoopSize, PP, TripCount, DT, SE, AC, UP.Threshold);
983 if (PP.PeelCount) {
984 UP.Runtime = false;
985 UP.Count = 1;
986 return ExplicitUnroll;
987 }
988
989 // Before starting partial unrolling, set up.partial to true,
990 // if user explicitly asked for unrolling
991 if (TripCount)
992 UP.Partial |= ExplicitUnroll;
993
994 // 6th priority is partial unrolling.
995 // Try partial unroll only when TripCount could be statically calculated.
996 if (auto UnrollFactor = shouldPartialUnroll(LoopSize, TripCount, UCE, UP)) {
997 UP.Count = *UnrollFactor;
998
999 if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
1000 UP.Count != TripCount)
1001 ORE->emit([&]() {
1003 "FullUnrollAsDirectedTooLarge",
1004 L->getStartLoc(), L->getHeader())
1005 << "Unable to fully unroll loop as directed by unroll pragma "
1006 "because "
1007 "unrolled size is too large.";
1008 });
1009
1010 if (UP.PartialThreshold != NoThreshold) {
1011 if (UP.Count == 0) {
1012 if (PragmaEnableUnroll)
1013 ORE->emit([&]() {
1015 "UnrollAsDirectedTooLarge",
1016 L->getStartLoc(), L->getHeader())
1017 << "Unable to unroll loop as directed by unroll(enable) "
1018 "pragma "
1019 "because unrolled size is too large.";
1020 });
1021 }
1022 }
1023 return ExplicitUnroll;
1024 }
1025 assert(TripCount == 0 &&
1026 "All cases when TripCount is constant should be covered here.");
1027 if (PragmaFullUnroll)
1028 ORE->emit([&]() {
1030 DEBUG_TYPE, "CantFullUnrollAsDirectedRuntimeTripCount",
1031 L->getStartLoc(), L->getHeader())
1032 << "Unable to fully unroll loop as directed by unroll(full) "
1033 "pragma "
1034 "because loop has a runtime trip count.";
1035 });
1036
1037 // 7th priority is runtime unrolling.
1038 // Don't unroll a runtime trip count loop when it is disabled.
1040 UP.Count = 0;
1041 return false;
1042 }
1043
1044 // Don't unroll a small upper bound loop unless user or TTI asked to do so.
1045 if (MaxTripCount && !UP.Force && MaxTripCount < UnrollMaxUpperBound) {
1046 UP.Count = 0;
1047 return false;
1048 }
1049
1050 // Check if the runtime trip count is too small when profile is available.
1051 if (L->getHeader()->getParent()->hasProfileData()) {
1052 if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
1053 if (*ProfileTripCount < FlatLoopTripCountThreshold)
1054 return false;
1055 else
1056 UP.AllowExpensiveTripCount = true;
1057 }
1058 }
1059 UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount;
1060 if (!UP.Runtime) {
1061 LLVM_DEBUG(
1062 dbgs() << " will not try to unroll loop with runtime trip count "
1063 << "-unroll-runtime not given\n");
1064 UP.Count = 0;
1065 return false;
1066 }
1067 if (UP.Count == 0)
1069
1070 // Reduce unroll count to be the largest power-of-two factor of
1071 // the original count which satisfies the threshold limit.
1072 while (UP.Count != 0 &&
1074 UP.Count >>= 1;
1075
1076#ifndef NDEBUG
1077 unsigned OrigCount = UP.Count;
1078#endif
1079
1080 if (!UP.AllowRemainder && UP.Count != 0 && (TripMultiple % UP.Count) != 0) {
1081 while (UP.Count != 0 && TripMultiple % UP.Count != 0)
1082 UP.Count >>= 1;
1083 LLVM_DEBUG(
1084 dbgs() << "Remainder loop is restricted (that could architecture "
1085 "specific or because the loop contains a convergent "
1086 "instruction), so unroll count must divide the trip "
1087 "multiple, "
1088 << TripMultiple << ". Reducing unroll count from " << OrigCount
1089 << " to " << UP.Count << ".\n");
1090
1091 using namespace ore;
1092
1093 if (unrollCountPragmaValue(L) > 0 && !UP.AllowRemainder)
1094 ORE->emit([&]() {
1096 "DifferentUnrollCountFromDirected",
1097 L->getStartLoc(), L->getHeader())
1098 << "Unable to unroll loop the number of times directed by "
1099 "unroll_count pragma because remainder loop is restricted "
1100 "(that could architecture specific or because the loop "
1101 "contains a convergent instruction) and so must have an "
1102 "unroll "
1103 "count that divides the loop trip multiple of "
1104 << NV("TripMultiple", TripMultiple) << ". Unrolling instead "
1105 << NV("UnrollCount", UP.Count) << " time(s).";
1106 });
1107 }
1108
1109 if (UP.Count > UP.MaxCount)
1110 UP.Count = UP.MaxCount;
1111
1112 if (MaxTripCount && UP.Count > MaxTripCount)
1113 UP.Count = MaxTripCount;
1114
1115 LLVM_DEBUG(dbgs() << " runtime unrolling with count: " << UP.Count
1116 << "\n");
1117 if (UP.Count < 2)
1118 UP.Count = 0;
1119 return ExplicitUnroll;
1120}
1121
1122static LoopUnrollResult
1126 ProfileSummaryInfo *PSI, bool PreserveLCSSA, int OptLevel,
1127 bool OnlyWhenForced, bool ForgetAllSCEV,
1128 std::optional<unsigned> ProvidedCount,
1129 std::optional<unsigned> ProvidedThreshold,
1130 std::optional<bool> ProvidedAllowPartial,
1131 std::optional<bool> ProvidedRuntime,
1132 std::optional<bool> ProvidedUpperBound,
1133 std::optional<bool> ProvidedAllowPeeling,
1134 std::optional<bool> ProvidedAllowProfileBasedPeeling,
1135 std::optional<unsigned> ProvidedFullUnrollMaxCount) {
1136 LLVM_DEBUG(dbgs() << "Loop Unroll: F["
1137 << L->getHeader()->getParent()->getName() << "] Loop %"
1138 << L->getHeader()->getName() << "\n");
1140 if (TM & TM_Disable)
1141 return LoopUnrollResult::Unmodified;
1142
1143 // If this loop isn't forced to be unrolled, avoid unrolling it when the
1144 // parent loop has an explicit unroll-and-jam pragma. This is to prevent
1145 // automatic unrolling from interfering with the user requested
1146 // transformation.
1147 Loop *ParentL = L->getParentLoop();
1148 if (ParentL != nullptr &&
1151 LLVM_DEBUG(dbgs() << "Not unrolling loop since parent loop has"
1152 << " llvm.loop.unroll_and_jam.\n");
1153 return LoopUnrollResult::Unmodified;
1154 }
1155
1156 // If this loop isn't forced to be unrolled, avoid unrolling it when the
1157 // loop has an explicit unroll-and-jam pragma. This is to prevent automatic
1158 // unrolling from interfering with the user requested transformation.
1161 LLVM_DEBUG(
1162 dbgs()
1163 << " Not unrolling loop since it has llvm.loop.unroll_and_jam.\n");
1164 return LoopUnrollResult::Unmodified;
1165 }
1166
1167 if (!L->isLoopSimplifyForm()) {
1168 LLVM_DEBUG(
1169 dbgs() << " Not unrolling loop which is not in loop-simplify form.\n");
1170 return LoopUnrollResult::Unmodified;
1171 }
1172
1173 // When automatic unrolling is disabled, do not unroll unless overridden for
1174 // this loop.
1175 if (OnlyWhenForced && !(TM & TM_Enable))
1176 return LoopUnrollResult::Unmodified;
1177
1178 bool OptForSize = L->getHeader()->getParent()->hasOptSize();
1179 unsigned NumInlineCandidates;
1180 bool NotDuplicatable;
1181 bool Convergent;
1183 L, SE, TTI, BFI, PSI, ORE, OptLevel, ProvidedThreshold, ProvidedCount,
1184 ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
1185 ProvidedFullUnrollMaxCount);
1187 L, SE, TTI, ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling, true);
1188
1189 // Exit early if unrolling is disabled. For OptForSize, we pick the loop size
1190 // as threshold later on.
1191 if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0) &&
1192 !OptForSize)
1193 return LoopUnrollResult::Unmodified;
1194
1196 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
1197
1198 InstructionCost LoopSizeIC =
1199 ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
1200 TTI, EphValues, UP.BEInsns);
1201 LLVM_DEBUG(dbgs() << " Loop Size = " << LoopSizeIC << "\n");
1202
1203 if (!LoopSizeIC.isValid()) {
1204 LLVM_DEBUG(dbgs() << " Not unrolling loop which contains instructions"
1205 << " with invalid cost.\n");
1206 return LoopUnrollResult::Unmodified;
1207 }
1208 unsigned LoopSize = *LoopSizeIC.getValue();
1209
1210 if (NotDuplicatable) {
1211 LLVM_DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable"
1212 << " instructions.\n");
1213 return LoopUnrollResult::Unmodified;
1214 }
1215
1216 // When optimizing for size, use LoopSize + 1 as threshold (we use < Threshold
1217 // later), to (fully) unroll loops, if it does not increase code size.
1218 if (OptForSize)
1219 UP.Threshold = std::max(UP.Threshold, LoopSize + 1);
1220
1221 if (NumInlineCandidates != 0) {
1222 LLVM_DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n");
1223 return LoopUnrollResult::Unmodified;
1224 }
1225
1226 // Find the smallest exact trip count for any exit. This is an upper bound
1227 // on the loop trip count, but an exit at an earlier iteration is still
1228 // possible. An unroll by the smallest exact trip count guarantees that all
1229 // branches relating to at least one exit can be eliminated. This is unlike
1230 // the max trip count, which only guarantees that the backedge can be broken.
1231 unsigned TripCount = 0;
1232 unsigned TripMultiple = 1;
1233 SmallVector<BasicBlock *, 8> ExitingBlocks;
1234 L->getExitingBlocks(ExitingBlocks);
1235 for (BasicBlock *ExitingBlock : ExitingBlocks)
1236 if (unsigned TC = SE.getSmallConstantTripCount(L, ExitingBlock))
1237 if (!TripCount || TC < TripCount)
1238 TripCount = TripMultiple = TC;
1239
1240 if (!TripCount) {
1241 // If no exact trip count is known, determine the trip multiple of either
1242 // the loop latch or the single exiting block.
1243 // TODO: Relax for multiple exits.
1244 BasicBlock *ExitingBlock = L->getLoopLatch();
1245 if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
1246 ExitingBlock = L->getExitingBlock();
1247 if (ExitingBlock)
1248 TripMultiple = SE.getSmallConstantTripMultiple(L, ExitingBlock);
1249 }
1250
1251 // If the loop contains a convergent operation, the prelude we'd add
1252 // to do the first few instructions before we hit the unrolled loop
1253 // is unsafe -- it adds a control-flow dependency to the convergent
1254 // operation. Therefore restrict remainder loop (try unrolling without).
1255 //
1256 // TODO: This is quite conservative. In practice, convergent_op()
1257 // is likely to be called unconditionally in the loop. In this
1258 // case, the program would be ill-formed (on most architectures)
1259 // unless n were the same on all threads in a thread group.
1260 // Assuming n is the same on all threads, any kind of unrolling is
1261 // safe. But currently llvm's notion of convergence isn't powerful
1262 // enough to express this.
1263 if (Convergent)
1264 UP.AllowRemainder = false;
1265
1266 // Try to find the trip count upper bound if we cannot find the exact trip
1267 // count.
1268 unsigned MaxTripCount = 0;
1269 bool MaxOrZero = false;
1270 if (!TripCount) {
1271 MaxTripCount = SE.getSmallConstantMaxTripCount(L);
1272 MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L);
1273 }
1274
1275 // computeUnrollCount() decides whether it is beneficial to use upper bound to
1276 // fully unroll the loop.
1277 bool UseUpperBound = false;
1278 bool IsCountSetExplicitly = computeUnrollCount(
1279 L, TTI, DT, LI, &AC, SE, EphValues, &ORE, TripCount, MaxTripCount, MaxOrZero,
1280 TripMultiple, LoopSize, UP, PP, UseUpperBound);
1281 if (!UP.Count)
1282 return LoopUnrollResult::Unmodified;
1283
1284 if (PP.PeelCount) {
1285 assert(UP.Count == 1 && "Cannot perform peel and unroll in the same step");
1286 LLVM_DEBUG(dbgs() << "PEELING loop %" << L->getHeader()->getName()
1287 << " with iteration count " << PP.PeelCount << "!\n");
1288 ORE.emit([&]() {
1289 return OptimizationRemark(DEBUG_TYPE, "Peeled", L->getStartLoc(),
1290 L->getHeader())
1291 << " peeled loop by " << ore::NV("PeelCount", PP.PeelCount)
1292 << " iterations";
1293 });
1294
1295 ValueToValueMapTy VMap;
1296 if (peelLoop(L, PP.PeelCount, LI, &SE, DT, &AC, PreserveLCSSA, VMap)) {
1297 simplifyLoopAfterUnroll(L, true, LI, &SE, &DT, &AC, &TTI);
1298 // If the loop was peeled, we already "used up" the profile information
1299 // we had, so we don't want to unroll or peel again.
1301 L->setLoopAlreadyUnrolled();
1302 return LoopUnrollResult::PartiallyUnrolled;
1303 }
1304 return LoopUnrollResult::Unmodified;
1305 }
1306
1307 // At this point, UP.Runtime indicates that run-time unrolling is allowed.
1308 // However, we only want to actually perform it if we don't know the trip
1309 // count and the unroll count doesn't divide the known trip multiple.
1310 // TODO: This decision should probably be pushed up into
1311 // computeUnrollCount().
1312 UP.Runtime &= TripCount == 0 && TripMultiple % UP.Count != 0;
1313
1314 // Save loop properties before it is transformed.
1315 MDNode *OrigLoopID = L->getLoopID();
1316
1317 // Unroll the loop.
1318 Loop *RemainderLoop = nullptr;
1319 LoopUnrollResult UnrollResult = UnrollLoop(
1320 L,
1322 UP.UnrollRemainder, ForgetAllSCEV},
1323 LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop);
1324 if (UnrollResult == LoopUnrollResult::Unmodified)
1325 return LoopUnrollResult::Unmodified;
1326
1327 if (RemainderLoop) {
1328 std::optional<MDNode *> RemainderLoopID =
1331 if (RemainderLoopID)
1332 RemainderLoop->setLoopID(*RemainderLoopID);
1333 }
1334
1335 if (UnrollResult != LoopUnrollResult::FullyUnrolled) {
1336 std::optional<MDNode *> NewLoopID =
1339 if (NewLoopID) {
1340 L->setLoopID(*NewLoopID);
1341
1342 // Do not setLoopAlreadyUnrolled if loop attributes have been specified
1343 // explicitly.
1344 return UnrollResult;
1345 }
1346 }
1347
1348 // If loop has an unroll count pragma or unrolled by explicitly set count
1349 // mark loop as unrolled to prevent unrolling beyond that requested.
1350 if (UnrollResult != LoopUnrollResult::FullyUnrolled && IsCountSetExplicitly)
1351 L->setLoopAlreadyUnrolled();
1352
1353 return UnrollResult;
1354}
1355
1356namespace {
1357
1358class LoopUnroll : public LoopPass {
1359public:
1360 static char ID; // Pass ID, replacement for typeid
1361
1362 int OptLevel;
1363
1364 /// If false, use a cost model to determine whether unrolling of a loop is
1365 /// profitable. If true, only loops that explicitly request unrolling via
1366 /// metadata are considered. All other loops are skipped.
1367 bool OnlyWhenForced;
1368
1369 /// If false, when SCEV is invalidated, only forget everything in the
1370 /// top-most loop (call forgetTopMostLoop), of the loop being processed.
1371 /// Otherwise, forgetAllLoops and rebuild when needed next.
1372 bool ForgetAllSCEV;
1373
1374 std::optional<unsigned> ProvidedCount;
1375 std::optional<unsigned> ProvidedThreshold;
1376 std::optional<bool> ProvidedAllowPartial;
1377 std::optional<bool> ProvidedRuntime;
1378 std::optional<bool> ProvidedUpperBound;
1379 std::optional<bool> ProvidedAllowPeeling;
1380 std::optional<bool> ProvidedAllowProfileBasedPeeling;
1381 std::optional<unsigned> ProvidedFullUnrollMaxCount;
1382
1383 LoopUnroll(int OptLevel = 2, bool OnlyWhenForced = false,
1384 bool ForgetAllSCEV = false,
1385 std::optional<unsigned> Threshold = std::nullopt,
1386 std::optional<unsigned> Count = std::nullopt,
1387 std::optional<bool> AllowPartial = std::nullopt,
1388 std::optional<bool> Runtime = std::nullopt,
1389 std::optional<bool> UpperBound = std::nullopt,
1390 std::optional<bool> AllowPeeling = std::nullopt,
1391 std::optional<bool> AllowProfileBasedPeeling = std::nullopt,
1392 std::optional<unsigned> ProvidedFullUnrollMaxCount = std::nullopt)
1393 : LoopPass(ID), OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced),
1394 ForgetAllSCEV(ForgetAllSCEV), ProvidedCount(std::move(Count)),
1395 ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
1396 ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound),
1397 ProvidedAllowPeeling(AllowPeeling),
1398 ProvidedAllowProfileBasedPeeling(AllowProfileBasedPeeling),
1399 ProvidedFullUnrollMaxCount(ProvidedFullUnrollMaxCount) {
1401 }
1402
1403 bool runOnLoop(Loop *L, LPPassManager &LPM) override {
1404 if (skipLoop(L))
1405 return false;
1406
1407 Function &F = *L->getHeader()->getParent();
1408
1409 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1410 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1411 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1412 const TargetTransformInfo &TTI =
1413 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1414 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1415 // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
1416 // pass. Function analyses need to be preserved across loop transformations
1417 // but ORE cannot be preserved (see comment before the pass definition).
1419 bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
1420
1422 L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr, PreserveLCSSA, OptLevel,
1423 OnlyWhenForced, ForgetAllSCEV, ProvidedCount, ProvidedThreshold,
1424 ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
1425 ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling,
1426 ProvidedFullUnrollMaxCount);
1427
1428 if (Result == LoopUnrollResult::FullyUnrolled)
1429 LPM.markLoopAsDeleted(*L);
1430
1432 }
1433
1434 /// This transformation requires natural loop information & requires that
1435 /// loop preheaders be inserted into the CFG...
1436 void getAnalysisUsage(AnalysisUsage &AU) const override {
1439 // FIXME: Loop passes are required to preserve domtree, and for now we just
1440 // recreate dom info if anything gets unrolled.
1442 }
1443};
1444
1445} // end anonymous namespace
1446
1447char LoopUnroll::ID = 0;
1448
1449INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
1453INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
1454
1455Pass *llvm::createLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
1456 bool ForgetAllSCEV, int Threshold, int Count,
1457 int AllowPartial, int Runtime, int UpperBound,
1458 int AllowPeeling) {
1459 // TODO: It would make more sense for this function to take the optionals
1460 // directly, but that's dangerous since it would silently break out of tree
1461 // callers.
1462 return new LoopUnroll(
1463 OptLevel, OnlyWhenForced, ForgetAllSCEV,
1464 Threshold == -1 ? std::nullopt : std::optional<unsigned>(Threshold),
1465 Count == -1 ? std::nullopt : std::optional<unsigned>(Count),
1466 AllowPartial == -1 ? std::nullopt : std::optional<bool>(AllowPartial),
1467 Runtime == -1 ? std::nullopt : std::optional<bool>(Runtime),
1468 UpperBound == -1 ? std::nullopt : std::optional<bool>(UpperBound),
1469 AllowPeeling == -1 ? std::nullopt : std::optional<bool>(AllowPeeling));
1470}
1471
1472Pass *llvm::createSimpleLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
1473 bool ForgetAllSCEV) {
1474 return createLoopUnrollPass(OptLevel, OnlyWhenForced, ForgetAllSCEV, -1, -1,
1475 0, 0, 0, 1);
1476}
1477
1480 LPMUpdater &Updater) {
1481 // For the new PM, we can't use OptimizationRemarkEmitter as an analysis
1482 // pass. Function analyses need to be preserved across loop transformations
1483 // but ORE cannot be preserved (see comment before the pass definition).
1484 OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
1485
1486 // Keep track of the previous loop structure so we can identify new loops
1487 // created by unrolling.
1488 Loop *ParentL = L.getParentLoop();
1489 SmallPtrSet<Loop *, 4> OldLoops;
1490 if (ParentL)
1491 OldLoops.insert(ParentL->begin(), ParentL->end());
1492 else
1493 OldLoops.insert(AR.LI.begin(), AR.LI.end());
1494
1495 std::string LoopName = std::string(L.getName());
1496
1497 bool Changed =
1498 tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, ORE,
1499 /*BFI*/ nullptr, /*PSI*/ nullptr,
1500 /*PreserveLCSSA*/ true, OptLevel, OnlyWhenForced,
1501 ForgetSCEV, /*Count*/ std::nullopt,
1502 /*Threshold*/ std::nullopt, /*AllowPartial*/ false,
1503 /*Runtime*/ false, /*UpperBound*/ false,
1504 /*AllowPeeling*/ true,
1505 /*AllowProfileBasedPeeling*/ false,
1506 /*FullUnrollMaxCount*/ std::nullopt) !=
1508 if (!Changed)
1509 return PreservedAnalyses::all();
1510
1511 // The parent must not be damaged by unrolling!
1512#ifndef NDEBUG
1513 if (ParentL)
1514 ParentL->verifyLoop();
1515#endif
1516
1517 // Unrolling can do several things to introduce new loops into a loop nest:
1518 // - Full unrolling clones child loops within the current loop but then
1519 // removes the current loop making all of the children appear to be new
1520 // sibling loops.
1521 //
1522 // When a new loop appears as a sibling loop after fully unrolling,
1523 // its nesting structure has fundamentally changed and we want to revisit
1524 // it to reflect that.
1525 //
1526 // When unrolling has removed the current loop, we need to tell the
1527 // infrastructure that it is gone.
1528 //
1529 // Finally, we support a debugging/testing mode where we revisit child loops
1530 // as well. These are not expected to require further optimizations as either
1531 // they or the loop they were cloned from have been directly visited already.
1532 // But the debugging mode allows us to check this assumption.
1533 bool IsCurrentLoopValid = false;
1534 SmallVector<Loop *, 4> SibLoops;
1535 if (ParentL)
1536 SibLoops.append(ParentL->begin(), ParentL->end());
1537 else
1538 SibLoops.append(AR.LI.begin(), AR.LI.end());
1539 erase_if(SibLoops, [&](Loop *SibLoop) {
1540 if (SibLoop == &L) {
1541 IsCurrentLoopValid = true;
1542 return true;
1543 }
1544
1545 // Otherwise erase the loop from the list if it was in the old loops.
1546 return OldLoops.contains(SibLoop);
1547 });
1548 Updater.addSiblingLoops(SibLoops);
1549
1550 if (!IsCurrentLoopValid) {
1551 Updater.markLoopAsDeleted(L, LoopName);
1552 } else {
1553 // We can only walk child loops if the current loop remained valid.
1555 // Walk *all* of the child loops.
1556 SmallVector<Loop *, 4> ChildLoops(L.begin(), L.end());
1557 Updater.addChildLoops(ChildLoops);
1558 }
1559 }
1560
1562}
1563
1566 auto &LI = AM.getResult<LoopAnalysis>(F);
1567 // There are no loops in the function. Return before computing other expensive
1568 // analyses.
1569 if (LI.empty())
1570 return PreservedAnalyses::all();
1571 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
1572 auto &TTI = AM.getResult<TargetIRAnalysis>(F);
1573 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
1574 auto &AC = AM.getResult<AssumptionAnalysis>(F);
1576
1577 LoopAnalysisManager *LAM = nullptr;
1578 if (auto *LAMProxy = AM.getCachedResult<LoopAnalysisManagerFunctionProxy>(F))
1579 LAM = &LAMProxy->getManager();
1580
1581 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
1582 ProfileSummaryInfo *PSI =
1583 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
1584 auto *BFI = (PSI && PSI->hasProfileSummary()) ?
1585 &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
1586
1587 bool Changed = false;
1588
1589 // The unroller requires loops to be in simplified form, and also needs LCSSA.
1590 // Since simplification may add new inner loops, it has to run before the
1591 // legality and profitability checks. This means running the loop unroller
1592 // will simplify all loops, regardless of whether anything end up being
1593 // unrolled.
1594 for (const auto &L : LI) {
1595 Changed |=
1596 simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */);
1597 Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
1598 }
1599
1600 // Add the loop nests in the reverse order of LoopInfo. See method
1601 // declaration.
1603 appendLoopsToWorklist(LI, Worklist);
1604
1605 while (!Worklist.empty()) {
1606 // Because the LoopInfo stores the loops in RPO, we walk the worklist
1607 // from back to front so that we work forward across the CFG, which
1608 // for unrolling is only needed to get optimization remarks emitted in
1609 // a forward order.
1610 Loop &L = *Worklist.pop_back_val();
1611#ifndef NDEBUG
1612 Loop *ParentL = L.getParentLoop();
1613#endif
1614
1615 // Check if the profile summary indicates that the profiled application
1616 // has a huge working set size, in which case we disable peeling to avoid
1617 // bloating it further.
1618 std::optional<bool> LocalAllowPeeling = UnrollOpts.AllowPeeling;
1619 if (PSI && PSI->hasHugeWorkingSetSize())
1620 LocalAllowPeeling = false;
1621 std::string LoopName = std::string(L.getName());
1622 // The API here is quite complex to call and we allow to select some
1623 // flavors of unrolling during construction time (by setting UnrollOpts).
1625 &L, DT, &LI, SE, TTI, AC, ORE, BFI, PSI,
1626 /*PreserveLCSSA*/ true, UnrollOpts.OptLevel, UnrollOpts.OnlyWhenForced,
1627 UnrollOpts.ForgetSCEV, /*Count*/ std::nullopt,
1628 /*Threshold*/ std::nullopt, UnrollOpts.AllowPartial,
1629 UnrollOpts.AllowRuntime, UnrollOpts.AllowUpperBound, LocalAllowPeeling,
1630 UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount);
1631 Changed |= Result != LoopUnrollResult::Unmodified;
1632
1633 // The parent must not be damaged by unrolling!
1634#ifndef NDEBUG
1635 if (Result != LoopUnrollResult::Unmodified && ParentL)
1636 ParentL->verifyLoop();
1637#endif
1638
1639 // Clear any cached analysis results for L if we removed it completely.
1640 if (LAM && Result == LoopUnrollResult::FullyUnrolled)
1641 LAM->clear(L, LoopName);
1642 }
1643
1644 if (!Changed)
1645 return PreservedAnalyses::all();
1646
1648}
1649
1651 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
1652 static_cast<PassInfoMixin<LoopUnrollPass> *>(this)->printPipeline(
1653 OS, MapClassName2PassName);
1654 OS << '<';
1655 if (UnrollOpts.AllowPartial != std::nullopt)
1656 OS << (*UnrollOpts.AllowPartial ? "" : "no-") << "partial;";
1657 if (UnrollOpts.AllowPeeling != std::nullopt)
1658 OS << (*UnrollOpts.AllowPeeling ? "" : "no-") << "peeling;";
1659 if (UnrollOpts.AllowRuntime != std::nullopt)
1660 OS << (*UnrollOpts.AllowRuntime ? "" : "no-") << "runtime;";
1661 if (UnrollOpts.AllowUpperBound != std::nullopt)
1662 OS << (*UnrollOpts.AllowUpperBound ? "" : "no-") << "upperbound;";
1663 if (UnrollOpts.AllowProfileBasedPeeling != std::nullopt)
1664 OS << (*UnrollOpts.AllowProfileBasedPeeling ? "" : "no-")
1665 << "profile-peeling;";
1666 if (UnrollOpts.FullUnrollMaxCount != std::nullopt)
1667 OS << "full-unroll-max=" << UnrollOpts.FullUnrollMaxCount << ';';
1668 OS << 'O' << UnrollOpts.OptLevel;
1669 OS << '>';
1670}
amdgpu Simplify well known AMD library false FunctionCallee Callee
Rewrite undef for PHI
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines DenseMapInfo traits for DenseMap.
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header provides classes for managing per-loop analyses.
loops
Definition: LoopInfo.cpp:1177
This header provides classes for managing a pipeline of passes over loops in LLVM IR.
static cl::opt< unsigned > UnrollMaxCount("unroll-max-count", cl::Hidden, cl::desc("Set the max unroll count for partial and runtime unrolling, for" "testing purposes"))
static cl::opt< unsigned > UnrollCount("unroll-count", cl::Hidden, cl::desc("Use this unroll count for all loops including those with " "unroll_count pragma values, for testing purposes"))
static LoopUnrollResult tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, const TargetTransformInfo &TTI, AssumptionCache &AC, OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, bool PreserveLCSSA, int OptLevel, bool OnlyWhenForced, bool ForgetAllSCEV, std::optional< unsigned > ProvidedCount, std::optional< unsigned > ProvidedThreshold, std::optional< bool > ProvidedAllowPartial, std::optional< bool > ProvidedRuntime, std::optional< bool > ProvidedUpperBound, std::optional< bool > ProvidedAllowPeeling, std::optional< bool > ProvidedAllowProfileBasedPeeling, std::optional< unsigned > ProvidedFullUnrollMaxCount)
static cl::opt< unsigned > UnrollThresholdDefault("unroll-threshold-default", cl::init(150), cl::Hidden, cl::desc("Default threshold (max size of unrolled " "loop), used in all but O3 optimizations"))
static cl::opt< unsigned > FlatLoopTripCountThreshold("flat-loop-tripcount-threshold", cl::init(5), cl::Hidden, cl::desc("If the runtime tripcount for the loop is lower than the " "threshold, the loop is considered as flat and will be less " "aggressively unrolled."))
static cl::opt< unsigned > UnrollMaxIterationsCountToAnalyze("unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden, cl::desc("Don't allow loop unrolling to simulate more than this number of" "iterations when checking full unroll profitability"))
static MDNode * getUnrollMetadataForLoop(const Loop *L, StringRef Name)
static cl::opt< unsigned > UnrollOptSizeThreshold("unroll-optsize-threshold", cl::init(0), cl::Hidden, cl::desc("The cost threshold for loop unrolling when optimizing for " "size"))
static bool hasUnrollFullPragma(const Loop *L)
static cl::opt< bool > UnrollUnrollRemainder("unroll-remainder", cl::Hidden, cl::desc("Allow the loop remainder to be unrolled."))
static unsigned unrollCountPragmaValue(const Loop *L)
static bool hasUnrollEnablePragma(const Loop *L)
static std::optional< unsigned > shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo, const unsigned TripMultiple, const unsigned TripCount, const UnrollCostEstimator UCE, const TargetTransformInfo::UnrollingPreferences &UP)
static cl::opt< unsigned > UnrollFullMaxCount("unroll-full-max-count", cl::Hidden, cl::desc("Set the max unroll count for full unrolling, for testing purposes"))
static cl::opt< unsigned > UnrollMaxUpperBound("unroll-max-upperbound", cl::init(8), cl::Hidden, cl::desc("The max of trip count upper bound that is considered in unrolling"))
static std::optional< unsigned > shouldFullUnroll(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, const unsigned FullUnrollTripCount, const UnrollCostEstimator UCE, const TargetTransformInfo::UnrollingPreferences &UP)
static std::optional< EstimatedUnrollCost > analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize, unsigned MaxIterationsCountToAnalyze)
Figure out if the loop is worth full unrolling.
static cl::opt< unsigned > UnrollPartialThreshold("unroll-partial-threshold", cl::Hidden, cl::desc("The cost threshold for partial loop unrolling"))
static cl::opt< bool > UnrollAllowRemainder("unroll-allow-remainder", cl::Hidden, cl::desc("Allow generation of a loop remainder (extra iterations) " "when unrolling a loop."))
static std::optional< unsigned > shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount, const UnrollCostEstimator UCE, const TargetTransformInfo::UnrollingPreferences &UP)
static const unsigned NoThreshold
A magic value for use with the Threshold parameter to indicate that the loop unroll should be perform...
static cl::opt< bool > UnrollRevisitChildLoops("unroll-revisit-child-loops", cl::Hidden, cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. " "This shouldn't typically be needed as child loops (or their " "clones) were already visited."))
static cl::opt< unsigned > UnrollThreshold("unroll-threshold", cl::Hidden, cl::desc("The cost threshold for loop unrolling"))
static cl::opt< bool > UnrollRuntime("unroll-runtime", cl::Hidden, cl::desc("Unroll loops with run-time trip counts"))
static bool hasRuntimeUnrollDisablePragma(const Loop *L)
static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost, unsigned MaxPercentThresholdBoost)
static cl::opt< unsigned > UnrollThresholdAggressive("unroll-threshold-aggressive", cl::init(300), cl::Hidden, cl::desc("Threshold (max size of unrolled loop) to use in aggressive (O3) " "optimizations"))
#define DEBUG_TYPE
static cl::opt< unsigned > UnrollMaxPercentThresholdBoost("unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden, cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied " "to the threshold when aggressively unrolling a loop due to the " "dynamic cost savings. If completely unrolling a loop will reduce " "the total runtime from X to Y, we boost the loop unroll " "threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, " "X/Y). This limit avoids excessive code bloat."))
static cl::opt< unsigned > PragmaUnrollThreshold("pragma-unroll-threshold", cl::init(16 *1024), cl::Hidden, cl::desc("Unrolled size limit for loops with an unroll(full) or " "unroll_count pragma."))
static cl::opt< bool > UnrollAllowPartial("unroll-allow-partial", cl::Hidden, cl::desc("Allows loops to be partially unrolled until " "-unroll-threshold loop size is reached."))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Machine Trace Metrics
This file contains the declarations for metadata subclasses.
if(VerifyEach)
LoopAnalysisManager LAM
const char LLVMTargetMachineRef TM
This header defines various interfaces for pass management in LLVM.
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
@ SI
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This pass exposes codegen information to IR-level passes.
Value * RHS
Value * LHS
UnrollCostEstimator(Loop &L, unsigned LoopSize)
uint64_t getUnrolledLoopSize(const TargetTransformInfo::UnrollingPreferences &UP, const unsigned CountOverwrite=0) const
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:620
void clear(IRUnitT &IR, llvm::StringRef Name)
Clear any cached analysis results for a single unit of IR.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:793
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:774
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:127
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Conditional or Unconditional Branch instruction.
This is the shared class of boolean and integer constants.
Definition: Constants.h:78
This is an important base class in LLVM.
Definition: Constant.h:41
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:646
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
Definition: PassManager.h:933
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:74
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
void addChildLoops(ArrayRef< Loop * > NewChildLoops)
Loop passes should use this method to indicate they have added new child loops of the current loop.
void markLoopAsDeleted(Loop &L, llvm::StringRef Name)
Loop passes should use this method to indicate they have deleted a loop from the nest.
void addSiblingLoops(ArrayRef< Loop * > NewSibLoops)
Loop passes should use this method to indicate they have added new sibling loops to the current loop.
void markLoopAsDeleted(Loop &L)
Definition: LoopPass.cpp:110
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:1268
void verifyLoop() const
Verify loop structure.
Definition: LoopInfoImpl.h:302
iterator end() const
Definition: LoopInfo.h:172
iterator begin() const
Definition: LoopInfo.h:171
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
iterator end() const
Definition: LoopInfo.h:968
bool empty() const
Definition: LoopInfo.h:971
iterator begin() const
Definition: LoopInfo.h:967
virtual bool runOnLoop(Loop *L, LPPassManager &LPM)=0
bool skipLoop(const Loop *L) const
Optional passes call this function to check whether the pass should be skipped.
Definition: LoopPass.cpp:370
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:547
void setLoopID(MDNode *LoopID) const
Set the llvm.loop loop id metadata for this loop.
Definition: LoopInfo.cpp:525
Metadata node.
Definition: Metadata.h:943
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1291
unsigned getNumOperands() const
Return number of MDNode operands.
Definition: Metadata.h:1297
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
An analysis over an "inner" IR unit that provides access to an analysis manager over a "outer" IR uni...
Definition: PassManager.h:1058
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:91
bool mustPreserveAnalysisID(char &AID) const
mustPreserveAnalysisID - This method serves the same function as getAnalysisIfAvailable,...
Definition: Pass.cpp:69
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:152
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:158
bool empty() const
Determine if the PriorityWorklist is empty or not.
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
unsigned getSmallConstantTripMultiple(const Loop *L, const SCEV *ExitCount)
Returns the largest constant divisor of the trip count as a normal unsigned value,...
unsigned getSmallConstantMaxTripCount(const Loop *L)
Returns the upper bound of the loop trip count as a normal unsigned value.
bool isBackedgeTakenCountMaxOrZero(const Loop *L)
Return true if the backedge taken count is either the value returned by getConstantMaxBackedgeTakenCo...
unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:77
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:141
void clear()
Completely clear the SetVector.
Definition: SetVector.h:213
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:72
A version of PriorityWorklist that selects small size optimized data structures for the vector and ma...
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:344
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:383
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:389
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:301
bool empty() const
Definition: SmallVector.h:94
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:687
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Multiway switch.
Analysis pass providing the TargetTransformInfo.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
void getUnrollingPreferences(Loop *L, ScalarEvolution &, UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const
Get target-customized preferences for the generic loop unrolling transformation.
TargetCostKind
The kind of cost model.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
bool isLoweredToCall(const Function *F) const
Test whether calls to a function lower to actual program function calls.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
iterator find(const_arg_type_t< ValueT > V)
Definition: DenseSet.h:179
An efficient, type-erasing, non-owning reference to a callable.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Returns a loop's estimated trip count based on branch weight metadata.
Definition: LoopUtils.cpp:824
void initializeLoopUnrollPass(PassRegistry &)
void computePeelCount(Loop *L, unsigned LoopSize, TargetTransformInfo::PeelingPreferences &PP, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE, AssumptionCache *AC=nullptr, unsigned Threshold=UINT_MAX)
Definition: LoopPeel.cpp:472
auto successors(const MachineBasicBlock *BB)
@ Runtime
Detect stack use after return if not disabled runtime with (ASAN_OPTIONS=detect_stack_use_after_retur...
bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition: LCSSA.cpp:410
std::optional< MDNode * > makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef< StringRef > FollowupAttrs, const char *InheritOptionsAttrsPrefix="", bool AlwaysNew=false)
Create a new loop identifier for a loop created from a loop transformation.
Definition: LoopUtils.cpp:263
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
char & LCSSAID
Definition: LCSSA.cpp:492
Pass * createLoopUnrollPass(int OptLevel=2, bool OnlyWhenForced=false, bool ForgetAllSCEV=false, int Threshold=-1, int Count=-1, int AllowPartial=-1, int Runtime=-1, int UpperBound=-1, int AllowPeeling=-1)
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
Definition: LoopPeel.cpp:811
TransformationMode hasUnrollAndJamTransformation(const Loop *L)
Definition: LoopUtils.cpp:373
cl::opt< bool > ForgetSCEVInLoopUnroll
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:145
TransformationMode hasUnrollTransformation(const Loop *L)
Definition: LoopUtils.cpp:352
LoopUnrollResult
Represents the result of a UnrollLoop invocation.
Definition: UnrollLoop.h:54
@ Unmodified
The loop was not modified.
@ FullyUnrolled
The loop was fully unrolled into straight-line code.
void getLoopAnalysisUsage(AnalysisUsage &AU)
Helper to consistently add the set of standard passes to a loop pass's AnalysisUsage.
Definition: LoopUtils.cpp:141
const char *const LLVMLoopUnrollFollowupAll
Definition: UnrollLoop.h:42
TargetTransformInfo TTI
TransformationMode
The mode sets how eager a transformation should be applied.
Definition: LoopUtils.h:271
@ TM_ForcedByUser
The transformation was directed by the user, e.g.
Definition: LoopUtils.h:288
@ TM_Disable
The transformation should not be applied.
Definition: LoopUtils.h:280
@ TM_Enable
The transformation should be applied without considering a cost model.
Definition: LoopUtils.h:277
bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, unsigned LoopSize, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:2011
void appendLoopsToWorklist(RangeT &&, SmallPriorityWorklist< Loop *, 4 > &)
Utility that implements appending of loops onto a worklist given a range.
Definition: LoopUtils.cpp:1537
TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1946
const char *const LLVMLoopUnrollFollowupRemainder
Definition: UnrollLoop.h:45
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
const char *const LLVMLoopUnrollFollowupUnrolled
Definition: UnrollLoop.h:43
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2113
InstructionCost ApproximateLoopSize(const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent, const TargetTransformInfo &TTI, const SmallPtrSetImpl< const Value * > &EphValues, unsigned BEInsns)
ApproximateLoopSize - Approximate the size of the loop.
LoopUnrollResult UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, const llvm::TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE, bool PreserveLCSSA, Loop **RemainderLoop=nullptr)
Unroll the given loop by Count.
Definition: LoopUnroll.cpp:269
void simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, const TargetTransformInfo *TTI)
Perform some cleanup and simplifications on loops after unrolling.
Definition: LoopUnroll.cpp:215
MDNode * GetUnrollMetadata(MDNode *LoopID, StringRef Name)
Given an llvm.loop loop id metadata node, returns the loop hint metadata node with the given name (fo...
Definition: LoopUnroll.cpp:901
Pass * createSimpleLoopUnrollPass(int OptLevel=2, bool OnlyWhenForced=false, bool ForgetAllSCEV=false)
bool peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, ScalarEvolution *SE, DominatorTree &DT, AssumptionCache *AC, bool PreserveLCSSA, ValueToValueMapTy &VMap)
VMap is the value-map that maps instructions from the original loop to instructions in the last peele...
Definition: LoopPeel.cpp:855
Definition: BitVector.h:858
Utility to calculate the size and a few similar metrics for a set of basic blocks.
Definition: CodeMetrics.h:31
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:70
An information struct used to provide DenseMap with the various necessary components for a given valu...
Definition: DenseMapInfo.h:51
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
bool OnlyWhenForced
If false, use a cost model to determine whether unrolling of a loop is profitable.
const bool ForgetSCEV
If true, forget all loops when unrolling.
std::optional< unsigned > FullUnrollMaxCount
std::optional< bool > AllowPartial
std::optional< bool > AllowRuntime
std::optional< bool > AllowProfileBasedPeeling
std::optional< bool > AllowPeeling
std::optional< bool > AllowUpperBound
A CRTP mix-in to automatically provide informational APIs needed for passes.
Definition: PassManager.h:371
bool PeelProfiledIterations
Allow peeling basing on profile.
unsigned PeelCount
A forced peeling factor (the number of bodied of the original loop that should be peeled off before t...
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
bool UpperBound
Allow using trip count upper bound to unroll loops.
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned MaxPercentThresholdBoost
If complete unrolling will reduce the cost of the loop, we will boost the Threshold by a certain perc...
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
unsigned MaxIterationsCountToAnalyze
Don't allow loop unrolling to simulate more than this number of iterations when checking full unroll ...
bool AllowRemainder
Allow generation of a loop remainder (extra iterations after unroll).
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned FullUnrollMaxCount
Set the maximum unrolling factor for full unrolling.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
bool AllowExpensiveTripCount
Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for...