LLVM 19.0.0git
LoopStrengthReduce.cpp
Go to the documentation of this file.
1//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This transformation analyzes and transforms the induction variables (and
10// computations derived from them) into forms suitable for efficient execution
11// on the target.
12//
13// This pass performs a strength reduction on array references inside loops that
14// have as one or more of their components the loop induction variable, it
15// rewrites expressions to take advantage of scaled-index addressing modes
16// available on the target, and it performs a variety of other optimizations
17// related to loop induction variables.
18//
19// Terminology note: this code has a lot of handling for "post-increment" or
20// "post-inc" users. This is not talking about post-increment addressing modes;
21// it is instead talking about code like this:
22//
23// %i = phi [ 0, %entry ], [ %i.next, %latch ]
24// ...
25// %i.next = add %i, 1
26// %c = icmp eq %i.next, %n
27//
28// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
29// it's useful to think about these as the same register, with some uses using
30// the value of the register before the add and some using it after. In this
31// example, the icmp is a post-increment user, since it uses %i.next, which is
32// the value of the induction variable after the increment. The other common
33// case of post-increment users is users outside the loop.
34//
35// TODO: More sophistication in the way Formulae are generated and filtered.
36//
37// TODO: Handle multiple loops at a time.
38//
39// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
40// of a GlobalValue?
41//
42// TODO: When truncation is free, truncate ICmp users' operands to make it a
43// smaller encoding (on x86 at least).
44//
45// TODO: When a negated register is used by an add (such as in a list of
46// multiple base registers, or as the increment expression in an addrec),
47// we may not actually need both reg and (-1 * reg) in registers; the
48// negation can be implemented by using a sub instead of an add. The
49// lack of support for taking this into consideration when making
50// register pressure decisions is partly worked around by the "Special"
51// use kind.
52//
53//===----------------------------------------------------------------------===//
54
56#include "llvm/ADT/APInt.h"
57#include "llvm/ADT/DenseMap.h"
58#include "llvm/ADT/DenseSet.h"
59#include "llvm/ADT/Hashing.h"
61#include "llvm/ADT/STLExtras.h"
62#include "llvm/ADT/SetVector.h"
65#include "llvm/ADT/SmallSet.h"
67#include "llvm/ADT/Statistic.h"
84#include "llvm/Config/llvm-config.h"
85#include "llvm/IR/BasicBlock.h"
86#include "llvm/IR/Constant.h"
87#include "llvm/IR/Constants.h"
90#include "llvm/IR/Dominators.h"
91#include "llvm/IR/GlobalValue.h"
92#include "llvm/IR/IRBuilder.h"
93#include "llvm/IR/InstrTypes.h"
94#include "llvm/IR/Instruction.h"
97#include "llvm/IR/Module.h"
98#include "llvm/IR/Operator.h"
99#include "llvm/IR/PassManager.h"
100#include "llvm/IR/Type.h"
101#include "llvm/IR/Use.h"
102#include "llvm/IR/User.h"
103#include "llvm/IR/Value.h"
104#include "llvm/IR/ValueHandle.h"
106#include "llvm/Pass.h"
107#include "llvm/Support/Casting.h"
110#include "llvm/Support/Debug.h"
120#include <algorithm>
121#include <cassert>
122#include <cstddef>
123#include <cstdint>
124#include <iterator>
125#include <limits>
126#include <map>
127#include <numeric>
128#include <optional>
129#include <utility>
130
131using namespace llvm;
132
133#define DEBUG_TYPE "loop-reduce"
134
135/// MaxIVUsers is an arbitrary threshold that provides an early opportunity for
136/// bail out. This threshold is far beyond the number of users that LSR can
137/// conceivably solve, so it should not affect generated code, but catches the
138/// worst cases before LSR burns too much compile time and stack space.
139static const unsigned MaxIVUsers = 200;
140
141/// Limit the size of expression that SCEV-based salvaging will attempt to
142/// translate into a DIExpression.
143/// Choose a maximum size such that debuginfo is not excessively increased and
144/// the salvaging is not too expensive for the compiler.
145static const unsigned MaxSCEVSalvageExpressionSize = 64;
146
147// Cleanup congruent phis after LSR phi expansion.
149 "enable-lsr-phielim", cl::Hidden, cl::init(true),
150 cl::desc("Enable LSR phi elimination"));
151
152// The flag adds instruction count to solutions cost comparison.
154 "lsr-insns-cost", cl::Hidden, cl::init(true),
155 cl::desc("Add instruction count to a LSR cost model"));
156
157// Flag to choose how to narrow complex lsr solution
159 "lsr-exp-narrow", cl::Hidden, cl::init(false),
160 cl::desc("Narrow LSR complex solution using"
161 " expectation of registers number"));
162
163// Flag to narrow search space by filtering non-optimal formulae with
164// the same ScaledReg and Scale.
166 "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true),
167 cl::desc("Narrow LSR search space by filtering non-optimal formulae"
168 " with the same ScaledReg and Scale"));
169
171 "lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None),
172 cl::desc("A flag that overrides the target's preferred addressing mode."),
174 "none",
175 "Don't prefer any addressing mode"),
177 "preindexed",
178 "Prefer pre-indexed addressing mode"),
180 "postindexed",
181 "Prefer post-indexed addressing mode")));
182
184 "lsr-complexity-limit", cl::Hidden,
185 cl::init(std::numeric_limits<uint16_t>::max()),
186 cl::desc("LSR search space complexity limit"));
187
189 "lsr-setupcost-depth-limit", cl::Hidden, cl::init(7),
190 cl::desc("The limit on recursion depth for LSRs setup cost"));
191
193 "lsr-term-fold", cl::Hidden,
194 cl::desc("Attempt to replace primary IV with other IV."));
195
197 "lsr-drop-solution", cl::Hidden,
198 cl::desc("Attempt to drop solution if it is less profitable"));
199
200STATISTIC(NumTermFold,
201 "Number of terminating condition fold recognized and performed");
202
203#ifndef NDEBUG
204// Stress test IV chain generation.
206 "stress-ivchain", cl::Hidden, cl::init(false),
207 cl::desc("Stress test LSR IV chains"));
208#else
209static bool StressIVChain = false;
210#endif
211
212namespace {
213
214struct MemAccessTy {
215 /// Used in situations where the accessed memory type is unknown.
216 static const unsigned UnknownAddressSpace =
217 std::numeric_limits<unsigned>::max();
218
219 Type *MemTy = nullptr;
220 unsigned AddrSpace = UnknownAddressSpace;
221
222 MemAccessTy() = default;
223 MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {}
224
225 bool operator==(MemAccessTy Other) const {
226 return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
227 }
228
229 bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
230
231 static MemAccessTy getUnknown(LLVMContext &Ctx,
232 unsigned AS = UnknownAddressSpace) {
233 return MemAccessTy(Type::getVoidTy(Ctx), AS);
234 }
235
236 Type *getType() { return MemTy; }
237};
238
239/// This class holds data which is used to order reuse candidates.
240class RegSortData {
241public:
242 /// This represents the set of LSRUse indices which reference
243 /// a particular register.
244 SmallBitVector UsedByIndices;
245
246 void print(raw_ostream &OS) const;
247 void dump() const;
248};
249
250} // end anonymous namespace
251
252#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
253void RegSortData::print(raw_ostream &OS) const {
254 OS << "[NumUses=" << UsedByIndices.count() << ']';
255}
256
257LLVM_DUMP_METHOD void RegSortData::dump() const {
258 print(errs()); errs() << '\n';
259}
260#endif
261
262namespace {
263
264/// Map register candidates to information about how they are used.
265class RegUseTracker {
266 using RegUsesTy = DenseMap<const SCEV *, RegSortData>;
267
268 RegUsesTy RegUsesMap;
270
271public:
272 void countRegister(const SCEV *Reg, size_t LUIdx);
273 void dropRegister(const SCEV *Reg, size_t LUIdx);
274 void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
275
276 bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
277
278 const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
279
280 void clear();
281
284
285 iterator begin() { return RegSequence.begin(); }
286 iterator end() { return RegSequence.end(); }
287 const_iterator begin() const { return RegSequence.begin(); }
288 const_iterator end() const { return RegSequence.end(); }
289};
290
291} // end anonymous namespace
292
293void
294RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
295 std::pair<RegUsesTy::iterator, bool> Pair =
296 RegUsesMap.insert(std::make_pair(Reg, RegSortData()));
297 RegSortData &RSD = Pair.first->second;
298 if (Pair.second)
299 RegSequence.push_back(Reg);
300 RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
301 RSD.UsedByIndices.set(LUIdx);
302}
303
304void
305RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
306 RegUsesTy::iterator It = RegUsesMap.find(Reg);
307 assert(It != RegUsesMap.end());
308 RegSortData &RSD = It->second;
309 assert(RSD.UsedByIndices.size() > LUIdx);
310 RSD.UsedByIndices.reset(LUIdx);
311}
312
313void
314RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
315 assert(LUIdx <= LastLUIdx);
316
317 // Update RegUses. The data structure is not optimized for this purpose;
318 // we must iterate through it and update each of the bit vectors.
319 for (auto &Pair : RegUsesMap) {
320 SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
321 if (LUIdx < UsedByIndices.size())
322 UsedByIndices[LUIdx] =
323 LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : false;
324 UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
325 }
326}
327
328bool
329RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
330 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
331 if (I == RegUsesMap.end())
332 return false;
333 const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
334 int i = UsedByIndices.find_first();
335 if (i == -1) return false;
336 if ((size_t)i != LUIdx) return true;
337 return UsedByIndices.find_next(i) != -1;
338}
339
340const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
341 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
342 assert(I != RegUsesMap.end() && "Unknown register!");
343 return I->second.UsedByIndices;
344}
345
346void RegUseTracker::clear() {
347 RegUsesMap.clear();
348 RegSequence.clear();
349}
350
351namespace {
352
353/// This class holds information that describes a formula for computing
354/// satisfying a use. It may include broken-out immediates and scaled registers.
355struct Formula {
356 /// Global base address used for complex addressing.
357 GlobalValue *BaseGV = nullptr;
358
359 /// Base offset for complex addressing.
360 int64_t BaseOffset = 0;
361
362 /// Whether any complex addressing has a base register.
363 bool HasBaseReg = false;
364
365 /// The scale of any complex addressing.
366 int64_t Scale = 0;
367
368 /// The list of "base" registers for this use. When this is non-empty. The
369 /// canonical representation of a formula is
370 /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
371 /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
372 /// 3. The reg containing recurrent expr related with currect loop in the
373 /// formula should be put in the ScaledReg.
374 /// #1 enforces that the scaled register is always used when at least two
375 /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
376 /// #2 enforces that 1 * reg is reg.
377 /// #3 ensures invariant regs with respect to current loop can be combined
378 /// together in LSR codegen.
379 /// This invariant can be temporarily broken while building a formula.
380 /// However, every formula inserted into the LSRInstance must be in canonical
381 /// form.
383
384 /// The 'scaled' register for this use. This should be non-null when Scale is
385 /// not zero.
386 const SCEV *ScaledReg = nullptr;
387
388 /// An additional constant offset which added near the use. This requires a
389 /// temporary register, but the offset itself can live in an add immediate
390 /// field rather than a register.
391 int64_t UnfoldedOffset = 0;
392
393 Formula() = default;
394
395 void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
396
397 bool isCanonical(const Loop &L) const;
398
399 void canonicalize(const Loop &L);
400
401 bool unscale();
402
403 bool hasZeroEnd() const;
404
405 size_t getNumRegs() const;
406 Type *getType() const;
407
408 void deleteBaseReg(const SCEV *&S);
409
410 bool referencesReg(const SCEV *S) const;
411 bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
412 const RegUseTracker &RegUses) const;
413
414 void print(raw_ostream &OS) const;
415 void dump() const;
416};
417
418} // end anonymous namespace
419
420/// Recursion helper for initialMatch.
421static void DoInitialMatch(const SCEV *S, Loop *L,
424 ScalarEvolution &SE) {
425 // Collect expressions which properly dominate the loop header.
426 if (SE.properlyDominates(S, L->getHeader())) {
427 Good.push_back(S);
428 return;
429 }
430
431 // Look at add operands.
432 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
433 for (const SCEV *S : Add->operands())
434 DoInitialMatch(S, L, Good, Bad, SE);
435 return;
436 }
437
438 // Look at addrec operands.
439 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
440 if (!AR->getStart()->isZero() && AR->isAffine()) {
441 DoInitialMatch(AR->getStart(), L, Good, Bad, SE);
442 DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
443 AR->getStepRecurrence(SE),
444 // FIXME: AR->getNoWrapFlags()
445 AR->getLoop(), SCEV::FlagAnyWrap),
446 L, Good, Bad, SE);
447 return;
448 }
449
450 // Handle a multiplication by -1 (negation) if it didn't fold.
451 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
452 if (Mul->getOperand(0)->isAllOnesValue()) {
454 const SCEV *NewMul = SE.getMulExpr(Ops);
455
458 DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
459 const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
460 SE.getEffectiveSCEVType(NewMul->getType())));
461 for (const SCEV *S : MyGood)
462 Good.push_back(SE.getMulExpr(NegOne, S));
463 for (const SCEV *S : MyBad)
464 Bad.push_back(SE.getMulExpr(NegOne, S));
465 return;
466 }
467
468 // Ok, we can't do anything interesting. Just stuff the whole thing into a
469 // register and hope for the best.
470 Bad.push_back(S);
471}
472
473/// Incorporate loop-variant parts of S into this Formula, attempting to keep
474/// all loop-invariant and loop-computable values in a single base register.
475void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
478 DoInitialMatch(S, L, Good, Bad, SE);
479 if (!Good.empty()) {
480 const SCEV *Sum = SE.getAddExpr(Good);
481 if (!Sum->isZero())
482 BaseRegs.push_back(Sum);
483 HasBaseReg = true;
484 }
485 if (!Bad.empty()) {
486 const SCEV *Sum = SE.getAddExpr(Bad);
487 if (!Sum->isZero())
488 BaseRegs.push_back(Sum);
489 HasBaseReg = true;
490 }
491 canonicalize(*L);
492}
493
494static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L) {
495 return SCEVExprContains(S, [&L](const SCEV *S) {
496 return isa<SCEVAddRecExpr>(S) && (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
497 });
498}
499
500/// Check whether or not this formula satisfies the canonical
501/// representation.
502/// \see Formula::BaseRegs.
503bool Formula::isCanonical(const Loop &L) const {
504 if (!ScaledReg)
505 return BaseRegs.size() <= 1;
506
507 if (Scale != 1)
508 return true;
509
510 if (Scale == 1 && BaseRegs.empty())
511 return false;
512
513 if (containsAddRecDependentOnLoop(ScaledReg, L))
514 return true;
515
516 // If ScaledReg is not a recurrent expr, or it is but its loop is not current
517 // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
518 // loop, we want to swap the reg in BaseRegs with ScaledReg.
519 return none_of(BaseRegs, [&L](const SCEV *S) {
521 });
522}
523
524/// Helper method to morph a formula into its canonical representation.
525/// \see Formula::BaseRegs.
526/// Every formula having more than one base register, must use the ScaledReg
527/// field. Otherwise, we would have to do special cases everywhere in LSR
528/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
529/// On the other hand, 1*reg should be canonicalized into reg.
530void Formula::canonicalize(const Loop &L) {
531 if (isCanonical(L))
532 return;
533
534 if (BaseRegs.empty()) {
535 // No base reg? Use scale reg with scale = 1 as such.
536 assert(ScaledReg && "Expected 1*reg => reg");
537 assert(Scale == 1 && "Expected 1*reg => reg");
538 BaseRegs.push_back(ScaledReg);
539 Scale = 0;
540 ScaledReg = nullptr;
541 return;
542 }
543
544 // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
545 if (!ScaledReg) {
546 ScaledReg = BaseRegs.pop_back_val();
547 Scale = 1;
548 }
549
550 // If ScaledReg is an invariant with respect to L, find the reg from
551 // BaseRegs containing the recurrent expr related with Loop L. Swap the
552 // reg with ScaledReg.
553 if (!containsAddRecDependentOnLoop(ScaledReg, L)) {
554 auto I = find_if(BaseRegs, [&L](const SCEV *S) {
556 });
557 if (I != BaseRegs.end())
558 std::swap(ScaledReg, *I);
559 }
560 assert(isCanonical(L) && "Failed to canonicalize?");
561}
562
563/// Get rid of the scale in the formula.
564/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
565/// \return true if it was possible to get rid of the scale, false otherwise.
566/// \note After this operation the formula may not be in the canonical form.
567bool Formula::unscale() {
568 if (Scale != 1)
569 return false;
570 Scale = 0;
571 BaseRegs.push_back(ScaledReg);
572 ScaledReg = nullptr;
573 return true;
574}
575
576bool Formula::hasZeroEnd() const {
577 if (UnfoldedOffset || BaseOffset)
578 return false;
579 if (BaseRegs.size() != 1 || ScaledReg)
580 return false;
581 return true;
582}
583
584/// Return the total number of register operands used by this formula. This does
585/// not include register uses implied by non-constant addrec strides.
586size_t Formula::getNumRegs() const {
587 return !!ScaledReg + BaseRegs.size();
588}
589
590/// Return the type of this formula, if it has one, or null otherwise. This type
591/// is meaningless except for the bit size.
592Type *Formula::getType() const {
593 return !BaseRegs.empty() ? BaseRegs.front()->getType() :
594 ScaledReg ? ScaledReg->getType() :
595 BaseGV ? BaseGV->getType() :
596 nullptr;
597}
598
599/// Delete the given base reg from the BaseRegs list.
600void Formula::deleteBaseReg(const SCEV *&S) {
601 if (&S != &BaseRegs.back())
602 std::swap(S, BaseRegs.back());
603 BaseRegs.pop_back();
604}
605
606/// Test if this formula references the given register.
607bool Formula::referencesReg(const SCEV *S) const {
608 return S == ScaledReg || is_contained(BaseRegs, S);
609}
610
611/// Test whether this formula uses registers which are used by uses other than
612/// the use with the given index.
613bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
614 const RegUseTracker &RegUses) const {
615 if (ScaledReg)
616 if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
617 return true;
618 for (const SCEV *BaseReg : BaseRegs)
619 if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
620 return true;
621 return false;
622}
623
624#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
625void Formula::print(raw_ostream &OS) const {
626 bool First = true;
627 if (BaseGV) {
628 if (!First) OS << " + "; else First = false;
629 BaseGV->printAsOperand(OS, /*PrintType=*/false);
630 }
631 if (BaseOffset != 0) {
632 if (!First) OS << " + "; else First = false;
633 OS << BaseOffset;
634 }
635 for (const SCEV *BaseReg : BaseRegs) {
636 if (!First) OS << " + "; else First = false;
637 OS << "reg(" << *BaseReg << ')';
638 }
639 if (HasBaseReg && BaseRegs.empty()) {
640 if (!First) OS << " + "; else First = false;
641 OS << "**error: HasBaseReg**";
642 } else if (!HasBaseReg && !BaseRegs.empty()) {
643 if (!First) OS << " + "; else First = false;
644 OS << "**error: !HasBaseReg**";
645 }
646 if (Scale != 0) {
647 if (!First) OS << " + "; else First = false;
648 OS << Scale << "*reg(";
649 if (ScaledReg)
650 OS << *ScaledReg;
651 else
652 OS << "<unknown>";
653 OS << ')';
654 }
655 if (UnfoldedOffset != 0) {
656 if (!First) OS << " + ";
657 OS << "imm(" << UnfoldedOffset << ')';
658 }
659}
660
661LLVM_DUMP_METHOD void Formula::dump() const {
662 print(errs()); errs() << '\n';
663}
664#endif
665
666/// Return true if the given addrec can be sign-extended without changing its
667/// value.
669 Type *WideTy =
671 return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
672}
673
674/// Return true if the given add can be sign-extended without changing its
675/// value.
676static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
677 Type *WideTy =
678 IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
679 return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
680}
681
682/// Return true if the given mul can be sign-extended without changing its
683/// value.
684static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
685 Type *WideTy =
687 SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
688 return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
689}
690
691/// Return an expression for LHS /s RHS, if it can be determined and if the
692/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
693/// is true, expressions like (X * Y) /s Y are simplified to X, ignoring that
694/// the multiplication may overflow, which is useful when the result will be
695/// used in a context where the most significant bits are ignored.
696static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
697 ScalarEvolution &SE,
698 bool IgnoreSignificantBits = false) {
699 // Handle the trivial case, which works for any SCEV type.
700 if (LHS == RHS)
701 return SE.getConstant(LHS->getType(), 1);
702
703 // Handle a few RHS special cases.
704 const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
705 if (RC) {
706 const APInt &RA = RC->getAPInt();
707 // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
708 // some folding.
709 if (RA.isAllOnes()) {
710 if (LHS->getType()->isPointerTy())
711 return nullptr;
712 return SE.getMulExpr(LHS, RC);
713 }
714 // Handle x /s 1 as x.
715 if (RA == 1)
716 return LHS;
717 }
718
719 // Check for a division of a constant by a constant.
720 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
721 if (!RC)
722 return nullptr;
723 const APInt &LA = C->getAPInt();
724 const APInt &RA = RC->getAPInt();
725 if (LA.srem(RA) != 0)
726 return nullptr;
727 return SE.getConstant(LA.sdiv(RA));
728 }
729
730 // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
731 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
732 if ((IgnoreSignificantBits || isAddRecSExtable(AR, SE)) && AR->isAffine()) {
733 const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
734 IgnoreSignificantBits);
735 if (!Step) return nullptr;
736 const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
737 IgnoreSignificantBits);
738 if (!Start) return nullptr;
739 // FlagNW is independent of the start value, step direction, and is
740 // preserved with smaller magnitude steps.
741 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
742 return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
743 }
744 return nullptr;
745 }
746
747 // Distribute the sdiv over add operands, if the add doesn't overflow.
748 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
749 if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
751 for (const SCEV *S : Add->operands()) {
752 const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
753 if (!Op) return nullptr;
754 Ops.push_back(Op);
755 }
756 return SE.getAddExpr(Ops);
757 }
758 return nullptr;
759 }
760
761 // Check for a multiply operand that we can pull RHS out of.
762 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
763 if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
764 // Handle special case C1*X*Y /s C2*X*Y.
765 if (const SCEVMulExpr *MulRHS = dyn_cast<SCEVMulExpr>(RHS)) {
766 if (IgnoreSignificantBits || isMulSExtable(MulRHS, SE)) {
767 const SCEVConstant *LC = dyn_cast<SCEVConstant>(Mul->getOperand(0));
768 const SCEVConstant *RC =
769 dyn_cast<SCEVConstant>(MulRHS->getOperand(0));
770 if (LC && RC) {
772 SmallVector<const SCEV *, 4> ROps(drop_begin(MulRHS->operands()));
773 if (LOps == ROps)
774 return getExactSDiv(LC, RC, SE, IgnoreSignificantBits);
775 }
776 }
777 }
778
780 bool Found = false;
781 for (const SCEV *S : Mul->operands()) {
782 if (!Found)
783 if (const SCEV *Q = getExactSDiv(S, RHS, SE,
784 IgnoreSignificantBits)) {
785 S = Q;
786 Found = true;
787 }
788 Ops.push_back(S);
789 }
790 return Found ? SE.getMulExpr(Ops) : nullptr;
791 }
792 return nullptr;
793 }
794
795 // Otherwise we don't know.
796 return nullptr;
797}
798
799/// If S involves the addition of a constant integer value, return that integer
800/// value, and mutate S to point to a new SCEV with that value excluded.
801static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
802 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
803 if (C->getAPInt().getSignificantBits() <= 64) {
804 S = SE.getConstant(C->getType(), 0);
805 return C->getValue()->getSExtValue();
806 }
807 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
808 SmallVector<const SCEV *, 8> NewOps(Add->operands());
809 int64_t Result = ExtractImmediate(NewOps.front(), SE);
810 if (Result != 0)
811 S = SE.getAddExpr(NewOps);
812 return Result;
813 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
814 SmallVector<const SCEV *, 8> NewOps(AR->operands());
815 int64_t Result = ExtractImmediate(NewOps.front(), SE);
816 if (Result != 0)
817 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
818 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
820 return Result;
821 }
822 return 0;
823}
824
825/// If S involves the addition of a GlobalValue address, return that symbol, and
826/// mutate S to point to a new SCEV with that value excluded.
828 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
829 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
830 S = SE.getConstant(GV->getType(), 0);
831 return GV;
832 }
833 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
834 SmallVector<const SCEV *, 8> NewOps(Add->operands());
835 GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
836 if (Result)
837 S = SE.getAddExpr(NewOps);
838 return Result;
839 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
840 SmallVector<const SCEV *, 8> NewOps(AR->operands());
841 GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
842 if (Result)
843 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
844 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
846 return Result;
847 }
848 return nullptr;
849}
850
851/// Returns true if the specified instruction is using the specified value as an
852/// address.
854 Instruction *Inst, Value *OperandVal) {
855 bool isAddress = isa<LoadInst>(Inst);
856 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
857 if (SI->getPointerOperand() == OperandVal)
858 isAddress = true;
859 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
860 // Addressing modes can also be folded into prefetches and a variety
861 // of intrinsics.
862 switch (II->getIntrinsicID()) {
863 case Intrinsic::memset:
864 case Intrinsic::prefetch:
865 case Intrinsic::masked_load:
866 if (II->getArgOperand(0) == OperandVal)
867 isAddress = true;
868 break;
869 case Intrinsic::masked_store:
870 if (II->getArgOperand(1) == OperandVal)
871 isAddress = true;
872 break;
873 case Intrinsic::memmove:
874 case Intrinsic::memcpy:
875 if (II->getArgOperand(0) == OperandVal ||
876 II->getArgOperand(1) == OperandVal)
877 isAddress = true;
878 break;
879 default: {
880 MemIntrinsicInfo IntrInfo;
881 if (TTI.getTgtMemIntrinsic(II, IntrInfo)) {
882 if (IntrInfo.PtrVal == OperandVal)
883 isAddress = true;
884 }
885 }
886 }
887 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
888 if (RMW->getPointerOperand() == OperandVal)
889 isAddress = true;
890 } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
891 if (CmpX->getPointerOperand() == OperandVal)
892 isAddress = true;
893 }
894 return isAddress;
895}
896
897/// Return the type of the memory being accessed.
898static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
899 Instruction *Inst, Value *OperandVal) {
900 MemAccessTy AccessTy = MemAccessTy::getUnknown(Inst->getContext());
901
902 // First get the type of memory being accessed.
903 if (Type *Ty = Inst->getAccessType())
904 AccessTy.MemTy = Ty;
905
906 // Then get the pointer address space.
907 if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
908 AccessTy.AddrSpace = SI->getPointerAddressSpace();
909 } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
910 AccessTy.AddrSpace = LI->getPointerAddressSpace();
911 } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
912 AccessTy.AddrSpace = RMW->getPointerAddressSpace();
913 } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
914 AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
915 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
916 switch (II->getIntrinsicID()) {
917 case Intrinsic::prefetch:
918 case Intrinsic::memset:
919 AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
920 AccessTy.MemTy = OperandVal->getType();
921 break;
922 case Intrinsic::memmove:
923 case Intrinsic::memcpy:
924 AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace();
925 AccessTy.MemTy = OperandVal->getType();
926 break;
927 case Intrinsic::masked_load:
928 AccessTy.AddrSpace =
929 II->getArgOperand(0)->getType()->getPointerAddressSpace();
930 break;
931 case Intrinsic::masked_store:
932 AccessTy.AddrSpace =
933 II->getArgOperand(1)->getType()->getPointerAddressSpace();
934 break;
935 default: {
936 MemIntrinsicInfo IntrInfo;
937 if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) {
938 AccessTy.AddrSpace
939 = IntrInfo.PtrVal->getType()->getPointerAddressSpace();
940 }
941
942 break;
943 }
944 }
945 }
946
947 return AccessTy;
948}
949
950/// Return true if this AddRec is already a phi in its loop.
951static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
952 for (PHINode &PN : AR->getLoop()->getHeader()->phis()) {
953 if (SE.isSCEVable(PN.getType()) &&
954 (SE.getEffectiveSCEVType(PN.getType()) ==
955 SE.getEffectiveSCEVType(AR->getType())) &&
956 SE.getSCEV(&PN) == AR)
957 return true;
958 }
959 return false;
960}
961
962/// Check if expanding this expression is likely to incur significant cost. This
963/// is tricky because SCEV doesn't track which expressions are actually computed
964/// by the current IR.
965///
966/// We currently allow expansion of IV increments that involve adds,
967/// multiplication by constants, and AddRecs from existing phis.
968///
969/// TODO: Allow UDivExpr if we can find an existing IV increment that is an
970/// obvious multiple of the UDivExpr.
971static bool isHighCostExpansion(const SCEV *S,
973 ScalarEvolution &SE) {
974 // Zero/One operand expressions
975 switch (S->getSCEVType()) {
976 case scUnknown:
977 case scConstant:
978 case scVScale:
979 return false;
980 case scTruncate:
981 return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
982 Processed, SE);
983 case scZeroExtend:
984 return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
985 Processed, SE);
986 case scSignExtend:
987 return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
988 Processed, SE);
989 default:
990 break;
991 }
992
993 if (!Processed.insert(S).second)
994 return false;
995
996 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
997 for (const SCEV *S : Add->operands()) {
998 if (isHighCostExpansion(S, Processed, SE))
999 return true;
1000 }
1001 return false;
1002 }
1003
1004 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
1005 if (Mul->getNumOperands() == 2) {
1006 // Multiplication by a constant is ok
1007 if (isa<SCEVConstant>(Mul->getOperand(0)))
1008 return isHighCostExpansion(Mul->getOperand(1), Processed, SE);
1009
1010 // If we have the value of one operand, check if an existing
1011 // multiplication already generates this expression.
1012 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) {
1013 Value *UVal = U->getValue();
1014 for (User *UR : UVal->users()) {
1015 // If U is a constant, it may be used by a ConstantExpr.
1016 Instruction *UI = dyn_cast<Instruction>(UR);
1017 if (UI && UI->getOpcode() == Instruction::Mul &&
1018 SE.isSCEVable(UI->getType())) {
1019 return SE.getSCEV(UI) == Mul;
1020 }
1021 }
1022 }
1023 }
1024 }
1025
1026 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
1027 if (isExistingPhi(AR, SE))
1028 return false;
1029 }
1030
1031 // Fow now, consider any other type of expression (div/mul/min/max) high cost.
1032 return true;
1033}
1034
1035namespace {
1036
1037class LSRUse;
1038
1039} // end anonymous namespace
1040
1041/// Check if the addressing mode defined by \p F is completely
1042/// folded in \p LU at isel time.
1043/// This includes address-mode folding and special icmp tricks.
1044/// This function returns true if \p LU can accommodate what \p F
1045/// defines and up to 1 base + 1 scaled + offset.
1046/// In other words, if \p F has several base registers, this function may
1047/// still return true. Therefore, users still need to account for
1048/// additional base registers and/or unfolded offsets to derive an
1049/// accurate cost model.
1051 const LSRUse &LU, const Formula &F);
1052
1053// Get the cost of the scaling factor used in F for LU.
1055 const LSRUse &LU, const Formula &F,
1056 const Loop &L);
1057
1058namespace {
1059
1060/// This class is used to measure and compare candidate formulae.
1061class Cost {
1062 const Loop *L = nullptr;
1063 ScalarEvolution *SE = nullptr;
1064 const TargetTransformInfo *TTI = nullptr;
1067
1068public:
1069 Cost() = delete;
1070 Cost(const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
1072 L(L), SE(&SE), TTI(&TTI), AMK(AMK) {
1073 C.Insns = 0;
1074 C.NumRegs = 0;
1075 C.AddRecCost = 0;
1076 C.NumIVMuls = 0;
1077 C.NumBaseAdds = 0;
1078 C.ImmCost = 0;
1079 C.SetupCost = 0;
1080 C.ScaleCost = 0;
1081 }
1082
1083 bool isLess(const Cost &Other) const;
1084
1085 void Lose();
1086
1087#ifndef NDEBUG
1088 // Once any of the metrics loses, they must all remain losers.
1089 bool isValid() {
1090 return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
1091 | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
1092 || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
1093 & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
1094 }
1095#endif
1096
1097 bool isLoser() {
1098 assert(isValid() && "invalid cost");
1099 return C.NumRegs == ~0u;
1100 }
1101
1102 void RateFormula(const Formula &F,
1104 const DenseSet<const SCEV *> &VisitedRegs,
1105 const LSRUse &LU,
1106 SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
1107
1108 void print(raw_ostream &OS) const;
1109 void dump() const;
1110
1111private:
1112 void RateRegister(const Formula &F, const SCEV *Reg,
1114 void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1117};
1118
1119/// An operand value in an instruction which is to be replaced with some
1120/// equivalent, possibly strength-reduced, replacement.
1121struct LSRFixup {
1122 /// The instruction which will be updated.
1123 Instruction *UserInst = nullptr;
1124
1125 /// The operand of the instruction which will be replaced. The operand may be
1126 /// used more than once; every instance will be replaced.
1127 Value *OperandValToReplace = nullptr;
1128
1129 /// If this user is to use the post-incremented value of an induction
1130 /// variable, this set is non-empty and holds the loops associated with the
1131 /// induction variable.
1132 PostIncLoopSet PostIncLoops;
1133
1134 /// A constant offset to be added to the LSRUse expression. This allows
1135 /// multiple fixups to share the same LSRUse with different offsets, for
1136 /// example in an unrolled loop.
1137 int64_t Offset = 0;
1138
1139 LSRFixup() = default;
1140
1141 bool isUseFullyOutsideLoop(const Loop *L) const;
1142
1143 void print(raw_ostream &OS) const;
1144 void dump() const;
1145};
1146
1147/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted
1148/// SmallVectors of const SCEV*.
1149struct UniquifierDenseMapInfo {
1150 static SmallVector<const SCEV *, 4> getEmptyKey() {
1152 V.push_back(reinterpret_cast<const SCEV *>(-1));
1153 return V;
1154 }
1155
1156 static SmallVector<const SCEV *, 4> getTombstoneKey() {
1158 V.push_back(reinterpret_cast<const SCEV *>(-2));
1159 return V;
1160 }
1161
1162 static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) {
1163 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
1164 }
1165
1166 static bool isEqual(const SmallVector<const SCEV *, 4> &LHS,
1168 return LHS == RHS;
1169 }
1170};
1171
1172/// This class holds the state that LSR keeps for each use in IVUsers, as well
1173/// as uses invented by LSR itself. It includes information about what kinds of
1174/// things can be folded into the user, information about the user itself, and
1175/// information about how the use may be satisfied. TODO: Represent multiple
1176/// users of the same expression in common?
1177class LSRUse {
1178 DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;
1179
1180public:
1181 /// An enum for a kind of use, indicating what types of scaled and immediate
1182 /// operands it might support.
1183 enum KindType {
1184 Basic, ///< A normal use, with no folding.
1185 Special, ///< A special case of basic, allowing -1 scales.
1186 Address, ///< An address use; folding according to TargetLowering
1187 ICmpZero ///< An equality icmp with both operands folded into one.
1188 // TODO: Add a generic icmp too?
1189 };
1190
1191 using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>;
1192
1193 KindType Kind;
1194 MemAccessTy AccessTy;
1195
1196 /// The list of operands which are to be replaced.
1198
1199 /// Keep track of the min and max offsets of the fixups.
1200 int64_t MinOffset = std::numeric_limits<int64_t>::max();
1201 int64_t MaxOffset = std::numeric_limits<int64_t>::min();
1202
1203 /// This records whether all of the fixups using this LSRUse are outside of
1204 /// the loop, in which case some special-case heuristics may be used.
1205 bool AllFixupsOutsideLoop = true;
1206
1207 /// RigidFormula is set to true to guarantee that this use will be associated
1208 /// with a single formula--the one that initially matched. Some SCEV
1209 /// expressions cannot be expanded. This allows LSR to consider the registers
1210 /// used by those expressions without the need to expand them later after
1211 /// changing the formula.
1212 bool RigidFormula = false;
1213
1214 /// This records the widest use type for any fixup using this
1215 /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
1216 /// fixup widths to be equivalent, because the narrower one may be relying on
1217 /// the implicit truncation to truncate away bogus bits.
1218 Type *WidestFixupType = nullptr;
1219
1220 /// A list of ways to build a value that can satisfy this user. After the
1221 /// list is populated, one of these is selected heuristically and used to
1222 /// formulate a replacement for OperandValToReplace in UserInst.
1223 SmallVector<Formula, 12> Formulae;
1224
1225 /// The set of register candidates used by all formulae in this LSRUse.
1227
1228 LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {}
1229
1230 LSRFixup &getNewFixup() {
1231 Fixups.push_back(LSRFixup());
1232 return Fixups.back();
1233 }
1234
1235 void pushFixup(LSRFixup &f) {
1236 Fixups.push_back(f);
1237 if (f.Offset > MaxOffset)
1238 MaxOffset = f.Offset;
1239 if (f.Offset < MinOffset)
1240 MinOffset = f.Offset;
1241 }
1242
1243 bool HasFormulaWithSameRegs(const Formula &F) const;
1244 float getNotSelectedProbability(const SCEV *Reg) const;
1245 bool InsertFormula(const Formula &F, const Loop &L);
1246 void DeleteFormula(Formula &F);
1247 void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
1248
1249 void print(raw_ostream &OS) const;
1250 void dump() const;
1251};
1252
1253} // end anonymous namespace
1254
1256 LSRUse::KindType Kind, MemAccessTy AccessTy,
1257 GlobalValue *BaseGV, int64_t BaseOffset,
1258 bool HasBaseReg, int64_t Scale,
1259 Instruction *Fixup = nullptr,
1260 int64_t ScalableOffset = 0);
1261
1262static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) {
1263 if (isa<SCEVUnknown>(Reg) || isa<SCEVConstant>(Reg))
1264 return 1;
1265 if (Depth == 0)
1266 return 0;
1267 if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))
1268 return getSetupCost(S->getStart(), Depth - 1);
1269 if (auto S = dyn_cast<SCEVIntegralCastExpr>(Reg))
1270 return getSetupCost(S->getOperand(), Depth - 1);
1271 if (auto S = dyn_cast<SCEVNAryExpr>(Reg))
1272 return std::accumulate(S->operands().begin(), S->operands().end(), 0,
1273 [&](unsigned i, const SCEV *Reg) {
1274 return i + getSetupCost(Reg, Depth - 1);
1275 });
1276 if (auto S = dyn_cast<SCEVUDivExpr>(Reg))
1277 return getSetupCost(S->getLHS(), Depth - 1) +
1278 getSetupCost(S->getRHS(), Depth - 1);
1279 return 0;
1280}
1281
1282/// Tally up interesting quantities from the given register.
1283void Cost::RateRegister(const Formula &F, const SCEV *Reg,
1285 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
1286 // If this is an addrec for another loop, it should be an invariant
1287 // with respect to L since L is the innermost loop (at least
1288 // for now LSR only handles innermost loops).
1289 if (AR->getLoop() != L) {
1290 // If the AddRec exists, consider it's register free and leave it alone.
1291 if (isExistingPhi(AR, *SE) && AMK != TTI::AMK_PostIndexed)
1292 return;
1293
1294 // It is bad to allow LSR for current loop to add induction variables
1295 // for its sibling loops.
1296 if (!AR->getLoop()->contains(L)) {
1297 Lose();
1298 return;
1299 }
1300
1301 // Otherwise, it will be an invariant with respect to Loop L.
1302 ++C.NumRegs;
1303 return;
1304 }
1305
1306 unsigned LoopCost = 1;
1307 if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||
1308 TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {
1309
1310 // If the step size matches the base offset, we could use pre-indexed
1311 // addressing.
1312 if (AMK == TTI::AMK_PreIndexed) {
1313 if (auto *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)))
1314 if (Step->getAPInt() == F.BaseOffset)
1315 LoopCost = 0;
1316 } else if (AMK == TTI::AMK_PostIndexed) {
1317 const SCEV *LoopStep = AR->getStepRecurrence(*SE);
1318 if (isa<SCEVConstant>(LoopStep)) {
1319 const SCEV *LoopStart = AR->getStart();
1320 if (!isa<SCEVConstant>(LoopStart) &&
1321 SE->isLoopInvariant(LoopStart, L))
1322 LoopCost = 0;
1323 }
1324 }
1325 }
1326 C.AddRecCost += LoopCost;
1327
1328 // Add the step value register, if it needs one.
1329 // TODO: The non-affine case isn't precisely modeled here.
1330 if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
1331 if (!Regs.count(AR->getOperand(1))) {
1332 RateRegister(F, AR->getOperand(1), Regs);
1333 if (isLoser())
1334 return;
1335 }
1336 }
1337 }
1338 ++C.NumRegs;
1339
1340 // Rough heuristic; favor registers which don't require extra setup
1341 // instructions in the preheader.
1342 C.SetupCost += getSetupCost(Reg, SetupCostDepthLimit);
1343 // Ensure we don't, even with the recusion limit, produce invalid costs.
1344 C.SetupCost = std::min<unsigned>(C.SetupCost, 1 << 16);
1345
1346 C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
1347 SE->hasComputableLoopEvolution(Reg, L);
1348}
1349
1350/// Record this register in the set. If we haven't seen it before, rate
1351/// it. Optional LoserRegs provides a way to declare any formula that refers to
1352/// one of those regs an instant loser.
1353void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1355 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1356 if (LoserRegs && LoserRegs->count(Reg)) {
1357 Lose();
1358 return;
1359 }
1360 if (Regs.insert(Reg).second) {
1361 RateRegister(F, Reg, Regs);
1362 if (LoserRegs && isLoser())
1363 LoserRegs->insert(Reg);
1364 }
1365}
1366
1367void Cost::RateFormula(const Formula &F,
1369 const DenseSet<const SCEV *> &VisitedRegs,
1370 const LSRUse &LU,
1371 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1372 if (isLoser())
1373 return;
1374 assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
1375 // Tally up the registers.
1376 unsigned PrevAddRecCost = C.AddRecCost;
1377 unsigned PrevNumRegs = C.NumRegs;
1378 unsigned PrevNumBaseAdds = C.NumBaseAdds;
1379 if (const SCEV *ScaledReg = F.ScaledReg) {
1380 if (VisitedRegs.count(ScaledReg)) {
1381 Lose();
1382 return;
1383 }
1384 RatePrimaryRegister(F, ScaledReg, Regs, LoserRegs);
1385 if (isLoser())
1386 return;
1387 }
1388 for (const SCEV *BaseReg : F.BaseRegs) {
1389 if (VisitedRegs.count(BaseReg)) {
1390 Lose();
1391 return;
1392 }
1393 RatePrimaryRegister(F, BaseReg, Regs, LoserRegs);
1394 if (isLoser())
1395 return;
1396 }
1397
1398 // Determine how many (unfolded) adds we'll need inside the loop.
1399 size_t NumBaseParts = F.getNumRegs();
1400 if (NumBaseParts > 1)
1401 // Do not count the base and a possible second register if the target
1402 // allows to fold 2 registers.
1403 C.NumBaseAdds +=
1404 NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F)));
1405 C.NumBaseAdds += (F.UnfoldedOffset != 0);
1406
1407 // Accumulate non-free scaling amounts.
1408 C.ScaleCost += *getScalingFactorCost(*TTI, LU, F, *L).getValue();
1409
1410 // Tally up the non-zero immediates.
1411 for (const LSRFixup &Fixup : LU.Fixups) {
1412 int64_t O = Fixup.Offset;
1413 int64_t Offset = (uint64_t)O + F.BaseOffset;
1414 if (F.BaseGV)
1415 C.ImmCost += 64; // Handle symbolic values conservatively.
1416 // TODO: This should probably be the pointer size.
1417 else if (Offset != 0)
1418 C.ImmCost += APInt(64, Offset, true).getSignificantBits();
1419
1420 // Check with target if this offset with this instruction is
1421 // specifically not supported.
1422 if (LU.Kind == LSRUse::Address && Offset != 0 &&
1423 !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1424 Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
1425 C.NumBaseAdds++;
1426 }
1427
1428 // If we don't count instruction cost exit here.
1429 if (!InsnsCost) {
1430 assert(isValid() && "invalid cost");
1431 return;
1432 }
1433
1434 // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
1435 // additional instruction (at least fill).
1436 // TODO: Need distinguish register class?
1437 unsigned TTIRegNum = TTI->getNumberOfRegisters(
1438 TTI->getRegisterClassForType(false, F.getType())) - 1;
1439 if (C.NumRegs > TTIRegNum) {
1440 // Cost already exceeded TTIRegNum, then only newly added register can add
1441 // new instructions.
1442 if (PrevNumRegs > TTIRegNum)
1443 C.Insns += (C.NumRegs - PrevNumRegs);
1444 else
1445 C.Insns += (C.NumRegs - TTIRegNum);
1446 }
1447
1448 // If ICmpZero formula ends with not 0, it could not be replaced by
1449 // just add or sub. We'll need to compare final result of AddRec.
1450 // That means we'll need an additional instruction. But if the target can
1451 // macro-fuse a compare with a branch, don't count this extra instruction.
1452 // For -10 + {0, +, 1}:
1453 // i = i + 1;
1454 // cmp i, 10
1455 //
1456 // For {-10, +, 1}:
1457 // i = i + 1;
1458 if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() &&
1459 !TTI->canMacroFuseCmp())
1460 C.Insns++;
1461 // Each new AddRec adds 1 instruction to calculation.
1462 C.Insns += (C.AddRecCost - PrevAddRecCost);
1463
1464 // BaseAdds adds instructions for unfolded registers.
1465 if (LU.Kind != LSRUse::ICmpZero)
1466 C.Insns += C.NumBaseAdds - PrevNumBaseAdds;
1467 assert(isValid() && "invalid cost");
1468}
1469
1470/// Set this cost to a losing value.
1471void Cost::Lose() {
1472 C.Insns = std::numeric_limits<unsigned>::max();
1473 C.NumRegs = std::numeric_limits<unsigned>::max();
1474 C.AddRecCost = std::numeric_limits<unsigned>::max();
1475 C.NumIVMuls = std::numeric_limits<unsigned>::max();
1476 C.NumBaseAdds = std::numeric_limits<unsigned>::max();
1477 C.ImmCost = std::numeric_limits<unsigned>::max();
1478 C.SetupCost = std::numeric_limits<unsigned>::max();
1479 C.ScaleCost = std::numeric_limits<unsigned>::max();
1480}
1481
1482/// Choose the lower cost.
1483bool Cost::isLess(const Cost &Other) const {
1484 if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
1485 C.Insns != Other.C.Insns)
1486 return C.Insns < Other.C.Insns;
1487 return TTI->isLSRCostLess(C, Other.C);
1488}
1489
1490#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1491void Cost::print(raw_ostream &OS) const {
1492 if (InsnsCost)
1493 OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
1494 OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s");
1495 if (C.AddRecCost != 0)
1496 OS << ", with addrec cost " << C.AddRecCost;
1497 if (C.NumIVMuls != 0)
1498 OS << ", plus " << C.NumIVMuls << " IV mul"
1499 << (C.NumIVMuls == 1 ? "" : "s");
1500 if (C.NumBaseAdds != 0)
1501 OS << ", plus " << C.NumBaseAdds << " base add"
1502 << (C.NumBaseAdds == 1 ? "" : "s");
1503 if (C.ScaleCost != 0)
1504 OS << ", plus " << C.ScaleCost << " scale cost";
1505 if (C.ImmCost != 0)
1506 OS << ", plus " << C.ImmCost << " imm cost";
1507 if (C.SetupCost != 0)
1508 OS << ", plus " << C.SetupCost << " setup cost";
1509}
1510
1511LLVM_DUMP_METHOD void Cost::dump() const {
1512 print(errs()); errs() << '\n';
1513}
1514#endif
1515
1516/// Test whether this fixup always uses its value outside of the given loop.
1517bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
1518 // PHI nodes use their value in their incoming blocks.
1519 if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
1520 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
1521 if (PN->getIncomingValue(i) == OperandValToReplace &&
1522 L->contains(PN->getIncomingBlock(i)))
1523 return false;
1524 return true;
1525 }
1526
1527 return !L->contains(UserInst);
1528}
1529
1530#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1531void LSRFixup::print(raw_ostream &OS) const {
1532 OS << "UserInst=";
1533 // Store is common and interesting enough to be worth special-casing.
1534 if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
1535 OS << "store ";
1536 Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
1537 } else if (UserInst->getType()->isVoidTy())
1538 OS << UserInst->getOpcodeName();
1539 else
1540 UserInst->printAsOperand(OS, /*PrintType=*/false);
1541
1542 OS << ", OperandValToReplace=";
1543 OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
1544
1545 for (const Loop *PIL : PostIncLoops) {
1546 OS << ", PostIncLoop=";
1547 PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
1548 }
1549
1550 if (Offset != 0)
1551 OS << ", Offset=" << Offset;
1552}
1553
1554LLVM_DUMP_METHOD void LSRFixup::dump() const {
1555 print(errs()); errs() << '\n';
1556}
1557#endif
1558
1559/// Test whether this use as a formula which has the same registers as the given
1560/// formula.
1561bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
1563 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1564 // Unstable sort by host order ok, because this is only used for uniquifying.
1565 llvm::sort(Key);
1566 return Uniquifier.count(Key);
1567}
1568
1569/// The function returns a probability of selecting formula without Reg.
1570float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
1571 unsigned FNum = 0;
1572 for (const Formula &F : Formulae)
1573 if (F.referencesReg(Reg))
1574 FNum++;
1575 return ((float)(Formulae.size() - FNum)) / Formulae.size();
1576}
1577
1578/// If the given formula has not yet been inserted, add it to the list, and
1579/// return true. Return false otherwise. The formula must be in canonical form.
1580bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
1581 assert(F.isCanonical(L) && "Invalid canonical representation");
1582
1583 if (!Formulae.empty() && RigidFormula)
1584 return false;
1585
1587 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1588 // Unstable sort by host order ok, because this is only used for uniquifying.
1589 llvm::sort(Key);
1590
1591 if (!Uniquifier.insert(Key).second)
1592 return false;
1593
1594 // Using a register to hold the value of 0 is not profitable.
1595 assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
1596 "Zero allocated in a scaled register!");
1597#ifndef NDEBUG
1598 for (const SCEV *BaseReg : F.BaseRegs)
1599 assert(!BaseReg->isZero() && "Zero allocated in a base register!");
1600#endif
1601
1602 // Add the formula to the list.
1603 Formulae.push_back(F);
1604
1605 // Record registers now being used by this use.
1606 Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1607 if (F.ScaledReg)
1608 Regs.insert(F.ScaledReg);
1609
1610 return true;
1611}
1612
1613/// Remove the given formula from this use's list.
1614void LSRUse::DeleteFormula(Formula &F) {
1615 if (&F != &Formulae.back())
1616 std::swap(F, Formulae.back());
1617 Formulae.pop_back();
1618}
1619
1620/// Recompute the Regs field, and update RegUses.
1621void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
1622 // Now that we've filtered out some formulae, recompute the Regs set.
1623 SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
1624 Regs.clear();
1625 for (const Formula &F : Formulae) {
1626 if (F.ScaledReg) Regs.insert(F.ScaledReg);
1627 Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1628 }
1629
1630 // Update the RegTracker.
1631 for (const SCEV *S : OldRegs)
1632 if (!Regs.count(S))
1633 RegUses.dropRegister(S, LUIdx);
1634}
1635
1636#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1637void LSRUse::print(raw_ostream &OS) const {
1638 OS << "LSR Use: Kind=";
1639 switch (Kind) {
1640 case Basic: OS << "Basic"; break;
1641 case Special: OS << "Special"; break;
1642 case ICmpZero: OS << "ICmpZero"; break;
1643 case Address:
1644 OS << "Address of ";
1645 if (AccessTy.MemTy->isPointerTy())
1646 OS << "pointer"; // the full pointer type could be really verbose
1647 else {
1648 OS << *AccessTy.MemTy;
1649 }
1650
1651 OS << " in addrspace(" << AccessTy.AddrSpace << ')';
1652 }
1653
1654 OS << ", Offsets={";
1655 bool NeedComma = false;
1656 for (const LSRFixup &Fixup : Fixups) {
1657 if (NeedComma) OS << ',';
1658 OS << Fixup.Offset;
1659 NeedComma = true;
1660 }
1661 OS << '}';
1662
1663 if (AllFixupsOutsideLoop)
1664 OS << ", all-fixups-outside-loop";
1665
1666 if (WidestFixupType)
1667 OS << ", widest fixup type: " << *WidestFixupType;
1668}
1669
1670LLVM_DUMP_METHOD void LSRUse::dump() const {
1671 print(errs()); errs() << '\n';
1672}
1673#endif
1674
1676 LSRUse::KindType Kind, MemAccessTy AccessTy,
1677 GlobalValue *BaseGV, int64_t BaseOffset,
1678 bool HasBaseReg, int64_t Scale,
1679 Instruction *Fixup /* = nullptr */,
1680 int64_t ScalableOffset) {
1681 switch (Kind) {
1682 case LSRUse::Address:
1683 return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, BaseOffset,
1684 HasBaseReg, Scale, AccessTy.AddrSpace,
1685 Fixup, ScalableOffset);
1686
1687 case LSRUse::ICmpZero:
1688 // There's not even a target hook for querying whether it would be legal to
1689 // fold a GV into an ICmp.
1690 if (BaseGV || ScalableOffset != 0)
1691 return false;
1692
1693 // ICmp only has two operands; don't allow more than two non-trivial parts.
1694 if (Scale != 0 && HasBaseReg && BaseOffset != 0)
1695 return false;
1696
1697 // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
1698 // putting the scaled register in the other operand of the icmp.
1699 if (Scale != 0 && Scale != -1)
1700 return false;
1701
1702 // If we have low-level target information, ask the target if it can fold an
1703 // integer immediate on an icmp.
1704 if (BaseOffset != 0) {
1705 // We have one of:
1706 // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
1707 // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
1708 // Offs is the ICmp immediate.
1709 if (Scale == 0)
1710 // The cast does the right thing with
1711 // std::numeric_limits<int64_t>::min().
1712 BaseOffset = -(uint64_t)BaseOffset;
1713 return TTI.isLegalICmpImmediate(BaseOffset);
1714 }
1715
1716 // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
1717 return true;
1718
1719 case LSRUse::Basic:
1720 // Only handle single-register values.
1721 return !BaseGV && Scale == 0 && BaseOffset == 0 && ScalableOffset == 0;
1722
1723 case LSRUse::Special:
1724 // Special case Basic to handle -1 scales.
1725 return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset == 0 &&
1726 ScalableOffset == 0;
1727 }
1728
1729 llvm_unreachable("Invalid LSRUse Kind!");
1730}
1731
1733 int64_t MinOffset, int64_t MaxOffset,
1734 LSRUse::KindType Kind, MemAccessTy AccessTy,
1735 GlobalValue *BaseGV, int64_t BaseOffset,
1736 bool HasBaseReg, int64_t Scale) {
1737 // Check for overflow.
1738 if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) !=
1739 (MinOffset > 0))
1740 return false;
1741 MinOffset = (uint64_t)BaseOffset + MinOffset;
1742 if (((int64_t)((uint64_t)BaseOffset + MaxOffset) > BaseOffset) !=
1743 (MaxOffset > 0))
1744 return false;
1745 MaxOffset = (uint64_t)BaseOffset + MaxOffset;
1746
1747 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
1748 HasBaseReg, Scale) &&
1749 isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
1750 HasBaseReg, Scale);
1751}
1752
1754 int64_t MinOffset, int64_t MaxOffset,
1755 LSRUse::KindType Kind, MemAccessTy AccessTy,
1756 const Formula &F, const Loop &L) {
1757 // For the purpose of isAMCompletelyFolded either having a canonical formula
1758 // or a scale not equal to zero is correct.
1759 // Problems may arise from non canonical formulae having a scale == 0.
1760 // Strictly speaking it would best to just rely on canonical formulae.
1761 // However, when we generate the scaled formulae, we first check that the
1762 // scaling factor is profitable before computing the actual ScaledReg for
1763 // compile time sake.
1764 assert((F.isCanonical(L) || F.Scale != 0));
1765 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1766 F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
1767}
1768
1769/// Test whether we know how to expand the current formula.
1770static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
1771 int64_t MaxOffset, LSRUse::KindType Kind,
1772 MemAccessTy AccessTy, GlobalValue *BaseGV,
1773 int64_t BaseOffset, bool HasBaseReg, int64_t Scale) {
1774 // We know how to expand completely foldable formulae.
1775 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1776 BaseOffset, HasBaseReg, Scale) ||
1777 // Or formulae that use a base register produced by a sum of base
1778 // registers.
1779 (Scale == 1 &&
1780 isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1781 BaseGV, BaseOffset, true, 0));
1782}
1783
1784static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
1785 int64_t MaxOffset, LSRUse::KindType Kind,
1786 MemAccessTy AccessTy, const Formula &F) {
1787 return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
1788 F.BaseOffset, F.HasBaseReg, F.Scale);
1789}
1790
1792 const LSRUse &LU, const Formula &F) {
1793 // Target may want to look at the user instructions.
1794 if (LU.Kind == LSRUse::Address && TTI.LSRWithInstrQueries()) {
1795 for (const LSRFixup &Fixup : LU.Fixups)
1796 if (!isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1797 (F.BaseOffset + Fixup.Offset), F.HasBaseReg,
1798 F.Scale, Fixup.UserInst))
1799 return false;
1800 return true;
1801 }
1802
1803 return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1804 LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
1805 F.Scale);
1806}
1807
1809 const LSRUse &LU, const Formula &F,
1810 const Loop &L) {
1811 if (!F.Scale)
1812 return 0;
1813
1814 // If the use is not completely folded in that instruction, we will have to
1815 // pay an extra cost only for scale != 1.
1816 if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1817 LU.AccessTy, F, L))
1818 return F.Scale != 1;
1819
1820 switch (LU.Kind) {
1821 case LSRUse::Address: {
1822 // Check the scaling factor cost with both the min and max offsets.
1823 InstructionCost ScaleCostMinOffset = TTI.getScalingFactorCost(
1824 LU.AccessTy.MemTy, F.BaseGV,
1825 StackOffset::getFixed(F.BaseOffset + LU.MinOffset), F.HasBaseReg,
1826 F.Scale, LU.AccessTy.AddrSpace);
1827 InstructionCost ScaleCostMaxOffset = TTI.getScalingFactorCost(
1828 LU.AccessTy.MemTy, F.BaseGV,
1829 StackOffset::getFixed(F.BaseOffset + LU.MaxOffset), F.HasBaseReg,
1830 F.Scale, LU.AccessTy.AddrSpace);
1831
1832 assert(ScaleCostMinOffset.isValid() && ScaleCostMaxOffset.isValid() &&
1833 "Legal addressing mode has an illegal cost!");
1834 return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
1835 }
1836 case LSRUse::ICmpZero:
1837 case LSRUse::Basic:
1838 case LSRUse::Special:
1839 // The use is completely folded, i.e., everything is folded into the
1840 // instruction.
1841 return 0;
1842 }
1843
1844 llvm_unreachable("Invalid LSRUse Kind!");
1845}
1846
1848 LSRUse::KindType Kind, MemAccessTy AccessTy,
1849 GlobalValue *BaseGV, int64_t BaseOffset,
1850 bool HasBaseReg, int64_t ScalableOffset = 0) {
1851 // Fast-path: zero is always foldable.
1852 if (BaseOffset == 0 && !BaseGV) return true;
1853
1854 // Conservatively, create an address with an immediate and a
1855 // base and a scale.
1856 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
1857
1858 // Canonicalize a scale of 1 to a base register if the formula doesn't
1859 // already have a base register.
1860 if (!HasBaseReg && Scale == 1) {
1861 Scale = 0;
1862 HasBaseReg = true;
1863 }
1864
1865 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
1866 HasBaseReg, Scale, nullptr, ScalableOffset);
1867}
1868
1870 ScalarEvolution &SE, int64_t MinOffset,
1871 int64_t MaxOffset, LSRUse::KindType Kind,
1872 MemAccessTy AccessTy, const SCEV *S,
1873 bool HasBaseReg) {
1874 // Fast-path: zero is always foldable.
1875 if (S->isZero()) return true;
1876
1877 // Conservatively, create an address with an immediate and a
1878 // base and a scale.
1879 int64_t BaseOffset = ExtractImmediate(S, SE);
1880 GlobalValue *BaseGV = ExtractSymbol(S, SE);
1881
1882 // If there's anything else involved, it's not foldable.
1883 if (!S->isZero()) return false;
1884
1885 // Fast-path: zero is always foldable.
1886 if (BaseOffset == 0 && !BaseGV) return true;
1887
1888 // Conservatively, create an address with an immediate and a
1889 // base and a scale.
1890 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
1891
1892 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1893 BaseOffset, HasBaseReg, Scale);
1894}
1895
1896namespace {
1897
1898/// An individual increment in a Chain of IV increments. Relate an IV user to
1899/// an expression that computes the IV it uses from the IV used by the previous
1900/// link in the Chain.
1901///
1902/// For the head of a chain, IncExpr holds the absolute SCEV expression for the
1903/// original IVOperand. The head of the chain's IVOperand is only valid during
1904/// chain collection, before LSR replaces IV users. During chain generation,
1905/// IncExpr can be used to find the new IVOperand that computes the same
1906/// expression.
1907struct IVInc {
1908 Instruction *UserInst;
1909 Value* IVOperand;
1910 const SCEV *IncExpr;
1911
1912 IVInc(Instruction *U, Value *O, const SCEV *E)
1913 : UserInst(U), IVOperand(O), IncExpr(E) {}
1914};
1915
1916// The list of IV increments in program order. We typically add the head of a
1917// chain without finding subsequent links.
1918struct IVChain {
1920 const SCEV *ExprBase = nullptr;
1921
1922 IVChain() = default;
1923 IVChain(const IVInc &Head, const SCEV *Base)
1924 : Incs(1, Head), ExprBase(Base) {}
1925
1927
1928 // Return the first increment in the chain.
1929 const_iterator begin() const {
1930 assert(!Incs.empty());
1931 return std::next(Incs.begin());
1932 }
1933 const_iterator end() const {
1934 return Incs.end();
1935 }
1936
1937 // Returns true if this chain contains any increments.
1938 bool hasIncs() const { return Incs.size() >= 2; }
1939
1940 // Add an IVInc to the end of this chain.
1941 void add(const IVInc &X) { Incs.push_back(X); }
1942
1943 // Returns the last UserInst in the chain.
1944 Instruction *tailUserInst() const { return Incs.back().UserInst; }
1945
1946 // Returns true if IncExpr can be profitably added to this chain.
1947 bool isProfitableIncrement(const SCEV *OperExpr,
1948 const SCEV *IncExpr,
1950};
1951
1952/// Helper for CollectChains to track multiple IV increment uses. Distinguish
1953/// between FarUsers that definitely cross IV increments and NearUsers that may
1954/// be used between IV increments.
1955struct ChainUsers {
1958};
1959
1960/// This class holds state for the main loop strength reduction logic.
1961class LSRInstance {
1962 IVUsers &IU;
1963 ScalarEvolution &SE;
1964 DominatorTree &DT;
1965 LoopInfo &LI;
1966 AssumptionCache &AC;
1967 TargetLibraryInfo &TLI;
1968 const TargetTransformInfo &TTI;
1969 Loop *const L;
1970 MemorySSAUpdater *MSSAU;
1972 mutable SCEVExpander Rewriter;
1973 bool Changed = false;
1974
1975 /// This is the insert position that the current loop's induction variable
1976 /// increment should be placed. In simple loops, this is the latch block's
1977 /// terminator. But in more complicated cases, this is a position which will
1978 /// dominate all the in-loop post-increment users.
1979 Instruction *IVIncInsertPos = nullptr;
1980
1981 /// Interesting factors between use strides.
1982 ///
1983 /// We explicitly use a SetVector which contains a SmallSet, instead of the
1984 /// default, a SmallDenseSet, because we need to use the full range of
1985 /// int64_ts, and there's currently no good way of doing that with
1986 /// SmallDenseSet.
1988
1989 /// The cost of the current SCEV, the best solution by LSR will be dropped if
1990 /// the solution is not profitable.
1991 Cost BaselineCost;
1992
1993 /// Interesting use types, to facilitate truncation reuse.
1995
1996 /// The list of interesting uses.
1998
1999 /// Track which uses use which register candidates.
2000 RegUseTracker RegUses;
2001
2002 // Limit the number of chains to avoid quadratic behavior. We don't expect to
2003 // have more than a few IV increment chains in a loop. Missing a Chain falls
2004 // back to normal LSR behavior for those uses.
2005 static const unsigned MaxChains = 8;
2006
2007 /// IV users can form a chain of IV increments.
2009
2010 /// IV users that belong to profitable IVChains.
2012
2013 /// Induction variables that were generated and inserted by the SCEV Expander.
2014 SmallVector<llvm::WeakVH, 2> ScalarEvolutionIVs;
2015
2016 void OptimizeShadowIV();
2017 bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
2018 ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
2019 void OptimizeLoopTermCond();
2020
2021 void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
2022 SmallVectorImpl<ChainUsers> &ChainUsersVec);
2023 void FinalizeChain(IVChain &Chain);
2024 void CollectChains();
2025 void GenerateIVChain(const IVChain &Chain,
2027
2028 void CollectInterestingTypesAndFactors();
2029 void CollectFixupsAndInitialFormulae();
2030
2031 // Support for sharing of LSRUses between LSRFixups.
2033 UseMapTy UseMap;
2034
2035 bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
2036 LSRUse::KindType Kind, MemAccessTy AccessTy);
2037
2038 std::pair<size_t, int64_t> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
2039 MemAccessTy AccessTy);
2040
2041 void DeleteUse(LSRUse &LU, size_t LUIdx);
2042
2043 LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
2044
2045 void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2046 void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2047 void CountRegisters(const Formula &F, size_t LUIdx);
2048 bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
2049
2050 void CollectLoopInvariantFixupsAndFormulae();
2051
2052 void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
2053 unsigned Depth = 0);
2054
2055 void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
2056 const Formula &Base, unsigned Depth,
2057 size_t Idx, bool IsScaledReg = false);
2058 void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
2059 void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2060 const Formula &Base, size_t Idx,
2061 bool IsScaledReg = false);
2062 void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2063 void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2064 const Formula &Base,
2065 const SmallVectorImpl<int64_t> &Worklist,
2066 size_t Idx, bool IsScaledReg = false);
2067 void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2068 void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2069 void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2070 void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
2071 void GenerateCrossUseConstantOffsets();
2072 void GenerateAllReuseFormulae();
2073
2074 void FilterOutUndesirableDedicatedRegisters();
2075
2076 size_t EstimateSearchSpaceComplexity() const;
2077 void NarrowSearchSpaceByDetectingSupersets();
2078 void NarrowSearchSpaceByCollapsingUnrolledCode();
2079 void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
2080 void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
2081 void NarrowSearchSpaceByFilterPostInc();
2082 void NarrowSearchSpaceByDeletingCostlyFormulas();
2083 void NarrowSearchSpaceByPickingWinnerRegs();
2084 void NarrowSearchSpaceUsingHeuristics();
2085
2086 void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
2087 Cost &SolutionCost,
2089 const Cost &CurCost,
2090 const SmallPtrSet<const SCEV *, 16> &CurRegs,
2091 DenseSet<const SCEV *> &VisitedRegs) const;
2092 void Solve(SmallVectorImpl<const Formula *> &Solution) const;
2093
2095 HoistInsertPosition(BasicBlock::iterator IP,
2096 const SmallVectorImpl<Instruction *> &Inputs) const;
2097 BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP,
2098 const LSRFixup &LF,
2099 const LSRUse &LU) const;
2100
2101 Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2103 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2104 void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
2105 const Formula &F,
2106 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2107 void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2108 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2109 void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
2110
2111public:
2112 LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
2114 TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU);
2115
2116 bool getChanged() const { return Changed; }
2117 const SmallVectorImpl<WeakVH> &getScalarEvolutionIVs() const {
2118 return ScalarEvolutionIVs;
2119 }
2120
2121 void print_factors_and_types(raw_ostream &OS) const;
2122 void print_fixups(raw_ostream &OS) const;
2123 void print_uses(raw_ostream &OS) const;
2124 void print(raw_ostream &OS) const;
2125 void dump() const;
2126};
2127
2128} // end anonymous namespace
2129
2130/// If IV is used in a int-to-float cast inside the loop then try to eliminate
2131/// the cast operation.
2132void LSRInstance::OptimizeShadowIV() {
2133 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2134 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2135 return;
2136
2137 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
2138 UI != E; /* empty */) {
2139 IVUsers::const_iterator CandidateUI = UI;
2140 ++UI;
2141 Instruction *ShadowUse = CandidateUI->getUser();
2142 Type *DestTy = nullptr;
2143 bool IsSigned = false;
2144
2145 /* If shadow use is a int->float cast then insert a second IV
2146 to eliminate this cast.
2147
2148 for (unsigned i = 0; i < n; ++i)
2149 foo((double)i);
2150
2151 is transformed into
2152
2153 double d = 0.0;
2154 for (unsigned i = 0; i < n; ++i, ++d)
2155 foo(d);
2156 */
2157 if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
2158 IsSigned = false;
2159 DestTy = UCast->getDestTy();
2160 }
2161 else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
2162 IsSigned = true;
2163 DestTy = SCast->getDestTy();
2164 }
2165 if (!DestTy) continue;
2166
2167 // If target does not support DestTy natively then do not apply
2168 // this transformation.
2169 if (!TTI.isTypeLegal(DestTy)) continue;
2170
2171 PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
2172 if (!PH) continue;
2173 if (PH->getNumIncomingValues() != 2) continue;
2174
2175 // If the calculation in integers overflows, the result in FP type will
2176 // differ. So we only can do this transformation if we are guaranteed to not
2177 // deal with overflowing values
2178 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PH));
2179 if (!AR) continue;
2180 if (IsSigned && !AR->hasNoSignedWrap()) continue;
2181 if (!IsSigned && !AR->hasNoUnsignedWrap()) continue;
2182
2183 Type *SrcTy = PH->getType();
2184 int Mantissa = DestTy->getFPMantissaWidth();
2185 if (Mantissa == -1) continue;
2186 if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
2187 continue;
2188
2189 unsigned Entry, Latch;
2190 if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
2191 Entry = 0;
2192 Latch = 1;
2193 } else {
2194 Entry = 1;
2195 Latch = 0;
2196 }
2197
2198 ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
2199 if (!Init) continue;
2200 Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
2201 (double)Init->getSExtValue() :
2202 (double)Init->getZExtValue());
2203
2204 BinaryOperator *Incr =
2205 dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
2206 if (!Incr) continue;
2207 if (Incr->getOpcode() != Instruction::Add
2208 && Incr->getOpcode() != Instruction::Sub)
2209 continue;
2210
2211 /* Initialize new IV, double d = 0.0 in above example. */
2212 ConstantInt *C = nullptr;
2213 if (Incr->getOperand(0) == PH)
2214 C = dyn_cast<ConstantInt>(Incr->getOperand(1));
2215 else if (Incr->getOperand(1) == PH)
2216 C = dyn_cast<ConstantInt>(Incr->getOperand(0));
2217 else
2218 continue;
2219
2220 if (!C) continue;
2221
2222 // Ignore negative constants, as the code below doesn't handle them
2223 // correctly. TODO: Remove this restriction.
2224 if (!C->getValue().isStrictlyPositive())
2225 continue;
2226
2227 /* Add new PHINode. */
2228 PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH->getIterator());
2229
2230 /* create new increment. '++d' in above example. */
2231 Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
2233 Incr->getOpcode() == Instruction::Add ? Instruction::FAdd
2234 : Instruction::FSub,
2235 NewPH, CFP, "IV.S.next.", Incr->getIterator());
2236
2237 NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
2238 NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
2239
2240 /* Remove cast operation */
2241 ShadowUse->replaceAllUsesWith(NewPH);
2242 ShadowUse->eraseFromParent();
2243 Changed = true;
2244 break;
2245 }
2246}
2247
2248/// If Cond has an operand that is an expression of an IV, set the IV user and
2249/// stride information and return true, otherwise return false.
2250bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
2251 for (IVStrideUse &U : IU)
2252 if (U.getUser() == Cond) {
2253 // NOTE: we could handle setcc instructions with multiple uses here, but
2254 // InstCombine does it as well for simple uses, it's not clear that it
2255 // occurs enough in real life to handle.
2256 CondUse = &U;
2257 return true;
2258 }
2259 return false;
2260}
2261
2262/// Rewrite the loop's terminating condition if it uses a max computation.
2263///
2264/// This is a narrow solution to a specific, but acute, problem. For loops
2265/// like this:
2266///
2267/// i = 0;
2268/// do {
2269/// p[i] = 0.0;
2270/// } while (++i < n);
2271///
2272/// the trip count isn't just 'n', because 'n' might not be positive. And
2273/// unfortunately this can come up even for loops where the user didn't use
2274/// a C do-while loop. For example, seemingly well-behaved top-test loops
2275/// will commonly be lowered like this:
2276///
2277/// if (n > 0) {
2278/// i = 0;
2279/// do {
2280/// p[i] = 0.0;
2281/// } while (++i < n);
2282/// }
2283///
2284/// and then it's possible for subsequent optimization to obscure the if
2285/// test in such a way that indvars can't find it.
2286///
2287/// When indvars can't find the if test in loops like this, it creates a
2288/// max expression, which allows it to give the loop a canonical
2289/// induction variable:
2290///
2291/// i = 0;
2292/// max = n < 1 ? 1 : n;
2293/// do {
2294/// p[i] = 0.0;
2295/// } while (++i != max);
2296///
2297/// Canonical induction variables are necessary because the loop passes
2298/// are designed around them. The most obvious example of this is the
2299/// LoopInfo analysis, which doesn't remember trip count values. It
2300/// expects to be able to rediscover the trip count each time it is
2301/// needed, and it does this using a simple analysis that only succeeds if
2302/// the loop has a canonical induction variable.
2303///
2304/// However, when it comes time to generate code, the maximum operation
2305/// can be quite costly, especially if it's inside of an outer loop.
2306///
2307/// This function solves this problem by detecting this type of loop and
2308/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
2309/// the instructions for the maximum computation.
2310ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
2311 // Check that the loop matches the pattern we're looking for.
2312 if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
2313 Cond->getPredicate() != CmpInst::ICMP_NE)
2314 return Cond;
2315
2316 SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
2317 if (!Sel || !Sel->hasOneUse()) return Cond;
2318
2319 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2320 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2321 return Cond;
2322 const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
2323
2324 // Add one to the backedge-taken count to get the trip count.
2325 const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
2326 if (IterationCount != SE.getSCEV(Sel)) return Cond;
2327
2328 // Check for a max calculation that matches the pattern. There's no check
2329 // for ICMP_ULE here because the comparison would be with zero, which
2330 // isn't interesting.
2331 CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
2332 const SCEVNAryExpr *Max = nullptr;
2333 if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
2334 Pred = ICmpInst::ICMP_SLE;
2335 Max = S;
2336 } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
2337 Pred = ICmpInst::ICMP_SLT;
2338 Max = S;
2339 } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
2340 Pred = ICmpInst::ICMP_ULT;
2341 Max = U;
2342 } else {
2343 // No match; bail.
2344 return Cond;
2345 }
2346
2347 // To handle a max with more than two operands, this optimization would
2348 // require additional checking and setup.
2349 if (Max->getNumOperands() != 2)
2350 return Cond;
2351
2352 const SCEV *MaxLHS = Max->getOperand(0);
2353 const SCEV *MaxRHS = Max->getOperand(1);
2354
2355 // ScalarEvolution canonicalizes constants to the left. For < and >, look
2356 // for a comparison with 1. For <= and >=, a comparison with zero.
2357 if (!MaxLHS ||
2358 (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
2359 return Cond;
2360
2361 // Check the relevant induction variable for conformance to
2362 // the pattern.
2363 const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
2364 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
2365 if (!AR || !AR->isAffine() ||
2366 AR->getStart() != One ||
2367 AR->getStepRecurrence(SE) != One)
2368 return Cond;
2369
2370 assert(AR->getLoop() == L &&
2371 "Loop condition operand is an addrec in a different loop!");
2372
2373 // Check the right operand of the select, and remember it, as it will
2374 // be used in the new comparison instruction.
2375 Value *NewRHS = nullptr;
2376 if (ICmpInst::isTrueWhenEqual(Pred)) {
2377 // Look for n+1, and grab n.
2378 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
2379 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2380 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2381 NewRHS = BO->getOperand(0);
2382 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
2383 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2384 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2385 NewRHS = BO->getOperand(0);
2386 if (!NewRHS)
2387 return Cond;
2388 } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
2389 NewRHS = Sel->getOperand(1);
2390 else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
2391 NewRHS = Sel->getOperand(2);
2392 else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
2393 NewRHS = SU->getValue();
2394 else
2395 // Max doesn't match expected pattern.
2396 return Cond;
2397
2398 // Determine the new comparison opcode. It may be signed or unsigned,
2399 // and the original comparison may be either equality or inequality.
2400 if (Cond->getPredicate() == CmpInst::ICMP_EQ)
2401 Pred = CmpInst::getInversePredicate(Pred);
2402
2403 // Ok, everything looks ok to change the condition into an SLT or SGE and
2404 // delete the max calculation.
2405 ICmpInst *NewCond = new ICmpInst(Cond->getIterator(), Pred,
2406 Cond->getOperand(0), NewRHS, "scmp");
2407
2408 // Delete the max calculation instructions.
2409 NewCond->setDebugLoc(Cond->getDebugLoc());
2410 Cond->replaceAllUsesWith(NewCond);
2411 CondUse->setUser(NewCond);
2412 Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
2413 Cond->eraseFromParent();
2414 Sel->eraseFromParent();
2415 if (Cmp->use_empty())
2416 Cmp->eraseFromParent();
2417 return NewCond;
2418}
2419
2420/// Change loop terminating condition to use the postinc iv when possible.
2421void
2422LSRInstance::OptimizeLoopTermCond() {
2424
2425 // We need a different set of heuristics for rotated and non-rotated loops.
2426 // If a loop is rotated then the latch is also the backedge, so inserting
2427 // post-inc expressions just before the latch is ideal. To reduce live ranges
2428 // it also makes sense to rewrite terminating conditions to use post-inc
2429 // expressions.
2430 //
2431 // If the loop is not rotated then the latch is not a backedge; the latch
2432 // check is done in the loop head. Adding post-inc expressions before the
2433 // latch will cause overlapping live-ranges of pre-inc and post-inc expressions
2434 // in the loop body. In this case we do *not* want to use post-inc expressions
2435 // in the latch check, and we want to insert post-inc expressions before
2436 // the backedge.
2437 BasicBlock *LatchBlock = L->getLoopLatch();
2438 SmallVector<BasicBlock*, 8> ExitingBlocks;
2439 L->getExitingBlocks(ExitingBlocks);
2440 if (!llvm::is_contained(ExitingBlocks, LatchBlock)) {
2441 // The backedge doesn't exit the loop; treat this as a head-tested loop.
2442 IVIncInsertPos = LatchBlock->getTerminator();
2443 return;
2444 }
2445
2446 // Otherwise treat this as a rotated loop.
2447 for (BasicBlock *ExitingBlock : ExitingBlocks) {
2448 // Get the terminating condition for the loop if possible. If we
2449 // can, we want to change it to use a post-incremented version of its
2450 // induction variable, to allow coalescing the live ranges for the IV into
2451 // one register value.
2452
2453 BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
2454 if (!TermBr)
2455 continue;
2456 // FIXME: Overly conservative, termination condition could be an 'or' etc..
2457 if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
2458 continue;
2459
2460 // Search IVUsesByStride to find Cond's IVUse if there is one.
2461 IVStrideUse *CondUse = nullptr;
2462 ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
2463 if (!FindIVUserForCond(Cond, CondUse))
2464 continue;
2465
2466 // If the trip count is computed in terms of a max (due to ScalarEvolution
2467 // being unable to find a sufficient guard, for example), change the loop
2468 // comparison to use SLT or ULT instead of NE.
2469 // One consequence of doing this now is that it disrupts the count-down
2470 // optimization. That's not always a bad thing though, because in such
2471 // cases it may still be worthwhile to avoid a max.
2472 Cond = OptimizeMax(Cond, CondUse);
2473
2474 // If this exiting block dominates the latch block, it may also use
2475 // the post-inc value if it won't be shared with other uses.
2476 // Check for dominance.
2477 if (!DT.dominates(ExitingBlock, LatchBlock))
2478 continue;
2479
2480 // Conservatively avoid trying to use the post-inc value in non-latch
2481 // exits if there may be pre-inc users in intervening blocks.
2482 if (LatchBlock != ExitingBlock)
2483 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
2484 // Test if the use is reachable from the exiting block. This dominator
2485 // query is a conservative approximation of reachability.
2486 if (&*UI != CondUse &&
2487 !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {
2488 // Conservatively assume there may be reuse if the quotient of their
2489 // strides could be a legal scale.
2490 const SCEV *A = IU.getStride(*CondUse, L);
2491 const SCEV *B = IU.getStride(*UI, L);
2492 if (!A || !B) continue;
2493 if (SE.getTypeSizeInBits(A->getType()) !=
2494 SE.getTypeSizeInBits(B->getType())) {
2495 if (SE.getTypeSizeInBits(A->getType()) >
2496 SE.getTypeSizeInBits(B->getType()))
2497 B = SE.getSignExtendExpr(B, A->getType());
2498 else
2499 A = SE.getSignExtendExpr(A, B->getType());
2500 }
2501 if (const SCEVConstant *D =
2502 dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
2503 const ConstantInt *C = D->getValue();
2504 // Stride of one or negative one can have reuse with non-addresses.
2505 if (C->isOne() || C->isMinusOne())
2506 goto decline_post_inc;
2507 // Avoid weird situations.
2508 if (C->getValue().getSignificantBits() >= 64 ||
2509 C->getValue().isMinSignedValue())
2510 goto decline_post_inc;
2511 // Check for possible scaled-address reuse.
2512 if (isAddressUse(TTI, UI->getUser(), UI->getOperandValToReplace())) {
2513 MemAccessTy AccessTy = getAccessType(
2514 TTI, UI->getUser(), UI->getOperandValToReplace());
2515 int64_t Scale = C->getSExtValue();
2516 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2517 /*BaseOffset=*/0,
2518 /*HasBaseReg=*/true, Scale,
2519 AccessTy.AddrSpace))
2520 goto decline_post_inc;
2521 Scale = -Scale;
2522 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2523 /*BaseOffset=*/0,
2524 /*HasBaseReg=*/true, Scale,
2525 AccessTy.AddrSpace))
2526 goto decline_post_inc;
2527 }
2528 }
2529 }
2530
2531 LLVM_DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: "
2532 << *Cond << '\n');
2533
2534 // It's possible for the setcc instruction to be anywhere in the loop, and
2535 // possible for it to have multiple users. If it is not immediately before
2536 // the exiting block branch, move it.
2537 if (Cond->getNextNonDebugInstruction() != TermBr) {
2538 if (Cond->hasOneUse()) {
2539 Cond->moveBefore(TermBr);
2540 } else {
2541 // Clone the terminating condition and insert into the loopend.
2542 ICmpInst *OldCond = Cond;
2543 Cond = cast<ICmpInst>(Cond->clone());
2544 Cond->setName(L->getHeader()->getName() + ".termcond");
2545 Cond->insertInto(ExitingBlock, TermBr->getIterator());
2546
2547 // Clone the IVUse, as the old use still exists!
2548 CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
2549 TermBr->replaceUsesOfWith(OldCond, Cond);
2550 }
2551 }
2552
2553 // If we get to here, we know that we can transform the setcc instruction to
2554 // use the post-incremented version of the IV, allowing us to coalesce the
2555 // live ranges for the IV correctly.
2556 CondUse->transformToPostInc(L);
2557 Changed = true;
2558
2559 PostIncs.insert(Cond);
2560 decline_post_inc:;
2561 }
2562
2563 // Determine an insertion point for the loop induction variable increment. It
2564 // must dominate all the post-inc comparisons we just set up, and it must
2565 // dominate the loop latch edge.
2566 IVIncInsertPos = L->getLoopLatch()->getTerminator();
2567 for (Instruction *Inst : PostIncs)
2568 IVIncInsertPos = DT.findNearestCommonDominator(IVIncInsertPos, Inst);
2569}
2570
2571/// Determine if the given use can accommodate a fixup at the given offset and
2572/// other details. If so, update the use and return true.
2573bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
2574 bool HasBaseReg, LSRUse::KindType Kind,
2575 MemAccessTy AccessTy) {
2576 int64_t NewMinOffset = LU.MinOffset;
2577 int64_t NewMaxOffset = LU.MaxOffset;
2578 MemAccessTy NewAccessTy = AccessTy;
2579
2580 // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
2581 // something conservative, however this can pessimize in the case that one of
2582 // the uses will have all its uses outside the loop, for example.
2583 if (LU.Kind != Kind)
2584 return false;
2585
2586 // Check for a mismatched access type, and fall back conservatively as needed.
2587 // TODO: Be less conservative when the type is similar and can use the same
2588 // addressing modes.
2589 if (Kind == LSRUse::Address) {
2590 if (AccessTy.MemTy != LU.AccessTy.MemTy) {
2591 NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext(),
2592 AccessTy.AddrSpace);
2593 }
2594 }
2595
2596 // Conservatively assume HasBaseReg is true for now.
2597 if (NewOffset < LU.MinOffset) {
2598 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2599 LU.MaxOffset - NewOffset, HasBaseReg))
2600 return false;
2601 NewMinOffset = NewOffset;
2602 } else if (NewOffset > LU.MaxOffset) {
2603 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2604 NewOffset - LU.MinOffset, HasBaseReg))
2605 return false;
2606 NewMaxOffset = NewOffset;
2607 }
2608
2609 // Update the use.
2610 LU.MinOffset = NewMinOffset;
2611 LU.MaxOffset = NewMaxOffset;
2612 LU.AccessTy = NewAccessTy;
2613 return true;
2614}
2615
2616/// Return an LSRUse index and an offset value for a fixup which needs the given
2617/// expression, with the given kind and optional access type. Either reuse an
2618/// existing use or create a new one, as needed.
2619std::pair<size_t, int64_t> LSRInstance::getUse(const SCEV *&Expr,
2620 LSRUse::KindType Kind,
2621 MemAccessTy AccessTy) {
2622 const SCEV *Copy = Expr;
2623 int64_t Offset = ExtractImmediate(Expr, SE);
2624
2625 // Basic uses can't accept any offset, for example.
2626 if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
2627 Offset, /*HasBaseReg=*/ true)) {
2628 Expr = Copy;
2629 Offset = 0;
2630 }
2631
2632 std::pair<UseMapTy::iterator, bool> P =
2633 UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0));
2634 if (!P.second) {
2635 // A use already existed with this base.
2636 size_t LUIdx = P.first->second;
2637 LSRUse &LU = Uses[LUIdx];
2638 if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
2639 // Reuse this use.
2640 return std::make_pair(LUIdx, Offset);
2641 }
2642
2643 // Create a new use.
2644 size_t LUIdx = Uses.size();
2645 P.first->second = LUIdx;
2646 Uses.push_back(LSRUse(Kind, AccessTy));
2647 LSRUse &LU = Uses[LUIdx];
2648
2649 LU.MinOffset = Offset;
2650 LU.MaxOffset = Offset;
2651 return std::make_pair(LUIdx, Offset);
2652}
2653
2654/// Delete the given use from the Uses list.
2655void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
2656 if (&LU != &Uses.back())
2657 std::swap(LU, Uses.back());
2658 Uses.pop_back();
2659
2660 // Update RegUses.
2661 RegUses.swapAndDropUse(LUIdx, Uses.size());
2662}
2663
2664/// Look for a use distinct from OrigLU which is has a formula that has the same
2665/// registers as the given formula.
2666LSRUse *
2667LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
2668 const LSRUse &OrigLU) {
2669 // Search all uses for the formula. This could be more clever.
2670 for (LSRUse &LU : Uses) {
2671 // Check whether this use is close enough to OrigLU, to see whether it's
2672 // worthwhile looking through its formulae.
2673 // Ignore ICmpZero uses because they may contain formulae generated by
2674 // GenerateICmpZeroScales, in which case adding fixup offsets may
2675 // be invalid.
2676 if (&LU != &OrigLU &&
2677 LU.Kind != LSRUse::ICmpZero &&
2678 LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
2679 LU.WidestFixupType == OrigLU.WidestFixupType &&
2680 LU.HasFormulaWithSameRegs(OrigF)) {
2681 // Scan through this use's formulae.
2682 for (const Formula &F : LU.Formulae) {
2683 // Check to see if this formula has the same registers and symbols
2684 // as OrigF.
2685 if (F.BaseRegs == OrigF.BaseRegs &&
2686 F.ScaledReg == OrigF.ScaledReg &&
2687 F.BaseGV == OrigF.BaseGV &&
2688 F.Scale == OrigF.Scale &&
2689 F.UnfoldedOffset == OrigF.UnfoldedOffset) {
2690 if (F.BaseOffset == 0)
2691 return &LU;
2692 // This is the formula where all the registers and symbols matched;
2693 // there aren't going to be any others. Since we declined it, we
2694 // can skip the rest of the formulae and proceed to the next LSRUse.
2695 break;
2696 }
2697 }
2698 }
2699 }
2700
2701 // Nothing looked good.
2702 return nullptr;
2703}
2704
2705void LSRInstance::CollectInterestingTypesAndFactors() {
2707
2708 // Collect interesting types and strides.
2710 for (const IVStrideUse &U : IU) {
2711 const SCEV *Expr = IU.getExpr(U);
2712 if (!Expr)
2713 continue;
2714
2715 // Collect interesting types.
2716 Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
2717
2718 // Add strides for mentioned loops.
2719 Worklist.push_back(Expr);
2720 do {
2721 const SCEV *S = Worklist.pop_back_val();
2722 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
2723 if (AR->getLoop() == L)
2724 Strides.insert(AR->getStepRecurrence(SE));
2725 Worklist.push_back(AR->getStart());
2726 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
2727 append_range(Worklist, Add->operands());
2728 }
2729 } while (!Worklist.empty());
2730 }
2731
2732 // Compute interesting factors from the set of interesting strides.
2734 I = Strides.begin(), E = Strides.end(); I != E; ++I)
2736 std::next(I); NewStrideIter != E; ++NewStrideIter) {
2737 const SCEV *OldStride = *I;
2738 const SCEV *NewStride = *NewStrideIter;
2739
2740 if (SE.getTypeSizeInBits(OldStride->getType()) !=
2741 SE.getTypeSizeInBits(NewStride->getType())) {
2742 if (SE.getTypeSizeInBits(OldStride->getType()) >
2743 SE.getTypeSizeInBits(NewStride->getType()))
2744 NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
2745 else
2746 OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
2747 }
2748 if (const SCEVConstant *Factor =
2749 dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
2750 SE, true))) {
2751 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2752 Factors.insert(Factor->getAPInt().getSExtValue());
2753 } else if (const SCEVConstant *Factor =
2754 dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
2755 NewStride,
2756 SE, true))) {
2757 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2758 Factors.insert(Factor->getAPInt().getSExtValue());
2759 }
2760 }
2761
2762 // If all uses use the same type, don't bother looking for truncation-based
2763 // reuse.
2764 if (Types.size() == 1)
2765 Types.clear();
2766
2767 LLVM_DEBUG(print_factors_and_types(dbgs()));
2768}
2769
2770/// Helper for CollectChains that finds an IV operand (computed by an AddRec in
2771/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
2772/// IVStrideUses, we could partially skip this.
2773static User::op_iterator
2775 Loop *L, ScalarEvolution &SE) {
2776 for(; OI != OE; ++OI) {
2777 if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
2778 if (!SE.isSCEVable(Oper->getType()))
2779 continue;
2780
2781 if (const SCEVAddRecExpr *AR =
2782 dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) {
2783 if (AR->getLoop() == L)
2784 break;
2785 }
2786 }
2787 }
2788 return OI;
2789}
2790
2791/// IVChain logic must consistently peek base TruncInst operands, so wrap it in
2792/// a convenient helper.
2794 if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
2795 return Trunc->getOperand(0);
2796 return Oper;
2797}
2798
2799/// Return an approximation of this SCEV expression's "base", or NULL for any
2800/// constant. Returning the expression itself is conservative. Returning a
2801/// deeper subexpression is more precise and valid as long as it isn't less
2802/// complex than another subexpression. For expressions involving multiple
2803/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
2804/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
2805/// IVInc==b-a.
2806///
2807/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
2808/// SCEVUnknown, we simply return the rightmost SCEV operand.
2809static const SCEV *getExprBase(const SCEV *S) {
2810 switch (S->getSCEVType()) {
2811 default: // including scUnknown.
2812 return S;
2813 case scConstant:
2814 case scVScale:
2815 return nullptr;
2816 case scTruncate:
2817 return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
2818 case scZeroExtend:
2819 return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
2820 case scSignExtend:
2821 return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
2822 case scAddExpr: {
2823 // Skip over scaled operands (scMulExpr) to follow add operands as long as
2824 // there's nothing more complex.
2825 // FIXME: not sure if we want to recognize negation.
2826 const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
2827 for (const SCEV *SubExpr : reverse(Add->operands())) {
2828 if (SubExpr->getSCEVType() == scAddExpr)
2829 return getExprBase(SubExpr);
2830
2831 if (SubExpr->getSCEVType() != scMulExpr)
2832 return SubExpr;
2833 }
2834 return S; // all operands are scaled, be conservative.
2835 }
2836 case scAddRecExpr:
2837 return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
2838 }
2839 llvm_unreachable("Unknown SCEV kind!");
2840}
2841
2842/// Return true if the chain increment is profitable to expand into a loop
2843/// invariant value, which may require its own register. A profitable chain
2844/// increment will be an offset relative to the same base. We allow such offsets
2845/// to potentially be used as chain increment as long as it's not obviously
2846/// expensive to expand using real instructions.
2847bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
2848 const SCEV *IncExpr,
2849 ScalarEvolution &SE) {
2850 // Aggressively form chains when -stress-ivchain.
2851 if (StressIVChain)
2852 return true;
2853
2854 // Do not replace a constant offset from IV head with a nonconstant IV
2855 // increment.
2856 if (!isa<SCEVConstant>(IncExpr)) {
2857 const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
2858 if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
2859 return false;
2860 }
2861
2863 return !isHighCostExpansion(IncExpr, Processed, SE);
2864}
2865
2866/// Return true if the number of registers needed for the chain is estimated to
2867/// be less than the number required for the individual IV users. First prohibit
2868/// any IV users that keep the IV live across increments (the Users set should
2869/// be empty). Next count the number and type of increments in the chain.
2870///
2871/// Chaining IVs can lead to considerable code bloat if ISEL doesn't
2872/// effectively use postinc addressing modes. Only consider it profitable it the
2873/// increments can be computed in fewer registers when chained.
2874///
2875/// TODO: Consider IVInc free if it's already used in another chains.
2876static bool isProfitableChain(IVChain &Chain,
2878 ScalarEvolution &SE,
2879 const TargetTransformInfo &TTI) {
2880 if (StressIVChain)
2881 return true;
2882
2883 if (!Chain.hasIncs())
2884 return false;
2885
2886 if (!Users.empty()) {
2887 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
2888 for (Instruction *Inst
2889 : Users) { dbgs() << " " << *Inst << "\n"; });
2890 return false;
2891 }
2892 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
2893
2894 // The chain itself may require a register, so intialize cost to 1.
2895 int cost = 1;
2896
2897 // A complete chain likely eliminates the need for keeping the original IV in
2898 // a register. LSR does not currently know how to form a complete chain unless
2899 // the header phi already exists.
2900 if (isa<PHINode>(Chain.tailUserInst())
2901 && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
2902 --cost;
2903 }
2904 const SCEV *LastIncExpr = nullptr;
2905 unsigned NumConstIncrements = 0;
2906 unsigned NumVarIncrements = 0;
2907 unsigned NumReusedIncrements = 0;
2908
2909 if (TTI.isProfitableLSRChainElement(Chain.Incs[0].UserInst))
2910 return true;
2911
2912 for (const IVInc &Inc : Chain) {
2913 if (TTI.isProfitableLSRChainElement(Inc.UserInst))
2914 return true;
2915 if (Inc.IncExpr->isZero())
2916 continue;
2917
2918 // Incrementing by zero or some constant is neutral. We assume constants can
2919 // be folded into an addressing mode or an add's immediate operand.
2920 if (isa<SCEVConstant>(Inc.IncExpr)) {
2921 ++NumConstIncrements;
2922 continue;
2923 }
2924
2925 if (Inc.IncExpr == LastIncExpr)
2926 ++NumReusedIncrements;
2927 else
2928 ++NumVarIncrements;
2929
2930 LastIncExpr = Inc.IncExpr;
2931 }
2932 // An IV chain with a single increment is handled by LSR's postinc
2933 // uses. However, a chain with multiple increments requires keeping the IV's
2934 // value live longer than it needs to be if chained.
2935 if (NumConstIncrements > 1)
2936 --cost;
2937
2938 // Materializing increment expressions in the preheader that didn't exist in
2939 // the original code may cost a register. For example, sign-extended array
2940 // indices can produce ridiculous increments like this:
2941 // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
2942 cost += NumVarIncrements;
2943
2944 // Reusing variable increments likely saves a register to hold the multiple of
2945 // the stride.
2946 cost -= NumReusedIncrements;
2947
2948 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
2949 << "\n");
2950
2951 return cost < 0;
2952}
2953
2954/// Add this IV user to an existing chain or make it the head of a new chain.
2955void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
2956 SmallVectorImpl<ChainUsers> &ChainUsersVec) {
2957 // When IVs are used as types of varying widths, they are generally converted
2958 // to a wider type with some uses remaining narrow under a (free) trunc.
2959 Value *const NextIV = getWideOperand(IVOper);
2960 const SCEV *const OperExpr = SE.getSCEV(NextIV);
2961 const SCEV *const OperExprBase = getExprBase(OperExpr);
2962
2963 // Visit all existing chains. Check if its IVOper can be computed as a
2964 // profitable loop invariant increment from the last link in the Chain.
2965 unsigned ChainIdx = 0, NChains = IVChainVec.size();
2966 const SCEV *LastIncExpr = nullptr;
2967 for (; ChainIdx < NChains; ++ChainIdx) {
2968 IVChain &Chain = IVChainVec[ChainIdx];
2969
2970 // Prune the solution space aggressively by checking that both IV operands
2971 // are expressions that operate on the same unscaled SCEVUnknown. This
2972 // "base" will be canceled by the subsequent getMinusSCEV call. Checking
2973 // first avoids creating extra SCEV expressions.
2974 if (!StressIVChain && Chain.ExprBase != OperExprBase)
2975 continue;
2976
2977 Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
2978 if (PrevIV->getType() != NextIV->getType())
2979 continue;
2980
2981 // A phi node terminates a chain.
2982 if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
2983 continue;
2984
2985 // The increment must be loop-invariant so it can be kept in a register.
2986 const SCEV *PrevExpr = SE.getSCEV(PrevIV);
2987 const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
2988 if (isa<SCEVCouldNotCompute>(IncExpr) || !SE.isLoopInvariant(IncExpr, L))
2989 continue;
2990
2991 if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
2992 LastIncExpr = IncExpr;
2993 break;
2994 }
2995 }
2996 // If we haven't found a chain, create a new one, unless we hit the max. Don't
2997 // bother for phi nodes, because they must be last in the chain.
2998 if (ChainIdx == NChains) {
2999 if (isa<PHINode>(UserInst))
3000 return;
3001 if (NChains >= MaxChains && !StressIVChain) {
3002 LLVM_DEBUG(dbgs() << "IV Chain Limit\n");
3003 return;
3004 }
3005 LastIncExpr = OperExpr;
3006 // IVUsers may have skipped over sign/zero extensions. We don't currently
3007 // attempt to form chains involving extensions unless they can be hoisted
3008 // into this loop's AddRec.
3009 if (!isa<SCEVAddRecExpr>(LastIncExpr))
3010 return;
3011 ++NChains;
3012 IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
3013 OperExprBase));
3014 ChainUsersVec.resize(NChains);
3015 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
3016 << ") IV=" << *LastIncExpr << "\n");
3017 } else {
3018 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst
3019 << ") IV+" << *LastIncExpr << "\n");
3020 // Add this IV user to the end of the chain.
3021 IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
3022 }
3023 IVChain &Chain = IVChainVec[ChainIdx];
3024
3025 SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
3026 // This chain's NearUsers become FarUsers.
3027 if (!LastIncExpr->isZero()) {
3028 ChainUsersVec[ChainIdx].FarUsers.insert(NearUsers.begin(),
3029 NearUsers.end());
3030 NearUsers.clear();
3031 }
3032
3033 // All other uses of IVOperand become near uses of the chain.
3034 // We currently ignore intermediate values within SCEV expressions, assuming
3035 // they will eventually be used be the current chain, or can be computed
3036 // from one of the chain increments. To be more precise we could
3037 // transitively follow its user and only add leaf IV users to the set.
3038 for (User *U : IVOper->users()) {
3039 Instruction *OtherUse = dyn_cast<Instruction>(U);
3040 if (!OtherUse)
3041 continue;
3042 // Uses in the chain will no longer be uses if the chain is formed.
3043 // Include the head of the chain in this iteration (not Chain.begin()).
3044 IVChain::const_iterator IncIter = Chain.Incs.begin();
3045 IVChain::const_iterator IncEnd = Chain.Incs.end();
3046 for( ; IncIter != IncEnd; ++IncIter) {
3047 if (IncIter->UserInst == OtherUse)
3048 break;
3049 }
3050 if (IncIter != IncEnd)
3051 continue;
3052
3053 if (SE.isSCEVable(OtherUse->getType())
3054 && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
3055 && IU.isIVUserOrOperand(OtherUse)) {
3056 continue;
3057 }
3058 NearUsers.insert(OtherUse);
3059 }
3060
3061 // Since this user is part of the chain, it's no longer considered a use
3062 // of the chain.
3063 ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
3064}
3065
3066/// Populate the vector of Chains.
3067///
3068/// This decreases ILP at the architecture level. Targets with ample registers,
3069/// multiple memory ports, and no register renaming probably don't want
3070/// this. However, such targets should probably disable LSR altogether.
3071///
3072/// The job of LSR is to make a reasonable choice of induction variables across
3073/// the loop. Subsequent passes can easily "unchain" computation exposing more
3074/// ILP *within the loop* if the target wants it.
3075///
3076/// Finding the best IV chain is potentially a scheduling problem. Since LSR
3077/// will not reorder memory operations, it will recognize this as a chain, but
3078/// will generate redundant IV increments. Ideally this would be corrected later
3079/// by a smart scheduler:
3080/// = A[i]
3081/// = A[i+x]
3082/// A[i] =
3083/// A[i+x] =
3084///
3085/// TODO: Walk the entire domtree within this loop, not just the path to the
3086/// loop latch. This will discover chains on side paths, but requires
3087/// maintaining multiple copies of the Chains state.
3088void LSRInstance::CollectChains() {
3089 LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n");
3090 SmallVector<ChainUsers, 8> ChainUsersVec;
3091
3093 BasicBlock *LoopHeader = L->getHeader();
3094 for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
3095 Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
3096 LatchPath.push_back(Rung->getBlock());
3097 }
3098 LatchPath.push_back(LoopHeader);
3099
3100 // Walk the instruction stream from the loop header to the loop latch.
3101 for (BasicBlock *BB : reverse(LatchPath)) {
3102 for (Instruction &I : *BB) {
3103 // Skip instructions that weren't seen by IVUsers analysis.
3104 if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&I))
3105 continue;
3106
3107 // Ignore users that are part of a SCEV expression. This way we only
3108 // consider leaf IV Users. This effectively rediscovers a portion of
3109 // IVUsers analysis but in program order this time.
3110 if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I)))
3111 continue;
3112
3113 // Remove this instruction from any NearUsers set it may be in.
3114 for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
3115 ChainIdx < NChains; ++ChainIdx) {
3116 ChainUsersVec[ChainIdx].NearUsers.erase(&I);
3117 }
3118 // Search for operands that can be chained.
3119 SmallPtrSet<Instruction*, 4> UniqueOperands;
3120 User::op_iterator IVOpEnd = I.op_end();
3121 User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE);
3122 while (IVOpIter != IVOpEnd) {
3123 Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
3124 if (UniqueOperands.insert(IVOpInst).second)
3125 ChainInstruction(&I, IVOpInst, ChainUsersVec);
3126 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3127 }
3128 } // Continue walking down the instructions.
3129 } // Continue walking down the domtree.
3130 // Visit phi backedges to determine if the chain can generate the IV postinc.
3131 for (PHINode &PN : L->getHeader()->phis()) {
3132 if (!SE.isSCEVable(PN.getType()))
3133 continue;
3134
3135 Instruction *IncV =
3136 dyn_cast<Instruction>(PN.getIncomingValueForBlock(L->getLoopLatch()));
3137 if (IncV)
3138 ChainInstruction(&PN, IncV, ChainUsersVec);
3139 }
3140 // Remove any unprofitable chains.
3141 unsigned ChainIdx = 0;
3142 for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
3143 UsersIdx < NChains; ++UsersIdx) {
3144 if (!isProfitableChain(IVChainVec[UsersIdx],
3145 ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
3146 continue;
3147 // Preserve the chain at UsesIdx.
3148 if (ChainIdx != UsersIdx)
3149 IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
3150 FinalizeChain(IVChainVec[ChainIdx]);
3151 ++ChainIdx;
3152 }
3153 IVChainVec.resize(ChainIdx);
3154}
3155
3156void LSRInstance::FinalizeChain(IVChain &Chain) {
3157 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3158 LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
3159
3160 for (const IVInc &Inc : Chain) {
3161 LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");
3162 auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
3163 assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
3164 IVIncSet.insert(UseI);
3165 }
3166}
3167
3168/// Return true if the IVInc can be folded into an addressing mode.
3169static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
3170 Value *Operand, const TargetTransformInfo &TTI) {
3171 const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
3172 int64_t IncOffset = 0;
3173 int64_t ScalableOffset = 0;
3174 if (IncConst) {
3175 if (IncConst && IncConst->getAPInt().getSignificantBits() > 64)
3176 return false;
3177 IncOffset = IncConst->getValue()->getSExtValue();
3178 } else {
3179 // Look for mul(vscale, constant), to detect ScalableOffset.
3180 auto *IncVScale = dyn_cast<SCEVMulExpr>(IncExpr);
3181 if (!IncVScale || IncVScale->getNumOperands() != 2 ||
3182 !isa<SCEVVScale>(IncVScale->getOperand(1)))
3183 return false;
3184 auto *Scale = dyn_cast<SCEVConstant>(IncVScale->getOperand(0));
3185 if (!Scale || Scale->getType()->getScalarSizeInBits() > 64)
3186 return false;
3187 ScalableOffset = Scale->getValue()->getSExtValue();
3188 }
3189
3190 if (!isAddressUse(TTI, UserInst, Operand))
3191 return false;
3192
3193 MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
3194 if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
3195 IncOffset, /*HasBaseReg=*/false, ScalableOffset))
3196 return false;
3197
3198 return true;
3199}
3200
3201/// Generate an add or subtract for each IVInc in a chain to materialize the IV
3202/// user's operand from the previous IV user's operand.
3203void LSRInstance::GenerateIVChain(const IVChain &Chain,
3205 // Find the new IVOperand for the head of the chain. It may have been replaced
3206 // by LSR.
3207 const IVInc &Head = Chain.Incs[0];
3208 User::op_iterator IVOpEnd = Head.UserInst->op_end();
3209 // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
3210 User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
3211 IVOpEnd, L, SE);
3212 Value *IVSrc = nullptr;
3213 while (IVOpIter != IVOpEnd) {
3214 IVSrc = getWideOperand(*IVOpIter);
3215
3216 // If this operand computes the expression that the chain needs, we may use
3217 // it. (Check this after setting IVSrc which is used below.)
3218 //
3219 // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
3220 // narrow for the chain, so we can no longer use it. We do allow using a
3221 // wider phi, assuming the LSR checked for free truncation. In that case we
3222 // should already have a truncate on this operand such that
3223 // getSCEV(IVSrc) == IncExpr.
3224 if (SE.getSCEV(*IVOpIter) == Head.IncExpr
3225 || SE.getSCEV(IVSrc) == Head.IncExpr) {
3226 break;
3227 }
3228 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3229 }
3230 if (IVOpIter == IVOpEnd) {
3231 // Gracefully give up on this chain.
3232 LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
3233 return;
3234 }
3235 assert(IVSrc && "Failed to find IV chain source");
3236
3237 LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
3238 Type *IVTy = IVSrc->getType();
3239 Type *IntTy = SE.getEffectiveSCEVType(IVTy);
3240 const SCEV *LeftOverExpr = nullptr;
3241 const SCEV *Accum = SE.getZero(IntTy);
3243 Bases.emplace_back(Accum, IVSrc);
3244
3245 for (const IVInc &Inc : Chain) {
3246 Instruction *InsertPt = Inc.UserInst;
3247 if (isa<PHINode>(InsertPt))
3248 InsertPt = L->getLoopLatch()->getTerminator();
3249
3250 // IVOper will replace the current IV User's operand. IVSrc is the IV
3251 // value currently held in a register.
3252 Value *IVOper = IVSrc;
3253 if (!Inc.IncExpr->isZero()) {
3254 // IncExpr was the result of subtraction of two narrow values, so must
3255 // be signed.
3256 const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
3257 Accum = SE.getAddExpr(Accum, IncExpr);
3258 LeftOverExpr = LeftOverExpr ?
3259 SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
3260 }
3261
3262 // Look through each base to see if any can produce a nice addressing mode.
3263 bool FoundBase = false;
3264 for (auto [MapScev, MapIVOper] : reverse(Bases)) {
3265 const SCEV *Remainder = SE.getMinusSCEV(Accum, MapScev);
3266 if (canFoldIVIncExpr(Remainder, Inc.UserInst, Inc.IVOperand, TTI)) {
3267 if (!Remainder->isZero()) {
3268 Rewriter.clearPostInc();
3269 Value *IncV = Rewriter.expandCodeFor(Remainder, IntTy, InsertPt);
3270 const SCEV *IVOperExpr =
3271 SE.getAddExpr(SE.getUnknown(MapIVOper), SE.getUnknown(IncV));
3272 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3273 } else {
3274 IVOper = MapIVOper;
3275 }
3276
3277 FoundBase = true;
3278 break;
3279 }
3280 }
3281 if (!FoundBase && LeftOverExpr && !LeftOverExpr->isZero()) {
3282 // Expand the IV increment.
3283 Rewriter.clearPostInc();
3284 Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
3285 const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
3286 SE.getUnknown(IncV));
3287 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3288
3289 // If an IV increment can't be folded, use it as the next IV value.
3290 if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
3291 assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
3292 Bases.emplace_back(Accum, IVOper);
3293 IVSrc = IVOper;
3294 LeftOverExpr = nullptr;
3295 }
3296 }
3297 Type *OperTy = Inc.IVOperand->getType();
3298 if (IVTy != OperTy) {
3299 assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
3300 "cannot extend a chained IV");
3301 IRBuilder<> Builder(InsertPt);
3302 IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
3303 }
3304 Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
3305 if (auto *OperandIsInstr = dyn_cast<Instruction>(Inc.IVOperand))
3306 DeadInsts.emplace_back(OperandIsInstr);
3307 }
3308 // If LSR created a new, wider phi, we may also replace its postinc. We only
3309 // do this if we also found a wide value for the head of the chain.
3310 if (isa<PHINode>(Chain.tailUserInst())) {
3311 for (PHINode &Phi : L->getHeader()->phis()) {
3312 if (Phi.getType() != IVSrc->getType())
3313 continue;
3314 Instruction *PostIncV = dyn_cast<Instruction>(
3315 Phi.getIncomingValueForBlock(L->getLoopLatch()));
3316 if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
3317 continue;
3318 Value *IVOper = IVSrc;
3319 Type *PostIncTy = PostIncV->getType();
3320 if (IVTy != PostIncTy) {
3321 assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
3322 IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
3323 Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
3324 IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
3325 }
3326 Phi.replaceUsesOfWith(PostIncV, IVOper);
3327 DeadInsts.emplace_back(PostIncV);
3328 }
3329 }
3330}
3331
3332void LSRInstance::CollectFixupsAndInitialFormulae() {
3333 BranchInst *ExitBranch = nullptr;
3334 bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &TLI);
3335
3336 // For calculating baseline cost
3338 DenseSet<const SCEV *> VisitedRegs;
3339 DenseSet<size_t> VisitedLSRUse;
3340
3341 for (const IVStrideUse &U : IU) {
3342 Instruction *UserInst = U.getUser();
3343 // Skip IV users that are part of profitable IV Chains.
3344 User::op_iterator UseI =
3345 find(UserInst->operands(), U.getOperandValToReplace());
3346 assert(UseI != UserInst->op_end() && "cannot find IV operand");
3347 if (IVIncSet.count(UseI)) {
3348 LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
3349 continue;
3350 }
3351
3352 LSRUse::KindType Kind = LSRUse::Basic;
3353 MemAccessTy AccessTy;
3354 if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
3355 Kind = LSRUse::Address;
3356 AccessTy = getAccessType(TTI, UserInst, U.getOperandValToReplace());
3357 }
3358
3359 const SCEV *S = IU.getExpr(U);
3360 if (!S)
3361 continue;
3362 PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
3363
3364 // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
3365 // (N - i == 0), and this allows (N - i) to be the expression that we work
3366 // with rather than just N or i, so we can consider the register
3367 // requirements for both N and i at the same time. Limiting this code to
3368 // equality icmps is not a problem because all interesting loops use
3369 // equality icmps, thanks to IndVarSimplify.
3370 if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) {
3371 // If CI can be saved in some target, like replaced inside hardware loop
3372 // in PowerPC, no need to generate initial formulae for it.
3373 if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition()))
3374 continue;
3375 if (CI->isEquality()) {
3376 // Swap the operands if needed to put the OperandValToReplace on the
3377 // left, for consistency.
3378 Value *NV = CI->getOperand(1);
3379 if (NV == U.getOperandValToReplace()) {
3380 CI->setOperand(1, CI->getOperand(0));
3381 CI->setOperand(0, NV);
3382 NV = CI->getOperand(1);
3383 Changed = true;
3384 }
3385
3386 // x == y --> x - y == 0
3387 const SCEV *N = SE.getSCEV(NV);
3388 if (SE.isLoopInvariant(N, L) && Rewriter.isSafeToExpand(N) &&
3389 (!NV->getType()->isPointerTy() ||
3390 SE.getPointerBase(N) == SE.getPointerBase(S))) {
3391 // S is normalized, so normalize N before folding it into S
3392 // to keep the result normalized.
3393 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3394 if (!N)
3395 continue;
3396 Kind = LSRUse::ICmpZero;
3397 S = SE.getMinusSCEV(N, S);
3398 } else if (L->isLoopInvariant(NV) &&
3399 (!isa<Instruction>(NV) ||
3400 DT.dominates(cast<Instruction>(NV), L->getHeader())) &&
3401 !NV->getType()->isPointerTy()) {
3402 // If we can't generally expand the expression (e.g. it contains
3403 // a divide), but it is already at a loop invariant point before the
3404 // loop, wrap it in an unknown (to prevent the expander from trying
3405 // to re-expand in a potentially unsafe way.) The restriction to
3406 // integer types is required because the unknown hides the base, and
3407 // SCEV can't compute the difference of two unknown pointers.
3408 N = SE.getUnknown(NV);
3409 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3410 if (!N)
3411 continue;
3412 Kind = LSRUse::ICmpZero;
3413 S = SE.getMinusSCEV(N, S);
3414 assert(!isa<SCEVCouldNotCompute>(S));
3415 }
3416
3417 // -1 and the negations of all interesting strides (except the negation
3418 // of -1) are now also interesting.
3419 for (size_t i = 0, e = Factors.size(); i != e; ++i)
3420 if (Factors[i] != -1)
3421 Factors.insert(-(uint64_t)Factors[i]);
3422 Factors.insert(-1);
3423 }
3424 }
3425
3426 // Get or create an LSRUse.
3427 std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
3428 size_t LUIdx = P.first;
3429 int64_t Offset = P.second;
3430 LSRUse &LU = Uses[LUIdx];
3431
3432 // Record the fixup.
3433 LSRFixup &LF = LU.getNewFixup();
3434 LF.UserInst = UserInst;
3435 LF.OperandValToReplace = U.getOperandValToReplace();
3436 LF.PostIncLoops = TmpPostIncLoops;
3437 LF.Offset = Offset;
3438 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3439
3440 // Create SCEV as Formula for calculating baseline cost
3441 if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) {
3442 Formula F;
3443 F.initialMatch(S, L, SE);
3444 BaselineCost.RateFormula(F, Regs, VisitedRegs, LU);
3445 VisitedLSRUse.insert(LUIdx);
3446 }
3447
3448 if (!LU.WidestFixupType ||
3449 SE.getTypeSizeInBits(LU.WidestFixupType) <
3450 SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3451 LU.WidestFixupType = LF.OperandValToReplace->getType();
3452
3453 // If this is the first use of this LSRUse, give it a formula.
3454 if (LU.Formulae.empty()) {
3455 InsertInitialFormula(S, LU, LUIdx);
3456 CountRegisters(LU.Formulae.back(), LUIdx);
3457 }
3458 }
3459
3460 LLVM_DEBUG(print_fixups(dbgs()));
3461}
3462
3463/// Insert a formula for the given expression into the given use, separating out
3464/// loop-variant portions from loop-invariant and loop-computable portions.
3465void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU,
3466 size_t LUIdx) {
3467 // Mark uses whose expressions cannot be expanded.
3468 if (!Rewriter.isSafeToExpand(S))
3469 LU.RigidFormula = true;
3470
3471 Formula F;
3472 F.initialMatch(S, L, SE);
3473 bool Inserted = InsertFormula(LU, LUIdx, F);
3474 assert(Inserted && "Initial formula already exists!"); (void)Inserted;
3475}
3476
3477/// Insert a simple single-register formula for the given expression into the
3478/// given use.
3479void
3480LSRInstance::InsertSupplementalFormula(const SCEV *S,
3481 LSRUse &LU, size_t LUIdx) {
3482 Formula F;
3483 F.BaseRegs.push_back(S);
3484 F.HasBaseReg = true;
3485 bool Inserted = InsertFormula(LU, LUIdx, F);
3486 assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
3487}
3488
3489/// Note which registers are used by the given formula, updating RegUses.
3490void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
3491 if (F.ScaledReg)
3492 RegUses.countRegister(F.ScaledReg, LUIdx);
3493 for (const SCEV *BaseReg : F.BaseRegs)
3494 RegUses.countRegister(BaseReg, LUIdx);
3495}
3496
3497/// If the given formula has not yet been inserted, add it to the list, and
3498/// return true. Return false otherwise.
3499bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
3500 // Do not insert formula that we will not be able to expand.
3501 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
3502 "Formula is illegal");
3503
3504 if (!LU.InsertFormula(F, *L))
3505 return false;
3506
3507 CountRegisters(F, LUIdx);
3508 return true;
3509}
3510
3511/// Check for other uses of loop-invariant values which we're tracking. These
3512/// other uses will pin these values in registers, making them less profitable
3513/// for elimination.
3514/// TODO: This currently misses non-constant addrec step registers.
3515/// TODO: Should this give more weight to users inside the loop?
3516void
3517LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
3518 SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
3520
3521 // Don't collect outside uses if we are favoring postinc - the instructions in
3522 // the loop are more important than the ones outside of it.
3523 if (AMK == TTI::AMK_PostIndexed)
3524 return;
3525
3526 while (!Worklist.empty()) {
3527 const SCEV *S = Worklist.pop_back_val();
3528
3529 // Don't process the same SCEV twice
3530 if (!Visited.insert(S).second)
3531 continue;
3532
3533 if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
3534 append_range(Worklist, N->operands());
3535 else if (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(S))
3536 Worklist.push_back(C->getOperand());
3537 else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
3538 Worklist.push_back(D->getLHS());
3539 Worklist.push_back(D->getRHS());
3540 } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
3541 const Value *V = US->getValue();
3542 if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
3543 // Look for instructions defined outside the loop.
3544 if (L->contains(Inst)) continue;
3545 } else if (isa<Constant>(V))
3546 // Constants can be re-materialized.
3547 continue;
3548 for (const Use &U : V->uses()) {
3549 const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
3550 // Ignore non-instructions.
3551 if (!UserInst)
3552 continue;
3553 // Don't bother if the instruction is an EHPad.
3554 if (UserInst->isEHPad())
3555 continue;
3556 // Ignore instructions in other functions (as can happen with
3557 // Constants).
3558 if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
3559 continue;
3560 // Ignore instructions not dominated by the loop.
3561 const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
3562 UserInst->getParent() :
3563 cast<PHINode>(UserInst)->getIncomingBlock(
3565 if (!DT.dominates(L->getHeader(), UseBB))
3566 continue;
3567 // Don't bother if the instruction is in a BB which ends in an EHPad.
3568 if (UseBB->getTerminator()->isEHPad())
3569 continue;
3570
3571 // Ignore cases in which the currently-examined value could come from
3572 // a basic block terminated with an EHPad. This checks all incoming
3573 // blocks of the phi node since it is possible that the same incoming
3574 // value comes from multiple basic blocks, only some of which may end
3575 // in an EHPad. If any of them do, a subsequent rewrite attempt by this
3576 // pass would try to insert instructions into an EHPad, hitting an
3577 // assertion.
3578 if (isa<PHINode>(UserInst)) {
3579 const auto *PhiNode = cast<PHINode>(UserInst);
3580 bool HasIncompatibleEHPTerminatedBlock = false;
3581 llvm::Value *ExpectedValue = U;
3582 for (unsigned int I = 0; I < PhiNode->getNumIncomingValues(); I++) {
3583 if (PhiNode->getIncomingValue(I) == ExpectedValue) {
3584 if (PhiNode->getIncomingBlock(I)->getTerminator()->isEHPad()) {
3585 HasIncompatibleEHPTerminatedBlock = true;
3586 break;
3587 }
3588 }
3589 }
3590 if (HasIncompatibleEHPTerminatedBlock) {
3591 continue;
3592 }
3593 }
3594
3595 // Don't bother rewriting PHIs in catchswitch blocks.
3596 if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
3597 continue;
3598 // Ignore uses which are part of other SCEV expressions, to avoid
3599 // analyzing them multiple times.
3600 if (SE.isSCEVable(UserInst->getType())) {
3601 const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
3602 // If the user is a no-op, look through to its uses.
3603 if (!isa<SCEVUnknown>(UserS))
3604 continue;
3605 if (UserS == US) {
3606 Worklist.push_back(
3607 SE.getUnknown(const_cast<Instruction *>(UserInst)));
3608 continue;
3609 }
3610 }
3611 // Ignore icmp instructions which are already being analyzed.
3612 if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
3613 unsigned OtherIdx = !U.getOperandNo();
3614 Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
3615 if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
3616 continue;
3617 }
3618
3619 std::pair<size_t, int64_t> P = getUse(
3620 S, LSRUse::Basic, MemAccessTy());
3621 size_t LUIdx = P.first;
3622 int64_t Offset = P.second;
3623 LSRUse &LU = Uses[LUIdx];
3624 LSRFixup &LF = LU.getNewFixup();
3625 LF.UserInst = const_cast<Instruction *>(UserInst);
3626 LF.OperandValToReplace = U;
3627 LF.Offset = Offset;
3628 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3629 if (!LU.WidestFixupType ||
3630 SE.getTypeSizeInBits(LU.WidestFixupType) <
3631 SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3632 LU.WidestFixupType = LF.OperandValToReplace->getType();
3633 InsertSupplementalFormula(US, LU, LUIdx);
3634 CountRegisters(LU.Formulae.back(), Uses.size() - 1);
3635 break;
3636 }
3637 }
3638 }
3639}
3640
3641/// Split S into subexpressions which can be pulled out into separate
3642/// registers. If C is non-null, multiply each subexpression by C.
3643///
3644/// Return remainder expression after factoring the subexpressions captured by
3645/// Ops. If Ops is complete, return NULL.
3646static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
3648 const Loop *L,
3649 ScalarEvolution &SE,
3650 unsigned Depth = 0) {
3651 // Arbitrarily cap recursion to protect compile time.
3652 if (Depth >= 3)
3653 return S;
3654
3655 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
3656 // Break out add operands.
3657 for (const SCEV *S : Add->operands()) {
3658 const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
3659 if (Remainder)
3660 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3661 }
3662 return nullptr;
3663 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
3664 // Split a non-zero base out of an addrec.
3665 if (AR->getStart()->isZero() || !AR->isAffine())
3666 return S;
3667
3668 const SCEV *Remainder = CollectSubexprs(AR->getStart(),
3669 C, Ops, L, SE, Depth+1);
3670 // Split the non-zero AddRec unless it is part of a nested recurrence that
3671 // does not pertain to this loop.
3672 if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) {
3673 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3674 Remainder = nullptr;
3675 }
3676 if (Remainder != AR->getStart()) {
3677 if (!Remainder)
3678 Remainder = SE.getConstant(AR->getType(), 0);
3679 return SE.getAddRecExpr(Remainder,
3680 AR->getStepRecurrence(SE),
3681 AR->getLoop(),
3682 //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
3684 }
3685 } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
3686 // Break (C * (a + b + c)) into C*a + C*b + C*c.
3687 if (Mul->getNumOperands() != 2)
3688 return S;
3689 if (const SCEVConstant *Op0 =
3690 dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
3691 C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
3692 const SCEV *Remainder =
3693 CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1);
3694 if (Remainder)
3695 Ops.push_back(SE.getMulExpr(C, Remainder));
3696 return nullptr;
3697 }
3698 }
3699 return S;
3700}
3701
3702/// Return true if the SCEV represents a value that may end up as a
3703/// post-increment operation.
3705 LSRUse &LU, const SCEV *S, const Loop *L,
3706 ScalarEvolution &SE) {
3707 if (LU.Kind != LSRUse::Address ||
3708 !LU.AccessTy.getType()->isIntOrIntVectorTy())
3709 return false;
3710 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S);
3711 if (!AR)
3712 return false;
3713 const SCEV *LoopStep = AR->getStepRecurrence(SE);
3714 if (!isa<SCEVConstant>(LoopStep))
3715 return false;
3716 // Check if a post-indexed load/store can be used.
3719 const SCEV *LoopStart = AR->getStart();
3720 if (!isa<SCEVConstant>(LoopStart) && SE.isLoopInvariant(LoopStart, L))
3721 return true;
3722 }
3723 return false;
3724}
3725
3726/// Helper function for LSRInstance::GenerateReassociations.
3727void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
3728 const Formula &Base,
3729 unsigned Depth, size_t Idx,
3730 bool IsScaledReg) {
3731 const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3732 // Don't generate reassociations for the base register of a value that
3733 // may generate a post-increment operator. The reason is that the
3734 // reassociations cause extra base+register formula to be created,
3735 // and possibly chosen, but the post-increment is more efficient.
3736 if (AMK == TTI::AMK_PostIndexed && mayUsePostIncMode(TTI, LU, BaseReg, L, SE))
3737 return;
3739 const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
3740 if (Remainder)
3741 AddOps.push_back(Remainder);
3742
3743 if (AddOps.size() == 1)
3744 return;
3745
3747 JE = AddOps.end();
3748 J != JE; ++J) {
3749 // Loop-variant "unknown" values are uninteresting; we won't be able to
3750 // do anything meaningful with them.
3751 if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
3752 continue;
3753
3754 // Don't pull a constant into a register if the constant could be folded
3755 // into an immediate field.
3756 if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3757 LU.AccessTy, *J, Base.getNumRegs() > 1))
3758 continue;
3759
3760 // Collect all operands except *J.
3761 SmallVector<const SCEV *, 8> InnerAddOps(
3762 ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
3763 InnerAddOps.append(std::next(J),
3764 ((const SmallVector<const SCEV *, 8> &)AddOps).end());
3765
3766 // Don't leave just a constant behind in a register if the constant could
3767 // be folded into an immediate field.
3768 if (InnerAddOps.size() == 1 &&
3769 isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3770 LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
3771 continue;
3772
3773 const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
3774 if (InnerSum->isZero())
3775 continue;
3776 Formula F = Base;
3777
3778 // Add the remaining pieces of the add back into the new formula.
3779 const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
3780 if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
3781 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
3782 InnerSumSC->getValue()->getZExtValue())) {
3783 F.UnfoldedOffset =
3784 (uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue();
3785 if (IsScaledReg)
3786 F.ScaledReg = nullptr;
3787 else
3788 F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
3789 } else if (IsScaledReg)
3790 F.ScaledReg = InnerSum;
3791 else
3792 F.BaseRegs[Idx] = InnerSum;
3793
3794 // Add J as its own register, or an unfolded immediate.
3795 const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
3796 if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
3797 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
3798 SC->getValue()->getZExtValue()))
3799 F.UnfoldedOffset =
3800 (uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue();
3801 else
3802 F.BaseRegs.push_back(*J);
3803 // We may have changed the number of register in base regs, adjust the
3804 // formula accordingly.
3805 F.canonicalize(*L);
3806
3807 if (InsertFormula(LU, LUIdx, F))
3808 // If that formula hadn't been seen before, recurse to find more like
3809 // it.
3810 // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2)
3811 // Because just Depth is not enough to bound compile time.
3812 // This means that every time AddOps.size() is greater 16^x we will add
3813 // x to Depth.
3814 GenerateReassociations(LU, LUIdx, LU.Formulae.back(),
3815 Depth + 1 + (Log2_32(AddOps.size()) >> 2));
3816 }
3817}
3818
3819/// Split out subexpressions from adds and the bases of addrecs.
3820void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
3821 Formula Base, unsigned Depth) {
3822 assert(Base.isCanonical(*L) && "Input must be in the canonical form");
3823 // Arbitrarily cap recursion to protect compile time.
3824 if (Depth >= 3)
3825 return;
3826
3827 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
3828 GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
3829
3830 if (Base.Scale == 1)
3831 GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
3832 /* Idx */ -1, /* IsScaledReg */ true);
3833}
3834
3835/// Generate a formula consisting of all of the loop-dominating registers added
3836/// into a single register.
3837void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
3838 Formula Base) {
3839 // This method is only interesting on a plurality of registers.
3840 if (Base.BaseRegs.size() + (Base.Scale == 1) +
3841 (Base.UnfoldedOffset != 0) <= 1)
3842 return;
3843
3844 // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
3845 // processing the formula.
3846 Base.unscale();
3848 Formula NewBase = Base;
3849 NewBase.BaseRegs.clear();
3850 Type *CombinedIntegerType = nullptr;
3851 for (const SCEV *BaseReg : Base.BaseRegs) {
3852 if (SE.properlyDominates(BaseReg, L->getHeader()) &&
3853 !SE.hasComputableLoopEvolution(BaseReg, L)) {
3854 if (!CombinedIntegerType)
3855 CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType());
3856 Ops.push_back(BaseReg);
3857 }
3858 else
3859 NewBase.BaseRegs.push_back(BaseReg);
3860 }
3861
3862 // If no register is relevant, we're done.
3863 if (Ops.size() == 0)
3864 return;
3865
3866 // Utility function for generating the required variants of the combined
3867 // registers.
3868 auto GenerateFormula = [&](const SCEV *Sum) {
3869 Formula F = NewBase;
3870
3871 // TODO: If Sum is zero, it probably means ScalarEvolution missed an
3872 // opportunity to fold something. For now, just ignore such cases
3873 // rather than proceed with zero in a register.
3874 if (Sum->isZero())
3875 return;
3876
3877 F.BaseRegs.push_back(Sum);
3878 F.canonicalize(*L);
3879 (void)InsertFormula(LU, LUIdx, F);
3880 };
3881
3882 // If we collected at least two registers, generate a formula combining them.
3883 if (Ops.size() > 1) {
3884 SmallVector<const SCEV *, 4> OpsCopy(Ops); // Don't let SE modify Ops.
3885 GenerateFormula(SE.getAddExpr(OpsCopy));
3886 }
3887
3888 // If we have an unfolded offset, generate a formula combining it with the
3889 // registers collected.
3890 if (NewBase.UnfoldedOffset) {
3891 assert(CombinedIntegerType && "Missing a type for the unfolded offset");
3892 Ops.push_back(SE.getConstant(CombinedIntegerType, NewBase.UnfoldedOffset,
3893 true));
3894 NewBase.UnfoldedOffset = 0;
3895 GenerateFormula(SE.getAddExpr(Ops));
3896 }
3897}
3898
3899/// Helper function for LSRInstance::GenerateSymbolicOffsets.
3900void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
3901 const Formula &Base, size_t Idx,
3902 bool IsScaledReg) {
3903 const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3904 GlobalValue *GV = ExtractSymbol(G, SE);
3905 if (G->isZero() || !GV)
3906 return;
3907 Formula F = Base;
3908 F.BaseGV = GV;
3909 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
3910 return;
3911 if (IsScaledReg)
3912 F.ScaledReg = G;
3913 else
3914 F.BaseRegs[Idx] = G;
3915 (void)InsertFormula(LU, LUIdx, F);
3916}
3917
3918/// Generate reuse formulae using symbolic offsets.
3919void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
3920 Formula Base) {
3921 // We can't add a symbolic offset if the address already contains one.
3922 if (Base.BaseGV) return;
3923
3924 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
3925 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
3926 if (Base.Scale == 1)
3927 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
3928 /* IsScaledReg */ true);
3929}
3930
3931/// Helper function for LSRInstance::GenerateConstantOffsets.
3932void LSRInstance::GenerateConstantOffsetsImpl(
3933 LSRUse &LU, unsigned LUIdx, const Formula &Base,
3934 const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) {
3935
3936 auto GenerateOffset = [&](const SCEV *G, int64_t Offset) {
3937 Formula F = Base;
3938 F.BaseOffset = (uint64_t)Base.BaseOffset - Offset;
3939
3940 if (isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) {
3941 // Add the offset to the base register.
3942 const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), Offset), G);
3943 // If it cancelled out, drop the base register, otherwise update it.
3944 if (NewG->isZero()) {
3945 if (IsScaledReg) {
3946 F.Scale = 0;
3947 F.ScaledReg = nullptr;
3948 } else
3949 F.deleteBaseReg(F.BaseRegs[Idx]);
3950 F.canonicalize(*L);
3951 } else if (IsScaledReg)
3952 F.ScaledReg = NewG;
3953 else
3954 F.BaseRegs[Idx] = NewG;
3955
3956 (void)InsertFormula(LU, LUIdx, F);
3957 }
3958 };
3959
3960 const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3961
3962 // With constant offsets and constant steps, we can generate pre-inc
3963 // accesses by having the offset equal the step. So, for access #0 with a
3964 // step of 8, we generate a G - 8 base which would require the first access
3965 // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer
3966 // for itself and hopefully becomes the base for other accesses. This means
3967 // means that a single pre-indexed access can be generated to become the new
3968 // base pointer for each iteration of the loop, resulting in no extra add/sub
3969 // instructions for pointer updating.
3970 if (AMK == TTI::AMK_PreIndexed && LU.Kind == LSRUse::Address) {
3971 if (auto *GAR = dyn_cast<SCEVAddRecExpr>(G)) {
3972 if (auto *StepRec =
3973 dyn_cast<SCEVConstant>(GAR->getStepRecurrence(SE))) {
3974 const APInt &StepInt = StepRec->getAPInt();
3975 int64_t Step = StepInt.isNegative() ?
3976 StepInt.getSExtValue() : StepInt.getZExtValue();
3977
3978 for (int64_t Offset : Worklist) {
3979 Offset -= Step;
3980 GenerateOffset(G, Offset);
3981 }
3982 }
3983 }
3984 }
3985 for (int64_t Offset : Worklist)
3986 GenerateOffset(G, Offset);
3987
3988 int64_t Imm = ExtractImmediate(G, SE);
3989 if (G->isZero() || Imm == 0)
3990 return;
3991 Formula F = Base;
3992 F.BaseOffset = (uint64_t)F.BaseOffset + Imm;
3993 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
3994 return;
3995 if (IsScaledReg) {
3996 F.ScaledReg = G;
3997 } else {
3998 F.BaseRegs[Idx] = G;
3999 // We may generate non canonical Formula if G is a recurrent expr reg
4000 // related with current loop while F.ScaledReg is not.
4001 F.canonicalize(*L);
4002 }
4003 (void)InsertFormula(LU, LUIdx, F);
4004}
4005
4006/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
4007void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
4008 Formula Base) {
4009 // TODO: For now, just add the min and max offset, because it usually isn't
4010 // worthwhile looking at everything inbetween.
4011 SmallVector<int64_t, 2> Worklist;
4012 Worklist.push_back(LU.MinOffset);
4013 if (LU.MaxOffset != LU.MinOffset)
4014 Worklist.push_back(LU.MaxOffset);
4015
4016 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4017 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
4018 if (Base.Scale == 1)
4019 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
4020 /* IsScaledReg */ true);
4021}
4022
4023/// For ICmpZero, check to see if we can scale up the comparison. For example, x
4024/// == y -> x*c == y*c.
4025void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
4026 Formula Base) {
4027 if (LU.Kind != LSRUse::ICmpZero) return;
4028
4029 // Determine the integer type for the base formula.
4030 Type *IntTy = Base.getType();
4031 if (!IntTy) return;
4032 if (SE.getTypeSizeInBits(IntTy) > 64) return;
4033
4034 // Don't do this if there is more than one offset.
4035 if (LU.MinOffset != LU.MaxOffset) return;
4036
4037 // Check if transformation is valid. It is illegal to multiply pointer.
4038 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4039 return;
4040 for (const SCEV *BaseReg : Base.BaseRegs)
4041 if (BaseReg->getType()->isPointerTy())
4042 return;
4043 assert(!Base.BaseGV && "ICmpZero use is not legal!");
4044
4045 // Check each interesting stride.
4046 for (int64_t Factor : Factors) {
4047 // Check that Factor can be represented by IntTy
4048 if (!ConstantInt::isValueValidForType(IntTy, Factor))
4049 continue;
4050 // Check that the multiplication doesn't overflow.
4051 if (Base.BaseOffset == std::numeric_limits<int64_t>::min() && Factor == -1)
4052 continue;
4053 int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor;
4054 assert(Factor != 0 && "Zero factor not expected!");
4055 if (NewBaseOffset / Factor != Base.BaseOffset)
4056 continue;
4057 // If the offset will be truncated at this use, check that it is in bounds.
4058 if (!IntTy->isPointerTy() &&
4059 !ConstantInt::isValueValidForType(IntTy, NewBaseOffset))
4060 continue;
4061
4062 // Check that multiplying with the use offset doesn't overflow.
4063 int64_t Offset = LU.MinOffset;
4064 if (Offset == std::numeric_limits<int64_t>::min() && Factor == -1)
4065 continue;
4066 Offset = (uint64_t)Offset * Factor;
4067 if (Offset / Factor != LU.MinOffset)
4068 continue;
4069 // If the offset will be truncated at this use, check that it is in bounds.
4070 if (!IntTy->isPointerTy() &&
4072 continue;
4073
4074 Formula F = Base;
4075 F.BaseOffset = NewBaseOffset;
4076
4077 // Check that this scale is legal.
4078 if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
4079 continue;
4080
4081 // Compensate for the use having MinOffset built into it.
4082 F.BaseOffset = (uint64_t)F.BaseOffset + Offset - LU.MinOffset;
4083
4084 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4085
4086 // Check that multiplying with each base register doesn't overflow.
4087 for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
4088 F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
4089 if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
4090 goto next;
4091 }
4092
4093 // Check that multiplying with the scaled register doesn't overflow.
4094 if (F.ScaledReg) {
4095 F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
4096 if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
4097 continue;
4098 }
4099
4100 // Check that multiplying with the unfolded offset doesn't overflow.
4101 if (F.UnfoldedOffset != 0) {
4102 if (F.UnfoldedOffset == std::numeric_limits<int64_t>::min() &&
4103 Factor == -1)
4104 continue;
4105 F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor;
4106 if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset)
4107 continue;
4108 // If the offset will be truncated, check that it is in bounds.
4109 if (!IntTy->isPointerTy() &&
4110 !ConstantInt::isValueValidForType(IntTy, F.UnfoldedOffset))
4111 continue;
4112 }
4113
4114 // If we make it here and it's legal, add it.
4115 (void)InsertFormula(LU, LUIdx, F);
4116 next:;
4117 }
4118}
4119
4120/// Generate stride factor reuse formulae by making use of scaled-offset address
4121/// modes, for example.
4122void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
4123 // Determine the integer type for the base formula.
4124 Type *IntTy = Base.getType();
4125 if (!IntTy) return;
4126
4127 // If this Formula already has a scaled register, we can't add another one.
4128 // Try to unscale the formula to generate a better scale.
4129 if (Base.Scale != 0 && !Base.unscale())
4130 return;
4131
4132 assert(Base.Scale == 0 && "unscale did not did its job!");
4133
4134 // Check each interesting stride.
4135 for (int64_t Factor : Factors) {
4136 Base.Scale = Factor;
4137 Base.HasBaseReg = Base.BaseRegs.size() > 1;
4138 // Check whether this scale is going to be legal.
4139 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4140 Base)) {
4141 // As a special-case, handle special out-of-loop Basic users specially.
4142 // TODO: Reconsider this special case.
4143 if (LU.Kind == LSRUse::Basic &&
4144 isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
4145 LU.AccessTy, Base) &&
4146 LU.AllFixupsOutsideLoop)
4147 LU.Kind = LSRUse::Special;
4148 else
4149 continue;
4150 }
4151 // For an ICmpZero, negating a solitary base register won't lead to
4152 // new solutions.
4153 if (LU.Kind == LSRUse::ICmpZero &&
4154 !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV)
4155 continue;
4156 // For each addrec base reg, if its loop is current loop, apply the scale.
4157 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
4158 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
4159 if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
4160 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4161 if (FactorS->isZero())
4162 continue;
4163 // Divide out the factor, ignoring high bits, since we'll be
4164 // scaling the value back up in the end.
4165 if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true))
4166 if (!Quotient->isZero()) {
4167 // TODO: This could be optimized to avoid all the copying.
4168 Formula F = Base;
4169 F.ScaledReg = Quotient;
4170 F.deleteBaseReg(F.BaseRegs[i]);
4171 // The canonical representation of 1*reg is reg, which is already in
4172 // Base. In that case, do not try to insert the formula, it will be
4173 // rejected anyway.
4174 if (F.Scale == 1 && (F.BaseRegs.empty() ||
4175 (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
4176 continue;
4177 // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
4178 // non canonical Formula with ScaledReg's loop not being L.
4179 if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
4180 F.canonicalize(*L);
4181 (void)InsertFormula(LU, LUIdx, F);
4182 }
4183 }
4184 }
4185 }
4186}
4187
4188/// Extend/Truncate \p Expr to \p ToTy considering post-inc uses in \p Loops.
4189/// For all PostIncLoopSets in \p Loops, first de-normalize \p Expr, then
4190/// perform the extension/truncate and normalize again, as the normalized form
4191/// can result in folds that are not valid in the post-inc use contexts. The
4192/// expressions for all PostIncLoopSets must match, otherwise return nullptr.
4193static const SCEV *
4195 const SCEV *Expr, Type *ToTy,
4196 ScalarEvolution &SE) {
4197 const SCEV *Result = nullptr;
4198 for (auto &L : Loops) {
4199 auto *DenormExpr = denormalizeForPostIncUse(Expr, L, SE);
4200 const SCEV *NewDenormExpr = SE.getAnyExtendExpr(DenormExpr, ToTy);
4201 const SCEV *New = normalizeForPostIncUse(NewDenormExpr, L, SE);
4202 if (!New || (Result && New != Result))
4203 return nullptr;
4204 Result = New;
4205 }
4206
4207 assert(Result && "failed to create expression");
4208 return Result;
4209}
4210
4211/// Generate reuse formulae from different IV types.
4212void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
4213 // Don't bother truncating symbolic values.
4214 if (Base.BaseGV) return;
4215
4216 // Determine the integer type for the base formula.
4217 Type *DstTy = Base.getType();
4218 if (!DstTy) return;
4219 if (DstTy->isPointerTy())
4220 return;
4221
4222 // It is invalid to extend a pointer type so exit early if ScaledReg or
4223 // any of the BaseRegs are pointers.
4224 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4225 return;
4226 if (any_of(Base.BaseRegs,
4227 [](const SCEV *S) { return S->getType()->isPointerTy(); }))
4228 return;
4229
4231 for (auto &LF : LU.Fixups)
4232 Loops.push_back(LF.PostIncLoops);
4233
4234 for (Type *SrcTy : Types) {
4235 if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
4236 Formula F = Base;
4237
4238 // Sometimes SCEV is able to prove zero during ext transform. It may
4239 // happen if SCEV did not do all possible transforms while creating the
4240 // initial node (maybe due to depth limitations), but it can do them while
4241 // taking ext.
4242 if (F.ScaledReg) {
4243 const SCEV *NewScaledReg =
4244 getAnyExtendConsideringPostIncUses(Loops, F.ScaledReg, SrcTy, SE);
4245 if (!NewScaledReg || NewScaledReg->isZero())
4246 continue;
4247 F.ScaledReg = NewScaledReg;
4248 }
4249 bool HasZeroBaseReg = false;
4250 for (const SCEV *&BaseReg : F.BaseRegs) {
4251 const SCEV *NewBaseReg =
4252 getAnyExtendConsideringPostIncUses(Loops, BaseReg, SrcTy, SE);
4253 if (!NewBaseReg || NewBaseReg->isZero()) {
4254 HasZeroBaseReg = true;
4255 break;
4256 }
4257 BaseReg = NewBaseReg;
4258 }
4259 if (HasZeroBaseReg)
4260 continue;
4261
4262 // TODO: This assumes we've done basic processing on all uses and
4263 // have an idea what the register usage is.
4264 if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
4265 continue;
4266
4267 F.canonicalize(*L);
4268 (void)InsertFormula(LU, LUIdx, F);
4269 }
4270 }
4271}
4272
4273namespace {
4274
4275/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
4276/// modifications so that the search phase doesn't have to worry about the data
4277/// structures moving underneath it.
4278struct WorkItem {
4279 size_t LUIdx;
4280 int64_t Imm;
4281 const SCEV *OrigReg;
4282
4283 WorkItem(size_t LI, int64_t I, const SCEV *R)
4284 : LUIdx(LI), Imm(I), OrigReg(R) {}
4285
4286 void print(raw_ostream &OS) const;
4287 void dump() const;
4288};
4289
4290} // end anonymous namespace
4291
4292#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4293void WorkItem::print(raw_ostream &OS) const {
4294 OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
4295 << " , add offset " << Imm;
4296}
4297
4298LLVM_DUMP_METHOD void WorkItem::dump() const {
4299 print(errs()); errs() << '\n';
4300}
4301#endif
4302
4303/// Look for registers which are a constant distance apart and try to form reuse
4304/// opportunities between them.
4305void LSRInstance::GenerateCrossUseConstantOffsets() {
4306 // Group the registers by their value without any added constant offset.
4307 using ImmMapTy = std::map<int64_t, const SCEV *>;
4308
4312 for (const SCEV *Use : RegUses) {
4313 const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify.
4314 int64_t Imm = ExtractImmediate(Reg, SE);
4315 auto Pair = Map.insert(std::make_pair(Reg, ImmMapTy()));
4316 if (Pair.second)
4317 Sequence.push_back(Reg);
4318 Pair.first->second.insert(std::make_pair(Imm, Use));
4319 UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
4320 }
4321
4322 // Now examine each set of registers with the same base value. Build up
4323 // a list of work to do and do the work in a separate step so that we're
4324 // not adding formulae and register counts while we're searching.
4325 SmallVector<WorkItem, 32> WorkItems;
4326 SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems;
4327 for (const SCEV *Reg : Sequence) {
4328 const ImmMapTy &Imms = Map.find(Reg)->second;
4329
4330 // It's not worthwhile looking for reuse if there's only one offset.
4331 if (Imms.size() == 1)
4332 continue;
4333
4334 LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
4335 for (const auto &Entry
4336 : Imms) dbgs()
4337 << ' ' << Entry.first;
4338 dbgs() << '\n');
4339
4340 // Examine each offset.
4341 for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
4342 J != JE; ++J) {
4343 const SCEV *OrigReg = J->second;
4344
4345 int64_t JImm = J->first;
4346 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
4347
4348 if (!isa<SCEVConstant>(OrigReg) &&
4349 UsedByIndicesMap[Reg].count() == 1) {
4350 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4351 << '\n');
4352 continue;
4353 }
4354
4355 // Conservatively examine offsets between this orig reg a few selected
4356 // other orig regs.
4357 int64_t First = Imms.begin()->first;
4358 int64_t Last = std::prev(Imms.end())->first;
4359 // Compute (First + Last) / 2 without overflow using the fact that
4360 // First + Last = 2 * (First + Last) + (First ^ Last).
4361 int64_t Avg = (First & Last) + ((First ^ Last) >> 1);
4362 // If the result is negative and First is odd and Last even (or vice versa),
4363 // we rounded towards -inf. Add 1 in that case, to round towards 0.
4364 Avg = Avg + ((First ^ Last) & ((uint64_t)Avg >> 63));
4365 ImmMapTy::const_iterator OtherImms[] = {
4366 Imms.begin(), std::prev(Imms.end()),
4367 Imms.lower_bound(Avg)};
4368 for (const auto &M : OtherImms) {
4369 if (M == J || M == JE) continue;
4370
4371 // Compute the difference between the two.
4372 int64_t Imm = (uint64_t)JImm - M->first;
4373 for (unsigned LUIdx : UsedByIndices.set_bits())
4374 // Make a memo of this use, offset, and register tuple.
4375 if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
4376 WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
4377 }
4378 }
4379 }
4380
4381 Map.clear();
4382 Sequence.clear();
4383 UsedByIndicesMap.clear();
4384 UniqueItems.clear();
4385
4386 // Now iterate through the worklist and add new formulae.
4387 for (const WorkItem &WI : WorkItems) {
4388 size_t LUIdx = WI.LUIdx;
4389 LSRUse &LU = Uses[LUIdx];
4390 int64_t Imm = WI.Imm;
4391 const SCEV *OrigReg = WI.OrigReg;
4392
4393 Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
4394 const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
4395 unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
4396
4397 // TODO: Use a more targeted data structure.
4398 for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
4399 Formula F = LU.Formulae[L];
4400 // FIXME: The code for the scaled and unscaled registers looks
4401 // very similar but slightly different. Investigate if they
4402 // could be merged. That way, we would not have to unscale the
4403 // Formula.
4404 F.unscale();
4405 // Use the immediate in the scaled register.
4406 if (F.ScaledReg == OrigReg) {
4407 int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale;
4408 // Don't create 50 + reg(-50).
4409 if (F.referencesReg(SE.getSCEV(
4410 ConstantInt::get(IntTy, -(uint64_t)Offset))))
4411 continue;
4412 Formula NewF = F;
4413 NewF.BaseOffset = Offset;
4414 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4415 NewF))
4416 continue;
4417 NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
4418
4419 // If the new scale is a constant in a register, and adding the constant
4420 // value to the immediate would produce a value closer to zero than the
4421 // immediate itself, then the formula isn't worthwhile.
4422 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
4423 if (C->getValue()->isNegative() != (NewF.BaseOffset < 0) &&
4424 (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
4425 .ule(std::abs(NewF.BaseOffset)))
4426 continue;
4427
4428 // OK, looks good.
4429 NewF.canonicalize(*this->L);
4430 (void)InsertFormula(LU, LUIdx, NewF);
4431 } else {
4432 // Use the immediate in a base register.
4433 for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
4434 const SCEV *BaseReg = F.BaseRegs[N];
4435 if (BaseReg != OrigReg)
4436 continue;
4437 Formula NewF = F;
4438 NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm;
4439 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
4440 LU.Kind, LU.AccessTy, NewF)) {
4441 if (AMK == TTI::AMK_PostIndexed &&
4442 mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
4443 continue;
4444 if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
4445 continue;
4446 NewF = F;
4447 NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm;
4448 }
4449 NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
4450
4451 // If the new formula has a constant in a register, and adding the
4452 // constant value to the immediate would produce a value closer to
4453 // zero than the immediate itself, then the formula isn't worthwhile.
4454 for (const SCEV *NewReg : NewF.BaseRegs)
4455 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg))
4456 if ((C->getAPInt() + NewF.BaseOffset)
4457 .abs()
4458 .slt(std::abs(NewF.BaseOffset)) &&
4459 (C->getAPInt() + NewF.BaseOffset).countr_zero() >=
4460 (unsigned)llvm::countr_zero<uint64_t>(NewF.BaseOffset))
4461 goto skip_formula;
4462
4463 // Ok, looks good.
4464 NewF.canonicalize(*this->L);
4465 (void)InsertFormula(LU, LUIdx, NewF);
4466 break;
4467 skip_formula:;
4468 }
4469 }
4470 }
4471 }
4472}
4473
4474/// Generate formulae for each use.
4475void
4476LSRInstance::GenerateAllReuseFormulae() {
4477 // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
4478 // queries are more precise.
4479 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4480 LSRUse &LU = Uses[LUIdx];
4481 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4482 GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
4483 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4484 GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
4485 }
4486 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4487 LSRUse &LU = Uses[LUIdx];
4488 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4489 GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
4490 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4491 GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
4492 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4493 GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
4494 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4495 GenerateScales(LU, LUIdx, LU.Formulae[i]);
4496 }
4497 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4498 LSRUse &LU = Uses[LUIdx];
4499 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4500 GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
4501 }
4502
4503 GenerateCrossUseConstantOffsets();
4504
4505 LLVM_DEBUG(dbgs() << "\n"
4506 "After generating reuse formulae:\n";
4507 print_uses(dbgs()));
4508}
4509
4510/// If there are multiple formulae with the same set of registers used
4511/// by other uses, pick the best one and delete the others.
4512void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
4513 DenseSet<const SCEV *> VisitedRegs;
4516#ifndef NDEBUG
4517 bool ChangedFormulae = false;
4518#endif
4519
4520 // Collect the best formula for each unique set of shared registers. This
4521 // is reset for each use.
4522 using BestFormulaeTy =
4523 DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>;
4524
4525 BestFormulaeTy BestFormulae;
4526
4527 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4528 LSRUse &LU = Uses[LUIdx];
4529 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
4530 dbgs() << '\n');
4531
4532 bool Any = false;
4533 for (size_t FIdx = 0, NumForms = LU.Formulae.size();
4534 FIdx != NumForms; ++FIdx) {
4535 Formula &F = LU.Formulae[FIdx];
4536
4537 // Some formulas are instant losers. For example, they may depend on
4538 // nonexistent AddRecs from other loops. These need to be filtered
4539 // immediately, otherwise heuristics could choose them over others leading
4540 // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
4541 // avoids the need to recompute this information across formulae using the
4542 // same bad AddRec. Passing LoserRegs is also essential unless we remove
4543 // the corresponding bad register from the Regs set.
4544 Cost CostF(L, SE, TTI, AMK);
4545 Regs.clear();
4546 CostF.RateFormula(F, Regs, VisitedRegs, LU, &LoserRegs);
4547 if (CostF.isLoser()) {
4548 // During initial formula generation, undesirable formulae are generated
4549 // by uses within other loops that have some non-trivial address mode or
4550 // use the postinc form of the IV. LSR needs to provide these formulae
4551 // as the basis of rediscovering the desired formula that uses an AddRec
4552 // corresponding to the existing phi. Once all formulae have been
4553 // generated, these initial losers may be pruned.
4554 LLVM_DEBUG(dbgs() << " Filtering loser "; F.print(dbgs());
4555 dbgs() << "\n");
4556 }
4557 else {
4559 for (const SCEV *Reg : F.BaseRegs) {
4560 if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
4561 Key.push_back(Reg);
4562 }
4563 if (F.ScaledReg &&
4564 RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
4565 Key.push_back(F.ScaledReg);
4566 // Unstable sort by host order ok, because this is only used for
4567 // uniquifying.
4568 llvm::sort(Key);
4569
4570 std::pair<BestFormulaeTy::const_iterator, bool> P =
4571 BestFormulae.insert(std::make_pair(Key, FIdx));
4572 if (P.second)
4573 continue;
4574
4575 Formula &Best = LU.Formulae[P.first->second];
4576
4577 Cost CostBest(L, SE, TTI, AMK);
4578 Regs.clear();
4579 CostBest.RateFormula(Best, Regs, VisitedRegs, LU);
4580 if (CostF.isLess(CostBest))
4581 std::swap(F, Best);
4582 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
4583 dbgs() << "\n"
4584 " in favor of formula ";
4585 Best.print(dbgs()); dbgs() << '\n');
4586 }
4587#ifndef NDEBUG
4588 ChangedFormulae = true;
4589#endif
4590 LU.DeleteFormula(F);
4591 --FIdx;
4592 --NumForms;
4593 Any = true;
4594 }
4595
4596 // Now that we've filtered out some formulae, recompute the Regs set.
4597 if (Any)
4598 LU.RecomputeRegs(LUIdx, RegUses);
4599
4600 // Reset this to prepare for the next use.
4601 BestFormulae.clear();
4602 }
4603
4604 LLVM_DEBUG(if (ChangedFormulae) {
4605 dbgs() << "\n"
4606 "After filtering out undesirable candidates:\n";
4607 print_uses(dbgs());
4608 });
4609}
4610
4611/// Estimate the worst-case number of solutions the solver might have to
4612/// consider. It almost never considers this many solutions because it prune the
4613/// search space, but the pruning isn't always sufficient.
4614size_t LSRInstance::EstimateSearchSpaceComplexity() const {
4615 size_t Power = 1;
4616 for (const LSRUse &LU : Uses) {
4617 size_t FSize = LU.Formulae.size();
4618 if (FSize >= ComplexityLimit) {
4619 Power = ComplexityLimit;
4620 break;
4621 }
4622 Power *= FSize;
4623 if (Power >= ComplexityLimit)
4624 break;
4625 }
4626 return Power;
4627}
4628
4629/// When one formula uses a superset of the registers of another formula, it
4630/// won't help reduce register pressure (though it may not necessarily hurt
4631/// register pressure); remove it to simplify the system.
4632void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
4633 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4634 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4635
4636 LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
4637 "which use a superset of registers used by other "
4638 "formulae.\n");
4639
4640 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4641 LSRUse &LU = Uses[LUIdx];
4642 bool Any = false;
4643 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
4644 Formula &F = LU.Formulae[i];
4645 // Look for a formula with a constant or GV in a register. If the use
4646 // also has a formula with that same value in an immediate field,
4647 // delete the one that uses a register.
4649 I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
4650 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
4651 Formula NewF = F;
4652 //FIXME: Formulas should store bitwidth to do wrapping properly.
4653 // See PR41034.
4654 NewF.BaseOffset += (uint64_t)C->getValue()->getSExtValue();
4655 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4656 (I - F.BaseRegs.begin()));
4657 if (LU.HasFormulaWithSameRegs(NewF)) {
4658 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4659 dbgs() << '\n');
4660 LU.DeleteFormula(F);
4661 --i;
4662 --e;
4663 Any = true;
4664 break;
4665 }
4666 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
4667 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
4668 if (!F.BaseGV) {
4669 Formula NewF = F;
4670 NewF.BaseGV = GV;
4671 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4672 (I - F.BaseRegs.begin()));
4673 if (LU.HasFormulaWithSameRegs(NewF)) {
4674 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4675 dbgs() << '\n');
4676 LU.DeleteFormula(F);
4677 --i;
4678 --e;
4679 Any = true;
4680 break;
4681 }
4682 }
4683 }
4684 }
4685 }
4686 if (Any)
4687 LU.RecomputeRegs(LUIdx, RegUses);
4688 }
4689
4690 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4691 }
4692}
4693
4694/// When there are many registers for expressions like A, A+1, A+2, etc.,
4695/// allocate a single register for them.
4696void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
4697 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4698 return;
4699
4700 LLVM_DEBUG(
4701 dbgs() << "The search space is too complex.\n"
4702 "Narrowing the search space by assuming that uses separated "
4703 "by a constant offset will use the same registers.\n");
4704
4705 // This is especially useful for unrolled loops.
4706
4707 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4708 LSRUse &LU = Uses[LUIdx];
4709 for (const Formula &F : LU.Formulae) {
4710 if (F.BaseOffset == 0 || (F.Scale != 0 && F.Scale != 1))
4711 continue;
4712
4713 LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
4714 if (!LUThatHas)
4715 continue;
4716
4717 if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
4718 LU.Kind, LU.AccessTy))
4719 continue;
4720
4721 LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');
4722
4723 LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
4724
4725 // Transfer the fixups of LU to LUThatHas.
4726 for (LSRFixup &Fixup : LU.Fixups) {
4727 Fixup.Offset += F.BaseOffset;
4728 LUThatHas->pushFixup(Fixup);
4729 LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
4730 }
4731
4732 // Delete formulae from the new use which are no longer legal.
4733 bool Any = false;
4734 for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
4735 Formula &F = LUThatHas->Formulae[i];
4736 if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
4737 LUThatHas->Kind, LUThatHas->AccessTy, F)) {
4738 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
4739 LUThatHas->DeleteFormula(F);
4740 --i;
4741 --e;
4742 Any = true;
4743 }
4744 }
4745
4746 if (Any)
4747 LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
4748
4749 // Delete the old use.
4750 DeleteUse(LU, LUIdx);
4751 --LUIdx;
4752 --NumUses;
4753 break;
4754 }
4755 }
4756
4757 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4758}
4759
4760/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
4761/// we've done more filtering, as it may be able to find more formulae to
4762/// eliminate.
4763void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
4764 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4765 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4766
4767 LLVM_DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
4768 "undesirable dedicated registers.\n");
4769
4770 FilterOutUndesirableDedicatedRegisters();
4771
4772 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4773 }
4774}
4775
4776/// If a LSRUse has multiple formulae with the same ScaledReg and Scale.
4777/// Pick the best one and delete the others.
4778/// This narrowing heuristic is to keep as many formulae with different
4779/// Scale and ScaledReg pair as possible while narrowing the search space.
4780/// The benefit is that it is more likely to find out a better solution
4781/// from a formulae set with more Scale and ScaledReg variations than
4782/// a formulae set with the same Scale and ScaledReg. The picking winner
4783/// reg heuristic will often keep the formulae with the same Scale and
4784/// ScaledReg and filter others, and we want to avoid that if possible.
4785void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
4786 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4787 return;
4788
4789 LLVM_DEBUG(
4790 dbgs() << "The search space is too complex.\n"
4791 "Narrowing the search space by choosing the best Formula "
4792 "from the Formulae with the same Scale and ScaledReg.\n");
4793
4794 // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
4795 using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>;
4796
4797 BestFormulaeTy BestFormulae;
4798#ifndef NDEBUG
4799 bool ChangedFormulae = false;
4800#endif
4801 DenseSet<const SCEV *> VisitedRegs;
4803
4804 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4805 LSRUse &LU = Uses[LUIdx];
4806 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
4807 dbgs() << '\n');
4808
4809 // Return true if Formula FA is better than Formula FB.
4810 auto IsBetterThan = [&](Formula &FA, Formula &FB) {
4811 // First we will try to choose the Formula with fewer new registers.
4812 // For a register used by current Formula, the more the register is
4813 // shared among LSRUses, the less we increase the register number
4814 // counter of the formula.
4815 size_t FARegNum = 0;
4816 for (const SCEV *Reg : FA.BaseRegs) {
4817 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
4818 FARegNum += (NumUses - UsedByIndices.count() + 1);
4819 }
4820 size_t FBRegNum = 0;
4821 for (const SCEV *Reg : FB.BaseRegs) {
4822 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
4823 FBRegNum += (NumUses - UsedByIndices.count() + 1);
4824 }
4825 if (FARegNum != FBRegNum)
4826 return FARegNum < FBRegNum;
4827
4828 // If the new register numbers are the same, choose the Formula with
4829 // less Cost.
4830 Cost CostFA(L, SE, TTI, AMK);
4831 Cost CostFB(L, SE, TTI, AMK);
4832 Regs.clear();
4833 CostFA.RateFormula(FA, Regs, VisitedRegs, LU);
4834 Regs.clear();
4835 CostFB.RateFormula(FB, Regs, VisitedRegs, LU);
4836 return CostFA.isLess(CostFB);
4837 };
4838
4839 bool Any = false;
4840 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
4841 ++FIdx) {
4842 Formula &F = LU.Formulae[FIdx];
4843 if (!F.ScaledReg)
4844 continue;
4845 auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx});
4846 if (P.second)
4847 continue;
4848
4849 Formula &Best = LU.Formulae[P.first->second];
4850 if (IsBetterThan(F, Best))
4851 std::swap(F, Best);
4852 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
4853 dbgs() << "\n"
4854 " in favor of formula ";
4855 Best.print(dbgs()); dbgs() << '\n');
4856#ifndef NDEBUG
4857 ChangedFormulae = true;
4858#endif
4859 LU.DeleteFormula(F);
4860 --FIdx;
4861 --NumForms;
4862 Any = true;
4863 }
4864 if (Any)
4865 LU.RecomputeRegs(LUIdx, RegUses);
4866
4867 // Reset this to prepare for the next use.
4868 BestFormulae.clear();
4869 }
4870
4871 LLVM_DEBUG(if (ChangedFormulae) {
4872 dbgs() << "\n"
4873 "After filtering out undesirable candidates:\n";
4874 print_uses(dbgs());
4875 });
4876}
4877
4878/// If we are over the complexity limit, filter out any post-inc prefering
4879/// variables to only post-inc values.
4880void LSRInstance::NarrowSearchSpaceByFilterPostInc() {
4881 if (AMK != TTI::AMK_PostIndexed)
4882 return;
4883 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4884 return;
4885
4886 LLVM_DEBUG(dbgs() << "The search space is too complex.\n"
4887 "Narrowing the search space by choosing the lowest "
4888 "register Formula for PostInc Uses.\n");
4889
4890 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4891 LSRUse &LU = Uses[LUIdx];
4892
4893 if (LU.Kind != LSRUse::Address)
4894 continue;
4895 if (!TTI.isIndexedLoadLegal(TTI.MIM_PostInc, LU.AccessTy.getType()) &&
4896 !TTI.isIndexedStoreLegal(TTI.MIM_PostInc, LU.AccessTy.getType()))
4897 continue;
4898
4899 size_t MinRegs = std::numeric_limits<size_t>::max();
4900 for (const Formula &F : LU.Formulae)
4901 MinRegs = std::min(F.getNumRegs(), MinRegs);
4902
4903 bool Any = false;
4904 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
4905 ++FIdx) {
4906 Formula &F = LU.Formulae[FIdx];
4907 if (F.getNumRegs() > MinRegs) {
4908 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
4909 dbgs() << "\n");
4910 LU.DeleteFormula(F);
4911 --FIdx;
4912 --NumForms;
4913 Any = true;
4914 }
4915 }
4916 if (Any)
4917 LU.RecomputeRegs(LUIdx, RegUses);
4918
4919 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4920 break;
4921 }
4922
4923 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4924}
4925
4926/// The function delete formulas with high registers number expectation.
4927/// Assuming we don't know the value of each formula (already delete
4928/// all inefficient), generate probability of not selecting for each
4929/// register.
4930/// For example,
4931/// Use1:
4932/// reg(a) + reg({0,+,1})
4933/// reg(a) + reg({-1,+,1}) + 1
4934/// reg({a,+,1})
4935/// Use2:
4936/// reg(b) + reg({0,+,1})
4937/// reg(b) + reg({-1,+,1}) + 1
4938/// reg({b,+,1})
4939/// Use3:
4940/// reg(c) + reg(b) + reg({0,+,1})
4941/// reg(c) + reg({b,+,1})
4942///
4943/// Probability of not selecting
4944/// Use1 Use2 Use3
4945/// reg(a) (1/3) * 1 * 1
4946/// reg(b) 1 * (1/3) * (1/2)
4947/// reg({0,+,1}) (2/3) * (2/3) * (1/2)
4948/// reg({-1,+,1}) (2/3) * (2/3) * 1
4949/// reg({a,+,1}) (2/3) * 1 * 1
4950/// reg({b,+,1}) 1 * (2/3) * (2/3)
4951/// reg(c) 1 * 1 * 0
4952///
4953/// Now count registers number mathematical expectation for each formula:
4954/// Note that for each use we exclude probability if not selecting for the use.
4955/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
4956/// probabilty 1/3 of not selecting for Use1).
4957/// Use1:
4958/// reg(a) + reg({0,+,1}) 1 + 1/3 -- to be deleted
4959/// reg(a) + reg({-1,+,1}) + 1 1 + 4/9 -- to be deleted
4960/// reg({a,+,1}) 1
4961/// Use2:
4962/// reg(b) + reg({0,+,1}) 1/2 + 1/3 -- to be deleted
4963/// reg(b) + reg({-1,+,1}) + 1 1/2 + 2/3 -- to be deleted
4964/// reg({b,+,1}) 2/3
4965/// Use3:
4966/// reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
4967/// reg(c) + reg({b,+,1}) 1 + 2/3
4968void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
4969 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4970 return;
4971 // Ok, we have too many of formulae on our hands to conveniently handle.
4972 // Use a rough heuristic to thin out the list.
4973
4974 // Set of Regs wich will be 100% used in final solution.
4975 // Used in each formula of a solution (in example above this is reg(c)).
4976 // We can skip them in calculations.
4978 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4979
4980 // Map each register to probability of not selecting
4981 DenseMap <const SCEV *, float> RegNumMap;
4982 for (const SCEV *Reg : RegUses) {
4983 if (UniqRegs.count(Reg))
4984 continue;
4985 float PNotSel = 1;
4986 for (const LSRUse &LU : Uses) {
4987 if (!LU.Regs.count(Reg))
4988 continue;
4989 float P = LU.getNotSelectedProbability(Reg);
4990 if (P != 0.0)
4991 PNotSel *= P;
4992 else
4993 UniqRegs.insert(Reg);
4994 }
4995 RegNumMap.insert(std::make_pair(Reg, PNotSel));
4996 }
4997
4998 LLVM_DEBUG(
4999 dbgs() << "Narrowing the search space by deleting costly formulas\n");
5000
5001 // Delete formulas where registers number expectation is high.
5002 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5003 LSRUse &LU = Uses[LUIdx];
5004 // If nothing to delete - continue.
5005 if (LU.Formulae.size() < 2)
5006 continue;
5007 // This is temporary solution to test performance. Float should be
5008 // replaced with round independent type (based on integers) to avoid
5009 // different results for different target builds.
5010 float FMinRegNum = LU.Formulae[0].getNumRegs();
5011 float FMinARegNum = LU.Formulae[0].getNumRegs();
5012 size_t MinIdx = 0;
5013 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5014 Formula &F = LU.Formulae[i];
5015 float FRegNum = 0;
5016 float FARegNum = 0;
5017 for (const SCEV *BaseReg : F.BaseRegs) {
5018 if (UniqRegs.count(BaseReg))
5019 continue;
5020 FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5021 if (isa<SCEVAddRecExpr>(BaseReg))
5022 FARegNum +=
5023 RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5024 }
5025 if (const SCEV *ScaledReg = F.ScaledReg) {
5026 if (!UniqRegs.count(ScaledReg)) {
5027 FRegNum +=
5028 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5029 if (isa<SCEVAddRecExpr>(ScaledReg))
5030 FARegNum +=
5031 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5032 }
5033 }
5034 if (FMinRegNum > FRegNum ||
5035 (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
5036 FMinRegNum = FRegNum;
5037 FMinARegNum = FARegNum;
5038 MinIdx = i;
5039 }
5040 }
5041 LLVM_DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs());
5042 dbgs() << " with min reg num " << FMinRegNum << '\n');
5043 if (MinIdx != 0)
5044 std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
5045 while (LU.Formulae.size() != 1) {
5046 LLVM_DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs());
5047 dbgs() << '\n');
5048 LU.Formulae.pop_back();
5049 }
5050 LU.RecomputeRegs(LUIdx, RegUses);
5051 assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
5052 Formula &F = LU.Formulae[0];
5053 LLVM_DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n');
5054 // When we choose the formula, the regs become unique.
5055 UniqRegs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
5056 if (F.ScaledReg)
5057 UniqRegs.insert(F.ScaledReg);
5058 }
5059 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5060}
5061
5062// Check if Best and Reg are SCEVs separated by a constant amount C, and if so
5063// would the addressing offset +C would be legal where the negative offset -C is
5064// not.
5066 ScalarEvolution &SE, const SCEV *Best,
5067 const SCEV *Reg,
5068 MemAccessTy AccessType) {
5069 if (Best->getType() != Reg->getType() ||
5070 (isa<SCEVAddRecExpr>(Best) && isa<SCEVAddRecExpr>(Reg) &&
5071 cast<SCEVAddRecExpr>(Best)->getLoop() !=
5072 cast<SCEVAddRecExpr>(Reg)->getLoop()))
5073 return false;
5074 const auto *Diff = dyn_cast<SCEVConstant>(SE.getMinusSCEV(Best, Reg));
5075 if (!Diff)
5076 return false;
5077
5079 AccessType.MemTy, /*BaseGV=*/nullptr,
5080 /*BaseOffset=*/Diff->getAPInt().getSExtValue(),
5081 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace) &&
5083 AccessType.MemTy, /*BaseGV=*/nullptr,
5084 /*BaseOffset=*/-Diff->getAPInt().getSExtValue(),
5085 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace);
5086}
5087
5088/// Pick a register which seems likely to be profitable, and then in any use
5089/// which has any reference to that register, delete all formulae which do not
5090/// reference that register.
5091void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
5092 // With all other options exhausted, loop until the system is simple
5093 // enough to handle.
5095 while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5096 // Ok, we have too many of formulae on our hands to conveniently handle.
5097 // Use a rough heuristic to thin out the list.
5098 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5099
5100 // Pick the register which is used by the most LSRUses, which is likely
5101 // to be a good reuse register candidate.
5102 const SCEV *Best = nullptr;
5103 unsigned BestNum = 0;
5104 for (const SCEV *Reg : RegUses) {
5105 if (Taken.count(Reg))
5106 continue;
5107 if (!Best) {
5108 Best = Reg;
5109 BestNum = RegUses.getUsedByIndices(Reg).count();
5110 } else {
5111 unsigned Count = RegUses.getUsedByIndices(Reg).count();
5112 if (Count > BestNum) {
5113 Best = Reg;
5114 BestNum = Count;
5115 }
5116
5117 // If the scores are the same, but the Reg is simpler for the target
5118 // (for example {x,+,1} as opposed to {x+C,+,1}, where the target can
5119 // handle +C but not -C), opt for the simpler formula.
5120 if (Count == BestNum) {
5121 int LUIdx = RegUses.getUsedByIndices(Reg).find_first();
5122 if (LUIdx >= 0 && Uses[LUIdx].Kind == LSRUse::Address &&
5123 IsSimplerBaseSCEVForTarget(TTI, SE, Best, Reg,
5124 Uses[LUIdx].AccessTy)) {
5125 Best = Reg;
5126 BestNum = Count;
5127 }
5128 }
5129 }
5130 }
5131 assert(Best && "Failed to find best LSRUse candidate");
5132
5133 LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
5134 << " will yield profitable reuse.\n");
5135 Taken.insert(Best);
5136
5137 // In any use with formulae which references this register, delete formulae
5138 // which don't reference it.
5139 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5140 LSRUse &LU = Uses[LUIdx];
5141 if (!LU.Regs.count(Best)) continue;
5142
5143 bool Any = false;
5144 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5145 Formula &F = LU.Formulae[i];
5146 if (!F.referencesReg(Best)) {
5147 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
5148 LU.DeleteFormula(F);
5149 --e;
5150 --i;
5151 Any = true;
5152 assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
5153 continue;
5154 }
5155 }
5156
5157 if (Any)
5158 LU.RecomputeRegs(LUIdx, RegUses);
5159 }
5160
5161 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5162 }
5163}
5164
5165/// If there are an extraordinary number of formulae to choose from, use some
5166/// rough heuristics to prune down the number of formulae. This keeps the main
5167/// solver from taking an extraordinary amount of time in some worst-case
5168/// scenarios.
5169void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
5170 NarrowSearchSpaceByDetectingSupersets();
5171 NarrowSearchSpaceByCollapsingUnrolledCode();
5172 NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
5174 NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
5175 NarrowSearchSpaceByFilterPostInc();
5176 if (LSRExpNarrow)
5177 NarrowSearchSpaceByDeletingCostlyFormulas();
5178 else
5179 NarrowSearchSpaceByPickingWinnerRegs();
5180}
5181
5182/// This is the recursive solver.
5183void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
5184 Cost &SolutionCost,
5186 const Cost &CurCost,
5187 const SmallPtrSet<const SCEV *, 16> &CurRegs,
5188 DenseSet<const SCEV *> &VisitedRegs) const {
5189 // Some ideas:
5190 // - prune more:
5191 // - use more aggressive filtering
5192 // - sort the formula so that the most profitable solutions are found first
5193 // - sort the uses too
5194 // - search faster:
5195 // - don't compute a cost, and then compare. compare while computing a cost
5196 // and bail early.
5197 // - track register sets with SmallBitVector
5198
5199 const LSRUse &LU = Uses[Workspace.size()];
5200
5201 // If this use references any register that's already a part of the
5202 // in-progress solution, consider it a requirement that a formula must
5203 // reference that register in order to be considered. This prunes out
5204 // unprofitable searching.
5206 for (const SCEV *S : CurRegs)
5207 if (LU.Regs.count(S))
5208 ReqRegs.insert(S);
5209
5211 Cost NewCost(L, SE, TTI, AMK);
5212 for (const Formula &F : LU.Formulae) {
5213 // Ignore formulae which may not be ideal in terms of register reuse of
5214 // ReqRegs. The formula should use all required registers before
5215 // introducing new ones.
5216 // This can sometimes (notably when trying to favour postinc) lead to
5217 // sub-optimial decisions. There it is best left to the cost modelling to
5218 // get correct.
5219 if (AMK != TTI::AMK_PostIndexed || LU.Kind != LSRUse::Address) {
5220 int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
5221 for (const SCEV *Reg : ReqRegs) {
5222 if ((F.ScaledReg && F.ScaledReg == Reg) ||
5223 is_contained(F.BaseRegs, Reg)) {
5224 --NumReqRegsToFind;
5225 if (NumReqRegsToFind == 0)
5226 break;
5227 }
5228 }
5229 if (NumReqRegsToFind != 0) {
5230 // If none of the formulae satisfied the required registers, then we could
5231 // clear ReqRegs and try again. Currently, we simply give up in this case.
5232 continue;
5233 }
5234 }
5235
5236 // Evaluate the cost of the current formula. If it's already worse than
5237 // the current best, prune the search at that point.
5238 NewCost = CurCost;
5239 NewRegs = CurRegs;
5240 NewCost.RateFormula(F, NewRegs, VisitedRegs, LU);
5241 if (NewCost.isLess(SolutionCost)) {
5242 Workspace.push_back(&F);
5243 if (Workspace.size() != Uses.size()) {
5244 SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
5245 NewRegs, VisitedRegs);
5246 if (F.getNumRegs() == 1 && Workspace.size() == 1)
5247 VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
5248 } else {
5249 LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
5250 dbgs() << ".\nRegs:\n";
5251 for (const SCEV *S : NewRegs) dbgs()
5252 << "- " << *S << "\n";
5253 dbgs() << '\n');
5254
5255 SolutionCost = NewCost;
5256 Solution = Workspace;
5257 }
5258 Workspace.pop_back();
5259 }
5260 }
5261}
5262
5263/// Choose one formula from each use. Return the results in the given Solution
5264/// vector.
5265void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
5267 Cost SolutionCost(L, SE, TTI, AMK);
5268 SolutionCost.Lose();
5269 Cost CurCost(L, SE, TTI, AMK);
5271 DenseSet<const SCEV *> VisitedRegs;
5272 Workspace.reserve(Uses.size());
5273
5274 // SolveRecurse does all the work.
5275 SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
5276 CurRegs, VisitedRegs);
5277 if (Solution.empty()) {
5278 LLVM_DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
5279 return;
5280 }
5281
5282 // Ok, we've now made all our decisions.
5283 LLVM_DEBUG(dbgs() << "\n"
5284 "The chosen solution requires ";
5285 SolutionCost.print(dbgs()); dbgs() << ":\n";
5286 for (size_t i = 0, e = Uses.size(); i != e; ++i) {
5287 dbgs() << " ";
5288 Uses[i].print(dbgs());
5289 dbgs() << "\n"
5290 " ";
5291 Solution[i]->print(dbgs());
5292 dbgs() << '\n';
5293 });
5294
5295 assert(Solution.size() == Uses.size() && "Malformed solution!");
5296
5297 const bool EnableDropUnprofitableSolution = [&] {
5299 case cl::BOU_TRUE:
5300 return true;
5301 case cl::BOU_FALSE:
5302 return false;
5303 case cl::BOU_UNSET:
5305 }
5306 llvm_unreachable("Unhandled cl::boolOrDefault enum");
5307 }();
5308
5309 if (BaselineCost.isLess(SolutionCost)) {
5310 if (!EnableDropUnprofitableSolution)
5311 LLVM_DEBUG(
5312 dbgs() << "Baseline is more profitable than chosen solution, "
5313 "add option 'lsr-drop-solution' to drop LSR solution.\n");
5314 else {
5315 LLVM_DEBUG(dbgs() << "Baseline is more profitable than chosen "
5316 "solution, dropping LSR solution.\n";);
5317 Solution.clear();
5318 }
5319 }
5320}
5321
5322/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
5323/// we can go while still being dominated by the input positions. This helps
5324/// canonicalize the insert position, which encourages sharing.
5326LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
5327 const SmallVectorImpl<Instruction *> &Inputs)
5328 const {
5329 Instruction *Tentative = &*IP;
5330 while (true) {
5331 bool AllDominate = true;
5332 Instruction *BetterPos = nullptr;
5333 // Don't bother attempting to insert before a catchswitch, their basic block
5334 // cannot have other non-PHI instructions.
5335 if (isa<CatchSwitchInst>(Tentative))
5336 return IP;
5337
5338 for (Instruction *Inst : Inputs) {
5339 if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
5340 AllDominate = false;
5341 break;
5342 }
5343 // Attempt to find an insert position in the middle of the block,
5344 // instead of at the end, so that it can be used for other expansions.
5345 if (Tentative->getParent() == Inst->getParent() &&
5346 (!BetterPos || !DT.dominates(Inst, BetterPos)))
5347 BetterPos = &*std::next(BasicBlock::iterator(Inst));
5348 }
5349 if (!AllDominate)
5350 break;
5351 if (BetterPos)
5352 IP = BetterPos->getIterator();
5353 else
5354 IP = Tentative->getIterator();
5355
5356 const Loop *IPLoop = LI.getLoopFor(IP->getParent());
5357 unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
5358
5359 BasicBlock *IDom;
5360 for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
5361 if (!Rung) return IP;
5362 Rung = Rung->getIDom();
5363 if (!Rung) return IP;
5364 IDom = Rung->getBlock();
5365
5366 // Don't climb into a loop though.
5367 const Loop *IDomLoop = LI.getLoopFor(IDom);
5368 unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
5369 if (IDomDepth <= IPLoopDepth &&
5370 (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
5371 break;
5372 }
5373
5374 Tentative = IDom->getTerminator();
5375 }
5376
5377 return IP;
5378}
5379
5380/// Determine an input position which will be dominated by the operands and
5381/// which will dominate the result.
5382BasicBlock::iterator LSRInstance::AdjustInsertPositionForExpand(
5383 BasicBlock::iterator LowestIP, const LSRFixup &LF, const LSRUse &LU) const {
5384 // Collect some instructions which must be dominated by the
5385 // expanding replacement. These must be dominated by any operands that
5386 // will be required in the expansion.
5388 if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
5389 Inputs.push_back(I);
5390 if (LU.Kind == LSRUse::ICmpZero)
5391 if (Instruction *I =
5392 dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
5393 Inputs.push_back(I);
5394 if (LF.PostIncLoops.count(L)) {
5395 if (LF.isUseFullyOutsideLoop(L))
5396 Inputs.push_back(L->getLoopLatch()->getTerminator());
5397 else
5398 Inputs.push_back(IVIncInsertPos);
5399 }
5400 // The expansion must also be dominated by the increment positions of any
5401 // loops it for which it is using post-inc mode.
5402 for (const Loop *PIL : LF.PostIncLoops) {
5403 if (PIL == L) continue;
5404
5405 // Be dominated by the loop exit.
5406 SmallVector<BasicBlock *, 4> ExitingBlocks;
5407 PIL->getExitingBlocks(ExitingBlocks);
5408 if (!ExitingBlocks.empty()) {
5409 BasicBlock *BB = ExitingBlocks[0];
5410 for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
5411 BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
5412 Inputs.push_back(BB->getTerminator());
5413 }
5414 }
5415
5416 assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad()
5417 && !isa<DbgInfoIntrinsic>(LowestIP) &&
5418 "Insertion point must be a normal instruction");
5419
5420 // Then, climb up the immediate dominator tree as far as we can go while
5421 // still being dominated by the input positions.
5422 BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
5423
5424 // Don't insert instructions before PHI nodes.
5425 while (isa<PHINode>(IP)) ++IP;
5426
5427 // Ignore landingpad instructions.
5428 while (IP->isEHPad()) ++IP;
5429
5430 // Ignore debug intrinsics.
5431 while (isa<DbgInfoIntrinsic>(IP)) ++IP;
5432
5433 // Set IP below instructions recently inserted by SCEVExpander. This keeps the
5434 // IP consistent across expansions and allows the previously inserted
5435 // instructions to be reused by subsequent expansion.
5436 while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP)
5437 ++IP;
5438
5439 return IP;
5440}
5441
5442/// Emit instructions for the leading candidate expression for this LSRUse (this
5443/// is called "expanding").
5444Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
5445 const Formula &F, BasicBlock::iterator IP,
5446 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5447 if (LU.RigidFormula)
5448 return LF.OperandValToReplace;
5449
5450 // Determine an input position which will be dominated by the operands and
5451 // which will dominate the result.
5452 IP = AdjustInsertPositionForExpand(IP, LF, LU);
5453 Rewriter.setInsertPoint(&*IP);
5454
5455 // Inform the Rewriter if we have a post-increment use, so that it can
5456 // perform an advantageous expansion.
5457 Rewriter.setPostInc(LF.PostIncLoops);
5458
5459 // This is the type that the user actually needs.
5460 Type *OpTy = LF.OperandValToReplace->getType();
5461 // This will be the type that we'll initially expand to.
5462 Type *Ty = F.getType();
5463 if (!Ty)
5464 // No type known; just expand directly to the ultimate type.
5465 Ty = OpTy;
5466 else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
5467 // Expand directly to the ultimate type if it's the right size.
5468 Ty = OpTy;
5469 // This is the type to do integer arithmetic in.
5470 Type *IntTy = SE.getEffectiveSCEVType(Ty);
5471
5472 // Build up a list of operands to add together to form the full base.
5474
5475 // Expand the BaseRegs portion.
5476 for (const SCEV *Reg : F.BaseRegs) {
5477 assert(!Reg->isZero() && "Zero allocated in a base register!");
5478
5479 // If we're expanding for a post-inc user, make the post-inc adjustment.
5480 Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE);
5481 Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
5482 }
5483
5484 // Expand the ScaledReg portion.
5485 Value *ICmpScaledV = nullptr;
5486 if (F.Scale != 0) {
5487 const SCEV *ScaledS = F.ScaledReg;
5488
5489 // If we're expanding for a post-inc user, make the post-inc adjustment.
5490 PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
5491 ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE);
5492
5493 if (LU.Kind == LSRUse::ICmpZero) {
5494 // Expand ScaleReg as if it was part of the base regs.
5495 if (F.Scale == 1)
5496 Ops.push_back(
5497 SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)));
5498 else {
5499 // An interesting way of "folding" with an icmp is to use a negated
5500 // scale, which we'll implement by inserting it into the other operand
5501 // of the icmp.
5502 assert(F.Scale == -1 &&
5503 "The only scale supported by ICmpZero uses is -1!");
5504 ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr);
5505 }
5506 } else {
5507 // Otherwise just expand the scaled register and an explicit scale,
5508 // which is expected to be matched as part of the address.
5509
5510 // Flush the operand list to suppress SCEVExpander hoisting address modes.
5511 // Unless the addressing mode will not be folded.
5512 if (!Ops.empty() && LU.Kind == LSRUse::Address &&
5513 isAMCompletelyFolded(TTI, LU, F)) {
5514 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), nullptr);
5515 Ops.clear();
5516 Ops.push_back(SE.getUnknown(FullV));
5517 }
5518 ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr));
5519 if (F.Scale != 1)
5520 ScaledS =
5521 SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
5522 Ops.push_back(ScaledS);
5523 }
5524 }
5525
5526 // Expand the GV portion.
5527 if (F.BaseGV) {
5528 // Flush the operand list to suppress SCEVExpander hoisting.
5529 if (!Ops.empty()) {
5530 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), IntTy);
5531 Ops.clear();
5532 Ops.push_back(SE.getUnknown(FullV));
5533 }
5534 Ops.push_back(SE.getUnknown(F.BaseGV));
5535 }
5536
5537 // Flush the operand list to suppress SCEVExpander hoisting of both folded and
5538 // unfolded offsets. LSR assumes they both live next to their uses.
5539 if (!Ops.empty()) {
5540 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
5541 Ops.clear();
5542 Ops.push_back(SE.getUnknown(FullV));
5543 }
5544
5545 // Expand the immediate portion.
5546 int64_t Offset = (uint64_t)F.BaseOffset + LF.Offset;
5547 if (Offset != 0) {
5548 if (LU.Kind == LSRUse::ICmpZero) {
5549 // The other interesting way of "folding" with an ICmpZero is to use a
5550 // negated immediate.
5551 if (!ICmpScaledV)
5552 ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset);
5553 else {
5554 Ops.push_back(SE.getUnknown(ICmpScaledV));
5555 ICmpScaledV = ConstantInt::get(IntTy, Offset);
5556 }
5557 } else {
5558 // Just add the immediate values. These again are expected to be matched
5559 // as part of the address.
5561 }
5562 }
5563
5564 // Expand the unfolded offset portion.
5565 int64_t UnfoldedOffset = F.UnfoldedOffset;
5566 if (UnfoldedOffset != 0) {
5567 // Just add the immediate values.
5569 UnfoldedOffset)));
5570 }
5571
5572 // Emit instructions summing all the operands.
5573 const SCEV *FullS = Ops.empty() ?
5574 SE.getConstant(IntTy, 0) :
5575 SE.getAddExpr(Ops);
5576 Value *FullV = Rewriter.expandCodeFor(FullS, Ty);
5577
5578 // We're done expanding now, so reset the rewriter.
5579 Rewriter.clearPostInc();
5580
5581 // An ICmpZero Formula represents an ICmp which we're handling as a
5582 // comparison against zero. Now that we've expanded an expression for that
5583 // form, update the ICmp's other operand.
5584 if (LU.Kind == LSRUse::ICmpZero) {
5585 ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
5586 if (auto *OperandIsInstr = dyn_cast<Instruction>(CI->getOperand(1)))
5587 DeadInsts.emplace_back(OperandIsInstr);
5588 assert(!F.BaseGV && "ICmp does not support folding a global value and "
5589 "a scale at the same time!");
5590 if (F.Scale == -1) {
5591 if (ICmpScaledV->getType() != OpTy) {
5593 CastInst::getCastOpcode(ICmpScaledV, false, OpTy, false),
5594 ICmpScaledV, OpTy, "tmp", CI->getIterator());
5595 ICmpScaledV = Cast;
5596 }
5597 CI->setOperand(1, ICmpScaledV);
5598 } else {
5599 // A scale of 1 means that the scale has been expanded as part of the
5600 // base regs.
5601 assert((F.Scale == 0 || F.Scale == 1) &&
5602 "ICmp does not support folding a global value and "
5603 "a scale at the same time!");
5605 -(uint64_t)Offset);
5606 if (C->getType() != OpTy) {
5608 CastInst::getCastOpcode(C, false, OpTy, false), C, OpTy,
5609 CI->getDataLayout());
5610 assert(C && "Cast of ConstantInt should have folded");
5611 }
5612
5613 CI->setOperand(1, C);
5614 }
5615 }
5616
5617 return FullV;
5618}
5619
5620/// Helper for Rewrite. PHI nodes are special because the use of their operands
5621/// effectively happens in their predecessor blocks, so the expression may need
5622/// to be expanded in multiple places.
5623void LSRInstance::RewriteForPHI(
5624 PHINode *PN, const LSRUse &LU, const LSRFixup &LF, const Formula &F,
5625 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5627
5628 // Inserting instructions in the loop and using them as PHI's input could
5629 // break LCSSA in case if PHI's parent block is not a loop exit (i.e. the
5630 // corresponding incoming block is not loop exiting). So collect all such
5631 // instructions to form LCSSA for them later.
5632 SmallVector<Instruction *, 4> InsertedNonLCSSAInsts;
5633
5634 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
5635 if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
5636 bool needUpdateFixups = false;
5637 BasicBlock *BB = PN->getIncomingBlock(i);
5638
5639 // If this is a critical edge, split the edge so that we do not insert
5640 // the code on all predecessor/successor paths. We do this unless this
5641 // is the canonical backedge for this loop, which complicates post-inc
5642 // users.
5643 if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
5644 !isa<IndirectBrInst>(BB->getTerminator()) &&
5645 !isa<CatchSwitchInst>(BB->getTerminator())) {
5646 BasicBlock *Parent = PN->getParent();
5647 Loop *PNLoop = LI.getLoopFor(Parent);
5648 if (!PNLoop || Parent != PNLoop->getHeader()) {
5649 // Split the critical edge.
5650 BasicBlock *NewBB = nullptr;
5651 if (!Parent->isLandingPad()) {
5652 NewBB =
5653 SplitCriticalEdge(BB, Parent,
5654 CriticalEdgeSplittingOptions(&DT, &LI, MSSAU)
5655 .setMergeIdenticalEdges()
5656 .setKeepOneInputPHIs());
5657 } else {
5659 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
5660 SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DTU, &LI);
5661 NewBB = NewBBs[0];
5662 }
5663 // If NewBB==NULL, then SplitCriticalEdge refused to split because all
5664 // phi predecessors are identical. The simple thing to do is skip
5665 // splitting in this case rather than complicate the API.
5666 if (NewBB) {
5667 // If PN is outside of the loop and BB is in the loop, we want to
5668 // move the block to be immediately before the PHI block, not
5669 // immediately after BB.
5670 if (L->contains(BB) && !L->contains(PN))
5671 NewBB->moveBefore(PN->getParent());
5672
5673 // Splitting the edge can reduce the number of PHI entries we have.
5674 e = PN->getNumIncomingValues();
5675 BB = NewBB;
5676 i = PN->getBasicBlockIndex(BB);
5677
5678 needUpdateFixups = true;
5679 }
5680 }
5681 }
5682
5683 std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
5684 Inserted.insert(std::make_pair(BB, static_cast<Value *>(nullptr)));
5685 if (!Pair.second)
5686 PN->setIncomingValue(i, Pair.first->second);
5687 else {
5688 Value *FullV =
5689 Expand(LU, LF, F, BB->getTerminator()->getIterator(), DeadInsts);
5690
5691 // If this is reuse-by-noop-cast, insert the noop cast.
5692 Type *OpTy = LF.OperandValToReplace->getType();
5693 if (FullV->getType() != OpTy)
5694 FullV = CastInst::Create(
5695 CastInst::getCastOpcode(FullV, false, OpTy, false), FullV,
5696 LF.OperandValToReplace->getType(), "tmp",
5697 BB->getTerminator()->getIterator());
5698
5699 // If the incoming block for this value is not in the loop, it means the
5700 // current PHI is not in a loop exit, so we must create a LCSSA PHI for
5701 // the inserted value.
5702 if (auto *I = dyn_cast<Instruction>(FullV))
5703 if (L->contains(I) && !L->contains(BB))
5704 InsertedNonLCSSAInsts.push_back(I);
5705
5706 PN->setIncomingValue(i, FullV);
5707 Pair.first->second = FullV;
5708 }
5709
5710 // If LSR splits critical edge and phi node has other pending
5711 // fixup operands, we need to update those pending fixups. Otherwise
5712 // formulae will not be implemented completely and some instructions
5713 // will not be eliminated.
5714 if (needUpdateFixups) {
5715 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
5716 for (LSRFixup &Fixup : Uses[LUIdx].Fixups)
5717 // If fixup is supposed to rewrite some operand in the phi
5718 // that was just updated, it may be already moved to
5719 // another phi node. Such fixup requires update.
5720 if (Fixup.UserInst == PN) {
5721 // Check if the operand we try to replace still exists in the
5722 // original phi.
5723 bool foundInOriginalPHI = false;
5724 for (const auto &val : PN->incoming_values())
5725 if (val == Fixup.OperandValToReplace) {
5726 foundInOriginalPHI = true;
5727 break;
5728 }
5729
5730 // If fixup operand found in original PHI - nothing to do.
5731 if (foundInOriginalPHI)
5732 continue;
5733
5734 // Otherwise it might be moved to another PHI and requires update.
5735 // If fixup operand not found in any of the incoming blocks that
5736 // means we have already rewritten it - nothing to do.
5737 for (const auto &Block : PN->blocks())
5738 for (BasicBlock::iterator I = Block->begin(); isa<PHINode>(I);
5739 ++I) {
5740 PHINode *NewPN = cast<PHINode>(I);
5741 for (const auto &val : NewPN->incoming_values())
5742 if (val == Fixup.OperandValToReplace)
5743 Fixup.UserInst = NewPN;
5744 }
5745 }
5746 }
5747 }
5748
5749 formLCSSAForInstructions(InsertedNonLCSSAInsts, DT, LI, &SE);
5750}
5751
5752/// Emit instructions for the leading candidate expression for this LSRUse (this
5753/// is called "expanding"), and update the UserInst to reference the newly
5754/// expanded value.
5755void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
5756 const Formula &F,
5757 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5758 // First, find an insertion point that dominates UserInst. For PHI nodes,
5759 // find the nearest block which dominates all the relevant uses.
5760 if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
5761 RewriteForPHI(PN, LU, LF, F, DeadInsts);
5762 } else {
5763 Value *FullV = Expand(LU, LF, F, LF.UserInst->getIterator(), DeadInsts);
5764
5765 // If this is reuse-by-noop-cast, insert the noop cast.
5766 Type *OpTy = LF.OperandValToReplace->getType();
5767 if (FullV->getType() != OpTy) {
5768 Instruction *Cast =
5769 CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
5770 FullV, OpTy, "tmp", LF.UserInst->getIterator());
5771 FullV = Cast;
5772 }
5773
5774 // Update the user. ICmpZero is handled specially here (for now) because
5775 // Expand may have updated one of the operands of the icmp already, and
5776 // its new value may happen to be equal to LF.OperandValToReplace, in
5777 // which case doing replaceUsesOfWith leads to replacing both operands
5778 // with the same value. TODO: Reorganize this.
5779 if (LU.Kind == LSRUse::ICmpZero)
5780 LF.UserInst->setOperand(0, FullV);
5781 else
5782 LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
5783 }
5784
5785 if (auto *OperandIsInstr = dyn_cast<Instruction>(LF.OperandValToReplace))
5786 DeadInsts.emplace_back(OperandIsInstr);
5787}
5788
5789// Trying to hoist the IVInc to loop header if all IVInc users are in
5790// the loop header. It will help backend to generate post index load/store
5791// when the latch block is different from loop header block.
5792static bool canHoistIVInc(const TargetTransformInfo &TTI, const LSRFixup &Fixup,
5793 const LSRUse &LU, Instruction *IVIncInsertPos,
5794 Loop *L) {
5795 if (LU.Kind != LSRUse::Address)
5796 return false;
5797
5798 // For now this code do the conservative optimization, only work for
5799 // the header block. Later we can hoist the IVInc to the block post
5800 // dominate all users.
5801 BasicBlock *LHeader = L->getHeader();
5802 if (IVIncInsertPos->getParent() == LHeader)
5803 return false;
5804
5805 if (!Fixup.OperandValToReplace ||
5806 any_of(Fixup.OperandValToReplace->users(), [&LHeader](User *U) {
5807 Instruction *UI = cast<Instruction>(U);
5808 return UI->getParent() != LHeader;
5809 }))
5810 return false;
5811
5812 Instruction *I = Fixup.UserInst;
5813 Type *Ty = I->getType();
5814 return Ty->isIntegerTy() &&
5815 ((isa<LoadInst>(I) && TTI.isIndexedLoadLegal(TTI.MIM_PostInc, Ty)) ||
5816 (isa<StoreInst>(I) && TTI.isIndexedStoreLegal(TTI.MIM_PostInc, Ty)));
5817}
5818
5819/// Rewrite all the fixup locations with new values, following the chosen
5820/// solution.
5821void LSRInstance::ImplementSolution(
5822 const SmallVectorImpl<const Formula *> &Solution) {
5823 // Keep track of instructions we may have made dead, so that
5824 // we can remove them after we are done working.
5826
5827 // Mark phi nodes that terminate chains so the expander tries to reuse them.
5828 for (const IVChain &Chain : IVChainVec) {
5829 if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst()))
5830 Rewriter.setChainedPhi(PN);
5831 }
5832
5833 // Expand the new value definitions and update the users.
5834 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
5835 for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) {
5836 Instruction *InsertPos =
5837 canHoistIVInc(TTI, Fixup, Uses[LUIdx], IVIncInsertPos, L)
5838 ? L->getHeader()->getTerminator()
5839 : IVIncInsertPos;
5840 Rewriter.setIVIncInsertPos(L, InsertPos);
5841 Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], DeadInsts);
5842 Changed = true;
5843 }
5844
5845 for (const IVChain &Chain : IVChainVec) {
5846 GenerateIVChain(Chain, DeadInsts);
5847 Changed = true;
5848 }
5849
5850 for (const WeakVH &IV : Rewriter.getInsertedIVs())
5851 if (IV && dyn_cast<Instruction>(&*IV)->getParent())
5852 ScalarEvolutionIVs.push_back(IV);
5853
5854 // Clean up after ourselves. This must be done before deleting any
5855 // instructions.
5856 Rewriter.clear();
5857
5859 &TLI, MSSAU);
5860
5861 // In our cost analysis above, we assume that each addrec consumes exactly
5862 // one register, and arrange to have increments inserted just before the
5863 // latch to maximimize the chance this is true. However, if we reused
5864 // existing IVs, we now need to move the increments to match our
5865 // expectations. Otherwise, our cost modeling results in us having a
5866 // chosen a non-optimal result for the actual schedule. (And yes, this
5867 // scheduling decision does impact later codegen.)
5868 for (PHINode &PN : L->getHeader()->phis()) {
5869 BinaryOperator *BO = nullptr;
5870 Value *Start = nullptr, *Step = nullptr;
5871 if (!matchSimpleRecurrence(&PN, BO, Start, Step))
5872 continue;
5873
5874 switch (BO->getOpcode()) {
5875 case Instruction::Sub:
5876 if (BO->getOperand(0) != &PN)
5877 // sub is non-commutative - match handling elsewhere in LSR
5878 continue;
5879 break;
5880 case Instruction::Add:
5881 break;
5882 default:
5883 continue;
5884 };
5885
5886 if (!isa<Constant>(Step))
5887 // If not a constant step, might increase register pressure
5888 // (We assume constants have been canonicalized to RHS)
5889 continue;
5890
5891 if (BO->getParent() == IVIncInsertPos->getParent())
5892 // Only bother moving across blocks. Isel can handle block local case.
5893 continue;
5894
5895 // Can we legally schedule inc at the desired point?
5896 if (!llvm::all_of(BO->uses(),
5897 [&](Use &U) {return DT.dominates(IVIncInsertPos, U);}))
5898 continue;
5899 BO->moveBefore(IVIncInsertPos);
5900 Changed = true;
5901 }
5902
5903
5904}
5905
5906LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
5907 DominatorTree &DT, LoopInfo &LI,
5910 : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), TLI(TLI), TTI(TTI), L(L),
5911 MSSAU(MSSAU), AMK(PreferredAddresingMode.getNumOccurrences() > 0
5913 : TTI.getPreferredAddressingMode(L, &SE)),
5914 Rewriter(SE, L->getHeader()->getDataLayout(), "lsr", false),
5915 BaselineCost(L, SE, TTI, AMK) {
5916 // If LoopSimplify form is not available, stay out of trouble.
5917 if (!L->isLoopSimplifyForm())
5918 return;
5919
5920 // If there's no interesting work to be done, bail early.
5921 if (IU.empty()) return;
5922
5923 // If there's too much analysis to be done, bail early. We won't be able to
5924 // model the problem anyway.
5925 unsigned NumUsers = 0;
5926 for (const IVStrideUse &U : IU) {
5927 if (++NumUsers > MaxIVUsers) {
5928 (void)U;
5929 LLVM_DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U
5930 << "\n");
5931 return;
5932 }
5933 // Bail out if we have a PHI on an EHPad that gets a value from a
5934 // CatchSwitchInst. Because the CatchSwitchInst cannot be split, there is
5935 // no good place to stick any instructions.
5936 if (auto *PN = dyn_cast<PHINode>(U.getUser())) {
5937 auto *FirstNonPHI = PN->getParent()->getFirstNonPHI();
5938 if (isa<FuncletPadInst>(FirstNonPHI) ||
5939 isa<CatchSwitchInst>(FirstNonPHI))
5940 for (BasicBlock *PredBB : PN->blocks())
5941 if (isa<CatchSwitchInst>(PredBB->getFirstNonPHI()))
5942 return;
5943 }
5944 }
5945
5946 LLVM_DEBUG(dbgs() << "\nLSR on loop ";
5947 L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
5948 dbgs() << ":\n");
5949
5950 // Configure SCEVExpander already now, so the correct mode is used for
5951 // isSafeToExpand() checks.
5952#ifndef NDEBUG
5953 Rewriter.setDebugType(DEBUG_TYPE);
5954#endif
5955 Rewriter.disableCanonicalMode();
5956 Rewriter.enableLSRMode();
5957
5958 // First, perform some low-level loop optimizations.
5959 OptimizeShadowIV();
5960 OptimizeLoopTermCond();
5961
5962 // If loop preparation eliminates all interesting IV users, bail.
5963 if (IU.empty()) return;
5964
5965 // Skip nested loops until we can model them better with formulae.
5966 if (!L->isInnermost()) {
5967 LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
5968 return;
5969 }
5970
5971 // Start collecting data and preparing for the solver.
5972 // If number of registers is not the major cost, we cannot benefit from the
5973 // current profitable chain optimization which is based on number of
5974 // registers.
5975 // FIXME: add profitable chain optimization for other kinds major cost, for
5976 // example number of instructions.
5978 CollectChains();
5979 CollectInterestingTypesAndFactors();
5980 CollectFixupsAndInitialFormulae();
5981 CollectLoopInvariantFixupsAndFormulae();
5982
5983 if (Uses.empty())
5984 return;
5985
5986 LLVM_DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
5987 print_uses(dbgs()));
5988 LLVM_DEBUG(dbgs() << "The baseline solution requires ";
5989 BaselineCost.print(dbgs()); dbgs() << "\n");
5990
5991 // Now use the reuse data to generate a bunch of interesting ways
5992 // to formulate the values needed for the uses.
5993 GenerateAllReuseFormulae();
5994
5995 FilterOutUndesirableDedicatedRegisters();
5996 NarrowSearchSpaceUsingHeuristics();
5997
5999 Solve(Solution);
6000
6001 // Release memory that is no longer needed.
6002 Factors.clear();
6003 Types.clear();
6004 RegUses.clear();
6005
6006 if (Solution.empty())
6007 return;
6008
6009#ifndef NDEBUG
6010 // Formulae should be legal.
6011 for (const LSRUse &LU : Uses) {
6012 for (const Formula &F : LU.Formulae)
6013 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
6014 F) && "Illegal formula generated!");
6015 };
6016#endif
6017
6018 // Now that we've decided what we want, make it so.
6019 ImplementSolution(Solution);
6020}
6021
6022#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
6023void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
6024 if (Factors.empty() && Types.empty()) return;
6025
6026 OS << "LSR has identified the following interesting factors and types: ";
6027 bool First = true;
6028
6029 for (int64_t Factor : Factors) {
6030 if (!First) OS << ", ";
6031 First = false;
6032 OS << '*' << Factor;
6033 }
6034
6035 for (Type *Ty : Types) {
6036 if (!First) OS << ", ";
6037 First = false;
6038 OS << '(' << *Ty << ')';
6039 }
6040 OS << '\n';
6041}
6042
6043void LSRInstance::print_fixups(raw_ostream &OS) const {
6044 OS << "LSR is examining the following fixup sites:\n";
6045 for (const LSRUse &LU : Uses)
6046 for (const LSRFixup &LF : LU.Fixups) {
6047 dbgs() << " ";
6048 LF.print(OS);
6049 OS << '\n';
6050 }
6051}
6052
6053void LSRInstance::print_uses(raw_ostream &OS) const {
6054 OS << "LSR is examining the following uses:\n";
6055 for (const LSRUse &LU : Uses) {
6056 dbgs() << " ";
6057 LU.print(OS);
6058 OS << '\n';
6059 for (const Formula &F : LU.Formulae) {
6060 OS << " ";
6061 F.print(OS);
6062 OS << '\n';
6063 }
6064 }
6065}
6066
6067void LSRInstance::print(raw_ostream &OS) const {
6068 print_factors_and_types(OS);
6069 print_fixups(OS);
6070 print_uses(OS);
6071}
6072
6073LLVM_DUMP_METHOD void LSRInstance::dump() const {
6074 print(errs()); errs() << '\n';
6075}
6076#endif
6077
6078namespace {
6079
6080class LoopStrengthReduce : public LoopPass {
6081public:
6082 static char ID; // Pass ID, replacement for typeid
6083
6084 LoopStrengthReduce();
6085
6086private:
6087 bool runOnLoop(Loop *L, LPPassManager &LPM) override;
6088 void getAnalysisUsage(AnalysisUsage &AU) const override;
6089};
6090
6091} // end anonymous namespace
6092
6093LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
6095}
6096
6097void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
6098 // We split critical edges, so we change the CFG. However, we do update
6099 // many analyses if they are around.
6101
6111 // Requiring LoopSimplify a second time here prevents IVUsers from running
6112 // twice, since LoopSimplify was invalidated by running ScalarEvolution.
6118}
6119
6120namespace {
6121
6122/// Enables more convenient iteration over a DWARF expression vector.
6124ToDwarfOpIter(SmallVectorImpl<uint64_t> &Expr) {
6129 return {Begin, End};
6130}
6131
6132struct SCEVDbgValueBuilder {
6133 SCEVDbgValueBuilder() = default;
6134 SCEVDbgValueBuilder(const SCEVDbgValueBuilder &Base) { clone(Base); }
6135
6136 void clone(const SCEVDbgValueBuilder &Base) {
6137 LocationOps = Base.LocationOps;
6138 Expr = Base.Expr;
6139 }
6140
6141 void clear() {
6142 LocationOps.clear();
6143 Expr.clear();
6144 }
6145
6146 /// The DIExpression as we translate the SCEV.
6148 /// The location ops of the DIExpression.
6149 SmallVector<Value *, 2> LocationOps;
6150
6151 void pushOperator(uint64_t Op) { Expr.push_back(Op); }
6152 void pushUInt(uint64_t Operand) { Expr.push_back(Operand); }
6153
6154 /// Add a DW_OP_LLVM_arg to the expression, followed by the index of the value
6155 /// in the set of values referenced by the expression.
6156 void pushLocation(llvm::Value *V) {
6158 auto *It = llvm::find(LocationOps, V);
6159 unsigned ArgIndex = 0;
6160 if (It != LocationOps.end()) {
6161 ArgIndex = std::distance(LocationOps.begin(), It);
6162 } else {
6163 ArgIndex = LocationOps.size();
6164 LocationOps.push_back(V);
6165 }
6166 Expr.push_back(ArgIndex);
6167 }
6168
6169 void pushValue(const SCEVUnknown *U) {
6170 llvm::Value *V = cast<SCEVUnknown>(U)->getValue();
6171 pushLocation(V);
6172 }
6173
6174 bool pushConst(const SCEVConstant *C) {
6175 if (C->getAPInt().getSignificantBits() > 64)
6176 return false;
6177 Expr.push_back(llvm::dwarf::DW_OP_consts);
6178 Expr.push_back(C->getAPInt().getSExtValue());
6179 return true;
6180 }
6181
6182 // Iterating the expression as DWARF ops is convenient when updating
6183 // DWARF_OP_LLVM_args.
6185 return ToDwarfOpIter(Expr);
6186 }
6187
6188 /// Several SCEV types are sequences of the same arithmetic operator applied
6189 /// to constants and values that may be extended or truncated.
6190 bool pushArithmeticExpr(const llvm::SCEVCommutativeExpr *CommExpr,
6191 uint64_t DwarfOp) {
6192 assert((isa<llvm::SCEVAddExpr>(CommExpr) || isa<SCEVMulExpr>(CommExpr)) &&
6193 "Expected arithmetic SCEV type");
6194 bool Success = true;
6195 unsigned EmitOperator = 0;
6196 for (const auto &Op : CommExpr->operands()) {
6197 Success &= pushSCEV(Op);
6198
6199 if (EmitOperator >= 1)
6200 pushOperator(DwarfOp);
6201 ++EmitOperator;
6202 }
6203 return Success;
6204 }
6205
6206 // TODO: Identify and omit noop casts.
6207 bool pushCast(const llvm::SCEVCastExpr *C, bool IsSigned) {
6208 const llvm::SCEV *Inner = C->getOperand(0);
6209 const llvm::Type *Type = C->getType();
6210 uint64_t ToWidth = Type->getIntegerBitWidth();
6211 bool Success = pushSCEV(Inner);
6212 uint64_t CastOps[] = {dwarf::DW_OP_LLVM_convert, ToWidth,
6213 IsSigned ? llvm::dwarf::DW_ATE_signed
6214 : llvm::dwarf::DW_ATE_unsigned};
6215 for (const auto &Op : CastOps)
6216 pushOperator(Op);
6217 return Success;
6218 }
6219
6220 // TODO: MinMax - although these haven't been encountered in the test suite.
6221 bool pushSCEV(const llvm::SCEV *S) {
6222 bool Success = true;
6223 if (const SCEVConstant *StartInt = dyn_cast<SCEVConstant>(S)) {
6224 Success &= pushConst(StartInt);
6225
6226 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
6227 if (!U->getValue())
6228 return false;
6229 pushLocation(U->getValue());
6230
6231 } else if (const SCEVMulExpr *MulRec = dyn_cast<SCEVMulExpr>(S)) {
6232 Success &= pushArithmeticExpr(MulRec, llvm::dwarf::DW_OP_mul);
6233
6234 } else if (const SCEVUDivExpr *UDiv = dyn_cast<SCEVUDivExpr>(S)) {
6235 Success &= pushSCEV(UDiv->getLHS());
6236 Success &= pushSCEV(UDiv->getRHS());
6237 pushOperator(llvm::dwarf::DW_OP_div);
6238
6239 } else if (const SCEVCastExpr *Cast = dyn_cast<SCEVCastExpr>(S)) {
6240 // Assert if a new and unknown SCEVCastEXpr type is encountered.
6241 assert((isa<SCEVZeroExtendExpr>(Cast) || isa<SCEVTruncateExpr>(Cast) ||
6242 isa<SCEVPtrToIntExpr>(Cast) || isa<SCEVSignExtendExpr>(Cast)) &&
6243 "Unexpected cast type in SCEV.");
6244 Success &= pushCast(Cast, (isa<SCEVSignExtendExpr>(Cast)));
6245
6246 } else if (const SCEVAddExpr *AddExpr = dyn_cast<SCEVAddExpr>(S)) {
6247 Success &= pushArithmeticExpr(AddExpr, llvm::dwarf::DW_OP_plus);
6248
6249 } else if (isa<SCEVAddRecExpr>(S)) {
6250 // Nested SCEVAddRecExpr are generated by nested loops and are currently
6251 // unsupported.
6252 return false;
6253
6254 } else {
6255 return false;
6256 }
6257 return Success;
6258 }
6259
6260 /// Return true if the combination of arithmetic operator and underlying
6261 /// SCEV constant value is an identity function.
6262 bool isIdentityFunction(uint64_t Op, const SCEV *S) {
6263 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
6264 if (C->getAPInt().getSignificantBits() > 64)
6265 return false;
6266 int64_t I = C->getAPInt().getSExtValue();
6267 switch (Op) {
6268 case llvm::dwarf::DW_OP_plus:
6269 case llvm::dwarf::DW_OP_minus:
6270 return I == 0;
6271 case llvm::dwarf::DW_OP_mul:
6272 case llvm::dwarf::DW_OP_div:
6273 return I == 1;
6274 }
6275 }
6276 return false;
6277 }
6278
6279 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6280 /// builder's expression stack. The stack should already contain an
6281 /// expression for the iteration count, so that it can be multiplied by
6282 /// the stride and added to the start.
6283 /// Components of the expression are omitted if they are an identity function.
6284 /// Chain (non-affine) SCEVs are not supported.
6285 bool SCEVToValueExpr(const llvm::SCEVAddRecExpr &SAR, ScalarEvolution &SE) {
6286 assert(SAR.isAffine() && "Expected affine SCEV");
6287 // TODO: Is this check needed?
6288 if (isa<SCEVAddRecExpr>(SAR.getStart()))
6289 return false;
6290
6291 const SCEV *Start = SAR.getStart();
6292 const SCEV *Stride = SAR.getStepRecurrence(SE);
6293
6294 // Skip pushing arithmetic noops.
6295 if (!isIdentityFunction(llvm::dwarf::DW_OP_mul, Stride)) {
6296 if (!pushSCEV(Stride))
6297 return false;
6298 pushOperator(llvm::dwarf::DW_OP_mul);
6299 }
6300 if (!isIdentityFunction(llvm::dwarf::DW_OP_plus, Start)) {
6301 if (!pushSCEV(Start))
6302 return false;
6303 pushOperator(llvm::dwarf::DW_OP_plus);
6304 }
6305 return true;
6306 }
6307
6308 /// Create an expression that is an offset from a value (usually the IV).
6309 void createOffsetExpr(int64_t Offset, Value *OffsetValue) {
6310 pushLocation(OffsetValue);
6312 LLVM_DEBUG(
6313 dbgs() << "scev-salvage: Generated IV offset expression. Offset: "
6314 << std::to_string(Offset) << "\n");
6315 }
6316
6317 /// Combine a translation of the SCEV and the IV to create an expression that
6318 /// recovers a location's value.
6319 /// returns true if an expression was created.
6320 bool createIterCountExpr(const SCEV *S,
6321 const SCEVDbgValueBuilder &IterationCount,
6322 ScalarEvolution &SE) {
6323 // SCEVs for SSA values are most frquently of the form
6324 // {start,+,stride}, but sometimes they are ({start,+,stride} + %a + ..).
6325 // This is because %a is a PHI node that is not the IV. However, these
6326 // SCEVs have not been observed to result in debuginfo-lossy optimisations,
6327 // so its not expected this point will be reached.
6328 if (!isa<SCEVAddRecExpr>(S))
6329 return false;
6330
6331 LLVM_DEBUG(dbgs() << "scev-salvage: Location to salvage SCEV: " << *S
6332 << '\n');
6333
6334 const auto *Rec = cast<SCEVAddRecExpr>(S);
6335 if (!Rec->isAffine())
6336 return false;
6337
6339 return false;
6340
6341 // Initialise a new builder with the iteration count expression. In
6342 // combination with the value's SCEV this enables recovery.
6343 clone(IterationCount);
6344 if (!SCEVToValueExpr(*Rec, SE))
6345 return false;
6346
6347 return true;
6348 }
6349
6350 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6351 /// builder's expression stack. The stack should already contain an
6352 /// expression for the iteration count, so that it can be multiplied by
6353 /// the stride and added to the start.
6354 /// Components of the expression are omitted if they are an identity function.
6355 bool SCEVToIterCountExpr(const llvm::SCEVAddRecExpr &SAR,
6356 ScalarEvolution &SE) {
6357 assert(SAR.isAffine() && "Expected affine SCEV");
6358 if (isa<SCEVAddRecExpr>(SAR.getStart())) {
6359 LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV. Unsupported nested AddRec: "
6360 << SAR << '\n');
6361 return false;
6362 }
6363 const SCEV *Start = SAR.getStart();
6364 const SCEV *Stride = SAR.getStepRecurrence(SE);
6365
6366 // Skip pushing arithmetic noops.
6367 if (!isIdentityFunction(llvm::dwarf::DW_OP_minus, Start)) {
6368 if (!pushSCEV(Start))
6369 return false;
6370 pushOperator(llvm::dwarf::DW_OP_minus);
6371 }
6372 if (!isIdentityFunction(llvm::dwarf::DW_OP_div, Stride)) {
6373 if (!pushSCEV(Stride))
6374 return false;
6375 pushOperator(llvm::dwarf::DW_OP_div);
6376 }
6377 return true;
6378 }
6379
6380 // Append the current expression and locations to a location list and an
6381 // expression list. Modify the DW_OP_LLVM_arg indexes to account for
6382 // the locations already present in the destination list.
6383 void appendToVectors(SmallVectorImpl<uint64_t> &DestExpr,
6384 SmallVectorImpl<Value *> &DestLocations) {
6385 assert(!DestLocations.empty() &&
6386 "Expected the locations vector to contain the IV");
6387 // The DWARF_OP_LLVM_arg arguments of the expression being appended must be
6388 // modified to account for the locations already in the destination vector.
6389 // All builders contain the IV as the first location op.
6390 assert(!LocationOps.empty() &&
6391 "Expected the location ops to contain the IV.");
6392 // DestIndexMap[n] contains the index in DestLocations for the nth
6393 // location in this SCEVDbgValueBuilder.
6394 SmallVector<uint64_t, 2> DestIndexMap;
6395 for (const auto &Op : LocationOps) {
6396 auto It = find(DestLocations, Op);
6397 if (It != DestLocations.end()) {
6398 // Location already exists in DestLocations, reuse existing ArgIndex.
6399 DestIndexMap.push_back(std::distance(DestLocations.begin(), It));
6400 continue;
6401 }
6402 // Location is not in DestLocations, add it.
6403 DestIndexMap.push_back(DestLocations.size());
6404 DestLocations.push_back(Op);
6405 }
6406
6407 for (const auto &Op : expr_ops()) {
6408 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6409 Op.appendToVector(DestExpr);
6410 continue;
6411 }
6412
6414 // `DW_OP_LLVM_arg n` represents the nth LocationOp in this SCEV,
6415 // DestIndexMap[n] contains its new index in DestLocations.
6416 uint64_t NewIndex = DestIndexMap[Op.getArg(0)];
6417 DestExpr.push_back(NewIndex);
6418 }
6419 }
6420};
6421
6422/// Holds all the required data to salvage a dbg.value using the pre-LSR SCEVs
6423/// and DIExpression.
6424struct DVIRecoveryRec {
6425 DVIRecoveryRec(DbgValueInst *DbgValue)
6426 : DbgRef(DbgValue), Expr(DbgValue->getExpression()),
6427 HadLocationArgList(false) {}
6428 DVIRecoveryRec(DbgVariableRecord *DVR)
6429 : DbgRef(DVR), Expr(DVR->getExpression()), HadLocationArgList(false) {}
6430
6432 DIExpression *Expr;
6433 bool HadLocationArgList;
6434 SmallVector<WeakVH, 2> LocationOps;
6437
6438 void clear() {
6439 for (auto &RE : RecoveryExprs)
6440 RE.reset();
6441 RecoveryExprs.clear();
6442 }
6443
6444 ~DVIRecoveryRec() { clear(); }
6445};
6446} // namespace
6447
6448/// Returns the total number of DW_OP_llvm_arg operands in the expression.
6449/// This helps in determining if a DIArglist is necessary or can be omitted from
6450/// the dbg.value.
6452 auto expr_ops = ToDwarfOpIter(Expr);
6453 unsigned Count = 0;
6454 for (auto Op : expr_ops)
6455 if (Op.getOp() == dwarf::DW_OP_LLVM_arg)
6456 Count++;
6457 return Count;
6458}
6459
6460/// Overwrites DVI with the location and Ops as the DIExpression. This will
6461/// create an invalid expression if Ops has any dwarf::DW_OP_llvm_arg operands,
6462/// because a DIArglist is not created for the first argument of the dbg.value.
6463template <typename T>
6464static void updateDVIWithLocation(T &DbgVal, Value *Location,
6466 assert(numLLVMArgOps(Ops) == 0 && "Expected expression that does not "
6467 "contain any DW_OP_llvm_arg operands.");
6468 DbgVal.setRawLocation(ValueAsMetadata::get(Location));
6469 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6470 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6471}
6472
6473/// Overwrite DVI with locations placed into a DIArglist.
6474template <typename T>
6475static void updateDVIWithLocations(T &DbgVal,
6476 SmallVectorImpl<Value *> &Locations,
6478 assert(numLLVMArgOps(Ops) != 0 &&
6479 "Expected expression that references DIArglist locations using "
6480 "DW_OP_llvm_arg operands.");
6482 for (Value *V : Locations)
6483 MetadataLocs.push_back(ValueAsMetadata::get(V));
6484 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6485 DbgVal.setRawLocation(llvm::DIArgList::get(DbgVal.getContext(), ValArrayRef));
6486 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6487}
6488
6489/// Write the new expression and new location ops for the dbg.value. If possible
6490/// reduce the szie of the dbg.value intrinsic by omitting DIArglist. This
6491/// can be omitted if:
6492/// 1. There is only a single location, refenced by a single DW_OP_llvm_arg.
6493/// 2. The DW_OP_LLVM_arg is the first operand in the expression.
6494static void UpdateDbgValueInst(DVIRecoveryRec &DVIRec,
6495 SmallVectorImpl<Value *> &NewLocationOps,
6497 auto UpdateDbgValueInstImpl = [&](auto *DbgVal) {
6498 unsigned NumLLVMArgs = numLLVMArgOps(NewExpr);
6499 if (NumLLVMArgs == 0) {
6500 // Location assumed to be on the stack.
6501 updateDVIWithLocation(*DbgVal, NewLocationOps[0], NewExpr);
6502 } else if (NumLLVMArgs == 1 && NewExpr[0] == dwarf::DW_OP_LLVM_arg) {
6503 // There is only a single DW_OP_llvm_arg at the start of the expression,
6504 // so it can be omitted along with DIArglist.
6505 assert(NewExpr[1] == 0 &&
6506 "Lone LLVM_arg in a DIExpression should refer to location-op 0.");
6508 updateDVIWithLocation(*DbgVal, NewLocationOps[0], ShortenedOps);
6509 } else {
6510 // Multiple DW_OP_llvm_arg, so DIArgList is strictly necessary.
6511 updateDVIWithLocations(*DbgVal, NewLocationOps, NewExpr);
6512 }
6513
6514 // If the DIExpression was previously empty then add the stack terminator.
6515 // Non-empty expressions have only had elements inserted into them and so
6516 // the terminator should already be present e.g. stack_value or fragment.
6517 DIExpression *SalvageExpr = DbgVal->getExpression();
6518 if (!DVIRec.Expr->isComplex() && SalvageExpr->isComplex()) {
6519 SalvageExpr =
6520 DIExpression::append(SalvageExpr, {dwarf::DW_OP_stack_value});
6521 DbgVal->setExpression(SalvageExpr);
6522 }
6523 };
6524 if (isa<DbgValueInst *>(DVIRec.DbgRef))
6525 UpdateDbgValueInstImpl(cast<DbgValueInst *>(DVIRec.DbgRef));
6526 else
6527 UpdateDbgValueInstImpl(cast<DbgVariableRecord *>(DVIRec.DbgRef));
6528}
6529
6530/// Cached location ops may be erased during LSR, in which case a poison is
6531/// required when restoring from the cache. The type of that location is no
6532/// longer available, so just use int8. The poison will be replaced by one or
6533/// more locations later when a SCEVDbgValueBuilder selects alternative
6534/// locations to use for the salvage.
6536 return (VH) ? VH : PoisonValue::get(llvm::Type::getInt8Ty(C));
6537}
6538
6539/// Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
6540static void restorePreTransformState(DVIRecoveryRec &DVIRec) {
6541 auto RestorePreTransformStateImpl = [&](auto *DbgVal) {
6542 LLVM_DEBUG(dbgs() << "scev-salvage: restore dbg.value to pre-LSR state\n"
6543 << "scev-salvage: post-LSR: " << *DbgVal << '\n');
6544 assert(DVIRec.Expr && "Expected an expression");
6545 DbgVal->setExpression(DVIRec.Expr);
6546
6547 // Even a single location-op may be inside a DIArgList and referenced with
6548 // DW_OP_LLVM_arg, which is valid only with a DIArgList.
6549 if (!DVIRec.HadLocationArgList) {
6550 assert(DVIRec.LocationOps.size() == 1 &&
6551 "Unexpected number of location ops.");
6552 // LSR's unsuccessful salvage attempt may have added DIArgList, which in
6553 // this case was not present before, so force the location back to a
6554 // single uncontained Value.
6555 Value *CachedValue =
6556 getValueOrPoison(DVIRec.LocationOps[0], DbgVal->getContext());
6557 DbgVal->setRawLocation(ValueAsMetadata::get(CachedValue));
6558 } else {
6560 for (WeakVH VH : DVIRec.LocationOps) {
6561 Value *CachedValue = getValueOrPoison(VH, DbgVal->getContext());
6562 MetadataLocs.push_back(ValueAsMetadata::get(CachedValue));
6563 }
6564 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6565 DbgVal->setRawLocation(
6566 llvm::DIArgList::get(DbgVal->getContext(), ValArrayRef));
6567 }
6568 LLVM_DEBUG(dbgs() << "scev-salvage: pre-LSR: " << *DbgVal << '\n');
6569 };
6570 if (isa<DbgValueInst *>(DVIRec.DbgRef))
6571 RestorePreTransformStateImpl(cast<DbgValueInst *>(DVIRec.DbgRef));
6572 else
6573 RestorePreTransformStateImpl(cast<DbgVariableRecord *>(DVIRec.DbgRef));
6574}
6575
6577 llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec,
6578 const SCEV *SCEVInductionVar,
6579 SCEVDbgValueBuilder IterCountExpr) {
6580
6581 if (isa<DbgValueInst *>(DVIRec.DbgRef)
6582 ? !cast<DbgValueInst *>(DVIRec.DbgRef)->isKillLocation()
6583 : !cast<DbgVariableRecord *>(DVIRec.DbgRef)->isKillLocation())
6584 return false;
6585
6586 // LSR may have caused several changes to the dbg.value in the failed salvage
6587 // attempt. So restore the DIExpression, the location ops and also the
6588 // location ops format, which is always DIArglist for multiple ops, but only
6589 // sometimes for a single op.
6591
6592 // LocationOpIndexMap[i] will store the post-LSR location index of
6593 // the non-optimised out location at pre-LSR index i.
6594 SmallVector<int64_t, 2> LocationOpIndexMap;
6595 LocationOpIndexMap.assign(DVIRec.LocationOps.size(), -1);
6596 SmallVector<Value *, 2> NewLocationOps;
6597 NewLocationOps.push_back(LSRInductionVar);
6598
6599 for (unsigned i = 0; i < DVIRec.LocationOps.size(); i++) {
6600 WeakVH VH = DVIRec.LocationOps[i];
6601 // Place the locations not optimised out in the list first, avoiding
6602 // inserts later. The map is used to update the DIExpression's
6603 // DW_OP_LLVM_arg arguments as the expression is updated.
6604 if (VH && !isa<UndefValue>(VH)) {
6605 NewLocationOps.push_back(VH);
6606 LocationOpIndexMap[i] = NewLocationOps.size() - 1;
6607 LLVM_DEBUG(dbgs() << "scev-salvage: Location index " << i
6608 << " now at index " << LocationOpIndexMap[i] << "\n");
6609 continue;
6610 }
6611
6612 // It's possible that a value referred to in the SCEV may have been
6613 // optimised out by LSR.
6614 if (SE.containsErasedValue(DVIRec.SCEVs[i]) ||
6615 SE.containsUndefs(DVIRec.SCEVs[i])) {
6616 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV for location at index: " << i
6617 << " refers to a location that is now undef or erased. "
6618 "Salvage abandoned.\n");
6619 return false;
6620 }
6621
6622 LLVM_DEBUG(dbgs() << "scev-salvage: salvaging location at index " << i
6623 << " with SCEV: " << *DVIRec.SCEVs[i] << "\n");
6624
6625 DVIRec.RecoveryExprs[i] = std::make_unique<SCEVDbgValueBuilder>();
6626 SCEVDbgValueBuilder *SalvageExpr = DVIRec.RecoveryExprs[i].get();
6627
6628 // Create an offset-based salvage expression if possible, as it requires
6629 // less DWARF ops than an iteration count-based expression.
6630 if (std::optional<APInt> Offset =
6631 SE.computeConstantDifference(DVIRec.SCEVs[i], SCEVInductionVar)) {
6632 if (Offset->getSignificantBits() <= 64)
6633 SalvageExpr->createOffsetExpr(Offset->getSExtValue(), LSRInductionVar);
6634 } else if (!SalvageExpr->createIterCountExpr(DVIRec.SCEVs[i], IterCountExpr,
6635 SE))
6636 return false;
6637 }
6638
6639 // Merge the DbgValueBuilder generated expressions and the original
6640 // DIExpression, place the result into an new vector.
6642 if (DVIRec.Expr->getNumElements() == 0) {
6643 assert(DVIRec.RecoveryExprs.size() == 1 &&
6644 "Expected only a single recovery expression for an empty "
6645 "DIExpression.");
6646 assert(DVIRec.RecoveryExprs[0] &&
6647 "Expected a SCEVDbgSalvageBuilder for location 0");
6648 SCEVDbgValueBuilder *B = DVIRec.RecoveryExprs[0].get();
6649 B->appendToVectors(NewExpr, NewLocationOps);
6650 }
6651 for (const auto &Op : DVIRec.Expr->expr_ops()) {
6652 // Most Ops needn't be updated.
6653 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6654 Op.appendToVector(NewExpr);
6655 continue;
6656 }
6657
6658 uint64_t LocationArgIndex = Op.getArg(0);
6659 SCEVDbgValueBuilder *DbgBuilder =
6660 DVIRec.RecoveryExprs[LocationArgIndex].get();
6661 // The location doesn't have s SCEVDbgValueBuilder, so LSR did not
6662 // optimise it away. So just translate the argument to the updated
6663 // location index.
6664 if (!DbgBuilder) {
6665 NewExpr.push_back(dwarf::DW_OP_LLVM_arg);
6666 assert(LocationOpIndexMap[Op.getArg(0)] != -1 &&
6667 "Expected a positive index for the location-op position.");
6668 NewExpr.push_back(LocationOpIndexMap[Op.getArg(0)]);
6669 continue;
6670 }
6671 // The location has a recovery expression.
6672 DbgBuilder->appendToVectors(NewExpr, NewLocationOps);
6673 }
6674
6675 UpdateDbgValueInst(DVIRec, NewLocationOps, NewExpr);
6676 if (isa<DbgValueInst *>(DVIRec.DbgRef))
6677 LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: "
6678 << *cast<DbgValueInst *>(DVIRec.DbgRef) << "\n");
6679 else
6680 LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: "
6681 << *cast<DbgVariableRecord *>(DVIRec.DbgRef) << "\n");
6682 return true;
6683}
6684
6685/// Obtain an expression for the iteration count, then attempt to salvage the
6686/// dbg.value intrinsics.
6688 llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar,
6689 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &DVIToUpdate) {
6690 if (DVIToUpdate.empty())
6691 return;
6692
6693 const llvm::SCEV *SCEVInductionVar = SE.getSCEV(LSRInductionVar);
6694 assert(SCEVInductionVar &&
6695 "Anticipated a SCEV for the post-LSR induction variable");
6696
6697 if (const SCEVAddRecExpr *IVAddRec =
6698 dyn_cast<SCEVAddRecExpr>(SCEVInductionVar)) {
6699 if (!IVAddRec->isAffine())
6700 return;
6701
6702 // Prevent translation using excessive resources.
6703 if (IVAddRec->getExpressionSize() > MaxSCEVSalvageExpressionSize)
6704 return;
6705
6706 // The iteration count is required to recover location values.
6707 SCEVDbgValueBuilder IterCountExpr;
6708 IterCountExpr.pushLocation(LSRInductionVar);
6709 if (!IterCountExpr.SCEVToIterCountExpr(*IVAddRec, SE))
6710 return;
6711
6712 LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV: " << *SCEVInductionVar
6713 << '\n');
6714
6715 for (auto &DVIRec : DVIToUpdate) {
6716 SalvageDVI(L, SE, LSRInductionVar, *DVIRec, SCEVInductionVar,
6717 IterCountExpr);
6718 }
6719 }
6720}
6721
6722/// Identify and cache salvageable DVI locations and expressions along with the
6723/// corresponding SCEV(s). Also ensure that the DVI is not deleted between
6724/// cacheing and salvaging.
6726 Loop *L, ScalarEvolution &SE,
6727 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &SalvageableDVISCEVs,
6728 SmallSet<AssertingVH<DbgValueInst>, 2> &DVIHandles) {
6729 for (const auto &B : L->getBlocks()) {
6730 for (auto &I : *B) {
6731 auto ProcessDbgValue = [&](auto *DbgVal) -> bool {
6732 // Ensure that if any location op is undef that the dbg.vlue is not
6733 // cached.
6734 if (DbgVal->isKillLocation())
6735 return false;
6736
6737 // Check that the location op SCEVs are suitable for translation to
6738 // DIExpression.
6739 const auto &HasTranslatableLocationOps =
6740 [&](const auto *DbgValToTranslate) -> bool {
6741 for (const auto LocOp : DbgValToTranslate->location_ops()) {
6742 if (!LocOp)
6743 return false;
6744
6745 if (!SE.isSCEVable(LocOp->getType()))
6746 return false;
6747
6748 const SCEV *S = SE.getSCEV(LocOp);
6749 if (SE.containsUndefs(S))
6750 return false;
6751 }
6752 return true;
6753 };
6754
6755 if (!HasTranslatableLocationOps(DbgVal))
6756 return false;
6757
6758 std::unique_ptr<DVIRecoveryRec> NewRec =
6759 std::make_unique<DVIRecoveryRec>(DbgVal);
6760 // Each location Op may need a SCEVDbgValueBuilder in order to recover
6761 // it. Pre-allocating a vector will enable quick lookups of the builder
6762 // later during the salvage.
6763 NewRec->RecoveryExprs.resize(DbgVal->getNumVariableLocationOps());
6764 for (const auto LocOp : DbgVal->location_ops()) {
6765 NewRec->SCEVs.push_back(SE.getSCEV(LocOp));
6766 NewRec->LocationOps.push_back(LocOp);
6767 NewRec->HadLocationArgList = DbgVal->hasArgList();
6768 }
6769 SalvageableDVISCEVs.push_back(std::move(NewRec));
6770 return true;
6771 };
6772 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
6773 if (DVR.isDbgValue() || DVR.isDbgAssign())
6774 ProcessDbgValue(&DVR);
6775 }
6776 auto DVI = dyn_cast<DbgValueInst>(&I);
6777 if (!DVI)
6778 continue;
6779 if (ProcessDbgValue(DVI))
6780 DVIHandles.insert(DVI);
6781 }
6782 }
6783}
6784
6785/// Ideally pick the PHI IV inserted by ScalarEvolutionExpander. As a fallback
6786/// any PHi from the loop header is usable, but may have less chance of
6787/// surviving subsequent transforms.
6789 const LSRInstance &LSR) {
6790
6791 auto IsSuitableIV = [&](PHINode *P) {
6792 if (!SE.isSCEVable(P->getType()))
6793 return false;
6794 if (const SCEVAddRecExpr *Rec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(P)))
6795 return Rec->isAffine() && !SE.containsUndefs(SE.getSCEV(P));
6796 return false;
6797 };
6798
6799 // For now, just pick the first IV that was generated and inserted by
6800 // ScalarEvolution. Ideally pick an IV that is unlikely to be optimised away
6801 // by subsequent transforms.
6802 for (const WeakVH &IV : LSR.getScalarEvolutionIVs()) {
6803 if (!IV)
6804 continue;
6805
6806 // There should only be PHI node IVs.
6807 PHINode *P = cast<PHINode>(&*IV);
6808
6809 if (IsSuitableIV(P))
6810 return P;
6811 }
6812
6813 for (PHINode &P : L.getHeader()->phis()) {
6814 if (IsSuitableIV(&P))
6815 return &P;
6816 }
6817 return nullptr;
6818}
6819
6820static std::optional<std::tuple<PHINode *, PHINode *, const SCEV *, bool>>
6822 const LoopInfo &LI, const TargetTransformInfo &TTI) {
6823 if (!L->isInnermost()) {
6824 LLVM_DEBUG(dbgs() << "Cannot fold on non-innermost loop\n");
6825 return std::nullopt;
6826 }
6827 // Only inspect on simple loop structure
6828 if (!L->isLoopSimplifyForm()) {
6829 LLVM_DEBUG(dbgs() << "Cannot fold on non-simple loop\n");
6830 return std::nullopt;
6831 }
6832
6834 LLVM_DEBUG(dbgs() << "Cannot fold on backedge that is loop variant\n");
6835 return std::nullopt;
6836 }
6837
6838 BasicBlock *LoopLatch = L->getLoopLatch();
6839 BranchInst *BI = dyn_cast<BranchInst>(LoopLatch->getTerminator());
6840 if (!BI || BI->isUnconditional())
6841 return std::nullopt;
6842 auto *TermCond = dyn_cast<ICmpInst>(BI->getCondition());
6843 if (!TermCond) {
6844 LLVM_DEBUG(
6845 dbgs() << "Cannot fold on branching condition that is not an ICmpInst");
6846 return std::nullopt;
6847 }
6848 if (!TermCond->hasOneUse()) {
6849 LLVM_DEBUG(
6850 dbgs()
6851 << "Cannot replace terminating condition with more than one use\n");
6852 return std::nullopt;
6853 }
6854
6855 BinaryOperator *LHS = dyn_cast<BinaryOperator>(TermCond->getOperand(0));
6856 Value *RHS = TermCond->getOperand(1);
6857 if (!LHS || !L->isLoopInvariant(RHS))
6858 // We could pattern match the inverse form of the icmp, but that is
6859 // non-canonical, and this pass is running *very* late in the pipeline.
6860 return std::nullopt;
6861
6862 // Find the IV used by the current exit condition.
6863 PHINode *ToFold;
6864 Value *ToFoldStart, *ToFoldStep;
6865 if (!matchSimpleRecurrence(LHS, ToFold, ToFoldStart, ToFoldStep))
6866 return std::nullopt;
6867
6868 // Ensure the simple recurrence is a part of the current loop.
6869 if (ToFold->getParent() != L->getHeader())
6870 return std::nullopt;
6871
6872 // If that IV isn't dead after we rewrite the exit condition in terms of
6873 // another IV, there's no point in doing the transform.
6874 if (!isAlmostDeadIV(ToFold, LoopLatch, TermCond))
6875 return std::nullopt;
6876
6877 // Inserting instructions in the preheader has a runtime cost, scale
6878 // the allowed cost with the loops trip count as best we can.
6879 const unsigned ExpansionBudget = [&]() {
6880 unsigned Budget = 2 * SCEVCheapExpansionBudget;
6881 if (unsigned SmallTC = SE.getSmallConstantMaxTripCount(L))
6882 return std::min(Budget, SmallTC);
6883 if (std::optional<unsigned> SmallTC = getLoopEstimatedTripCount(L))
6884 return std::min(Budget, *SmallTC);
6885 // Unknown trip count, assume long running by default.
6886 return Budget;
6887 }();
6888
6889 const SCEV *BECount = SE.getBackedgeTakenCount(L);
6890 const DataLayout &DL = L->getHeader()->getDataLayout();
6891 SCEVExpander Expander(SE, DL, "lsr_fold_term_cond");
6892
6893 PHINode *ToHelpFold = nullptr;
6894 const SCEV *TermValueS = nullptr;
6895 bool MustDropPoison = false;
6896 auto InsertPt = L->getLoopPreheader()->getTerminator();
6897 for (PHINode &PN : L->getHeader()->phis()) {
6898 if (ToFold == &PN)
6899 continue;
6900
6901 if (!SE.isSCEVable(PN.getType())) {
6902 LLVM_DEBUG(dbgs() << "IV of phi '" << PN
6903 << "' is not SCEV-able, not qualified for the "
6904 "terminating condition folding.\n");
6905 continue;
6906 }
6907 const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(&PN));
6908 // Only speculate on affine AddRec
6909 if (!AddRec || !AddRec->isAffine()) {
6910 LLVM_DEBUG(dbgs() << "SCEV of phi '" << PN
6911 << "' is not an affine add recursion, not qualified "
6912 "for the terminating condition folding.\n");
6913 continue;
6914 }
6915
6916 // Check that we can compute the value of AddRec on the exiting iteration
6917 // without soundness problems. evaluateAtIteration internally needs
6918 // to multiply the stride of the iteration number - which may wrap around.
6919 // The issue here is subtle because computing the result accounting for
6920 // wrap is insufficient. In order to use the result in an exit test, we
6921 // must also know that AddRec doesn't take the same value on any previous
6922 // iteration. The simplest case to consider is a candidate IV which is
6923 // narrower than the trip count (and thus original IV), but this can
6924 // also happen due to non-unit strides on the candidate IVs.
6925 if (!AddRec->hasNoSelfWrap() ||
6926 !SE.isKnownNonZero(AddRec->getStepRecurrence(SE)))
6927 continue;
6928
6929 const SCEVAddRecExpr *PostInc = AddRec->getPostIncExpr(SE);
6930 const SCEV *TermValueSLocal = PostInc->evaluateAtIteration(BECount, SE);
6931 if (!Expander.isSafeToExpand(TermValueSLocal)) {
6932 LLVM_DEBUG(
6933 dbgs() << "Is not safe to expand terminating value for phi node" << PN
6934 << "\n");
6935 continue;
6936 }
6937
6938 if (Expander.isHighCostExpansion(TermValueSLocal, L, ExpansionBudget,
6939 &TTI, InsertPt)) {
6940 LLVM_DEBUG(
6941 dbgs() << "Is too expensive to expand terminating value for phi node"
6942 << PN << "\n");
6943 continue;
6944 }
6945
6946 // The candidate IV may have been otherwise dead and poison from the
6947 // very first iteration. If we can't disprove that, we can't use the IV.
6948 if (!mustExecuteUBIfPoisonOnPathTo(&PN, LoopLatch->getTerminator(), &DT)) {
6949 LLVM_DEBUG(dbgs() << "Can not prove poison safety for IV "
6950 << PN << "\n");
6951 continue;
6952 }
6953
6954 // The candidate IV may become poison on the last iteration. If this
6955 // value is not branched on, this is a well defined program. We're
6956 // about to add a new use to this IV, and we have to ensure we don't
6957 // insert UB which didn't previously exist.
6958 bool MustDropPoisonLocal = false;
6959 Instruction *PostIncV =
6960 cast<Instruction>(PN.getIncomingValueForBlock(LoopLatch));
6961 if (!mustExecuteUBIfPoisonOnPathTo(PostIncV, LoopLatch->getTerminator(),
6962 &DT)) {
6963 LLVM_DEBUG(dbgs() << "Can not prove poison safety to insert use"
6964 << PN << "\n");
6965
6966 // If this is a complex recurrance with multiple instructions computing
6967 // the backedge value, we might need to strip poison flags from all of
6968 // them.
6969 if (PostIncV->getOperand(0) != &PN)
6970 continue;
6971
6972 // In order to perform the transform, we need to drop the poison generating
6973 // flags on this instruction (if any).
6974 MustDropPoisonLocal = PostIncV->hasPoisonGeneratingFlags();
6975 }
6976
6977 // We pick the last legal alternate IV. We could expore choosing an optimal
6978 // alternate IV if we had a decent heuristic to do so.
6979 ToHelpFold = &PN;
6980 TermValueS = TermValueSLocal;
6981 MustDropPoison = MustDropPoisonLocal;
6982 }
6983
6984 LLVM_DEBUG(if (ToFold && !ToHelpFold) dbgs()
6985 << "Cannot find other AddRec IV to help folding\n";);
6986
6987 LLVM_DEBUG(if (ToFold && ToHelpFold) dbgs()
6988 << "\nFound loop that can fold terminating condition\n"
6989 << " BECount (SCEV): " << *SE.getBackedgeTakenCount(L) << "\n"
6990 << " TermCond: " << *TermCond << "\n"
6991 << " BrandInst: " << *BI << "\n"
6992 << " ToFold: " << *ToFold << "\n"
6993 << " ToHelpFold: " << *ToHelpFold << "\n");
6994
6995 if (!ToFold || !ToHelpFold)
6996 return std::nullopt;
6997 return std::make_tuple(ToFold, ToHelpFold, TermValueS, MustDropPoison);
6998}
6999
7001 DominatorTree &DT, LoopInfo &LI,
7002 const TargetTransformInfo &TTI,
7004 MemorySSA *MSSA) {
7005
7006 // Debug preservation - before we start removing anything identify which DVI
7007 // meet the salvageable criteria and store their DIExpression and SCEVs.
7008 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> SalvageableDVIRecords;
7010 DbgGatherSalvagableDVI(L, SE, SalvageableDVIRecords, DVIHandles);
7011
7012 bool Changed = false;
7013 std::unique_ptr<MemorySSAUpdater> MSSAU;
7014 if (MSSA)
7015 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
7016
7017 // Run the main LSR transformation.
7018 const LSRInstance &Reducer =
7019 LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get());
7020 Changed |= Reducer.getChanged();
7021
7022 // Remove any extra phis created by processing inner loops.
7023 Changed |= DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7024 if (EnablePhiElim && L->isLoopSimplifyForm()) {
7026 const DataLayout &DL = L->getHeader()->getDataLayout();
7027 SCEVExpander Rewriter(SE, DL, "lsr", false);
7028#ifndef NDEBUG
7029 Rewriter.setDebugType(DEBUG_TYPE);
7030#endif
7031 unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI);
7032 Rewriter.clear();
7033 if (numFolded) {
7034 Changed = true;
7036 MSSAU.get());
7037 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7038 }
7039 }
7040 // LSR may at times remove all uses of an induction variable from a loop.
7041 // The only remaining use is the PHI in the exit block.
7042 // When this is the case, if the exit value of the IV can be calculated using
7043 // SCEV, we can replace the exit block PHI with the final value of the IV and
7044 // skip the updates in each loop iteration.
7045 if (L->isRecursivelyLCSSAForm(DT, LI) && L->getExitBlock()) {
7047 const DataLayout &DL = L->getHeader()->getDataLayout();
7048 SCEVExpander Rewriter(SE, DL, "lsr", true);
7049 int Rewrites = rewriteLoopExitValues(L, &LI, &TLI, &SE, &TTI, Rewriter, &DT,
7050 UnusedIndVarInLoop, DeadInsts);
7051 Rewriter.clear();
7052 if (Rewrites) {
7053 Changed = true;
7055 MSSAU.get());
7056 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7057 }
7058 }
7059
7060 const bool EnableFormTerm = [&] {
7062 case cl::BOU_TRUE:
7063 return true;
7064 case cl::BOU_FALSE:
7065 return false;
7066 case cl::BOU_UNSET:
7068 }
7069 llvm_unreachable("Unhandled cl::boolOrDefault enum");
7070 }();
7071
7072 if (EnableFormTerm) {
7073 if (auto Opt = canFoldTermCondOfLoop(L, SE, DT, LI, TTI)) {
7074 auto [ToFold, ToHelpFold, TermValueS, MustDrop] = *Opt;
7075
7076 Changed = true;
7077 NumTermFold++;
7078
7079 BasicBlock *LoopPreheader = L->getLoopPreheader();
7080 BasicBlock *LoopLatch = L->getLoopLatch();
7081
7082 (void)ToFold;
7083 LLVM_DEBUG(dbgs() << "To fold phi-node:\n"
7084 << *ToFold << "\n"
7085 << "New term-cond phi-node:\n"
7086 << *ToHelpFold << "\n");
7087
7088 Value *StartValue = ToHelpFold->getIncomingValueForBlock(LoopPreheader);
7089 (void)StartValue;
7090 Value *LoopValue = ToHelpFold->getIncomingValueForBlock(LoopLatch);
7091
7092 // See comment in canFoldTermCondOfLoop on why this is sufficient.
7093 if (MustDrop)
7094 cast<Instruction>(LoopValue)->dropPoisonGeneratingFlags();
7095
7096 // SCEVExpander for both use in preheader and latch
7097 const DataLayout &DL = L->getHeader()->getDataLayout();
7098 SCEVExpander Expander(SE, DL, "lsr_fold_term_cond");
7099
7100 assert(Expander.isSafeToExpand(TermValueS) &&
7101 "Terminating value was checked safe in canFoldTerminatingCondition");
7102
7103 // Create new terminating value at loop preheader
7104 Value *TermValue = Expander.expandCodeFor(TermValueS, ToHelpFold->getType(),
7105 LoopPreheader->getTerminator());
7106
7107 LLVM_DEBUG(dbgs() << "Start value of new term-cond phi-node:\n"
7108 << *StartValue << "\n"
7109 << "Terminating value of new term-cond phi-node:\n"
7110 << *TermValue << "\n");
7111
7112 // Create new terminating condition at loop latch
7113 BranchInst *BI = cast<BranchInst>(LoopLatch->getTerminator());
7114 ICmpInst *OldTermCond = cast<ICmpInst>(BI->getCondition());
7115 IRBuilder<> LatchBuilder(LoopLatch->getTerminator());
7116 Value *NewTermCond =
7117 LatchBuilder.CreateICmp(CmpInst::ICMP_EQ, LoopValue, TermValue,
7118 "lsr_fold_term_cond.replaced_term_cond");
7119 // Swap successors to exit loop body if IV equals to new TermValue
7120 if (BI->getSuccessor(0) == L->getHeader())
7121 BI->swapSuccessors();
7122
7123 LLVM_DEBUG(dbgs() << "Old term-cond:\n"
7124 << *OldTermCond << "\n"
7125 << "New term-cond:\n" << *NewTermCond << "\n");
7126
7127 BI->setCondition(NewTermCond);
7128
7129 Expander.clear();
7130 OldTermCond->eraseFromParent();
7131 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7132 }
7133 }
7134
7135 if (SalvageableDVIRecords.empty())
7136 return Changed;
7137
7138 // Obtain relevant IVs and attempt to rewrite the salvageable DVIs with
7139 // expressions composed using the derived iteration count.
7140 // TODO: Allow for multiple IV references for nested AddRecSCEVs
7141 for (const auto &L : LI) {
7142 if (llvm::PHINode *IV = GetInductionVariable(*L, SE, Reducer))
7143 DbgRewriteSalvageableDVIs(L, SE, IV, SalvageableDVIRecords);
7144 else {
7145 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV salvaging not possible. An IV "
7146 "could not be identified.\n");
7147 }
7148 }
7149
7150 for (auto &Rec : SalvageableDVIRecords)
7151 Rec->clear();
7152 SalvageableDVIRecords.clear();
7153 DVIHandles.clear();
7154 return Changed;
7155}
7156
7157bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
7158 if (skipLoop(L))
7159 return false;
7160
7161 auto &IU = getAnalysis<IVUsersWrapperPass>().getIU();
7162 auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
7163 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
7164 auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
7165 const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
7166 *L->getHeader()->getParent());
7167 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
7168 *L->getHeader()->getParent());
7169 auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
7170 *L->getHeader()->getParent());
7171 auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
7172 MemorySSA *MSSA = nullptr;
7173 if (MSSAAnalysis)
7174 MSSA = &MSSAAnalysis->getMSSA();
7175 return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, TLI, MSSA);
7176}
7177
7180 LPMUpdater &) {
7181 if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,
7182 AR.DT, AR.LI, AR.TTI, AR.AC, AR.TLI, AR.MSSA))
7183 return PreservedAnalyses::all();
7184
7185 auto PA = getLoopPassPreservedAnalyses();
7186 if (AR.MSSA)
7187 PA.preserve<MemorySSAAnalysis>();
7188 return PA;
7189}
7190
7191char LoopStrengthReduce::ID = 0;
7192
7193INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
7194 "Loop Strength Reduction", false, false)
7200INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
7201INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
7202 "Loop Strength Reduction", false, false)
7203
7204Pass *llvm::createLoopStrengthReducePass() { return new LoopStrengthReduce(); }
#define Success
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file implements a class to represent arbitrary precision integral constant values and operations...
@ PostInc
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:686
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:537
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static void clear(coro::Shape &Shape)
Definition: Coroutines.cpp:148
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static bool isCanonical(const MDString *S)
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
This file contains constants used for implementing Dwarf debug support.
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1293
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Rewrite Partial Register Uses
Hexagon Hardware Loops
iv Induction Variable Users
Definition: IVUsers.cpp:48
This header provides classes for managing per-loop analyses.
static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec, const SCEV *SCEVInductionVar, SCEVDbgValueBuilder IterCountExpr)
static std::optional< std::tuple< PHINode *, PHINode *, const SCEV *, bool > > canFoldTermCondOfLoop(Loop *L, ScalarEvolution &SE, DominatorTree &DT, const LoopInfo &LI, const TargetTransformInfo &TTI)
static Value * getWideOperand(Value *Oper)
IVChain logic must consistently peek base TruncInst operands, so wrap it in a convenient helper.
static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE)
Return true if the given add can be sign-extended without changing its value.
static bool mayUsePostIncMode(const TargetTransformInfo &TTI, LSRUse &LU, const SCEV *S, const Loop *L, ScalarEvolution &SE)
Return true if the SCEV represents a value that may end up as a post-increment operation.
static void restorePreTransformState(DVIRecoveryRec &DVIRec)
Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L)
static User::op_iterator findIVOperand(User::op_iterator OI, User::op_iterator OE, Loop *L, ScalarEvolution &SE)
Helper for CollectChains that finds an IV operand (computed by an AddRec in this loop) within [OI,...
static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE)
Return true if the given mul can be sign-extended without changing its value.
static const unsigned MaxSCEVSalvageExpressionSize
Limit the size of expression that SCEV-based salvaging will attempt to translate into a DIExpression.
static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if this AddRec is already a phi in its loop.
static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F, const Loop &L)
static cl::opt< bool > InsnsCost("lsr-insns-cost", cl::Hidden, cl::init(true), cl::desc("Add instruction count to a LSR cost model"))
static cl::opt< bool > StressIVChain("stress-ivchain", cl::Hidden, cl::init(false), cl::desc("Stress test LSR IV chains"))
static bool isAddressUse(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Returns true if the specified instruction is using the specified value as an address.
static GlobalValue * ExtractSymbol(const SCEV *&S, ScalarEvolution &SE)
If S involves the addition of a GlobalValue address, return that symbol, and mutate S to point to a n...
static void updateDVIWithLocation(T &DbgVal, Value *Location, SmallVectorImpl< uint64_t > &Ops)
Overwrites DVI with the location and Ops as the DIExpression.
static cl::opt< cl::boolOrDefault > AllowDropSolutionIfLessProfitable("lsr-drop-solution", cl::Hidden, cl::desc("Attempt to drop solution if it is less profitable"))
static cl::opt< TTI::AddressingModeKind > PreferredAddresingMode("lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None), cl::desc("A flag that overrides the target's preferred addressing mode."), cl::values(clEnumValN(TTI::AMK_None, "none", "Don't prefer any addressing mode"), clEnumValN(TTI::AMK_PreIndexed, "preindexed", "Prefer pre-indexed addressing mode"), clEnumValN(TTI::AMK_PostIndexed, "postindexed", "Prefer post-indexed addressing mode")))
static const SCEV * getExprBase(const SCEV *S)
Return an approximation of this SCEV expression's "base", or NULL for any constant.
static llvm::PHINode * GetInductionVariable(const Loop &L, ScalarEvolution &SE, const LSRInstance &LSR)
Ideally pick the PHI IV inserted by ScalarEvolutionExpander.
static cl::opt< cl::boolOrDefault > AllowTerminatingConditionFoldingAfterLSR("lsr-term-fold", cl::Hidden, cl::desc("Attempt to replace primary IV with other IV."))
static bool IsSimplerBaseSCEVForTarget(const TargetTransformInfo &TTI, ScalarEvolution &SE, const SCEV *Best, const SCEV *Reg, MemAccessTy AccessType)
loop reduce
static const unsigned MaxIVUsers
MaxIVUsers is an arbitrary threshold that provides an early opportunity for bail out.
static bool isAlwaysFoldable(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t ScalableOffset=0)
static bool isHighCostExpansion(const SCEV *S, SmallPtrSetImpl< const SCEV * > &Processed, ScalarEvolution &SE)
Check if expanding this expression is likely to incur significant cost.
static Value * getValueOrPoison(WeakVH &VH, LLVMContext &C)
Cached location ops may be erased during LSR, in which case a poison is required when restoring from ...
static MemAccessTy getAccessType(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Return the type of the memory being accessed.
static unsigned numLLVMArgOps(SmallVectorImpl< uint64_t > &Expr)
Returns the total number of DW_OP_llvm_arg operands in the expression.
static void DbgRewriteSalvageableDVIs(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &DVIToUpdate)
Obtain an expression for the iteration count, then attempt to salvage the dbg.value intrinsics.
static cl::opt< bool > EnablePhiElim("enable-lsr-phielim", cl::Hidden, cl::init(true), cl::desc("Enable LSR phi elimination"))
static void DbgGatherSalvagableDVI(Loop *L, ScalarEvolution &SE, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &SalvageableDVISCEVs, SmallSet< AssertingVH< DbgValueInst >, 2 > &DVIHandles)
Identify and cache salvageable DVI locations and expressions along with the corresponding SCEV(s).
static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if the given addrec can be sign-extended without changing its value.
static bool canHoistIVInc(const TargetTransformInfo &TTI, const LSRFixup &Fixup, const LSRUse &LU, Instruction *IVIncInsertPos, Loop *L)
static void DoInitialMatch(const SCEV *S, Loop *L, SmallVectorImpl< const SCEV * > &Good, SmallVectorImpl< const SCEV * > &Bad, ScalarEvolution &SE)
Recursion helper for initialMatch.
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F)
Check if the addressing mode defined by F is completely folded in LU at isel time.
static cl::opt< bool > LSRExpNarrow("lsr-exp-narrow", cl::Hidden, cl::init(false), cl::desc("Narrow LSR complex solution using" " expectation of registers number"))
static cl::opt< bool > FilterSameScaledReg("lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true), cl::desc("Narrow LSR search space by filtering non-optimal formulae" " with the same ScaledReg and Scale"))
static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, int64_t MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale)
Test whether we know how to expand the current formula.
static void updateDVIWithLocations(T &DbgVal, SmallVectorImpl< Value * > &Locations, SmallVectorImpl< uint64_t > &Ops)
Overwrite DVI with locations placed into a DIArglist.
static cl::opt< unsigned > ComplexityLimit("lsr-complexity-limit", cl::Hidden, cl::init(std::numeric_limits< uint16_t >::max()), cl::desc("LSR search space complexity limit"))
static void UpdateDbgValueInst(DVIRecoveryRec &DVIRec, SmallVectorImpl< Value * > &NewLocationOps, SmallVectorImpl< uint64_t > &NewExpr)
Write the new expression and new location ops for the dbg.value.
static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE)
If S involves the addition of a constant integer value, return that integer value,...
static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC, TargetLibraryInfo &TLI, MemorySSA *MSSA)
static bool isProfitableChain(IVChain &Chain, SmallPtrSetImpl< Instruction * > &Users, ScalarEvolution &SE, const TargetTransformInfo &TTI)
Return true if the number of registers needed for the chain is estimated to be less than the number r...
static const SCEV * CollectSubexprs(const SCEV *S, const SCEVConstant *C, SmallVectorImpl< const SCEV * > &Ops, const Loop *L, ScalarEvolution &SE, unsigned Depth=0)
Split S into subexpressions which can be pulled out into separate registers.
static const SCEV * getExactSDiv(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool IgnoreSignificantBits=false)
Return an expression for LHS /s RHS, if it can be determined and if the remainder is known to be zero...
static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, Value *Operand, const TargetTransformInfo &TTI)
Return true if the IVInc can be folded into an addressing mode.
#define DEBUG_TYPE
static const SCEV * getAnyExtendConsideringPostIncUses(ArrayRef< PostIncLoopSet > Loops, const SCEV *Expr, Type *ToTy, ScalarEvolution &SE)
Extend/Truncate Expr to ToTy considering post-inc uses in Loops.
static unsigned getSetupCost(const SCEV *Reg, unsigned Depth)
static cl::opt< unsigned > SetupCostDepthLimit("lsr-setupcost-depth-limit", cl::Hidden, cl::init(7), cl::desc("The limit on recursion depth for LSRs setup cost"))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
unsigned Reg
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
Module.h This file contains the declarations for the Module class.
uint64_t IntrinsicInst * II
#define P(N)
PowerPC TLS Dynamic Call Fixup
if(VerifyEach)
This header defines various interfaces for pass management in LLVM.
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file defines the PointerIntPair class.
const SmallVectorImpl< MachineOperand > & Cond
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
SI optimize exec mask operations pre RA
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This pass exposes codegen information to IR-level passes.
This defines the Use class.
Virtual Register Rewriter
Definition: VirtRegMap.cpp:237
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class recording the (high level) value of a variable.
Class for arbitrary precision integers.
Definition: APInt.h:77
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1499
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:308
APInt sdiv(const APInt &RHS) const
Signed division function for APInt.
Definition: APInt.cpp:1614
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition: APInt.h:1490
APInt srem(const APInt &RHS) const
Function for signed remainder operation.
Definition: APInt.cpp:1706
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1521
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:405
Represent the analysis usage information of a pass.
AnalysisUsage & addRequiredID(const void *ID)
Definition: Pass.cpp:283
AnalysisUsage & addPreservedID(const void *ID)
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Definition: Any.h:28
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
Value handle that asserts if the Value is deleted.
Definition: ValueHandle.h:264
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:494
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:695
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:507
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:167
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition: BasicBlock.h:366
bool isLandingPad() const
Return true if this basic block is a landing pad.
Definition: BasicBlock.cpp:677
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:229
BinaryOps getOpcode() const
Definition: InstrTypes.h:442
static BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
Conditional or Unconditional Branch instruction.
void setCondition(Value *V)
void swapSuccessors()
Swap the successors of this branch instruction.
BasicBlock * getSuccessor(unsigned i) const
bool isUnconditional() const
Value * getCondition() const
static Instruction::CastOps getCastOpcode(const Value *Val, bool SrcIsSigned, Type *Ty, bool DstIsSigned)
Returns the opcode necessary to cast Val into Ty using usual casting rules.
static CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ ICMP_EQ
equal
Definition: InstrTypes.h:778
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:871
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
static bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
Definition: Constants.cpp:1575
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.h:124
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:161
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:155
This is an important base class in LLVM.
Definition: Constant.h:41
static DIArgList * get(LLVMContext &Context, ArrayRef< ValueAsMetadata * > Args)
An iterator for expression operands.
DWARF expression.
static DIExpression * append(const DIExpression *Expr, ArrayRef< uint64_t > Ops)
Append the opcodes Ops to DIExpr.
static void appendOffset(SmallVectorImpl< uint64_t > &Ops, int64_t Offset)
Append Ops with operations to apply the Offset.
bool isComplex() const
Return whether the location is computed on the expression stack, meaning it cannot be a simple regist...
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
This represents the llvm.dbg.value instruction.
Record of a variable value-assignment, aka a non instruction representation of the dbg....
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
NodeT * getBlock() const
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:317
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
This instruction compares its operands according to the predicate given to the constructor.
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2349
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2664
IVStrideUse - Keep track of one use of a strided induction variable.
Definition: IVUsers.h:35
void transformToPostInc(const Loop *L)
transformToPostInc - Transform the expression to post-inc form for the given loop.
Definition: IVUsers.cpp:367
Value * getOperandValToReplace() const
getOperandValToReplace - Return the Value of the operand in the user instruction that this IVStrideUs...
Definition: IVUsers.h:54
void setUser(Instruction *NewUser)
setUser - Assign a new user instruction for this use.
Definition: IVUsers.h:48
Analysis pass that exposes the IVUsers for a loop.
Definition: IVUsers.h:184
ilist< IVStrideUse >::const_iterator const_iterator
Definition: IVUsers.h:142
bool empty() const
Definition: IVUsers.h:147
void print(raw_ostream &OS) const
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:476
bool isEHPad() const
Return true if the instruction is a variety of EH-block.
Definition: Instruction.h:834
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:92
Type * getAccessType() const LLVM_READONLY
Return the type this instruction accesses in memory, if any.
bool hasPoisonGeneratingFlags() const LLVM_READONLY
Return true if this operator has flags which may cause this instruction to evaluate to poison despite...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:473
const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition: Instruction.cpp:74
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
An instruction for reading from memory.
Definition: Instructions.h:173
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:598
virtual bool runOnLoop(Loop *L, LPPassManager &LPM)=0
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
An analysis that produces MemorySSA for a function.
Definition: MemorySSA.h:928
Legacy analysis pass which computes MemorySSA.
Definition: MemorySSA.h:985
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition: MemorySSA.h:701
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
iterator_range< const_block_iterator > blocks() const
op_range incoming_values()
void setIncomingValue(unsigned i, Value *V)
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
static unsigned getIncomingValueNumForOperand(unsigned i)
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98
PointerIntPair - This class implements a pair of a pointer and small integer.
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1814
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
This node represents an addition of some number of SCEVs.
This node represents a polynomial recurrence on the trip count of the specified loop.
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
const SCEVAddRecExpr * getPostIncExpr(ScalarEvolution &SE) const
Return an expression representing the value of this expression one iteration of the loop ahead.
This is the base class for unary cast operator classes.
This node is the base class for n'ary commutative operators.
This class represents a constant integer value.
ConstantInt * getValue() const
const APInt & getAPInt() const
This class uses information about analyze scalars to rewrite expressions in canonical form.
bool isSafeToExpand(const SCEV *S) const
Return true if the given expression is safe to expand in the sense that all materialized values are s...
bool isHighCostExpansion(ArrayRef< const SCEV * > Exprs, Loop *L, unsigned Budget, const TargetTransformInfo *TTI, const Instruction *At)
Return true for expressions that can't be evaluated at runtime within given Budget.
void clear()
Erase the contents of the InsertedExpressions map so that users trying to expand the same expression ...
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This is the base class for unary integral cast operator classes.
This node represents multiplication of some number of SCEVs.
This node is a base class providing common functionality for n'ary operators.
ArrayRef< const SCEV * > operands() const
This class represents a signed maximum selection.
This class represents a binary unsigned division operation.
This class represents an unsigned maximum selection.
This means that we are dealing with an entirely unknown SCEV value, and only represent it as its LLVM...
This class represents an analyzed expression in the program.
ArrayRef< const SCEV * > operands() const
Return operands of this SCEV expression.
unsigned short getExpressionSize() const
bool isZero() const
Return true if the expression is a constant zero.
SCEVTypes getSCEVType() const
Type * getType() const
Return the LLVM type of this SCEV expression.
This class represents a cast from signed integer to floating point.
The main scalar evolution driver.
bool isKnownNonZero(const SCEV *S)
Test if the given expression is known to be non-zero.
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
uint64_t getTypeSizeInBits(Type *Ty) const
Return the size in bits of the specified type, for which isSCEVable must return true.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getNoopOrSignExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
unsigned getSmallConstantMaxTripCount(const Loop *L)
Returns the upper bound of the loop trip count as a normal unsigned value.
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getAddRecExpr(const SCEV *Start, const SCEV *Step, const Loop *L, SCEV::NoWrapFlags Flags)
Get an add recurrence expression for the specified loop.
bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
Type * getEffectiveSCEVType(Type *Ty) const
Return a type with the same bitwidth as the given type and which represents how SCEV will treat the g...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
const SCEV * getAnyExtendExpr(const SCEV *Op, Type *Ty)
getAnyExtendExpr - Return a SCEV for the given operand extended with unspecified bits out to the give...
bool containsUndefs(const SCEV *S) const
Return true if the SCEV expression contains an undef value.
const SCEV * getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
bool hasComputableLoopEvolution(const SCEV *S, const Loop *L)
Return true if the given SCEV changes value in a known way in the specified loop.
const SCEV * getPointerBase(const SCEV *V)
Transitively follow the chain of pointer-type operands until reaching a SCEV that does not have a sin...
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUnknown(Value *V)
std::optional< APInt > computeConstantDifference(const SCEV *LHS, const SCEV *RHS)
Compute LHS - RHS and returns the result as an APInt if it is a constant, and std::nullopt if it isn'...
bool properlyDominates(const SCEV *S, const BasicBlock *BB)
Return true if elements that makes up the given SCEV properly dominate the specified basic block.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
bool containsErasedValue(const SCEV *S) const
Return true if the SCEV expression contains a Value that has been optimised out and is now a nullptr.
LLVMContext & getContext() const
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:113
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
iterator_range< const_set_bits_iterator > set_bits() const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
size_type size() const
Returns the number of bits in this bitvector.
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
size_type count() const
Returns the number of bits which are set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:323
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:412
iterator end() const
Definition: SmallPtrSet.h:437
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:344
iterator begin() const
Definition: SmallPtrSet.h:432
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
void clear()
Definition: SmallSet.h:218
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:717
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void reserve(size_type N)
Definition: SmallVector.h:676
iterator erase(const_iterator CI)
Definition: SmallVector.h:750
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:591
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
typename SuperClass::iterator iterator
Definition: SmallVector.h:590
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
int64_t getFixed() const
Returns the fixed component of the stack.
Definition: TypeSize.h:49
An instruction for storing to memory.
Definition: Instructions.h:289
Provides information about what library functions are available for the current target.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const
bool shouldDropLSRSolutionIfLessProfitable() const
Return true if LSR should drop a found solution if it's calculated to be less profitable than the bas...
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const
Return true if LSR cost of C1 is lower than C2.
bool shouldFoldTerminatingConditionAfterLSR() const
Return true if LSR should attempts to replace a use of an otherwise dead primary IV in the latch cond...
bool isProfitableLSRChainElement(Instruction *I) const
bool LSRWithInstrQueries() const
Return true if the loop strength reduce pass should make Instruction* based TTI queries to isLegalAdd...
bool isIndexedStoreLegal(enum MemIndexedMode Mode, Type *Ty) const
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace=0, Instruction *I=nullptr, int64_t ScalableOffset=0) const
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isIndexedLoadLegal(enum MemIndexedMode Mode, Type *Ty) const
bool isLegalICmpImmediate(int64_t Imm) const
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
bool isLegalAddImmediate(int64_t Imm) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *LibInfo) const
Return true if the target can save a compare for loop count, for example hardware loop saves a compar...
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isNumRegsMajorCostOfLSR() const
Return true if LSR major cost is number of registers.
@ MIM_PostInc
Post-incrementing.
bool canMacroFuseCmp() const
Return true if the target can fuse a compare and branch.
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace=0) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
This class represents a truncation of integer types.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static Type * getVoidTy(LLVMContext &C)
int getFPMantissaWidth() const
Return the width of the mantissa of this type.
static IntegerType * getInt8Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
This class represents a cast unsigned integer to floating point.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:242
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
op_iterator op_end()
Definition: User.h:236
static ValueAsMetadata * get(Value *V)
Definition: Metadata.cpp:495
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
iterator_range< use_iterator > uses()
Definition: Value.h:376
A nullable Value handle that is nullable.
Definition: ValueHandle.h:144
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Key
PAL metadata keys.
@ Entry
Definition: COFF.h:811
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SC
CHAIN = SC CHAIN, Imm128 - System call.
Reg
All possible values of the reg field in the ModR/M byte.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:711
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
@ DW_OP_LLVM_arg
Only used in LLVM metadata.
Definition: Dwarf.h:147
@ DW_OP_LLVM_convert
Only used in LLVM metadata.
Definition: Dwarf.h:143
constexpr double e
Definition: MathExtras.h:31
Sequence
A sequence of states that a pointer may go through in which an objc_retain and objc_release are actua...
Definition: PtrState.h:41
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:227
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:236
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool mustExecuteUBIfPoisonOnPathTo(Instruction *Root, Instruction *OnPathTo, DominatorTree *DT)
Return true if undefined behavior would provable be executed on the path to OnPathTo if Root produced...
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition: DWP.cpp:480
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Returns a loop's estimated trip count based on branch weight metadata.
Definition: LoopUtils.cpp:849
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1434
bool operator!=(uint64_t V1, const APInt &V2)
Definition: APInt.h:2058
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2067
char & LoopSimplifyID
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
bool matchSimpleRecurrence(const PHINode *P, BinaryOperator *&BO, Value *&Start, Value *&Step)
Attempt to match a simple first order recurrence cycle of the form: iv = phi Ty [Start,...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
bool DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr)
Examine each PHI in the given block and delete it if it is dead.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
const SCEV * denormalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE)
Denormalize S to be post-increment for all loops present in Loops.
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
cl::opt< unsigned > SCEVCheapExpansionBudget
Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
void SplitLandingPadPredecessors(BasicBlock *OrigBB, ArrayRef< BasicBlock * > Preds, const char *Suffix, const char *Suffix2, SmallVectorImpl< BasicBlock * > &NewBBs, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, bool PreserveLCSSA=false)
This method transforms the landing pad, OrigBB, by introducing two new basic blocks into the function...
const SCEV * normalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE, bool CheckInvertible=true)
Normalize S to be post-increment for all loops present in Loops.
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
@ Add
Sum of integers.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1914
Pass * createLoopStrengthReducePass()
BasicBlock * SplitCriticalEdge(Instruction *TI, unsigned SuccNum, const CriticalEdgeSplittingOptions &Options=CriticalEdgeSplittingOptions(), const Twine &BBName="")
If this edge is a critical edge, insert a new node to split the critical edge.
bool RecursivelyDeleteTriviallyDeadInstructionsPermissive(SmallVectorImpl< WeakTrackingVH > &DeadInsts, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
Same functionality as RecursivelyDeleteTriviallyDeadInstructions, but allow instructions that are not...
Definition: Local.cpp:555
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
bool formLCSSAForInstructions(SmallVectorImpl< Instruction * > &Worklist, const DominatorTree &DT, const LoopInfo &LI, ScalarEvolution *SE, SmallVectorImpl< PHINode * > *PHIsToRemove=nullptr, SmallVectorImpl< PHINode * > *InsertedPHIs=nullptr)
Ensures LCSSA form for every instruction from the Worklist in the scope of innermost containing loop.
Definition: LCSSA.cpp:77
void initializeLoopStrengthReducePass(PassRegistry &)
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
bool isAlmostDeadIV(PHINode *IV, BasicBlock *LatchBlock, Value *Cond)
Return true if the induction variable IV in a Loop whose latch is LatchBlock would become dead if the...
Definition: LoopUtils.cpp:469
int rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI, ScalarEvolution *SE, const TargetTransformInfo *TTI, SCEVExpander &Rewriter, DominatorTree *DT, ReplaceExitVal ReplaceExitValue, SmallVector< WeakTrackingVH, 16 > &DeadInsts)
If the final value of any expressions that are recurrent in the loop can be computed,...
Definition: LoopUtils.cpp:1404
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
@ UnusedIndVarInLoop
Definition: LoopUtils.h:456
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:471
bool SCEVExprContains(const SCEV *Root, PredTy Pred)
Return true if any node in Root satisfies the predicate Pred.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Option class for critical edge splitting.
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Information about a load/store intrinsic defined by the target.
Value * PtrVal
This is the pointer that the intrinsic is loading from or storing to.