LLVM 19.0.0git
LoopStrengthReduce.cpp
Go to the documentation of this file.
1//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This transformation analyzes and transforms the induction variables (and
10// computations derived from them) into forms suitable for efficient execution
11// on the target.
12//
13// This pass performs a strength reduction on array references inside loops that
14// have as one or more of their components the loop induction variable, it
15// rewrites expressions to take advantage of scaled-index addressing modes
16// available on the target, and it performs a variety of other optimizations
17// related to loop induction variables.
18//
19// Terminology note: this code has a lot of handling for "post-increment" or
20// "post-inc" users. This is not talking about post-increment addressing modes;
21// it is instead talking about code like this:
22//
23// %i = phi [ 0, %entry ], [ %i.next, %latch ]
24// ...
25// %i.next = add %i, 1
26// %c = icmp eq %i.next, %n
27//
28// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
29// it's useful to think about these as the same register, with some uses using
30// the value of the register before the add and some using it after. In this
31// example, the icmp is a post-increment user, since it uses %i.next, which is
32// the value of the induction variable after the increment. The other common
33// case of post-increment users is users outside the loop.
34//
35// TODO: More sophistication in the way Formulae are generated and filtered.
36//
37// TODO: Handle multiple loops at a time.
38//
39// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
40// of a GlobalValue?
41//
42// TODO: When truncation is free, truncate ICmp users' operands to make it a
43// smaller encoding (on x86 at least).
44//
45// TODO: When a negated register is used by an add (such as in a list of
46// multiple base registers, or as the increment expression in an addrec),
47// we may not actually need both reg and (-1 * reg) in registers; the
48// negation can be implemented by using a sub instead of an add. The
49// lack of support for taking this into consideration when making
50// register pressure decisions is partly worked around by the "Special"
51// use kind.
52//
53//===----------------------------------------------------------------------===//
54
56#include "llvm/ADT/APInt.h"
57#include "llvm/ADT/DenseMap.h"
58#include "llvm/ADT/DenseSet.h"
59#include "llvm/ADT/Hashing.h"
61#include "llvm/ADT/STLExtras.h"
62#include "llvm/ADT/SetVector.h"
65#include "llvm/ADT/SmallSet.h"
67#include "llvm/ADT/Statistic.h"
84#include "llvm/Config/llvm-config.h"
85#include "llvm/IR/BasicBlock.h"
86#include "llvm/IR/Constant.h"
87#include "llvm/IR/Constants.h"
90#include "llvm/IR/Dominators.h"
91#include "llvm/IR/GlobalValue.h"
92#include "llvm/IR/IRBuilder.h"
93#include "llvm/IR/InstrTypes.h"
94#include "llvm/IR/Instruction.h"
97#include "llvm/IR/Module.h"
98#include "llvm/IR/Operator.h"
99#include "llvm/IR/PassManager.h"
100#include "llvm/IR/Type.h"
101#include "llvm/IR/Use.h"
102#include "llvm/IR/User.h"
103#include "llvm/IR/Value.h"
104#include "llvm/IR/ValueHandle.h"
106#include "llvm/Pass.h"
107#include "llvm/Support/Casting.h"
110#include "llvm/Support/Debug.h"
120#include <algorithm>
121#include <cassert>
122#include <cstddef>
123#include <cstdint>
124#include <iterator>
125#include <limits>
126#include <map>
127#include <numeric>
128#include <optional>
129#include <utility>
130
131using namespace llvm;
132
133#define DEBUG_TYPE "loop-reduce"
134
135/// MaxIVUsers is an arbitrary threshold that provides an early opportunity for
136/// bail out. This threshold is far beyond the number of users that LSR can
137/// conceivably solve, so it should not affect generated code, but catches the
138/// worst cases before LSR burns too much compile time and stack space.
139static const unsigned MaxIVUsers = 200;
140
141/// Limit the size of expression that SCEV-based salvaging will attempt to
142/// translate into a DIExpression.
143/// Choose a maximum size such that debuginfo is not excessively increased and
144/// the salvaging is not too expensive for the compiler.
145static const unsigned MaxSCEVSalvageExpressionSize = 64;
146
147// Cleanup congruent phis after LSR phi expansion.
149 "enable-lsr-phielim", cl::Hidden, cl::init(true),
150 cl::desc("Enable LSR phi elimination"));
151
152// The flag adds instruction count to solutions cost comparison.
154 "lsr-insns-cost", cl::Hidden, cl::init(true),
155 cl::desc("Add instruction count to a LSR cost model"));
156
157// Flag to choose how to narrow complex lsr solution
159 "lsr-exp-narrow", cl::Hidden, cl::init(false),
160 cl::desc("Narrow LSR complex solution using"
161 " expectation of registers number"));
162
163// Flag to narrow search space by filtering non-optimal formulae with
164// the same ScaledReg and Scale.
166 "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true),
167 cl::desc("Narrow LSR search space by filtering non-optimal formulae"
168 " with the same ScaledReg and Scale"));
169
171 "lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None),
172 cl::desc("A flag that overrides the target's preferred addressing mode."),
174 "none",
175 "Don't prefer any addressing mode"),
177 "preindexed",
178 "Prefer pre-indexed addressing mode"),
180 "postindexed",
181 "Prefer post-indexed addressing mode")));
182
184 "lsr-complexity-limit", cl::Hidden,
185 cl::init(std::numeric_limits<uint16_t>::max()),
186 cl::desc("LSR search space complexity limit"));
187
189 "lsr-setupcost-depth-limit", cl::Hidden, cl::init(7),
190 cl::desc("The limit on recursion depth for LSRs setup cost"));
191
193 "lsr-term-fold", cl::Hidden,
194 cl::desc("Attempt to replace primary IV with other IV."));
195
197 "lsr-drop-solution", cl::Hidden, cl::init(false),
198 cl::desc("Attempt to drop solution if it is less profitable"));
199
200STATISTIC(NumTermFold,
201 "Number of terminating condition fold recognized and performed");
202
203#ifndef NDEBUG
204// Stress test IV chain generation.
206 "stress-ivchain", cl::Hidden, cl::init(false),
207 cl::desc("Stress test LSR IV chains"));
208#else
209static bool StressIVChain = false;
210#endif
211
212namespace {
213
214struct MemAccessTy {
215 /// Used in situations where the accessed memory type is unknown.
216 static const unsigned UnknownAddressSpace =
217 std::numeric_limits<unsigned>::max();
218
219 Type *MemTy = nullptr;
220 unsigned AddrSpace = UnknownAddressSpace;
221
222 MemAccessTy() = default;
223 MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {}
224
225 bool operator==(MemAccessTy Other) const {
226 return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
227 }
228
229 bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
230
231 static MemAccessTy getUnknown(LLVMContext &Ctx,
232 unsigned AS = UnknownAddressSpace) {
233 return MemAccessTy(Type::getVoidTy(Ctx), AS);
234 }
235
236 Type *getType() { return MemTy; }
237};
238
239/// This class holds data which is used to order reuse candidates.
240class RegSortData {
241public:
242 /// This represents the set of LSRUse indices which reference
243 /// a particular register.
244 SmallBitVector UsedByIndices;
245
246 void print(raw_ostream &OS) const;
247 void dump() const;
248};
249
250} // end anonymous namespace
251
252#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
253void RegSortData::print(raw_ostream &OS) const {
254 OS << "[NumUses=" << UsedByIndices.count() << ']';
255}
256
257LLVM_DUMP_METHOD void RegSortData::dump() const {
258 print(errs()); errs() << '\n';
259}
260#endif
261
262namespace {
263
264/// Map register candidates to information about how they are used.
265class RegUseTracker {
266 using RegUsesTy = DenseMap<const SCEV *, RegSortData>;
267
268 RegUsesTy RegUsesMap;
270
271public:
272 void countRegister(const SCEV *Reg, size_t LUIdx);
273 void dropRegister(const SCEV *Reg, size_t LUIdx);
274 void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
275
276 bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
277
278 const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
279
280 void clear();
281
284
285 iterator begin() { return RegSequence.begin(); }
286 iterator end() { return RegSequence.end(); }
287 const_iterator begin() const { return RegSequence.begin(); }
288 const_iterator end() const { return RegSequence.end(); }
289};
290
291} // end anonymous namespace
292
293void
294RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
295 std::pair<RegUsesTy::iterator, bool> Pair =
296 RegUsesMap.insert(std::make_pair(Reg, RegSortData()));
297 RegSortData &RSD = Pair.first->second;
298 if (Pair.second)
299 RegSequence.push_back(Reg);
300 RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
301 RSD.UsedByIndices.set(LUIdx);
302}
303
304void
305RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
306 RegUsesTy::iterator It = RegUsesMap.find(Reg);
307 assert(It != RegUsesMap.end());
308 RegSortData &RSD = It->second;
309 assert(RSD.UsedByIndices.size() > LUIdx);
310 RSD.UsedByIndices.reset(LUIdx);
311}
312
313void
314RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
315 assert(LUIdx <= LastLUIdx);
316
317 // Update RegUses. The data structure is not optimized for this purpose;
318 // we must iterate through it and update each of the bit vectors.
319 for (auto &Pair : RegUsesMap) {
320 SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
321 if (LUIdx < UsedByIndices.size())
322 UsedByIndices[LUIdx] =
323 LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : false;
324 UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
325 }
326}
327
328bool
329RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
330 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
331 if (I == RegUsesMap.end())
332 return false;
333 const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
334 int i = UsedByIndices.find_first();
335 if (i == -1) return false;
336 if ((size_t)i != LUIdx) return true;
337 return UsedByIndices.find_next(i) != -1;
338}
339
340const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
341 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
342 assert(I != RegUsesMap.end() && "Unknown register!");
343 return I->second.UsedByIndices;
344}
345
346void RegUseTracker::clear() {
347 RegUsesMap.clear();
348 RegSequence.clear();
349}
350
351namespace {
352
353/// This class holds information that describes a formula for computing
354/// satisfying a use. It may include broken-out immediates and scaled registers.
355struct Formula {
356 /// Global base address used for complex addressing.
357 GlobalValue *BaseGV = nullptr;
358
359 /// Base offset for complex addressing.
360 int64_t BaseOffset = 0;
361
362 /// Whether any complex addressing has a base register.
363 bool HasBaseReg = false;
364
365 /// The scale of any complex addressing.
366 int64_t Scale = 0;
367
368 /// The list of "base" registers for this use. When this is non-empty. The
369 /// canonical representation of a formula is
370 /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
371 /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
372 /// 3. The reg containing recurrent expr related with currect loop in the
373 /// formula should be put in the ScaledReg.
374 /// #1 enforces that the scaled register is always used when at least two
375 /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
376 /// #2 enforces that 1 * reg is reg.
377 /// #3 ensures invariant regs with respect to current loop can be combined
378 /// together in LSR codegen.
379 /// This invariant can be temporarily broken while building a formula.
380 /// However, every formula inserted into the LSRInstance must be in canonical
381 /// form.
383
384 /// The 'scaled' register for this use. This should be non-null when Scale is
385 /// not zero.
386 const SCEV *ScaledReg = nullptr;
387
388 /// An additional constant offset which added near the use. This requires a
389 /// temporary register, but the offset itself can live in an add immediate
390 /// field rather than a register.
391 int64_t UnfoldedOffset = 0;
392
393 Formula() = default;
394
395 void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
396
397 bool isCanonical(const Loop &L) const;
398
399 void canonicalize(const Loop &L);
400
401 bool unscale();
402
403 bool hasZeroEnd() const;
404
405 size_t getNumRegs() const;
406 Type *getType() const;
407
408 void deleteBaseReg(const SCEV *&S);
409
410 bool referencesReg(const SCEV *S) const;
411 bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
412 const RegUseTracker &RegUses) const;
413
414 void print(raw_ostream &OS) const;
415 void dump() const;
416};
417
418} // end anonymous namespace
419
420/// Recursion helper for initialMatch.
421static void DoInitialMatch(const SCEV *S, Loop *L,
424 ScalarEvolution &SE) {
425 // Collect expressions which properly dominate the loop header.
426 if (SE.properlyDominates(S, L->getHeader())) {
427 Good.push_back(S);
428 return;
429 }
430
431 // Look at add operands.
432 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
433 for (const SCEV *S : Add->operands())
434 DoInitialMatch(S, L, Good, Bad, SE);
435 return;
436 }
437
438 // Look at addrec operands.
439 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
440 if (!AR->getStart()->isZero() && AR->isAffine()) {
441 DoInitialMatch(AR->getStart(), L, Good, Bad, SE);
442 DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
443 AR->getStepRecurrence(SE),
444 // FIXME: AR->getNoWrapFlags()
445 AR->getLoop(), SCEV::FlagAnyWrap),
446 L, Good, Bad, SE);
447 return;
448 }
449
450 // Handle a multiplication by -1 (negation) if it didn't fold.
451 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
452 if (Mul->getOperand(0)->isAllOnesValue()) {
454 const SCEV *NewMul = SE.getMulExpr(Ops);
455
458 DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
459 const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
460 SE.getEffectiveSCEVType(NewMul->getType())));
461 for (const SCEV *S : MyGood)
462 Good.push_back(SE.getMulExpr(NegOne, S));
463 for (const SCEV *S : MyBad)
464 Bad.push_back(SE.getMulExpr(NegOne, S));
465 return;
466 }
467
468 // Ok, we can't do anything interesting. Just stuff the whole thing into a
469 // register and hope for the best.
470 Bad.push_back(S);
471}
472
473/// Incorporate loop-variant parts of S into this Formula, attempting to keep
474/// all loop-invariant and loop-computable values in a single base register.
475void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
478 DoInitialMatch(S, L, Good, Bad, SE);
479 if (!Good.empty()) {
480 const SCEV *Sum = SE.getAddExpr(Good);
481 if (!Sum->isZero())
482 BaseRegs.push_back(Sum);
483 HasBaseReg = true;
484 }
485 if (!Bad.empty()) {
486 const SCEV *Sum = SE.getAddExpr(Bad);
487 if (!Sum->isZero())
488 BaseRegs.push_back(Sum);
489 HasBaseReg = true;
490 }
491 canonicalize(*L);
492}
493
494static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L) {
495 return SCEVExprContains(S, [&L](const SCEV *S) {
496 return isa<SCEVAddRecExpr>(S) && (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
497 });
498}
499
500/// Check whether or not this formula satisfies the canonical
501/// representation.
502/// \see Formula::BaseRegs.
503bool Formula::isCanonical(const Loop &L) const {
504 if (!ScaledReg)
505 return BaseRegs.size() <= 1;
506
507 if (Scale != 1)
508 return true;
509
510 if (Scale == 1 && BaseRegs.empty())
511 return false;
512
513 if (containsAddRecDependentOnLoop(ScaledReg, L))
514 return true;
515
516 // If ScaledReg is not a recurrent expr, or it is but its loop is not current
517 // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
518 // loop, we want to swap the reg in BaseRegs with ScaledReg.
519 return none_of(BaseRegs, [&L](const SCEV *S) {
521 });
522}
523
524/// Helper method to morph a formula into its canonical representation.
525/// \see Formula::BaseRegs.
526/// Every formula having more than one base register, must use the ScaledReg
527/// field. Otherwise, we would have to do special cases everywhere in LSR
528/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
529/// On the other hand, 1*reg should be canonicalized into reg.
530void Formula::canonicalize(const Loop &L) {
531 if (isCanonical(L))
532 return;
533
534 if (BaseRegs.empty()) {
535 // No base reg? Use scale reg with scale = 1 as such.
536 assert(ScaledReg && "Expected 1*reg => reg");
537 assert(Scale == 1 && "Expected 1*reg => reg");
538 BaseRegs.push_back(ScaledReg);
539 Scale = 0;
540 ScaledReg = nullptr;
541 return;
542 }
543
544 // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
545 if (!ScaledReg) {
546 ScaledReg = BaseRegs.pop_back_val();
547 Scale = 1;
548 }
549
550 // If ScaledReg is an invariant with respect to L, find the reg from
551 // BaseRegs containing the recurrent expr related with Loop L. Swap the
552 // reg with ScaledReg.
553 if (!containsAddRecDependentOnLoop(ScaledReg, L)) {
554 auto I = find_if(BaseRegs, [&L](const SCEV *S) {
556 });
557 if (I != BaseRegs.end())
558 std::swap(ScaledReg, *I);
559 }
560 assert(isCanonical(L) && "Failed to canonicalize?");
561}
562
563/// Get rid of the scale in the formula.
564/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
565/// \return true if it was possible to get rid of the scale, false otherwise.
566/// \note After this operation the formula may not be in the canonical form.
567bool Formula::unscale() {
568 if (Scale != 1)
569 return false;
570 Scale = 0;
571 BaseRegs.push_back(ScaledReg);
572 ScaledReg = nullptr;
573 return true;
574}
575
576bool Formula::hasZeroEnd() const {
577 if (UnfoldedOffset || BaseOffset)
578 return false;
579 if (BaseRegs.size() != 1 || ScaledReg)
580 return false;
581 return true;
582}
583
584/// Return the total number of register operands used by this formula. This does
585/// not include register uses implied by non-constant addrec strides.
586size_t Formula::getNumRegs() const {
587 return !!ScaledReg + BaseRegs.size();
588}
589
590/// Return the type of this formula, if it has one, or null otherwise. This type
591/// is meaningless except for the bit size.
592Type *Formula::getType() const {
593 return !BaseRegs.empty() ? BaseRegs.front()->getType() :
594 ScaledReg ? ScaledReg->getType() :
595 BaseGV ? BaseGV->getType() :
596 nullptr;
597}
598
599/// Delete the given base reg from the BaseRegs list.
600void Formula::deleteBaseReg(const SCEV *&S) {
601 if (&S != &BaseRegs.back())
602 std::swap(S, BaseRegs.back());
603 BaseRegs.pop_back();
604}
605
606/// Test if this formula references the given register.
607bool Formula::referencesReg(const SCEV *S) const {
608 return S == ScaledReg || is_contained(BaseRegs, S);
609}
610
611/// Test whether this formula uses registers which are used by uses other than
612/// the use with the given index.
613bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
614 const RegUseTracker &RegUses) const {
615 if (ScaledReg)
616 if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
617 return true;
618 for (const SCEV *BaseReg : BaseRegs)
619 if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
620 return true;
621 return false;
622}
623
624#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
625void Formula::print(raw_ostream &OS) const {
626 bool First = true;
627 if (BaseGV) {
628 if (!First) OS << " + "; else First = false;
629 BaseGV->printAsOperand(OS, /*PrintType=*/false);
630 }
631 if (BaseOffset != 0) {
632 if (!First) OS << " + "; else First = false;
633 OS << BaseOffset;
634 }
635 for (const SCEV *BaseReg : BaseRegs) {
636 if (!First) OS << " + "; else First = false;
637 OS << "reg(" << *BaseReg << ')';
638 }
639 if (HasBaseReg && BaseRegs.empty()) {
640 if (!First) OS << " + "; else First = false;
641 OS << "**error: HasBaseReg**";
642 } else if (!HasBaseReg && !BaseRegs.empty()) {
643 if (!First) OS << " + "; else First = false;
644 OS << "**error: !HasBaseReg**";
645 }
646 if (Scale != 0) {
647 if (!First) OS << " + "; else First = false;
648 OS << Scale << "*reg(";
649 if (ScaledReg)
650 OS << *ScaledReg;
651 else
652 OS << "<unknown>";
653 OS << ')';
654 }
655 if (UnfoldedOffset != 0) {
656 if (!First) OS << " + ";
657 OS << "imm(" << UnfoldedOffset << ')';
658 }
659}
660
661LLVM_DUMP_METHOD void Formula::dump() const {
662 print(errs()); errs() << '\n';
663}
664#endif
665
666/// Return true if the given addrec can be sign-extended without changing its
667/// value.
669 Type *WideTy =
671 return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
672}
673
674/// Return true if the given add can be sign-extended without changing its
675/// value.
676static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
677 Type *WideTy =
678 IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
679 return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
680}
681
682/// Return true if the given mul can be sign-extended without changing its
683/// value.
684static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
685 Type *WideTy =
687 SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
688 return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
689}
690
691/// Return an expression for LHS /s RHS, if it can be determined and if the
692/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
693/// is true, expressions like (X * Y) /s Y are simplified to X, ignoring that
694/// the multiplication may overflow, which is useful when the result will be
695/// used in a context where the most significant bits are ignored.
696static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
697 ScalarEvolution &SE,
698 bool IgnoreSignificantBits = false) {
699 // Handle the trivial case, which works for any SCEV type.
700 if (LHS == RHS)
701 return SE.getConstant(LHS->getType(), 1);
702
703 // Handle a few RHS special cases.
704 const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
705 if (RC) {
706 const APInt &RA = RC->getAPInt();
707 // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
708 // some folding.
709 if (RA.isAllOnes()) {
710 if (LHS->getType()->isPointerTy())
711 return nullptr;
712 return SE.getMulExpr(LHS, RC);
713 }
714 // Handle x /s 1 as x.
715 if (RA == 1)
716 return LHS;
717 }
718
719 // Check for a division of a constant by a constant.
720 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
721 if (!RC)
722 return nullptr;
723 const APInt &LA = C->getAPInt();
724 const APInt &RA = RC->getAPInt();
725 if (LA.srem(RA) != 0)
726 return nullptr;
727 return SE.getConstant(LA.sdiv(RA));
728 }
729
730 // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
731 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
732 if ((IgnoreSignificantBits || isAddRecSExtable(AR, SE)) && AR->isAffine()) {
733 const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
734 IgnoreSignificantBits);
735 if (!Step) return nullptr;
736 const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
737 IgnoreSignificantBits);
738 if (!Start) return nullptr;
739 // FlagNW is independent of the start value, step direction, and is
740 // preserved with smaller magnitude steps.
741 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
742 return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
743 }
744 return nullptr;
745 }
746
747 // Distribute the sdiv over add operands, if the add doesn't overflow.
748 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
749 if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
751 for (const SCEV *S : Add->operands()) {
752 const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
753 if (!Op) return nullptr;
754 Ops.push_back(Op);
755 }
756 return SE.getAddExpr(Ops);
757 }
758 return nullptr;
759 }
760
761 // Check for a multiply operand that we can pull RHS out of.
762 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
763 if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
764 // Handle special case C1*X*Y /s C2*X*Y.
765 if (const SCEVMulExpr *MulRHS = dyn_cast<SCEVMulExpr>(RHS)) {
766 if (IgnoreSignificantBits || isMulSExtable(MulRHS, SE)) {
767 const SCEVConstant *LC = dyn_cast<SCEVConstant>(Mul->getOperand(0));
768 const SCEVConstant *RC =
769 dyn_cast<SCEVConstant>(MulRHS->getOperand(0));
770 if (LC && RC) {
772 SmallVector<const SCEV *, 4> ROps(drop_begin(MulRHS->operands()));
773 if (LOps == ROps)
774 return getExactSDiv(LC, RC, SE, IgnoreSignificantBits);
775 }
776 }
777 }
778
780 bool Found = false;
781 for (const SCEV *S : Mul->operands()) {
782 if (!Found)
783 if (const SCEV *Q = getExactSDiv(S, RHS, SE,
784 IgnoreSignificantBits)) {
785 S = Q;
786 Found = true;
787 }
788 Ops.push_back(S);
789 }
790 return Found ? SE.getMulExpr(Ops) : nullptr;
791 }
792 return nullptr;
793 }
794
795 // Otherwise we don't know.
796 return nullptr;
797}
798
799/// If S involves the addition of a constant integer value, return that integer
800/// value, and mutate S to point to a new SCEV with that value excluded.
801static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
802 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
803 if (C->getAPInt().getSignificantBits() <= 64) {
804 S = SE.getConstant(C->getType(), 0);
805 return C->getValue()->getSExtValue();
806 }
807 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
808 SmallVector<const SCEV *, 8> NewOps(Add->operands());
809 int64_t Result = ExtractImmediate(NewOps.front(), SE);
810 if (Result != 0)
811 S = SE.getAddExpr(NewOps);
812 return Result;
813 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
814 SmallVector<const SCEV *, 8> NewOps(AR->operands());
815 int64_t Result = ExtractImmediate(NewOps.front(), SE);
816 if (Result != 0)
817 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
818 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
820 return Result;
821 }
822 return 0;
823}
824
825/// If S involves the addition of a GlobalValue address, return that symbol, and
826/// mutate S to point to a new SCEV with that value excluded.
828 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
829 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
830 S = SE.getConstant(GV->getType(), 0);
831 return GV;
832 }
833 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
834 SmallVector<const SCEV *, 8> NewOps(Add->operands());
835 GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
836 if (Result)
837 S = SE.getAddExpr(NewOps);
838 return Result;
839 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
840 SmallVector<const SCEV *, 8> NewOps(AR->operands());
841 GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
842 if (Result)
843 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
844 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
846 return Result;
847 }
848 return nullptr;
849}
850
851/// Returns true if the specified instruction is using the specified value as an
852/// address.
854 Instruction *Inst, Value *OperandVal) {
855 bool isAddress = isa<LoadInst>(Inst);
856 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
857 if (SI->getPointerOperand() == OperandVal)
858 isAddress = true;
859 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
860 // Addressing modes can also be folded into prefetches and a variety
861 // of intrinsics.
862 switch (II->getIntrinsicID()) {
863 case Intrinsic::memset:
864 case Intrinsic::prefetch:
865 case Intrinsic::masked_load:
866 if (II->getArgOperand(0) == OperandVal)
867 isAddress = true;
868 break;
869 case Intrinsic::masked_store:
870 if (II->getArgOperand(1) == OperandVal)
871 isAddress = true;
872 break;
873 case Intrinsic::memmove:
874 case Intrinsic::memcpy:
875 if (II->getArgOperand(0) == OperandVal ||
876 II->getArgOperand(1) == OperandVal)
877 isAddress = true;
878 break;
879 default: {
880 MemIntrinsicInfo IntrInfo;
881 if (TTI.getTgtMemIntrinsic(II, IntrInfo)) {
882 if (IntrInfo.PtrVal == OperandVal)
883 isAddress = true;
884 }
885 }
886 }
887 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
888 if (RMW->getPointerOperand() == OperandVal)
889 isAddress = true;
890 } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
891 if (CmpX->getPointerOperand() == OperandVal)
892 isAddress = true;
893 }
894 return isAddress;
895}
896
897/// Return the type of the memory being accessed.
898static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
899 Instruction *Inst, Value *OperandVal) {
900 MemAccessTy AccessTy = MemAccessTy::getUnknown(Inst->getContext());
901
902 // First get the type of memory being accessed.
903 if (Type *Ty = Inst->getAccessType())
904 AccessTy.MemTy = Ty;
905
906 // Then get the pointer address space.
907 if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
908 AccessTy.AddrSpace = SI->getPointerAddressSpace();
909 } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
910 AccessTy.AddrSpace = LI->getPointerAddressSpace();
911 } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
912 AccessTy.AddrSpace = RMW->getPointerAddressSpace();
913 } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
914 AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
915 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
916 switch (II->getIntrinsicID()) {
917 case Intrinsic::prefetch:
918 case Intrinsic::memset:
919 AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
920 AccessTy.MemTy = OperandVal->getType();
921 break;
922 case Intrinsic::memmove:
923 case Intrinsic::memcpy:
924 AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace();
925 AccessTy.MemTy = OperandVal->getType();
926 break;
927 case Intrinsic::masked_load:
928 AccessTy.AddrSpace =
929 II->getArgOperand(0)->getType()->getPointerAddressSpace();
930 break;
931 case Intrinsic::masked_store:
932 AccessTy.AddrSpace =
933 II->getArgOperand(1)->getType()->getPointerAddressSpace();
934 break;
935 default: {
936 MemIntrinsicInfo IntrInfo;
937 if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) {
938 AccessTy.AddrSpace
939 = IntrInfo.PtrVal->getType()->getPointerAddressSpace();
940 }
941
942 break;
943 }
944 }
945 }
946
947 return AccessTy;
948}
949
950/// Return true if this AddRec is already a phi in its loop.
951static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
952 for (PHINode &PN : AR->getLoop()->getHeader()->phis()) {
953 if (SE.isSCEVable(PN.getType()) &&
954 (SE.getEffectiveSCEVType(PN.getType()) ==
955 SE.getEffectiveSCEVType(AR->getType())) &&
956 SE.getSCEV(&PN) == AR)
957 return true;
958 }
959 return false;
960}
961
962/// Check if expanding this expression is likely to incur significant cost. This
963/// is tricky because SCEV doesn't track which expressions are actually computed
964/// by the current IR.
965///
966/// We currently allow expansion of IV increments that involve adds,
967/// multiplication by constants, and AddRecs from existing phis.
968///
969/// TODO: Allow UDivExpr if we can find an existing IV increment that is an
970/// obvious multiple of the UDivExpr.
971static bool isHighCostExpansion(const SCEV *S,
973 ScalarEvolution &SE) {
974 // Zero/One operand expressions
975 switch (S->getSCEVType()) {
976 case scUnknown:
977 case scConstant:
978 case scVScale:
979 return false;
980 case scTruncate:
981 return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
982 Processed, SE);
983 case scZeroExtend:
984 return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
985 Processed, SE);
986 case scSignExtend:
987 return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
988 Processed, SE);
989 default:
990 break;
991 }
992
993 if (!Processed.insert(S).second)
994 return false;
995
996 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
997 for (const SCEV *S : Add->operands()) {
998 if (isHighCostExpansion(S, Processed, SE))
999 return true;
1000 }
1001 return false;
1002 }
1003
1004 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
1005 if (Mul->getNumOperands() == 2) {
1006 // Multiplication by a constant is ok
1007 if (isa<SCEVConstant>(Mul->getOperand(0)))
1008 return isHighCostExpansion(Mul->getOperand(1), Processed, SE);
1009
1010 // If we have the value of one operand, check if an existing
1011 // multiplication already generates this expression.
1012 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) {
1013 Value *UVal = U->getValue();
1014 for (User *UR : UVal->users()) {
1015 // If U is a constant, it may be used by a ConstantExpr.
1016 Instruction *UI = dyn_cast<Instruction>(UR);
1017 if (UI && UI->getOpcode() == Instruction::Mul &&
1018 SE.isSCEVable(UI->getType())) {
1019 return SE.getSCEV(UI) == Mul;
1020 }
1021 }
1022 }
1023 }
1024 }
1025
1026 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
1027 if (isExistingPhi(AR, SE))
1028 return false;
1029 }
1030
1031 // Fow now, consider any other type of expression (div/mul/min/max) high cost.
1032 return true;
1033}
1034
1035namespace {
1036
1037class LSRUse;
1038
1039} // end anonymous namespace
1040
1041/// Check if the addressing mode defined by \p F is completely
1042/// folded in \p LU at isel time.
1043/// This includes address-mode folding and special icmp tricks.
1044/// This function returns true if \p LU can accommodate what \p F
1045/// defines and up to 1 base + 1 scaled + offset.
1046/// In other words, if \p F has several base registers, this function may
1047/// still return true. Therefore, users still need to account for
1048/// additional base registers and/or unfolded offsets to derive an
1049/// accurate cost model.
1051 const LSRUse &LU, const Formula &F);
1052
1053// Get the cost of the scaling factor used in F for LU.
1055 const LSRUse &LU, const Formula &F,
1056 const Loop &L);
1057
1058namespace {
1059
1060/// This class is used to measure and compare candidate formulae.
1061class Cost {
1062 const Loop *L = nullptr;
1063 ScalarEvolution *SE = nullptr;
1064 const TargetTransformInfo *TTI = nullptr;
1067
1068public:
1069 Cost() = delete;
1070 Cost(const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
1072 L(L), SE(&SE), TTI(&TTI), AMK(AMK) {
1073 C.Insns = 0;
1074 C.NumRegs = 0;
1075 C.AddRecCost = 0;
1076 C.NumIVMuls = 0;
1077 C.NumBaseAdds = 0;
1078 C.ImmCost = 0;
1079 C.SetupCost = 0;
1080 C.ScaleCost = 0;
1081 }
1082
1083 bool isLess(const Cost &Other) const;
1084
1085 void Lose();
1086
1087#ifndef NDEBUG
1088 // Once any of the metrics loses, they must all remain losers.
1089 bool isValid() {
1090 return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
1091 | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
1092 || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
1093 & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
1094 }
1095#endif
1096
1097 bool isLoser() {
1098 assert(isValid() && "invalid cost");
1099 return C.NumRegs == ~0u;
1100 }
1101
1102 void RateFormula(const Formula &F,
1104 const DenseSet<const SCEV *> &VisitedRegs,
1105 const LSRUse &LU,
1106 SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
1107
1108 void print(raw_ostream &OS) const;
1109 void dump() const;
1110
1111private:
1112 void RateRegister(const Formula &F, const SCEV *Reg,
1114 void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1117};
1118
1119/// An operand value in an instruction which is to be replaced with some
1120/// equivalent, possibly strength-reduced, replacement.
1121struct LSRFixup {
1122 /// The instruction which will be updated.
1123 Instruction *UserInst = nullptr;
1124
1125 /// The operand of the instruction which will be replaced. The operand may be
1126 /// used more than once; every instance will be replaced.
1127 Value *OperandValToReplace = nullptr;
1128
1129 /// If this user is to use the post-incremented value of an induction
1130 /// variable, this set is non-empty and holds the loops associated with the
1131 /// induction variable.
1132 PostIncLoopSet PostIncLoops;
1133
1134 /// A constant offset to be added to the LSRUse expression. This allows
1135 /// multiple fixups to share the same LSRUse with different offsets, for
1136 /// example in an unrolled loop.
1137 int64_t Offset = 0;
1138
1139 LSRFixup() = default;
1140
1141 bool isUseFullyOutsideLoop(const Loop *L) const;
1142
1143 void print(raw_ostream &OS) const;
1144 void dump() const;
1145};
1146
1147/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted
1148/// SmallVectors of const SCEV*.
1149struct UniquifierDenseMapInfo {
1150 static SmallVector<const SCEV *, 4> getEmptyKey() {
1152 V.push_back(reinterpret_cast<const SCEV *>(-1));
1153 return V;
1154 }
1155
1156 static SmallVector<const SCEV *, 4> getTombstoneKey() {
1158 V.push_back(reinterpret_cast<const SCEV *>(-2));
1159 return V;
1160 }
1161
1162 static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) {
1163 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
1164 }
1165
1166 static bool isEqual(const SmallVector<const SCEV *, 4> &LHS,
1168 return LHS == RHS;
1169 }
1170};
1171
1172/// This class holds the state that LSR keeps for each use in IVUsers, as well
1173/// as uses invented by LSR itself. It includes information about what kinds of
1174/// things can be folded into the user, information about the user itself, and
1175/// information about how the use may be satisfied. TODO: Represent multiple
1176/// users of the same expression in common?
1177class LSRUse {
1178 DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;
1179
1180public:
1181 /// An enum for a kind of use, indicating what types of scaled and immediate
1182 /// operands it might support.
1183 enum KindType {
1184 Basic, ///< A normal use, with no folding.
1185 Special, ///< A special case of basic, allowing -1 scales.
1186 Address, ///< An address use; folding according to TargetLowering
1187 ICmpZero ///< An equality icmp with both operands folded into one.
1188 // TODO: Add a generic icmp too?
1189 };
1190
1191 using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>;
1192
1193 KindType Kind;
1194 MemAccessTy AccessTy;
1195
1196 /// The list of operands which are to be replaced.
1198
1199 /// Keep track of the min and max offsets of the fixups.
1200 int64_t MinOffset = std::numeric_limits<int64_t>::max();
1201 int64_t MaxOffset = std::numeric_limits<int64_t>::min();
1202
1203 /// This records whether all of the fixups using this LSRUse are outside of
1204 /// the loop, in which case some special-case heuristics may be used.
1205 bool AllFixupsOutsideLoop = true;
1206
1207 /// RigidFormula is set to true to guarantee that this use will be associated
1208 /// with a single formula--the one that initially matched. Some SCEV
1209 /// expressions cannot be expanded. This allows LSR to consider the registers
1210 /// used by those expressions without the need to expand them later after
1211 /// changing the formula.
1212 bool RigidFormula = false;
1213
1214 /// This records the widest use type for any fixup using this
1215 /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
1216 /// fixup widths to be equivalent, because the narrower one may be relying on
1217 /// the implicit truncation to truncate away bogus bits.
1218 Type *WidestFixupType = nullptr;
1219
1220 /// A list of ways to build a value that can satisfy this user. After the
1221 /// list is populated, one of these is selected heuristically and used to
1222 /// formulate a replacement for OperandValToReplace in UserInst.
1223 SmallVector<Formula, 12> Formulae;
1224
1225 /// The set of register candidates used by all formulae in this LSRUse.
1227
1228 LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {}
1229
1230 LSRFixup &getNewFixup() {
1231 Fixups.push_back(LSRFixup());
1232 return Fixups.back();
1233 }
1234
1235 void pushFixup(LSRFixup &f) {
1236 Fixups.push_back(f);
1237 if (f.Offset > MaxOffset)
1238 MaxOffset = f.Offset;
1239 if (f.Offset < MinOffset)
1240 MinOffset = f.Offset;
1241 }
1242
1243 bool HasFormulaWithSameRegs(const Formula &F) const;
1244 float getNotSelectedProbability(const SCEV *Reg) const;
1245 bool InsertFormula(const Formula &F, const Loop &L);
1246 void DeleteFormula(Formula &F);
1247 void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
1248
1249 void print(raw_ostream &OS) const;
1250 void dump() const;
1251};
1252
1253} // end anonymous namespace
1254
1256 LSRUse::KindType Kind, MemAccessTy AccessTy,
1257 GlobalValue *BaseGV, int64_t BaseOffset,
1258 bool HasBaseReg, int64_t Scale,
1259 Instruction *Fixup = nullptr);
1260
1261static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) {
1262 if (isa<SCEVUnknown>(Reg) || isa<SCEVConstant>(Reg))
1263 return 1;
1264 if (Depth == 0)
1265 return 0;
1266 if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))
1267 return getSetupCost(S->getStart(), Depth - 1);
1268 if (auto S = dyn_cast<SCEVIntegralCastExpr>(Reg))
1269 return getSetupCost(S->getOperand(), Depth - 1);
1270 if (auto S = dyn_cast<SCEVNAryExpr>(Reg))
1271 return std::accumulate(S->operands().begin(), S->operands().end(), 0,
1272 [&](unsigned i, const SCEV *Reg) {
1273 return i + getSetupCost(Reg, Depth - 1);
1274 });
1275 if (auto S = dyn_cast<SCEVUDivExpr>(Reg))
1276 return getSetupCost(S->getLHS(), Depth - 1) +
1277 getSetupCost(S->getRHS(), Depth - 1);
1278 return 0;
1279}
1280
1281/// Tally up interesting quantities from the given register.
1282void Cost::RateRegister(const Formula &F, const SCEV *Reg,
1284 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
1285 // If this is an addrec for another loop, it should be an invariant
1286 // with respect to L since L is the innermost loop (at least
1287 // for now LSR only handles innermost loops).
1288 if (AR->getLoop() != L) {
1289 // If the AddRec exists, consider it's register free and leave it alone.
1290 if (isExistingPhi(AR, *SE) && AMK != TTI::AMK_PostIndexed)
1291 return;
1292
1293 // It is bad to allow LSR for current loop to add induction variables
1294 // for its sibling loops.
1295 if (!AR->getLoop()->contains(L)) {
1296 Lose();
1297 return;
1298 }
1299
1300 // Otherwise, it will be an invariant with respect to Loop L.
1301 ++C.NumRegs;
1302 return;
1303 }
1304
1305 unsigned LoopCost = 1;
1306 if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||
1307 TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {
1308
1309 // If the step size matches the base offset, we could use pre-indexed
1310 // addressing.
1311 if (AMK == TTI::AMK_PreIndexed) {
1312 if (auto *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)))
1313 if (Step->getAPInt() == F.BaseOffset)
1314 LoopCost = 0;
1315 } else if (AMK == TTI::AMK_PostIndexed) {
1316 const SCEV *LoopStep = AR->getStepRecurrence(*SE);
1317 if (isa<SCEVConstant>(LoopStep)) {
1318 const SCEV *LoopStart = AR->getStart();
1319 if (!isa<SCEVConstant>(LoopStart) &&
1320 SE->isLoopInvariant(LoopStart, L))
1321 LoopCost = 0;
1322 }
1323 }
1324 }
1325 C.AddRecCost += LoopCost;
1326
1327 // Add the step value register, if it needs one.
1328 // TODO: The non-affine case isn't precisely modeled here.
1329 if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
1330 if (!Regs.count(AR->getOperand(1))) {
1331 RateRegister(F, AR->getOperand(1), Regs);
1332 if (isLoser())
1333 return;
1334 }
1335 }
1336 }
1337 ++C.NumRegs;
1338
1339 // Rough heuristic; favor registers which don't require extra setup
1340 // instructions in the preheader.
1341 C.SetupCost += getSetupCost(Reg, SetupCostDepthLimit);
1342 // Ensure we don't, even with the recusion limit, produce invalid costs.
1343 C.SetupCost = std::min<unsigned>(C.SetupCost, 1 << 16);
1344
1345 C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
1346 SE->hasComputableLoopEvolution(Reg, L);
1347}
1348
1349/// Record this register in the set. If we haven't seen it before, rate
1350/// it. Optional LoserRegs provides a way to declare any formula that refers to
1351/// one of those regs an instant loser.
1352void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1354 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1355 if (LoserRegs && LoserRegs->count(Reg)) {
1356 Lose();
1357 return;
1358 }
1359 if (Regs.insert(Reg).second) {
1360 RateRegister(F, Reg, Regs);
1361 if (LoserRegs && isLoser())
1362 LoserRegs->insert(Reg);
1363 }
1364}
1365
1366void Cost::RateFormula(const Formula &F,
1368 const DenseSet<const SCEV *> &VisitedRegs,
1369 const LSRUse &LU,
1370 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1371 if (isLoser())
1372 return;
1373 assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
1374 // Tally up the registers.
1375 unsigned PrevAddRecCost = C.AddRecCost;
1376 unsigned PrevNumRegs = C.NumRegs;
1377 unsigned PrevNumBaseAdds = C.NumBaseAdds;
1378 if (const SCEV *ScaledReg = F.ScaledReg) {
1379 if (VisitedRegs.count(ScaledReg)) {
1380 Lose();
1381 return;
1382 }
1383 RatePrimaryRegister(F, ScaledReg, Regs, LoserRegs);
1384 if (isLoser())
1385 return;
1386 }
1387 for (const SCEV *BaseReg : F.BaseRegs) {
1388 if (VisitedRegs.count(BaseReg)) {
1389 Lose();
1390 return;
1391 }
1392 RatePrimaryRegister(F, BaseReg, Regs, LoserRegs);
1393 if (isLoser())
1394 return;
1395 }
1396
1397 // Determine how many (unfolded) adds we'll need inside the loop.
1398 size_t NumBaseParts = F.getNumRegs();
1399 if (NumBaseParts > 1)
1400 // Do not count the base and a possible second register if the target
1401 // allows to fold 2 registers.
1402 C.NumBaseAdds +=
1403 NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F)));
1404 C.NumBaseAdds += (F.UnfoldedOffset != 0);
1405
1406 // Accumulate non-free scaling amounts.
1407 C.ScaleCost += *getScalingFactorCost(*TTI, LU, F, *L).getValue();
1408
1409 // Tally up the non-zero immediates.
1410 for (const LSRFixup &Fixup : LU.Fixups) {
1411 int64_t O = Fixup.Offset;
1412 int64_t Offset = (uint64_t)O + F.BaseOffset;
1413 if (F.BaseGV)
1414 C.ImmCost += 64; // Handle symbolic values conservatively.
1415 // TODO: This should probably be the pointer size.
1416 else if (Offset != 0)
1417 C.ImmCost += APInt(64, Offset, true).getSignificantBits();
1418
1419 // Check with target if this offset with this instruction is
1420 // specifically not supported.
1421 if (LU.Kind == LSRUse::Address && Offset != 0 &&
1422 !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1423 Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
1424 C.NumBaseAdds++;
1425 }
1426
1427 // If we don't count instruction cost exit here.
1428 if (!InsnsCost) {
1429 assert(isValid() && "invalid cost");
1430 return;
1431 }
1432
1433 // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
1434 // additional instruction (at least fill).
1435 // TODO: Need distinguish register class?
1436 unsigned TTIRegNum = TTI->getNumberOfRegisters(
1437 TTI->getRegisterClassForType(false, F.getType())) - 1;
1438 if (C.NumRegs > TTIRegNum) {
1439 // Cost already exceeded TTIRegNum, then only newly added register can add
1440 // new instructions.
1441 if (PrevNumRegs > TTIRegNum)
1442 C.Insns += (C.NumRegs - PrevNumRegs);
1443 else
1444 C.Insns += (C.NumRegs - TTIRegNum);
1445 }
1446
1447 // If ICmpZero formula ends with not 0, it could not be replaced by
1448 // just add or sub. We'll need to compare final result of AddRec.
1449 // That means we'll need an additional instruction. But if the target can
1450 // macro-fuse a compare with a branch, don't count this extra instruction.
1451 // For -10 + {0, +, 1}:
1452 // i = i + 1;
1453 // cmp i, 10
1454 //
1455 // For {-10, +, 1}:
1456 // i = i + 1;
1457 if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() &&
1458 !TTI->canMacroFuseCmp())
1459 C.Insns++;
1460 // Each new AddRec adds 1 instruction to calculation.
1461 C.Insns += (C.AddRecCost - PrevAddRecCost);
1462
1463 // BaseAdds adds instructions for unfolded registers.
1464 if (LU.Kind != LSRUse::ICmpZero)
1465 C.Insns += C.NumBaseAdds - PrevNumBaseAdds;
1466 assert(isValid() && "invalid cost");
1467}
1468
1469/// Set this cost to a losing value.
1470void Cost::Lose() {
1471 C.Insns = std::numeric_limits<unsigned>::max();
1472 C.NumRegs = std::numeric_limits<unsigned>::max();
1473 C.AddRecCost = std::numeric_limits<unsigned>::max();
1474 C.NumIVMuls = std::numeric_limits<unsigned>::max();
1475 C.NumBaseAdds = std::numeric_limits<unsigned>::max();
1476 C.ImmCost = std::numeric_limits<unsigned>::max();
1477 C.SetupCost = std::numeric_limits<unsigned>::max();
1478 C.ScaleCost = std::numeric_limits<unsigned>::max();
1479}
1480
1481/// Choose the lower cost.
1482bool Cost::isLess(const Cost &Other) const {
1483 if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
1484 C.Insns != Other.C.Insns)
1485 return C.Insns < Other.C.Insns;
1486 return TTI->isLSRCostLess(C, Other.C);
1487}
1488
1489#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1490void Cost::print(raw_ostream &OS) const {
1491 if (InsnsCost)
1492 OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
1493 OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s");
1494 if (C.AddRecCost != 0)
1495 OS << ", with addrec cost " << C.AddRecCost;
1496 if (C.NumIVMuls != 0)
1497 OS << ", plus " << C.NumIVMuls << " IV mul"
1498 << (C.NumIVMuls == 1 ? "" : "s");
1499 if (C.NumBaseAdds != 0)
1500 OS << ", plus " << C.NumBaseAdds << " base add"
1501 << (C.NumBaseAdds == 1 ? "" : "s");
1502 if (C.ScaleCost != 0)
1503 OS << ", plus " << C.ScaleCost << " scale cost";
1504 if (C.ImmCost != 0)
1505 OS << ", plus " << C.ImmCost << " imm cost";
1506 if (C.SetupCost != 0)
1507 OS << ", plus " << C.SetupCost << " setup cost";
1508}
1509
1510LLVM_DUMP_METHOD void Cost::dump() const {
1511 print(errs()); errs() << '\n';
1512}
1513#endif
1514
1515/// Test whether this fixup always uses its value outside of the given loop.
1516bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
1517 // PHI nodes use their value in their incoming blocks.
1518 if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
1519 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
1520 if (PN->getIncomingValue(i) == OperandValToReplace &&
1521 L->contains(PN->getIncomingBlock(i)))
1522 return false;
1523 return true;
1524 }
1525
1526 return !L->contains(UserInst);
1527}
1528
1529#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1530void LSRFixup::print(raw_ostream &OS) const {
1531 OS << "UserInst=";
1532 // Store is common and interesting enough to be worth special-casing.
1533 if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
1534 OS << "store ";
1535 Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
1536 } else if (UserInst->getType()->isVoidTy())
1537 OS << UserInst->getOpcodeName();
1538 else
1539 UserInst->printAsOperand(OS, /*PrintType=*/false);
1540
1541 OS << ", OperandValToReplace=";
1542 OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
1543
1544 for (const Loop *PIL : PostIncLoops) {
1545 OS << ", PostIncLoop=";
1546 PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
1547 }
1548
1549 if (Offset != 0)
1550 OS << ", Offset=" << Offset;
1551}
1552
1553LLVM_DUMP_METHOD void LSRFixup::dump() const {
1554 print(errs()); errs() << '\n';
1555}
1556#endif
1557
1558/// Test whether this use as a formula which has the same registers as the given
1559/// formula.
1560bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
1562 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1563 // Unstable sort by host order ok, because this is only used for uniquifying.
1564 llvm::sort(Key);
1565 return Uniquifier.count(Key);
1566}
1567
1568/// The function returns a probability of selecting formula without Reg.
1569float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
1570 unsigned FNum = 0;
1571 for (const Formula &F : Formulae)
1572 if (F.referencesReg(Reg))
1573 FNum++;
1574 return ((float)(Formulae.size() - FNum)) / Formulae.size();
1575}
1576
1577/// If the given formula has not yet been inserted, add it to the list, and
1578/// return true. Return false otherwise. The formula must be in canonical form.
1579bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
1580 assert(F.isCanonical(L) && "Invalid canonical representation");
1581
1582 if (!Formulae.empty() && RigidFormula)
1583 return false;
1584
1586 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1587 // Unstable sort by host order ok, because this is only used for uniquifying.
1588 llvm::sort(Key);
1589
1590 if (!Uniquifier.insert(Key).second)
1591 return false;
1592
1593 // Using a register to hold the value of 0 is not profitable.
1594 assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
1595 "Zero allocated in a scaled register!");
1596#ifndef NDEBUG
1597 for (const SCEV *BaseReg : F.BaseRegs)
1598 assert(!BaseReg->isZero() && "Zero allocated in a base register!");
1599#endif
1600
1601 // Add the formula to the list.
1602 Formulae.push_back(F);
1603
1604 // Record registers now being used by this use.
1605 Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1606 if (F.ScaledReg)
1607 Regs.insert(F.ScaledReg);
1608
1609 return true;
1610}
1611
1612/// Remove the given formula from this use's list.
1613void LSRUse::DeleteFormula(Formula &F) {
1614 if (&F != &Formulae.back())
1615 std::swap(F, Formulae.back());
1616 Formulae.pop_back();
1617}
1618
1619/// Recompute the Regs field, and update RegUses.
1620void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
1621 // Now that we've filtered out some formulae, recompute the Regs set.
1622 SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
1623 Regs.clear();
1624 for (const Formula &F : Formulae) {
1625 if (F.ScaledReg) Regs.insert(F.ScaledReg);
1626 Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1627 }
1628
1629 // Update the RegTracker.
1630 for (const SCEV *S : OldRegs)
1631 if (!Regs.count(S))
1632 RegUses.dropRegister(S, LUIdx);
1633}
1634
1635#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1636void LSRUse::print(raw_ostream &OS) const {
1637 OS << "LSR Use: Kind=";
1638 switch (Kind) {
1639 case Basic: OS << "Basic"; break;
1640 case Special: OS << "Special"; break;
1641 case ICmpZero: OS << "ICmpZero"; break;
1642 case Address:
1643 OS << "Address of ";
1644 if (AccessTy.MemTy->isPointerTy())
1645 OS << "pointer"; // the full pointer type could be really verbose
1646 else {
1647 OS << *AccessTy.MemTy;
1648 }
1649
1650 OS << " in addrspace(" << AccessTy.AddrSpace << ')';
1651 }
1652
1653 OS << ", Offsets={";
1654 bool NeedComma = false;
1655 for (const LSRFixup &Fixup : Fixups) {
1656 if (NeedComma) OS << ',';
1657 OS << Fixup.Offset;
1658 NeedComma = true;
1659 }
1660 OS << '}';
1661
1662 if (AllFixupsOutsideLoop)
1663 OS << ", all-fixups-outside-loop";
1664
1665 if (WidestFixupType)
1666 OS << ", widest fixup type: " << *WidestFixupType;
1667}
1668
1669LLVM_DUMP_METHOD void LSRUse::dump() const {
1670 print(errs()); errs() << '\n';
1671}
1672#endif
1673
1675 LSRUse::KindType Kind, MemAccessTy AccessTy,
1676 GlobalValue *BaseGV, int64_t BaseOffset,
1677 bool HasBaseReg, int64_t Scale,
1678 Instruction *Fixup/*= nullptr*/) {
1679 switch (Kind) {
1680 case LSRUse::Address:
1681 return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, BaseOffset,
1682 HasBaseReg, Scale, AccessTy.AddrSpace, Fixup);
1683
1684 case LSRUse::ICmpZero:
1685 // There's not even a target hook for querying whether it would be legal to
1686 // fold a GV into an ICmp.
1687 if (BaseGV)
1688 return false;
1689
1690 // ICmp only has two operands; don't allow more than two non-trivial parts.
1691 if (Scale != 0 && HasBaseReg && BaseOffset != 0)
1692 return false;
1693
1694 // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
1695 // putting the scaled register in the other operand of the icmp.
1696 if (Scale != 0 && Scale != -1)
1697 return false;
1698
1699 // If we have low-level target information, ask the target if it can fold an
1700 // integer immediate on an icmp.
1701 if (BaseOffset != 0) {
1702 // We have one of:
1703 // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
1704 // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
1705 // Offs is the ICmp immediate.
1706 if (Scale == 0)
1707 // The cast does the right thing with
1708 // std::numeric_limits<int64_t>::min().
1709 BaseOffset = -(uint64_t)BaseOffset;
1710 return TTI.isLegalICmpImmediate(BaseOffset);
1711 }
1712
1713 // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
1714 return true;
1715
1716 case LSRUse::Basic:
1717 // Only handle single-register values.
1718 return !BaseGV && Scale == 0 && BaseOffset == 0;
1719
1720 case LSRUse::Special:
1721 // Special case Basic to handle -1 scales.
1722 return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset == 0;
1723 }
1724
1725 llvm_unreachable("Invalid LSRUse Kind!");
1726}
1727
1729 int64_t MinOffset, int64_t MaxOffset,
1730 LSRUse::KindType Kind, MemAccessTy AccessTy,
1731 GlobalValue *BaseGV, int64_t BaseOffset,
1732 bool HasBaseReg, int64_t Scale) {
1733 // Check for overflow.
1734 if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) !=
1735 (MinOffset > 0))
1736 return false;
1737 MinOffset = (uint64_t)BaseOffset + MinOffset;
1738 if (((int64_t)((uint64_t)BaseOffset + MaxOffset) > BaseOffset) !=
1739 (MaxOffset > 0))
1740 return false;
1741 MaxOffset = (uint64_t)BaseOffset + MaxOffset;
1742
1743 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
1744 HasBaseReg, Scale) &&
1745 isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
1746 HasBaseReg, Scale);
1747}
1748
1750 int64_t MinOffset, int64_t MaxOffset,
1751 LSRUse::KindType Kind, MemAccessTy AccessTy,
1752 const Formula &F, const Loop &L) {
1753 // For the purpose of isAMCompletelyFolded either having a canonical formula
1754 // or a scale not equal to zero is correct.
1755 // Problems may arise from non canonical formulae having a scale == 0.
1756 // Strictly speaking it would best to just rely on canonical formulae.
1757 // However, when we generate the scaled formulae, we first check that the
1758 // scaling factor is profitable before computing the actual ScaledReg for
1759 // compile time sake.
1760 assert((F.isCanonical(L) || F.Scale != 0));
1761 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1762 F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
1763}
1764
1765/// Test whether we know how to expand the current formula.
1766static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
1767 int64_t MaxOffset, LSRUse::KindType Kind,
1768 MemAccessTy AccessTy, GlobalValue *BaseGV,
1769 int64_t BaseOffset, bool HasBaseReg, int64_t Scale) {
1770 // We know how to expand completely foldable formulae.
1771 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1772 BaseOffset, HasBaseReg, Scale) ||
1773 // Or formulae that use a base register produced by a sum of base
1774 // registers.
1775 (Scale == 1 &&
1776 isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1777 BaseGV, BaseOffset, true, 0));
1778}
1779
1780static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
1781 int64_t MaxOffset, LSRUse::KindType Kind,
1782 MemAccessTy AccessTy, const Formula &F) {
1783 return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
1784 F.BaseOffset, F.HasBaseReg, F.Scale);
1785}
1786
1788 const LSRUse &LU, const Formula &F) {
1789 // Target may want to look at the user instructions.
1790 if (LU.Kind == LSRUse::Address && TTI.LSRWithInstrQueries()) {
1791 for (const LSRFixup &Fixup : LU.Fixups)
1792 if (!isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1793 (F.BaseOffset + Fixup.Offset), F.HasBaseReg,
1794 F.Scale, Fixup.UserInst))
1795 return false;
1796 return true;
1797 }
1798
1799 return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1800 LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
1801 F.Scale);
1802}
1803
1805 const LSRUse &LU, const Formula &F,
1806 const Loop &L) {
1807 if (!F.Scale)
1808 return 0;
1809
1810 // If the use is not completely folded in that instruction, we will have to
1811 // pay an extra cost only for scale != 1.
1812 if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1813 LU.AccessTy, F, L))
1814 return F.Scale != 1;
1815
1816 switch (LU.Kind) {
1817 case LSRUse::Address: {
1818 // Check the scaling factor cost with both the min and max offsets.
1819 InstructionCost ScaleCostMinOffset = TTI.getScalingFactorCost(
1820 LU.AccessTy.MemTy, F.BaseGV,
1821 StackOffset::getFixed(F.BaseOffset + LU.MinOffset), F.HasBaseReg,
1822 F.Scale, LU.AccessTy.AddrSpace);
1823 InstructionCost ScaleCostMaxOffset = TTI.getScalingFactorCost(
1824 LU.AccessTy.MemTy, F.BaseGV,
1825 StackOffset::getFixed(F.BaseOffset + LU.MaxOffset), F.HasBaseReg,
1826 F.Scale, LU.AccessTy.AddrSpace);
1827
1828 assert(ScaleCostMinOffset.isValid() && ScaleCostMaxOffset.isValid() &&
1829 "Legal addressing mode has an illegal cost!");
1830 return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
1831 }
1832 case LSRUse::ICmpZero:
1833 case LSRUse::Basic:
1834 case LSRUse::Special:
1835 // The use is completely folded, i.e., everything is folded into the
1836 // instruction.
1837 return 0;
1838 }
1839
1840 llvm_unreachable("Invalid LSRUse Kind!");
1841}
1842
1844 LSRUse::KindType Kind, MemAccessTy AccessTy,
1845 GlobalValue *BaseGV, int64_t BaseOffset,
1846 bool HasBaseReg) {
1847 // Fast-path: zero is always foldable.
1848 if (BaseOffset == 0 && !BaseGV) return true;
1849
1850 // Conservatively, create an address with an immediate and a
1851 // base and a scale.
1852 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
1853
1854 // Canonicalize a scale of 1 to a base register if the formula doesn't
1855 // already have a base register.
1856 if (!HasBaseReg && Scale == 1) {
1857 Scale = 0;
1858 HasBaseReg = true;
1859 }
1860
1861 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
1862 HasBaseReg, Scale);
1863}
1864
1866 ScalarEvolution &SE, int64_t MinOffset,
1867 int64_t MaxOffset, LSRUse::KindType Kind,
1868 MemAccessTy AccessTy, const SCEV *S,
1869 bool HasBaseReg) {
1870 // Fast-path: zero is always foldable.
1871 if (S->isZero()) return true;
1872
1873 // Conservatively, create an address with an immediate and a
1874 // base and a scale.
1875 int64_t BaseOffset = ExtractImmediate(S, SE);
1876 GlobalValue *BaseGV = ExtractSymbol(S, SE);
1877
1878 // If there's anything else involved, it's not foldable.
1879 if (!S->isZero()) return false;
1880
1881 // Fast-path: zero is always foldable.
1882 if (BaseOffset == 0 && !BaseGV) return true;
1883
1884 // Conservatively, create an address with an immediate and a
1885 // base and a scale.
1886 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
1887
1888 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1889 BaseOffset, HasBaseReg, Scale);
1890}
1891
1892namespace {
1893
1894/// An individual increment in a Chain of IV increments. Relate an IV user to
1895/// an expression that computes the IV it uses from the IV used by the previous
1896/// link in the Chain.
1897///
1898/// For the head of a chain, IncExpr holds the absolute SCEV expression for the
1899/// original IVOperand. The head of the chain's IVOperand is only valid during
1900/// chain collection, before LSR replaces IV users. During chain generation,
1901/// IncExpr can be used to find the new IVOperand that computes the same
1902/// expression.
1903struct IVInc {
1904 Instruction *UserInst;
1905 Value* IVOperand;
1906 const SCEV *IncExpr;
1907
1908 IVInc(Instruction *U, Value *O, const SCEV *E)
1909 : UserInst(U), IVOperand(O), IncExpr(E) {}
1910};
1911
1912// The list of IV increments in program order. We typically add the head of a
1913// chain without finding subsequent links.
1914struct IVChain {
1916 const SCEV *ExprBase = nullptr;
1917
1918 IVChain() = default;
1919 IVChain(const IVInc &Head, const SCEV *Base)
1920 : Incs(1, Head), ExprBase(Base) {}
1921
1923
1924 // Return the first increment in the chain.
1925 const_iterator begin() const {
1926 assert(!Incs.empty());
1927 return std::next(Incs.begin());
1928 }
1929 const_iterator end() const {
1930 return Incs.end();
1931 }
1932
1933 // Returns true if this chain contains any increments.
1934 bool hasIncs() const { return Incs.size() >= 2; }
1935
1936 // Add an IVInc to the end of this chain.
1937 void add(const IVInc &X) { Incs.push_back(X); }
1938
1939 // Returns the last UserInst in the chain.
1940 Instruction *tailUserInst() const { return Incs.back().UserInst; }
1941
1942 // Returns true if IncExpr can be profitably added to this chain.
1943 bool isProfitableIncrement(const SCEV *OperExpr,
1944 const SCEV *IncExpr,
1946};
1947
1948/// Helper for CollectChains to track multiple IV increment uses. Distinguish
1949/// between FarUsers that definitely cross IV increments and NearUsers that may
1950/// be used between IV increments.
1951struct ChainUsers {
1954};
1955
1956/// This class holds state for the main loop strength reduction logic.
1957class LSRInstance {
1958 IVUsers &IU;
1959 ScalarEvolution &SE;
1960 DominatorTree &DT;
1961 LoopInfo &LI;
1962 AssumptionCache &AC;
1963 TargetLibraryInfo &TLI;
1964 const TargetTransformInfo &TTI;
1965 Loop *const L;
1966 MemorySSAUpdater *MSSAU;
1968 mutable SCEVExpander Rewriter;
1969 bool Changed = false;
1970
1971 /// This is the insert position that the current loop's induction variable
1972 /// increment should be placed. In simple loops, this is the latch block's
1973 /// terminator. But in more complicated cases, this is a position which will
1974 /// dominate all the in-loop post-increment users.
1975 Instruction *IVIncInsertPos = nullptr;
1976
1977 /// Interesting factors between use strides.
1978 ///
1979 /// We explicitly use a SetVector which contains a SmallSet, instead of the
1980 /// default, a SmallDenseSet, because we need to use the full range of
1981 /// int64_ts, and there's currently no good way of doing that with
1982 /// SmallDenseSet.
1984
1985 /// The cost of the current SCEV, the best solution by LSR will be dropped if
1986 /// the solution is not profitable.
1987 Cost BaselineCost;
1988
1989 /// Interesting use types, to facilitate truncation reuse.
1991
1992 /// The list of interesting uses.
1994
1995 /// Track which uses use which register candidates.
1996 RegUseTracker RegUses;
1997
1998 // Limit the number of chains to avoid quadratic behavior. We don't expect to
1999 // have more than a few IV increment chains in a loop. Missing a Chain falls
2000 // back to normal LSR behavior for those uses.
2001 static const unsigned MaxChains = 8;
2002
2003 /// IV users can form a chain of IV increments.
2005
2006 /// IV users that belong to profitable IVChains.
2008
2009 /// Induction variables that were generated and inserted by the SCEV Expander.
2010 SmallVector<llvm::WeakVH, 2> ScalarEvolutionIVs;
2011
2012 void OptimizeShadowIV();
2013 bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
2014 ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
2015 void OptimizeLoopTermCond();
2016
2017 void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
2018 SmallVectorImpl<ChainUsers> &ChainUsersVec);
2019 void FinalizeChain(IVChain &Chain);
2020 void CollectChains();
2021 void GenerateIVChain(const IVChain &Chain,
2023
2024 void CollectInterestingTypesAndFactors();
2025 void CollectFixupsAndInitialFormulae();
2026
2027 // Support for sharing of LSRUses between LSRFixups.
2029 UseMapTy UseMap;
2030
2031 bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
2032 LSRUse::KindType Kind, MemAccessTy AccessTy);
2033
2034 std::pair<size_t, int64_t> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
2035 MemAccessTy AccessTy);
2036
2037 void DeleteUse(LSRUse &LU, size_t LUIdx);
2038
2039 LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
2040
2041 void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2042 void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2043 void CountRegisters(const Formula &F, size_t LUIdx);
2044 bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
2045
2046 void CollectLoopInvariantFixupsAndFormulae();
2047
2048 void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
2049 unsigned Depth = 0);
2050
2051 void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
2052 const Formula &Base, unsigned Depth,
2053 size_t Idx, bool IsScaledReg = false);
2054 void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
2055 void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2056 const Formula &Base, size_t Idx,
2057 bool IsScaledReg = false);
2058 void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2059 void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2060 const Formula &Base,
2061 const SmallVectorImpl<int64_t> &Worklist,
2062 size_t Idx, bool IsScaledReg = false);
2063 void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2064 void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2065 void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2066 void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
2067 void GenerateCrossUseConstantOffsets();
2068 void GenerateAllReuseFormulae();
2069
2070 void FilterOutUndesirableDedicatedRegisters();
2071
2072 size_t EstimateSearchSpaceComplexity() const;
2073 void NarrowSearchSpaceByDetectingSupersets();
2074 void NarrowSearchSpaceByCollapsingUnrolledCode();
2075 void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
2076 void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
2077 void NarrowSearchSpaceByFilterPostInc();
2078 void NarrowSearchSpaceByDeletingCostlyFormulas();
2079 void NarrowSearchSpaceByPickingWinnerRegs();
2080 void NarrowSearchSpaceUsingHeuristics();
2081
2082 void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
2083 Cost &SolutionCost,
2085 const Cost &CurCost,
2086 const SmallPtrSet<const SCEV *, 16> &CurRegs,
2087 DenseSet<const SCEV *> &VisitedRegs) const;
2088 void Solve(SmallVectorImpl<const Formula *> &Solution) const;
2089
2091 HoistInsertPosition(BasicBlock::iterator IP,
2092 const SmallVectorImpl<Instruction *> &Inputs) const;
2093 BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP,
2094 const LSRFixup &LF,
2095 const LSRUse &LU) const;
2096
2097 Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2099 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2100 void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
2101 const Formula &F,
2102 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2103 void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2104 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2105 void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
2106
2107public:
2108 LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
2110 TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU);
2111
2112 bool getChanged() const { return Changed; }
2113 const SmallVectorImpl<WeakVH> &getScalarEvolutionIVs() const {
2114 return ScalarEvolutionIVs;
2115 }
2116
2117 void print_factors_and_types(raw_ostream &OS) const;
2118 void print_fixups(raw_ostream &OS) const;
2119 void print_uses(raw_ostream &OS) const;
2120 void print(raw_ostream &OS) const;
2121 void dump() const;
2122};
2123
2124} // end anonymous namespace
2125
2126/// If IV is used in a int-to-float cast inside the loop then try to eliminate
2127/// the cast operation.
2128void LSRInstance::OptimizeShadowIV() {
2129 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2130 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2131 return;
2132
2133 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
2134 UI != E; /* empty */) {
2135 IVUsers::const_iterator CandidateUI = UI;
2136 ++UI;
2137 Instruction *ShadowUse = CandidateUI->getUser();
2138 Type *DestTy = nullptr;
2139 bool IsSigned = false;
2140
2141 /* If shadow use is a int->float cast then insert a second IV
2142 to eliminate this cast.
2143
2144 for (unsigned i = 0; i < n; ++i)
2145 foo((double)i);
2146
2147 is transformed into
2148
2149 double d = 0.0;
2150 for (unsigned i = 0; i < n; ++i, ++d)
2151 foo(d);
2152 */
2153 if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
2154 IsSigned = false;
2155 DestTy = UCast->getDestTy();
2156 }
2157 else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
2158 IsSigned = true;
2159 DestTy = SCast->getDestTy();
2160 }
2161 if (!DestTy) continue;
2162
2163 // If target does not support DestTy natively then do not apply
2164 // this transformation.
2165 if (!TTI.isTypeLegal(DestTy)) continue;
2166
2167 PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
2168 if (!PH) continue;
2169 if (PH->getNumIncomingValues() != 2) continue;
2170
2171 // If the calculation in integers overflows, the result in FP type will
2172 // differ. So we only can do this transformation if we are guaranteed to not
2173 // deal with overflowing values
2174 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PH));
2175 if (!AR) continue;
2176 if (IsSigned && !AR->hasNoSignedWrap()) continue;
2177 if (!IsSigned && !AR->hasNoUnsignedWrap()) continue;
2178
2179 Type *SrcTy = PH->getType();
2180 int Mantissa = DestTy->getFPMantissaWidth();
2181 if (Mantissa == -1) continue;
2182 if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
2183 continue;
2184
2185 unsigned Entry, Latch;
2186 if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
2187 Entry = 0;
2188 Latch = 1;
2189 } else {
2190 Entry = 1;
2191 Latch = 0;
2192 }
2193
2194 ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
2195 if (!Init) continue;
2196 Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
2197 (double)Init->getSExtValue() :
2198 (double)Init->getZExtValue());
2199
2200 BinaryOperator *Incr =
2201 dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
2202 if (!Incr) continue;
2203 if (Incr->getOpcode() != Instruction::Add
2204 && Incr->getOpcode() != Instruction::Sub)
2205 continue;
2206
2207 /* Initialize new IV, double d = 0.0 in above example. */
2208 ConstantInt *C = nullptr;
2209 if (Incr->getOperand(0) == PH)
2210 C = dyn_cast<ConstantInt>(Incr->getOperand(1));
2211 else if (Incr->getOperand(1) == PH)
2212 C = dyn_cast<ConstantInt>(Incr->getOperand(0));
2213 else
2214 continue;
2215
2216 if (!C) continue;
2217
2218 // Ignore negative constants, as the code below doesn't handle them
2219 // correctly. TODO: Remove this restriction.
2220 if (!C->getValue().isStrictlyPositive())
2221 continue;
2222
2223 /* Add new PHINode. */
2224 PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH->getIterator());
2225
2226 /* create new increment. '++d' in above example. */
2227 Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
2229 Incr->getOpcode() == Instruction::Add ? Instruction::FAdd
2230 : Instruction::FSub,
2231 NewPH, CFP, "IV.S.next.", Incr->getIterator());
2232
2233 NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
2234 NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
2235
2236 /* Remove cast operation */
2237 ShadowUse->replaceAllUsesWith(NewPH);
2238 ShadowUse->eraseFromParent();
2239 Changed = true;
2240 break;
2241 }
2242}
2243
2244/// If Cond has an operand that is an expression of an IV, set the IV user and
2245/// stride information and return true, otherwise return false.
2246bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
2247 for (IVStrideUse &U : IU)
2248 if (U.getUser() == Cond) {
2249 // NOTE: we could handle setcc instructions with multiple uses here, but
2250 // InstCombine does it as well for simple uses, it's not clear that it
2251 // occurs enough in real life to handle.
2252 CondUse = &U;
2253 return true;
2254 }
2255 return false;
2256}
2257
2258/// Rewrite the loop's terminating condition if it uses a max computation.
2259///
2260/// This is a narrow solution to a specific, but acute, problem. For loops
2261/// like this:
2262///
2263/// i = 0;
2264/// do {
2265/// p[i] = 0.0;
2266/// } while (++i < n);
2267///
2268/// the trip count isn't just 'n', because 'n' might not be positive. And
2269/// unfortunately this can come up even for loops where the user didn't use
2270/// a C do-while loop. For example, seemingly well-behaved top-test loops
2271/// will commonly be lowered like this:
2272///
2273/// if (n > 0) {
2274/// i = 0;
2275/// do {
2276/// p[i] = 0.0;
2277/// } while (++i < n);
2278/// }
2279///
2280/// and then it's possible for subsequent optimization to obscure the if
2281/// test in such a way that indvars can't find it.
2282///
2283/// When indvars can't find the if test in loops like this, it creates a
2284/// max expression, which allows it to give the loop a canonical
2285/// induction variable:
2286///
2287/// i = 0;
2288/// max = n < 1 ? 1 : n;
2289/// do {
2290/// p[i] = 0.0;
2291/// } while (++i != max);
2292///
2293/// Canonical induction variables are necessary because the loop passes
2294/// are designed around them. The most obvious example of this is the
2295/// LoopInfo analysis, which doesn't remember trip count values. It
2296/// expects to be able to rediscover the trip count each time it is
2297/// needed, and it does this using a simple analysis that only succeeds if
2298/// the loop has a canonical induction variable.
2299///
2300/// However, when it comes time to generate code, the maximum operation
2301/// can be quite costly, especially if it's inside of an outer loop.
2302///
2303/// This function solves this problem by detecting this type of loop and
2304/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
2305/// the instructions for the maximum computation.
2306ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
2307 // Check that the loop matches the pattern we're looking for.
2308 if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
2309 Cond->getPredicate() != CmpInst::ICMP_NE)
2310 return Cond;
2311
2312 SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
2313 if (!Sel || !Sel->hasOneUse()) return Cond;
2314
2315 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2316 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2317 return Cond;
2318 const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
2319
2320 // Add one to the backedge-taken count to get the trip count.
2321 const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
2322 if (IterationCount != SE.getSCEV(Sel)) return Cond;
2323
2324 // Check for a max calculation that matches the pattern. There's no check
2325 // for ICMP_ULE here because the comparison would be with zero, which
2326 // isn't interesting.
2327 CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
2328 const SCEVNAryExpr *Max = nullptr;
2329 if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
2330 Pred = ICmpInst::ICMP_SLE;
2331 Max = S;
2332 } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
2333 Pred = ICmpInst::ICMP_SLT;
2334 Max = S;
2335 } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
2336 Pred = ICmpInst::ICMP_ULT;
2337 Max = U;
2338 } else {
2339 // No match; bail.
2340 return Cond;
2341 }
2342
2343 // To handle a max with more than two operands, this optimization would
2344 // require additional checking and setup.
2345 if (Max->getNumOperands() != 2)
2346 return Cond;
2347
2348 const SCEV *MaxLHS = Max->getOperand(0);
2349 const SCEV *MaxRHS = Max->getOperand(1);
2350
2351 // ScalarEvolution canonicalizes constants to the left. For < and >, look
2352 // for a comparison with 1. For <= and >=, a comparison with zero.
2353 if (!MaxLHS ||
2354 (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
2355 return Cond;
2356
2357 // Check the relevant induction variable for conformance to
2358 // the pattern.
2359 const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
2360 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
2361 if (!AR || !AR->isAffine() ||
2362 AR->getStart() != One ||
2363 AR->getStepRecurrence(SE) != One)
2364 return Cond;
2365
2366 assert(AR->getLoop() == L &&
2367 "Loop condition operand is an addrec in a different loop!");
2368
2369 // Check the right operand of the select, and remember it, as it will
2370 // be used in the new comparison instruction.
2371 Value *NewRHS = nullptr;
2372 if (ICmpInst::isTrueWhenEqual(Pred)) {
2373 // Look for n+1, and grab n.
2374 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
2375 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2376 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2377 NewRHS = BO->getOperand(0);
2378 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
2379 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2380 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2381 NewRHS = BO->getOperand(0);
2382 if (!NewRHS)
2383 return Cond;
2384 } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
2385 NewRHS = Sel->getOperand(1);
2386 else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
2387 NewRHS = Sel->getOperand(2);
2388 else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
2389 NewRHS = SU->getValue();
2390 else
2391 // Max doesn't match expected pattern.
2392 return Cond;
2393
2394 // Determine the new comparison opcode. It may be signed or unsigned,
2395 // and the original comparison may be either equality or inequality.
2396 if (Cond->getPredicate() == CmpInst::ICMP_EQ)
2397 Pred = CmpInst::getInversePredicate(Pred);
2398
2399 // Ok, everything looks ok to change the condition into an SLT or SGE and
2400 // delete the max calculation.
2401 ICmpInst *NewCond = new ICmpInst(Cond->getIterator(), Pred,
2402 Cond->getOperand(0), NewRHS, "scmp");
2403
2404 // Delete the max calculation instructions.
2405 NewCond->setDebugLoc(Cond->getDebugLoc());
2406 Cond->replaceAllUsesWith(NewCond);
2407 CondUse->setUser(NewCond);
2408 Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
2409 Cond->eraseFromParent();
2410 Sel->eraseFromParent();
2411 if (Cmp->use_empty())
2412 Cmp->eraseFromParent();
2413 return NewCond;
2414}
2415
2416/// Change loop terminating condition to use the postinc iv when possible.
2417void
2418LSRInstance::OptimizeLoopTermCond() {
2420
2421 // We need a different set of heuristics for rotated and non-rotated loops.
2422 // If a loop is rotated then the latch is also the backedge, so inserting
2423 // post-inc expressions just before the latch is ideal. To reduce live ranges
2424 // it also makes sense to rewrite terminating conditions to use post-inc
2425 // expressions.
2426 //
2427 // If the loop is not rotated then the latch is not a backedge; the latch
2428 // check is done in the loop head. Adding post-inc expressions before the
2429 // latch will cause overlapping live-ranges of pre-inc and post-inc expressions
2430 // in the loop body. In this case we do *not* want to use post-inc expressions
2431 // in the latch check, and we want to insert post-inc expressions before
2432 // the backedge.
2433 BasicBlock *LatchBlock = L->getLoopLatch();
2434 SmallVector<BasicBlock*, 8> ExitingBlocks;
2435 L->getExitingBlocks(ExitingBlocks);
2436 if (!llvm::is_contained(ExitingBlocks, LatchBlock)) {
2437 // The backedge doesn't exit the loop; treat this as a head-tested loop.
2438 IVIncInsertPos = LatchBlock->getTerminator();
2439 return;
2440 }
2441
2442 // Otherwise treat this as a rotated loop.
2443 for (BasicBlock *ExitingBlock : ExitingBlocks) {
2444 // Get the terminating condition for the loop if possible. If we
2445 // can, we want to change it to use a post-incremented version of its
2446 // induction variable, to allow coalescing the live ranges for the IV into
2447 // one register value.
2448
2449 BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
2450 if (!TermBr)
2451 continue;
2452 // FIXME: Overly conservative, termination condition could be an 'or' etc..
2453 if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
2454 continue;
2455
2456 // Search IVUsesByStride to find Cond's IVUse if there is one.
2457 IVStrideUse *CondUse = nullptr;
2458 ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
2459 if (!FindIVUserForCond(Cond, CondUse))
2460 continue;
2461
2462 // If the trip count is computed in terms of a max (due to ScalarEvolution
2463 // being unable to find a sufficient guard, for example), change the loop
2464 // comparison to use SLT or ULT instead of NE.
2465 // One consequence of doing this now is that it disrupts the count-down
2466 // optimization. That's not always a bad thing though, because in such
2467 // cases it may still be worthwhile to avoid a max.
2468 Cond = OptimizeMax(Cond, CondUse);
2469
2470 // If this exiting block dominates the latch block, it may also use
2471 // the post-inc value if it won't be shared with other uses.
2472 // Check for dominance.
2473 if (!DT.dominates(ExitingBlock, LatchBlock))
2474 continue;
2475
2476 // Conservatively avoid trying to use the post-inc value in non-latch
2477 // exits if there may be pre-inc users in intervening blocks.
2478 if (LatchBlock != ExitingBlock)
2479 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
2480 // Test if the use is reachable from the exiting block. This dominator
2481 // query is a conservative approximation of reachability.
2482 if (&*UI != CondUse &&
2483 !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {
2484 // Conservatively assume there may be reuse if the quotient of their
2485 // strides could be a legal scale.
2486 const SCEV *A = IU.getStride(*CondUse, L);
2487 const SCEV *B = IU.getStride(*UI, L);
2488 if (!A || !B) continue;
2489 if (SE.getTypeSizeInBits(A->getType()) !=
2490 SE.getTypeSizeInBits(B->getType())) {
2491 if (SE.getTypeSizeInBits(A->getType()) >
2492 SE.getTypeSizeInBits(B->getType()))
2493 B = SE.getSignExtendExpr(B, A->getType());
2494 else
2495 A = SE.getSignExtendExpr(A, B->getType());
2496 }
2497 if (const SCEVConstant *D =
2498 dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
2499 const ConstantInt *C = D->getValue();
2500 // Stride of one or negative one can have reuse with non-addresses.
2501 if (C->isOne() || C->isMinusOne())
2502 goto decline_post_inc;
2503 // Avoid weird situations.
2504 if (C->getValue().getSignificantBits() >= 64 ||
2505 C->getValue().isMinSignedValue())
2506 goto decline_post_inc;
2507 // Check for possible scaled-address reuse.
2508 if (isAddressUse(TTI, UI->getUser(), UI->getOperandValToReplace())) {
2509 MemAccessTy AccessTy = getAccessType(
2510 TTI, UI->getUser(), UI->getOperandValToReplace());
2511 int64_t Scale = C->getSExtValue();
2512 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2513 /*BaseOffset=*/0,
2514 /*HasBaseReg=*/true, Scale,
2515 AccessTy.AddrSpace))
2516 goto decline_post_inc;
2517 Scale = -Scale;
2518 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2519 /*BaseOffset=*/0,
2520 /*HasBaseReg=*/true, Scale,
2521 AccessTy.AddrSpace))
2522 goto decline_post_inc;
2523 }
2524 }
2525 }
2526
2527 LLVM_DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: "
2528 << *Cond << '\n');
2529
2530 // It's possible for the setcc instruction to be anywhere in the loop, and
2531 // possible for it to have multiple users. If it is not immediately before
2532 // the exiting block branch, move it.
2533 if (Cond->getNextNonDebugInstruction() != TermBr) {
2534 if (Cond->hasOneUse()) {
2535 Cond->moveBefore(TermBr);
2536 } else {
2537 // Clone the terminating condition and insert into the loopend.
2538 ICmpInst *OldCond = Cond;
2539 Cond = cast<ICmpInst>(Cond->clone());
2540 Cond->setName(L->getHeader()->getName() + ".termcond");
2541 Cond->insertInto(ExitingBlock, TermBr->getIterator());
2542
2543 // Clone the IVUse, as the old use still exists!
2544 CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
2545 TermBr->replaceUsesOfWith(OldCond, Cond);
2546 }
2547 }
2548
2549 // If we get to here, we know that we can transform the setcc instruction to
2550 // use the post-incremented version of the IV, allowing us to coalesce the
2551 // live ranges for the IV correctly.
2552 CondUse->transformToPostInc(L);
2553 Changed = true;
2554
2555 PostIncs.insert(Cond);
2556 decline_post_inc:;
2557 }
2558
2559 // Determine an insertion point for the loop induction variable increment. It
2560 // must dominate all the post-inc comparisons we just set up, and it must
2561 // dominate the loop latch edge.
2562 IVIncInsertPos = L->getLoopLatch()->getTerminator();
2563 for (Instruction *Inst : PostIncs)
2564 IVIncInsertPos = DT.findNearestCommonDominator(IVIncInsertPos, Inst);
2565}
2566
2567/// Determine if the given use can accommodate a fixup at the given offset and
2568/// other details. If so, update the use and return true.
2569bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
2570 bool HasBaseReg, LSRUse::KindType Kind,
2571 MemAccessTy AccessTy) {
2572 int64_t NewMinOffset = LU.MinOffset;
2573 int64_t NewMaxOffset = LU.MaxOffset;
2574 MemAccessTy NewAccessTy = AccessTy;
2575
2576 // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
2577 // something conservative, however this can pessimize in the case that one of
2578 // the uses will have all its uses outside the loop, for example.
2579 if (LU.Kind != Kind)
2580 return false;
2581
2582 // Check for a mismatched access type, and fall back conservatively as needed.
2583 // TODO: Be less conservative when the type is similar and can use the same
2584 // addressing modes.
2585 if (Kind == LSRUse::Address) {
2586 if (AccessTy.MemTy != LU.AccessTy.MemTy) {
2587 NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext(),
2588 AccessTy.AddrSpace);
2589 }
2590 }
2591
2592 // Conservatively assume HasBaseReg is true for now.
2593 if (NewOffset < LU.MinOffset) {
2594 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2595 LU.MaxOffset - NewOffset, HasBaseReg))
2596 return false;
2597 NewMinOffset = NewOffset;
2598 } else if (NewOffset > LU.MaxOffset) {
2599 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2600 NewOffset - LU.MinOffset, HasBaseReg))
2601 return false;
2602 NewMaxOffset = NewOffset;
2603 }
2604
2605 // Update the use.
2606 LU.MinOffset = NewMinOffset;
2607 LU.MaxOffset = NewMaxOffset;
2608 LU.AccessTy = NewAccessTy;
2609 return true;
2610}
2611
2612/// Return an LSRUse index and an offset value for a fixup which needs the given
2613/// expression, with the given kind and optional access type. Either reuse an
2614/// existing use or create a new one, as needed.
2615std::pair<size_t, int64_t> LSRInstance::getUse(const SCEV *&Expr,
2616 LSRUse::KindType Kind,
2617 MemAccessTy AccessTy) {
2618 const SCEV *Copy = Expr;
2619 int64_t Offset = ExtractImmediate(Expr, SE);
2620
2621 // Basic uses can't accept any offset, for example.
2622 if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
2623 Offset, /*HasBaseReg=*/ true)) {
2624 Expr = Copy;
2625 Offset = 0;
2626 }
2627
2628 std::pair<UseMapTy::iterator, bool> P =
2629 UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0));
2630 if (!P.second) {
2631 // A use already existed with this base.
2632 size_t LUIdx = P.first->second;
2633 LSRUse &LU = Uses[LUIdx];
2634 if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
2635 // Reuse this use.
2636 return std::make_pair(LUIdx, Offset);
2637 }
2638
2639 // Create a new use.
2640 size_t LUIdx = Uses.size();
2641 P.first->second = LUIdx;
2642 Uses.push_back(LSRUse(Kind, AccessTy));
2643 LSRUse &LU = Uses[LUIdx];
2644
2645 LU.MinOffset = Offset;
2646 LU.MaxOffset = Offset;
2647 return std::make_pair(LUIdx, Offset);
2648}
2649
2650/// Delete the given use from the Uses list.
2651void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
2652 if (&LU != &Uses.back())
2653 std::swap(LU, Uses.back());
2654 Uses.pop_back();
2655
2656 // Update RegUses.
2657 RegUses.swapAndDropUse(LUIdx, Uses.size());
2658}
2659
2660/// Look for a use distinct from OrigLU which is has a formula that has the same
2661/// registers as the given formula.
2662LSRUse *
2663LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
2664 const LSRUse &OrigLU) {
2665 // Search all uses for the formula. This could be more clever.
2666 for (LSRUse &LU : Uses) {
2667 // Check whether this use is close enough to OrigLU, to see whether it's
2668 // worthwhile looking through its formulae.
2669 // Ignore ICmpZero uses because they may contain formulae generated by
2670 // GenerateICmpZeroScales, in which case adding fixup offsets may
2671 // be invalid.
2672 if (&LU != &OrigLU &&
2673 LU.Kind != LSRUse::ICmpZero &&
2674 LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
2675 LU.WidestFixupType == OrigLU.WidestFixupType &&
2676 LU.HasFormulaWithSameRegs(OrigF)) {
2677 // Scan through this use's formulae.
2678 for (const Formula &F : LU.Formulae) {
2679 // Check to see if this formula has the same registers and symbols
2680 // as OrigF.
2681 if (F.BaseRegs == OrigF.BaseRegs &&
2682 F.ScaledReg == OrigF.ScaledReg &&
2683 F.BaseGV == OrigF.BaseGV &&
2684 F.Scale == OrigF.Scale &&
2685 F.UnfoldedOffset == OrigF.UnfoldedOffset) {
2686 if (F.BaseOffset == 0)
2687 return &LU;
2688 // This is the formula where all the registers and symbols matched;
2689 // there aren't going to be any others. Since we declined it, we
2690 // can skip the rest of the formulae and proceed to the next LSRUse.
2691 break;
2692 }
2693 }
2694 }
2695 }
2696
2697 // Nothing looked good.
2698 return nullptr;
2699}
2700
2701void LSRInstance::CollectInterestingTypesAndFactors() {
2703
2704 // Collect interesting types and strides.
2706 for (const IVStrideUse &U : IU) {
2707 const SCEV *Expr = IU.getExpr(U);
2708 if (!Expr)
2709 continue;
2710
2711 // Collect interesting types.
2712 Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
2713
2714 // Add strides for mentioned loops.
2715 Worklist.push_back(Expr);
2716 do {
2717 const SCEV *S = Worklist.pop_back_val();
2718 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
2719 if (AR->getLoop() == L)
2720 Strides.insert(AR->getStepRecurrence(SE));
2721 Worklist.push_back(AR->getStart());
2722 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
2723 append_range(Worklist, Add->operands());
2724 }
2725 } while (!Worklist.empty());
2726 }
2727
2728 // Compute interesting factors from the set of interesting strides.
2730 I = Strides.begin(), E = Strides.end(); I != E; ++I)
2732 std::next(I); NewStrideIter != E; ++NewStrideIter) {
2733 const SCEV *OldStride = *I;
2734 const SCEV *NewStride = *NewStrideIter;
2735
2736 if (SE.getTypeSizeInBits(OldStride->getType()) !=
2737 SE.getTypeSizeInBits(NewStride->getType())) {
2738 if (SE.getTypeSizeInBits(OldStride->getType()) >
2739 SE.getTypeSizeInBits(NewStride->getType()))
2740 NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
2741 else
2742 OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
2743 }
2744 if (const SCEVConstant *Factor =
2745 dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
2746 SE, true))) {
2747 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2748 Factors.insert(Factor->getAPInt().getSExtValue());
2749 } else if (const SCEVConstant *Factor =
2750 dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
2751 NewStride,
2752 SE, true))) {
2753 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2754 Factors.insert(Factor->getAPInt().getSExtValue());
2755 }
2756 }
2757
2758 // If all uses use the same type, don't bother looking for truncation-based
2759 // reuse.
2760 if (Types.size() == 1)
2761 Types.clear();
2762
2763 LLVM_DEBUG(print_factors_and_types(dbgs()));
2764}
2765
2766/// Helper for CollectChains that finds an IV operand (computed by an AddRec in
2767/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
2768/// IVStrideUses, we could partially skip this.
2769static User::op_iterator
2771 Loop *L, ScalarEvolution &SE) {
2772 for(; OI != OE; ++OI) {
2773 if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
2774 if (!SE.isSCEVable(Oper->getType()))
2775 continue;
2776
2777 if (const SCEVAddRecExpr *AR =
2778 dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) {
2779 if (AR->getLoop() == L)
2780 break;
2781 }
2782 }
2783 }
2784 return OI;
2785}
2786
2787/// IVChain logic must consistently peek base TruncInst operands, so wrap it in
2788/// a convenient helper.
2790 if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
2791 return Trunc->getOperand(0);
2792 return Oper;
2793}
2794
2795/// Return an approximation of this SCEV expression's "base", or NULL for any
2796/// constant. Returning the expression itself is conservative. Returning a
2797/// deeper subexpression is more precise and valid as long as it isn't less
2798/// complex than another subexpression. For expressions involving multiple
2799/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
2800/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
2801/// IVInc==b-a.
2802///
2803/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
2804/// SCEVUnknown, we simply return the rightmost SCEV operand.
2805static const SCEV *getExprBase(const SCEV *S) {
2806 switch (S->getSCEVType()) {
2807 default: // including scUnknown.
2808 return S;
2809 case scConstant:
2810 case scVScale:
2811 return nullptr;
2812 case scTruncate:
2813 return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
2814 case scZeroExtend:
2815 return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
2816 case scSignExtend:
2817 return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
2818 case scAddExpr: {
2819 // Skip over scaled operands (scMulExpr) to follow add operands as long as
2820 // there's nothing more complex.
2821 // FIXME: not sure if we want to recognize negation.
2822 const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
2823 for (const SCEV *SubExpr : reverse(Add->operands())) {
2824 if (SubExpr->getSCEVType() == scAddExpr)
2825 return getExprBase(SubExpr);
2826
2827 if (SubExpr->getSCEVType() != scMulExpr)
2828 return SubExpr;
2829 }
2830 return S; // all operands are scaled, be conservative.
2831 }
2832 case scAddRecExpr:
2833 return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
2834 }
2835 llvm_unreachable("Unknown SCEV kind!");
2836}
2837
2838/// Return true if the chain increment is profitable to expand into a loop
2839/// invariant value, which may require its own register. A profitable chain
2840/// increment will be an offset relative to the same base. We allow such offsets
2841/// to potentially be used as chain increment as long as it's not obviously
2842/// expensive to expand using real instructions.
2843bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
2844 const SCEV *IncExpr,
2845 ScalarEvolution &SE) {
2846 // Aggressively form chains when -stress-ivchain.
2847 if (StressIVChain)
2848 return true;
2849
2850 // Do not replace a constant offset from IV head with a nonconstant IV
2851 // increment.
2852 if (!isa<SCEVConstant>(IncExpr)) {
2853 const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
2854 if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
2855 return false;
2856 }
2857
2859 return !isHighCostExpansion(IncExpr, Processed, SE);
2860}
2861
2862/// Return true if the number of registers needed for the chain is estimated to
2863/// be less than the number required for the individual IV users. First prohibit
2864/// any IV users that keep the IV live across increments (the Users set should
2865/// be empty). Next count the number and type of increments in the chain.
2866///
2867/// Chaining IVs can lead to considerable code bloat if ISEL doesn't
2868/// effectively use postinc addressing modes. Only consider it profitable it the
2869/// increments can be computed in fewer registers when chained.
2870///
2871/// TODO: Consider IVInc free if it's already used in another chains.
2872static bool isProfitableChain(IVChain &Chain,
2874 ScalarEvolution &SE,
2875 const TargetTransformInfo &TTI) {
2876 if (StressIVChain)
2877 return true;
2878
2879 if (!Chain.hasIncs())
2880 return false;
2881
2882 if (!Users.empty()) {
2883 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
2884 for (Instruction *Inst
2885 : Users) { dbgs() << " " << *Inst << "\n"; });
2886 return false;
2887 }
2888 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
2889
2890 // The chain itself may require a register, so intialize cost to 1.
2891 int cost = 1;
2892
2893 // A complete chain likely eliminates the need for keeping the original IV in
2894 // a register. LSR does not currently know how to form a complete chain unless
2895 // the header phi already exists.
2896 if (isa<PHINode>(Chain.tailUserInst())
2897 && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
2898 --cost;
2899 }
2900 const SCEV *LastIncExpr = nullptr;
2901 unsigned NumConstIncrements = 0;
2902 unsigned NumVarIncrements = 0;
2903 unsigned NumReusedIncrements = 0;
2904
2905 if (TTI.isProfitableLSRChainElement(Chain.Incs[0].UserInst))
2906 return true;
2907
2908 for (const IVInc &Inc : Chain) {
2909 if (TTI.isProfitableLSRChainElement(Inc.UserInst))
2910 return true;
2911 if (Inc.IncExpr->isZero())
2912 continue;
2913
2914 // Incrementing by zero or some constant is neutral. We assume constants can
2915 // be folded into an addressing mode or an add's immediate operand.
2916 if (isa<SCEVConstant>(Inc.IncExpr)) {
2917 ++NumConstIncrements;
2918 continue;
2919 }
2920
2921 if (Inc.IncExpr == LastIncExpr)
2922 ++NumReusedIncrements;
2923 else
2924 ++NumVarIncrements;
2925
2926 LastIncExpr = Inc.IncExpr;
2927 }
2928 // An IV chain with a single increment is handled by LSR's postinc
2929 // uses. However, a chain with multiple increments requires keeping the IV's
2930 // value live longer than it needs to be if chained.
2931 if (NumConstIncrements > 1)
2932 --cost;
2933
2934 // Materializing increment expressions in the preheader that didn't exist in
2935 // the original code may cost a register. For example, sign-extended array
2936 // indices can produce ridiculous increments like this:
2937 // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
2938 cost += NumVarIncrements;
2939
2940 // Reusing variable increments likely saves a register to hold the multiple of
2941 // the stride.
2942 cost -= NumReusedIncrements;
2943
2944 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
2945 << "\n");
2946
2947 return cost < 0;
2948}
2949
2950/// Add this IV user to an existing chain or make it the head of a new chain.
2951void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
2952 SmallVectorImpl<ChainUsers> &ChainUsersVec) {
2953 // When IVs are used as types of varying widths, they are generally converted
2954 // to a wider type with some uses remaining narrow under a (free) trunc.
2955 Value *const NextIV = getWideOperand(IVOper);
2956 const SCEV *const OperExpr = SE.getSCEV(NextIV);
2957 const SCEV *const OperExprBase = getExprBase(OperExpr);
2958
2959 // Visit all existing chains. Check if its IVOper can be computed as a
2960 // profitable loop invariant increment from the last link in the Chain.
2961 unsigned ChainIdx = 0, NChains = IVChainVec.size();
2962 const SCEV *LastIncExpr = nullptr;
2963 for (; ChainIdx < NChains; ++ChainIdx) {
2964 IVChain &Chain = IVChainVec[ChainIdx];
2965
2966 // Prune the solution space aggressively by checking that both IV operands
2967 // are expressions that operate on the same unscaled SCEVUnknown. This
2968 // "base" will be canceled by the subsequent getMinusSCEV call. Checking
2969 // first avoids creating extra SCEV expressions.
2970 if (!StressIVChain && Chain.ExprBase != OperExprBase)
2971 continue;
2972
2973 Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
2974 if (PrevIV->getType() != NextIV->getType())
2975 continue;
2976
2977 // A phi node terminates a chain.
2978 if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
2979 continue;
2980
2981 // The increment must be loop-invariant so it can be kept in a register.
2982 const SCEV *PrevExpr = SE.getSCEV(PrevIV);
2983 const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
2984 if (isa<SCEVCouldNotCompute>(IncExpr) || !SE.isLoopInvariant(IncExpr, L))
2985 continue;
2986
2987 if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
2988 LastIncExpr = IncExpr;
2989 break;
2990 }
2991 }
2992 // If we haven't found a chain, create a new one, unless we hit the max. Don't
2993 // bother for phi nodes, because they must be last in the chain.
2994 if (ChainIdx == NChains) {
2995 if (isa<PHINode>(UserInst))
2996 return;
2997 if (NChains >= MaxChains && !StressIVChain) {
2998 LLVM_DEBUG(dbgs() << "IV Chain Limit\n");
2999 return;
3000 }
3001 LastIncExpr = OperExpr;
3002 // IVUsers may have skipped over sign/zero extensions. We don't currently
3003 // attempt to form chains involving extensions unless they can be hoisted
3004 // into this loop's AddRec.
3005 if (!isa<SCEVAddRecExpr>(LastIncExpr))
3006 return;
3007 ++NChains;
3008 IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
3009 OperExprBase));
3010 ChainUsersVec.resize(NChains);
3011 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
3012 << ") IV=" << *LastIncExpr << "\n");
3013 } else {
3014 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst
3015 << ") IV+" << *LastIncExpr << "\n");
3016 // Add this IV user to the end of the chain.
3017 IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
3018 }
3019 IVChain &Chain = IVChainVec[ChainIdx];
3020
3021 SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
3022 // This chain's NearUsers become FarUsers.
3023 if (!LastIncExpr->isZero()) {
3024 ChainUsersVec[ChainIdx].FarUsers.insert(NearUsers.begin(),
3025 NearUsers.end());
3026 NearUsers.clear();
3027 }
3028
3029 // All other uses of IVOperand become near uses of the chain.
3030 // We currently ignore intermediate values within SCEV expressions, assuming
3031 // they will eventually be used be the current chain, or can be computed
3032 // from one of the chain increments. To be more precise we could
3033 // transitively follow its user and only add leaf IV users to the set.
3034 for (User *U : IVOper->users()) {
3035 Instruction *OtherUse = dyn_cast<Instruction>(U);
3036 if (!OtherUse)
3037 continue;
3038 // Uses in the chain will no longer be uses if the chain is formed.
3039 // Include the head of the chain in this iteration (not Chain.begin()).
3040 IVChain::const_iterator IncIter = Chain.Incs.begin();
3041 IVChain::const_iterator IncEnd = Chain.Incs.end();
3042 for( ; IncIter != IncEnd; ++IncIter) {
3043 if (IncIter->UserInst == OtherUse)
3044 break;
3045 }
3046 if (IncIter != IncEnd)
3047 continue;
3048
3049 if (SE.isSCEVable(OtherUse->getType())
3050 && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
3051 && IU.isIVUserOrOperand(OtherUse)) {
3052 continue;
3053 }
3054 NearUsers.insert(OtherUse);
3055 }
3056
3057 // Since this user is part of the chain, it's no longer considered a use
3058 // of the chain.
3059 ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
3060}
3061
3062/// Populate the vector of Chains.
3063///
3064/// This decreases ILP at the architecture level. Targets with ample registers,
3065/// multiple memory ports, and no register renaming probably don't want
3066/// this. However, such targets should probably disable LSR altogether.
3067///
3068/// The job of LSR is to make a reasonable choice of induction variables across
3069/// the loop. Subsequent passes can easily "unchain" computation exposing more
3070/// ILP *within the loop* if the target wants it.
3071///
3072/// Finding the best IV chain is potentially a scheduling problem. Since LSR
3073/// will not reorder memory operations, it will recognize this as a chain, but
3074/// will generate redundant IV increments. Ideally this would be corrected later
3075/// by a smart scheduler:
3076/// = A[i]
3077/// = A[i+x]
3078/// A[i] =
3079/// A[i+x] =
3080///
3081/// TODO: Walk the entire domtree within this loop, not just the path to the
3082/// loop latch. This will discover chains on side paths, but requires
3083/// maintaining multiple copies of the Chains state.
3084void LSRInstance::CollectChains() {
3085 LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n");
3086 SmallVector<ChainUsers, 8> ChainUsersVec;
3087
3089 BasicBlock *LoopHeader = L->getHeader();
3090 for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
3091 Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
3092 LatchPath.push_back(Rung->getBlock());
3093 }
3094 LatchPath.push_back(LoopHeader);
3095
3096 // Walk the instruction stream from the loop header to the loop latch.
3097 for (BasicBlock *BB : reverse(LatchPath)) {
3098 for (Instruction &I : *BB) {
3099 // Skip instructions that weren't seen by IVUsers analysis.
3100 if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&I))
3101 continue;
3102
3103 // Ignore users that are part of a SCEV expression. This way we only
3104 // consider leaf IV Users. This effectively rediscovers a portion of
3105 // IVUsers analysis but in program order this time.
3106 if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I)))
3107 continue;
3108
3109 // Remove this instruction from any NearUsers set it may be in.
3110 for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
3111 ChainIdx < NChains; ++ChainIdx) {
3112 ChainUsersVec[ChainIdx].NearUsers.erase(&I);
3113 }
3114 // Search for operands that can be chained.
3115 SmallPtrSet<Instruction*, 4> UniqueOperands;
3116 User::op_iterator IVOpEnd = I.op_end();
3117 User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE);
3118 while (IVOpIter != IVOpEnd) {
3119 Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
3120 if (UniqueOperands.insert(IVOpInst).second)
3121 ChainInstruction(&I, IVOpInst, ChainUsersVec);
3122 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3123 }
3124 } // Continue walking down the instructions.
3125 } // Continue walking down the domtree.
3126 // Visit phi backedges to determine if the chain can generate the IV postinc.
3127 for (PHINode &PN : L->getHeader()->phis()) {
3128 if (!SE.isSCEVable(PN.getType()))
3129 continue;
3130
3131 Instruction *IncV =
3132 dyn_cast<Instruction>(PN.getIncomingValueForBlock(L->getLoopLatch()));
3133 if (IncV)
3134 ChainInstruction(&PN, IncV, ChainUsersVec);
3135 }
3136 // Remove any unprofitable chains.
3137 unsigned ChainIdx = 0;
3138 for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
3139 UsersIdx < NChains; ++UsersIdx) {
3140 if (!isProfitableChain(IVChainVec[UsersIdx],
3141 ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
3142 continue;
3143 // Preserve the chain at UsesIdx.
3144 if (ChainIdx != UsersIdx)
3145 IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
3146 FinalizeChain(IVChainVec[ChainIdx]);
3147 ++ChainIdx;
3148 }
3149 IVChainVec.resize(ChainIdx);
3150}
3151
3152void LSRInstance::FinalizeChain(IVChain &Chain) {
3153 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3154 LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
3155
3156 for (const IVInc &Inc : Chain) {
3157 LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");
3158 auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
3159 assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
3160 IVIncSet.insert(UseI);
3161 }
3162}
3163
3164/// Return true if the IVInc can be folded into an addressing mode.
3165static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
3166 Value *Operand, const TargetTransformInfo &TTI) {
3167 const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
3168 if (!IncConst || !isAddressUse(TTI, UserInst, Operand))
3169 return false;
3170
3171 if (IncConst->getAPInt().getSignificantBits() > 64)
3172 return false;
3173
3174 MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
3175 int64_t IncOffset = IncConst->getValue()->getSExtValue();
3176 if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
3177 IncOffset, /*HasBaseReg=*/false))
3178 return false;
3179
3180 return true;
3181}
3182
3183/// Generate an add or subtract for each IVInc in a chain to materialize the IV
3184/// user's operand from the previous IV user's operand.
3185void LSRInstance::GenerateIVChain(const IVChain &Chain,
3187 // Find the new IVOperand for the head of the chain. It may have been replaced
3188 // by LSR.
3189 const IVInc &Head = Chain.Incs[0];
3190 User::op_iterator IVOpEnd = Head.UserInst->op_end();
3191 // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
3192 User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
3193 IVOpEnd, L, SE);
3194 Value *IVSrc = nullptr;
3195 while (IVOpIter != IVOpEnd) {
3196 IVSrc = getWideOperand(*IVOpIter);
3197
3198 // If this operand computes the expression that the chain needs, we may use
3199 // it. (Check this after setting IVSrc which is used below.)
3200 //
3201 // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
3202 // narrow for the chain, so we can no longer use it. We do allow using a
3203 // wider phi, assuming the LSR checked for free truncation. In that case we
3204 // should already have a truncate on this operand such that
3205 // getSCEV(IVSrc) == IncExpr.
3206 if (SE.getSCEV(*IVOpIter) == Head.IncExpr
3207 || SE.getSCEV(IVSrc) == Head.IncExpr) {
3208 break;
3209 }
3210 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3211 }
3212 if (IVOpIter == IVOpEnd) {
3213 // Gracefully give up on this chain.
3214 LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
3215 return;
3216 }
3217 assert(IVSrc && "Failed to find IV chain source");
3218
3219 LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
3220 Type *IVTy = IVSrc->getType();
3221 Type *IntTy = SE.getEffectiveSCEVType(IVTy);
3222 const SCEV *LeftOverExpr = nullptr;
3223 for (const IVInc &Inc : Chain) {
3224 Instruction *InsertPt = Inc.UserInst;
3225 if (isa<PHINode>(InsertPt))
3226 InsertPt = L->getLoopLatch()->getTerminator();
3227
3228 // IVOper will replace the current IV User's operand. IVSrc is the IV
3229 // value currently held in a register.
3230 Value *IVOper = IVSrc;
3231 if (!Inc.IncExpr->isZero()) {
3232 // IncExpr was the result of subtraction of two narrow values, so must
3233 // be signed.
3234 const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
3235 LeftOverExpr = LeftOverExpr ?
3236 SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
3237 }
3238 if (LeftOverExpr && !LeftOverExpr->isZero()) {
3239 // Expand the IV increment.
3240 Rewriter.clearPostInc();
3241 Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
3242 const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
3243 SE.getUnknown(IncV));
3244 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3245
3246 // If an IV increment can't be folded, use it as the next IV value.
3247 if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
3248 assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
3249 IVSrc = IVOper;
3250 LeftOverExpr = nullptr;
3251 }
3252 }
3253 Type *OperTy = Inc.IVOperand->getType();
3254 if (IVTy != OperTy) {
3255 assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
3256 "cannot extend a chained IV");
3257 IRBuilder<> Builder(InsertPt);
3258 IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
3259 }
3260 Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
3261 if (auto *OperandIsInstr = dyn_cast<Instruction>(Inc.IVOperand))
3262 DeadInsts.emplace_back(OperandIsInstr);
3263 }
3264 // If LSR created a new, wider phi, we may also replace its postinc. We only
3265 // do this if we also found a wide value for the head of the chain.
3266 if (isa<PHINode>(Chain.tailUserInst())) {
3267 for (PHINode &Phi : L->getHeader()->phis()) {
3268 if (Phi.getType() != IVSrc->getType())
3269 continue;
3270 Instruction *PostIncV = dyn_cast<Instruction>(
3271 Phi.getIncomingValueForBlock(L->getLoopLatch()));
3272 if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
3273 continue;
3274 Value *IVOper = IVSrc;
3275 Type *PostIncTy = PostIncV->getType();
3276 if (IVTy != PostIncTy) {
3277 assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
3278 IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
3279 Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
3280 IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
3281 }
3282 Phi.replaceUsesOfWith(PostIncV, IVOper);
3283 DeadInsts.emplace_back(PostIncV);
3284 }
3285 }
3286}
3287
3288void LSRInstance::CollectFixupsAndInitialFormulae() {
3289 BranchInst *ExitBranch = nullptr;
3290 bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &TLI);
3291
3292 // For calculating baseline cost
3294 DenseSet<const SCEV *> VisitedRegs;
3295 DenseSet<size_t> VisitedLSRUse;
3296
3297 for (const IVStrideUse &U : IU) {
3298 Instruction *UserInst = U.getUser();
3299 // Skip IV users that are part of profitable IV Chains.
3300 User::op_iterator UseI =
3301 find(UserInst->operands(), U.getOperandValToReplace());
3302 assert(UseI != UserInst->op_end() && "cannot find IV operand");
3303 if (IVIncSet.count(UseI)) {
3304 LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
3305 continue;
3306 }
3307
3308 LSRUse::KindType Kind = LSRUse::Basic;
3309 MemAccessTy AccessTy;
3310 if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
3311 Kind = LSRUse::Address;
3312 AccessTy = getAccessType(TTI, UserInst, U.getOperandValToReplace());
3313 }
3314
3315 const SCEV *S = IU.getExpr(U);
3316 if (!S)
3317 continue;
3318 PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
3319
3320 // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
3321 // (N - i == 0), and this allows (N - i) to be the expression that we work
3322 // with rather than just N or i, so we can consider the register
3323 // requirements for both N and i at the same time. Limiting this code to
3324 // equality icmps is not a problem because all interesting loops use
3325 // equality icmps, thanks to IndVarSimplify.
3326 if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) {
3327 // If CI can be saved in some target, like replaced inside hardware loop
3328 // in PowerPC, no need to generate initial formulae for it.
3329 if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition()))
3330 continue;
3331 if (CI->isEquality()) {
3332 // Swap the operands if needed to put the OperandValToReplace on the
3333 // left, for consistency.
3334 Value *NV = CI->getOperand(1);
3335 if (NV == U.getOperandValToReplace()) {
3336 CI->setOperand(1, CI->getOperand(0));
3337 CI->setOperand(0, NV);
3338 NV = CI->getOperand(1);
3339 Changed = true;
3340 }
3341
3342 // x == y --> x - y == 0
3343 const SCEV *N = SE.getSCEV(NV);
3344 if (SE.isLoopInvariant(N, L) && Rewriter.isSafeToExpand(N) &&
3345 (!NV->getType()->isPointerTy() ||
3346 SE.getPointerBase(N) == SE.getPointerBase(S))) {
3347 // S is normalized, so normalize N before folding it into S
3348 // to keep the result normalized.
3349 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3350 if (!N)
3351 continue;
3352 Kind = LSRUse::ICmpZero;
3353 S = SE.getMinusSCEV(N, S);
3354 } else if (L->isLoopInvariant(NV) &&
3355 (!isa<Instruction>(NV) ||
3356 DT.dominates(cast<Instruction>(NV), L->getHeader())) &&
3357 !NV->getType()->isPointerTy()) {
3358 // If we can't generally expand the expression (e.g. it contains
3359 // a divide), but it is already at a loop invariant point before the
3360 // loop, wrap it in an unknown (to prevent the expander from trying
3361 // to re-expand in a potentially unsafe way.) The restriction to
3362 // integer types is required because the unknown hides the base, and
3363 // SCEV can't compute the difference of two unknown pointers.
3364 N = SE.getUnknown(NV);
3365 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3366 if (!N)
3367 continue;
3368 Kind = LSRUse::ICmpZero;
3369 S = SE.getMinusSCEV(N, S);
3370 assert(!isa<SCEVCouldNotCompute>(S));
3371 }
3372
3373 // -1 and the negations of all interesting strides (except the negation
3374 // of -1) are now also interesting.
3375 for (size_t i = 0, e = Factors.size(); i != e; ++i)
3376 if (Factors[i] != -1)
3377 Factors.insert(-(uint64_t)Factors[i]);
3378 Factors.insert(-1);
3379 }
3380 }
3381
3382 // Get or create an LSRUse.
3383 std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
3384 size_t LUIdx = P.first;
3385 int64_t Offset = P.second;
3386 LSRUse &LU = Uses[LUIdx];
3387
3388 // Record the fixup.
3389 LSRFixup &LF = LU.getNewFixup();
3390 LF.UserInst = UserInst;
3391 LF.OperandValToReplace = U.getOperandValToReplace();
3392 LF.PostIncLoops = TmpPostIncLoops;
3393 LF.Offset = Offset;
3394 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3395
3396 // Create SCEV as Formula for calculating baseline cost
3397 if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) {
3398 Formula F;
3399 F.initialMatch(S, L, SE);
3400 BaselineCost.RateFormula(F, Regs, VisitedRegs, LU);
3401 VisitedLSRUse.insert(LUIdx);
3402 }
3403
3404 if (!LU.WidestFixupType ||
3405 SE.getTypeSizeInBits(LU.WidestFixupType) <
3406 SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3407 LU.WidestFixupType = LF.OperandValToReplace->getType();
3408
3409 // If this is the first use of this LSRUse, give it a formula.
3410 if (LU.Formulae.empty()) {
3411 InsertInitialFormula(S, LU, LUIdx);
3412 CountRegisters(LU.Formulae.back(), LUIdx);
3413 }
3414 }
3415
3416 LLVM_DEBUG(print_fixups(dbgs()));
3417}
3418
3419/// Insert a formula for the given expression into the given use, separating out
3420/// loop-variant portions from loop-invariant and loop-computable portions.
3421void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU,
3422 size_t LUIdx) {
3423 // Mark uses whose expressions cannot be expanded.
3424 if (!Rewriter.isSafeToExpand(S))
3425 LU.RigidFormula = true;
3426
3427 Formula F;
3428 F.initialMatch(S, L, SE);
3429 bool Inserted = InsertFormula(LU, LUIdx, F);
3430 assert(Inserted && "Initial formula already exists!"); (void)Inserted;
3431}
3432
3433/// Insert a simple single-register formula for the given expression into the
3434/// given use.
3435void
3436LSRInstance::InsertSupplementalFormula(const SCEV *S,
3437 LSRUse &LU, size_t LUIdx) {
3438 Formula F;
3439 F.BaseRegs.push_back(S);
3440 F.HasBaseReg = true;
3441 bool Inserted = InsertFormula(LU, LUIdx, F);
3442 assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
3443}
3444
3445/// Note which registers are used by the given formula, updating RegUses.
3446void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
3447 if (F.ScaledReg)
3448 RegUses.countRegister(F.ScaledReg, LUIdx);
3449 for (const SCEV *BaseReg : F.BaseRegs)
3450 RegUses.countRegister(BaseReg, LUIdx);
3451}
3452
3453/// If the given formula has not yet been inserted, add it to the list, and
3454/// return true. Return false otherwise.
3455bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
3456 // Do not insert formula that we will not be able to expand.
3457 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
3458 "Formula is illegal");
3459
3460 if (!LU.InsertFormula(F, *L))
3461 return false;
3462
3463 CountRegisters(F, LUIdx);
3464 return true;
3465}
3466
3467/// Check for other uses of loop-invariant values which we're tracking. These
3468/// other uses will pin these values in registers, making them less profitable
3469/// for elimination.
3470/// TODO: This currently misses non-constant addrec step registers.
3471/// TODO: Should this give more weight to users inside the loop?
3472void
3473LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
3474 SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
3476
3477 // Don't collect outside uses if we are favoring postinc - the instructions in
3478 // the loop are more important than the ones outside of it.
3479 if (AMK == TTI::AMK_PostIndexed)
3480 return;
3481
3482 while (!Worklist.empty()) {
3483 const SCEV *S = Worklist.pop_back_val();
3484
3485 // Don't process the same SCEV twice
3486 if (!Visited.insert(S).second)
3487 continue;
3488
3489 if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
3490 append_range(Worklist, N->operands());
3491 else if (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(S))
3492 Worklist.push_back(C->getOperand());
3493 else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
3494 Worklist.push_back(D->getLHS());
3495 Worklist.push_back(D->getRHS());
3496 } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
3497 const Value *V = US->getValue();
3498 if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
3499 // Look for instructions defined outside the loop.
3500 if (L->contains(Inst)) continue;
3501 } else if (isa<Constant>(V))
3502 // Constants can be re-materialized.
3503 continue;
3504 for (const Use &U : V->uses()) {
3505 const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
3506 // Ignore non-instructions.
3507 if (!UserInst)
3508 continue;
3509 // Don't bother if the instruction is an EHPad.
3510 if (UserInst->isEHPad())
3511 continue;
3512 // Ignore instructions in other functions (as can happen with
3513 // Constants).
3514 if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
3515 continue;
3516 // Ignore instructions not dominated by the loop.
3517 const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
3518 UserInst->getParent() :
3519 cast<PHINode>(UserInst)->getIncomingBlock(
3521 if (!DT.dominates(L->getHeader(), UseBB))
3522 continue;
3523 // Don't bother if the instruction is in a BB which ends in an EHPad.
3524 if (UseBB->getTerminator()->isEHPad())
3525 continue;
3526
3527 // Ignore cases in which the currently-examined value could come from
3528 // a basic block terminated with an EHPad. This checks all incoming
3529 // blocks of the phi node since it is possible that the same incoming
3530 // value comes from multiple basic blocks, only some of which may end
3531 // in an EHPad. If any of them do, a subsequent rewrite attempt by this
3532 // pass would try to insert instructions into an EHPad, hitting an
3533 // assertion.
3534 if (isa<PHINode>(UserInst)) {
3535 const auto *PhiNode = cast<PHINode>(UserInst);
3536 bool HasIncompatibleEHPTerminatedBlock = false;
3537 llvm::Value *ExpectedValue = U;
3538 for (unsigned int I = 0; I < PhiNode->getNumIncomingValues(); I++) {
3539 if (PhiNode->getIncomingValue(I) == ExpectedValue) {
3540 if (PhiNode->getIncomingBlock(I)->getTerminator()->isEHPad()) {
3541 HasIncompatibleEHPTerminatedBlock = true;
3542 break;
3543 }
3544 }
3545 }
3546 if (HasIncompatibleEHPTerminatedBlock) {
3547 continue;
3548 }
3549 }
3550
3551 // Don't bother rewriting PHIs in catchswitch blocks.
3552 if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
3553 continue;
3554 // Ignore uses which are part of other SCEV expressions, to avoid
3555 // analyzing them multiple times.
3556 if (SE.isSCEVable(UserInst->getType())) {
3557 const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
3558 // If the user is a no-op, look through to its uses.
3559 if (!isa<SCEVUnknown>(UserS))
3560 continue;
3561 if (UserS == US) {
3562 Worklist.push_back(
3563 SE.getUnknown(const_cast<Instruction *>(UserInst)));
3564 continue;
3565 }
3566 }
3567 // Ignore icmp instructions which are already being analyzed.
3568 if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
3569 unsigned OtherIdx = !U.getOperandNo();
3570 Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
3571 if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
3572 continue;
3573 }
3574
3575 std::pair<size_t, int64_t> P = getUse(
3576 S, LSRUse::Basic, MemAccessTy());
3577 size_t LUIdx = P.first;
3578 int64_t Offset = P.second;
3579 LSRUse &LU = Uses[LUIdx];
3580 LSRFixup &LF = LU.getNewFixup();
3581 LF.UserInst = const_cast<Instruction *>(UserInst);
3582 LF.OperandValToReplace = U;
3583 LF.Offset = Offset;
3584 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3585 if (!LU.WidestFixupType ||
3586 SE.getTypeSizeInBits(LU.WidestFixupType) <
3587 SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3588 LU.WidestFixupType = LF.OperandValToReplace->getType();
3589 InsertSupplementalFormula(US, LU, LUIdx);
3590 CountRegisters(LU.Formulae.back(), Uses.size() - 1);
3591 break;
3592 }
3593 }
3594 }
3595}
3596
3597/// Split S into subexpressions which can be pulled out into separate
3598/// registers. If C is non-null, multiply each subexpression by C.
3599///
3600/// Return remainder expression after factoring the subexpressions captured by
3601/// Ops. If Ops is complete, return NULL.
3602static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
3604 const Loop *L,
3605 ScalarEvolution &SE,
3606 unsigned Depth = 0) {
3607 // Arbitrarily cap recursion to protect compile time.
3608 if (Depth >= 3)
3609 return S;
3610
3611 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
3612 // Break out add operands.
3613 for (const SCEV *S : Add->operands()) {
3614 const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
3615 if (Remainder)
3616 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3617 }
3618 return nullptr;
3619 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
3620 // Split a non-zero base out of an addrec.
3621 if (AR->getStart()->isZero() || !AR->isAffine())
3622 return S;
3623
3624 const SCEV *Remainder = CollectSubexprs(AR->getStart(),
3625 C, Ops, L, SE, Depth+1);
3626 // Split the non-zero AddRec unless it is part of a nested recurrence that
3627 // does not pertain to this loop.
3628 if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) {
3629 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3630 Remainder = nullptr;
3631 }
3632 if (Remainder != AR->getStart()) {
3633 if (!Remainder)
3634 Remainder = SE.getConstant(AR->getType(), 0);
3635 return SE.getAddRecExpr(Remainder,
3636 AR->getStepRecurrence(SE),
3637 AR->getLoop(),
3638 //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
3640 }
3641 } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
3642 // Break (C * (a + b + c)) into C*a + C*b + C*c.
3643 if (Mul->getNumOperands() != 2)
3644 return S;
3645 if (const SCEVConstant *Op0 =
3646 dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
3647 C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
3648 const SCEV *Remainder =
3649 CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1);
3650 if (Remainder)
3651 Ops.push_back(SE.getMulExpr(C, Remainder));
3652 return nullptr;
3653 }
3654 }
3655 return S;
3656}
3657
3658/// Return true if the SCEV represents a value that may end up as a
3659/// post-increment operation.
3661 LSRUse &LU, const SCEV *S, const Loop *L,
3662 ScalarEvolution &SE) {
3663 if (LU.Kind != LSRUse::Address ||
3664 !LU.AccessTy.getType()->isIntOrIntVectorTy())
3665 return false;
3666 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S);
3667 if (!AR)
3668 return false;
3669 const SCEV *LoopStep = AR->getStepRecurrence(SE);
3670 if (!isa<SCEVConstant>(LoopStep))
3671 return false;
3672 // Check if a post-indexed load/store can be used.
3675 const SCEV *LoopStart = AR->getStart();
3676 if (!isa<SCEVConstant>(LoopStart) && SE.isLoopInvariant(LoopStart, L))
3677 return true;
3678 }
3679 return false;
3680}
3681
3682/// Helper function for LSRInstance::GenerateReassociations.
3683void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
3684 const Formula &Base,
3685 unsigned Depth, size_t Idx,
3686 bool IsScaledReg) {
3687 const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3688 // Don't generate reassociations for the base register of a value that
3689 // may generate a post-increment operator. The reason is that the
3690 // reassociations cause extra base+register formula to be created,
3691 // and possibly chosen, but the post-increment is more efficient.
3692 if (AMK == TTI::AMK_PostIndexed && mayUsePostIncMode(TTI, LU, BaseReg, L, SE))
3693 return;
3695 const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
3696 if (Remainder)
3697 AddOps.push_back(Remainder);
3698
3699 if (AddOps.size() == 1)
3700 return;
3701
3703 JE = AddOps.end();
3704 J != JE; ++J) {
3705 // Loop-variant "unknown" values are uninteresting; we won't be able to
3706 // do anything meaningful with them.
3707 if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
3708 continue;
3709
3710 // Don't pull a constant into a register if the constant could be folded
3711 // into an immediate field.
3712 if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3713 LU.AccessTy, *J, Base.getNumRegs() > 1))
3714 continue;
3715
3716 // Collect all operands except *J.
3717 SmallVector<const SCEV *, 8> InnerAddOps(
3718 ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
3719 InnerAddOps.append(std::next(J),
3720 ((const SmallVector<const SCEV *, 8> &)AddOps).end());
3721
3722 // Don't leave just a constant behind in a register if the constant could
3723 // be folded into an immediate field.
3724 if (InnerAddOps.size() == 1 &&
3725 isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3726 LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
3727 continue;
3728
3729 const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
3730 if (InnerSum->isZero())
3731 continue;
3732 Formula F = Base;
3733
3734 // Add the remaining pieces of the add back into the new formula.
3735 const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
3736 if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
3737 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
3738 InnerSumSC->getValue()->getZExtValue())) {
3739 F.UnfoldedOffset =
3740 (uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue();
3741 if (IsScaledReg)
3742 F.ScaledReg = nullptr;
3743 else
3744 F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
3745 } else if (IsScaledReg)
3746 F.ScaledReg = InnerSum;
3747 else
3748 F.BaseRegs[Idx] = InnerSum;
3749
3750 // Add J as its own register, or an unfolded immediate.
3751 const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
3752 if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
3753 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
3754 SC->getValue()->getZExtValue()))
3755 F.UnfoldedOffset =
3756 (uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue();
3757 else
3758 F.BaseRegs.push_back(*J);
3759 // We may have changed the number of register in base regs, adjust the
3760 // formula accordingly.
3761 F.canonicalize(*L);
3762
3763 if (InsertFormula(LU, LUIdx, F))
3764 // If that formula hadn't been seen before, recurse to find more like
3765 // it.
3766 // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2)
3767 // Because just Depth is not enough to bound compile time.
3768 // This means that every time AddOps.size() is greater 16^x we will add
3769 // x to Depth.
3770 GenerateReassociations(LU, LUIdx, LU.Formulae.back(),
3771 Depth + 1 + (Log2_32(AddOps.size()) >> 2));
3772 }
3773}
3774
3775/// Split out subexpressions from adds and the bases of addrecs.
3776void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
3777 Formula Base, unsigned Depth) {
3778 assert(Base.isCanonical(*L) && "Input must be in the canonical form");
3779 // Arbitrarily cap recursion to protect compile time.
3780 if (Depth >= 3)
3781 return;
3782
3783 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
3784 GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
3785
3786 if (Base.Scale == 1)
3787 GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
3788 /* Idx */ -1, /* IsScaledReg */ true);
3789}
3790
3791/// Generate a formula consisting of all of the loop-dominating registers added
3792/// into a single register.
3793void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
3794 Formula Base) {
3795 // This method is only interesting on a plurality of registers.
3796 if (Base.BaseRegs.size() + (Base.Scale == 1) +
3797 (Base.UnfoldedOffset != 0) <= 1)
3798 return;
3799
3800 // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
3801 // processing the formula.
3802 Base.unscale();
3804 Formula NewBase = Base;
3805 NewBase.BaseRegs.clear();
3806 Type *CombinedIntegerType = nullptr;
3807 for (const SCEV *BaseReg : Base.BaseRegs) {
3808 if (SE.properlyDominates(BaseReg, L->getHeader()) &&
3809 !SE.hasComputableLoopEvolution(BaseReg, L)) {
3810 if (!CombinedIntegerType)
3811 CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType());
3812 Ops.push_back(BaseReg);
3813 }
3814 else
3815 NewBase.BaseRegs.push_back(BaseReg);
3816 }
3817
3818 // If no register is relevant, we're done.
3819 if (Ops.size() == 0)
3820 return;
3821
3822 // Utility function for generating the required variants of the combined
3823 // registers.
3824 auto GenerateFormula = [&](const SCEV *Sum) {
3825 Formula F = NewBase;
3826
3827 // TODO: If Sum is zero, it probably means ScalarEvolution missed an
3828 // opportunity to fold something. For now, just ignore such cases
3829 // rather than proceed with zero in a register.
3830 if (Sum->isZero())
3831 return;
3832
3833 F.BaseRegs.push_back(Sum);
3834 F.canonicalize(*L);
3835 (void)InsertFormula(LU, LUIdx, F);
3836 };
3837
3838 // If we collected at least two registers, generate a formula combining them.
3839 if (Ops.size() > 1) {
3840 SmallVector<const SCEV *, 4> OpsCopy(Ops); // Don't let SE modify Ops.
3841 GenerateFormula(SE.getAddExpr(OpsCopy));
3842 }
3843
3844 // If we have an unfolded offset, generate a formula combining it with the
3845 // registers collected.
3846 if (NewBase.UnfoldedOffset) {
3847 assert(CombinedIntegerType && "Missing a type for the unfolded offset");
3848 Ops.push_back(SE.getConstant(CombinedIntegerType, NewBase.UnfoldedOffset,
3849 true));
3850 NewBase.UnfoldedOffset = 0;
3851 GenerateFormula(SE.getAddExpr(Ops));
3852 }
3853}
3854
3855/// Helper function for LSRInstance::GenerateSymbolicOffsets.
3856void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
3857 const Formula &Base, size_t Idx,
3858 bool IsScaledReg) {
3859 const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3860 GlobalValue *GV = ExtractSymbol(G, SE);
3861 if (G->isZero() || !GV)
3862 return;
3863 Formula F = Base;
3864 F.BaseGV = GV;
3865 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
3866 return;
3867 if (IsScaledReg)
3868 F.ScaledReg = G;
3869 else
3870 F.BaseRegs[Idx] = G;
3871 (void)InsertFormula(LU, LUIdx, F);
3872}
3873
3874/// Generate reuse formulae using symbolic offsets.
3875void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
3876 Formula Base) {
3877 // We can't add a symbolic offset if the address already contains one.
3878 if (Base.BaseGV) return;
3879
3880 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
3881 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
3882 if (Base.Scale == 1)
3883 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
3884 /* IsScaledReg */ true);
3885}
3886
3887/// Helper function for LSRInstance::GenerateConstantOffsets.
3888void LSRInstance::GenerateConstantOffsetsImpl(
3889 LSRUse &LU, unsigned LUIdx, const Formula &Base,
3890 const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) {
3891
3892 auto GenerateOffset = [&](const SCEV *G, int64_t Offset) {
3893 Formula F = Base;
3894 F.BaseOffset = (uint64_t)Base.BaseOffset - Offset;
3895
3896 if (isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) {
3897 // Add the offset to the base register.
3898 const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), Offset), G);
3899 // If it cancelled out, drop the base register, otherwise update it.
3900 if (NewG->isZero()) {
3901 if (IsScaledReg) {
3902 F.Scale = 0;
3903 F.ScaledReg = nullptr;
3904 } else
3905 F.deleteBaseReg(F.BaseRegs[Idx]);
3906 F.canonicalize(*L);
3907 } else if (IsScaledReg)
3908 F.ScaledReg = NewG;
3909 else
3910 F.BaseRegs[Idx] = NewG;
3911
3912 (void)InsertFormula(LU, LUIdx, F);
3913 }
3914 };
3915
3916 const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3917
3918 // With constant offsets and constant steps, we can generate pre-inc
3919 // accesses by having the offset equal the step. So, for access #0 with a
3920 // step of 8, we generate a G - 8 base which would require the first access
3921 // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer
3922 // for itself and hopefully becomes the base for other accesses. This means
3923 // means that a single pre-indexed access can be generated to become the new
3924 // base pointer for each iteration of the loop, resulting in no extra add/sub
3925 // instructions for pointer updating.
3926 if (AMK == TTI::AMK_PreIndexed && LU.Kind == LSRUse::Address) {
3927 if (auto *GAR = dyn_cast<SCEVAddRecExpr>(G)) {
3928 if (auto *StepRec =
3929 dyn_cast<SCEVConstant>(GAR->getStepRecurrence(SE))) {
3930 const APInt &StepInt = StepRec->getAPInt();
3931 int64_t Step = StepInt.isNegative() ?
3932 StepInt.getSExtValue() : StepInt.getZExtValue();
3933
3934 for (int64_t Offset : Worklist) {
3935 Offset -= Step;
3936 GenerateOffset(G, Offset);
3937 }
3938 }
3939 }
3940 }
3941 for (int64_t Offset : Worklist)
3942 GenerateOffset(G, Offset);
3943
3944 int64_t Imm = ExtractImmediate(G, SE);
3945 if (G->isZero() || Imm == 0)
3946 return;
3947 Formula F = Base;
3948 F.BaseOffset = (uint64_t)F.BaseOffset + Imm;
3949 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
3950 return;
3951 if (IsScaledReg) {
3952 F.ScaledReg = G;
3953 } else {
3954 F.BaseRegs[Idx] = G;
3955 // We may generate non canonical Formula if G is a recurrent expr reg
3956 // related with current loop while F.ScaledReg is not.
3957 F.canonicalize(*L);
3958 }
3959 (void)InsertFormula(LU, LUIdx, F);
3960}
3961
3962/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
3963void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
3964 Formula Base) {
3965 // TODO: For now, just add the min and max offset, because it usually isn't
3966 // worthwhile looking at everything inbetween.
3967 SmallVector<int64_t, 2> Worklist;
3968 Worklist.push_back(LU.MinOffset);
3969 if (LU.MaxOffset != LU.MinOffset)
3970 Worklist.push_back(LU.MaxOffset);
3971
3972 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
3973 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
3974 if (Base.Scale == 1)
3975 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
3976 /* IsScaledReg */ true);
3977}
3978
3979/// For ICmpZero, check to see if we can scale up the comparison. For example, x
3980/// == y -> x*c == y*c.
3981void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
3982 Formula Base) {
3983 if (LU.Kind != LSRUse::ICmpZero) return;
3984
3985 // Determine the integer type for the base formula.
3986 Type *IntTy = Base.getType();
3987 if (!IntTy) return;
3988 if (SE.getTypeSizeInBits(IntTy) > 64) return;
3989
3990 // Don't do this if there is more than one offset.
3991 if (LU.MinOffset != LU.MaxOffset) return;
3992
3993 // Check if transformation is valid. It is illegal to multiply pointer.
3994 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
3995 return;
3996 for (const SCEV *BaseReg : Base.BaseRegs)
3997 if (BaseReg->getType()->isPointerTy())
3998 return;
3999 assert(!Base.BaseGV && "ICmpZero use is not legal!");
4000
4001 // Check each interesting stride.
4002 for (int64_t Factor : Factors) {
4003 // Check that Factor can be represented by IntTy
4004 if (!ConstantInt::isValueValidForType(IntTy, Factor))
4005 continue;
4006 // Check that the multiplication doesn't overflow.
4007 if (Base.BaseOffset == std::numeric_limits<int64_t>::min() && Factor == -1)
4008 continue;
4009 int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor;
4010 assert(Factor != 0 && "Zero factor not expected!");
4011 if (NewBaseOffset / Factor != Base.BaseOffset)
4012 continue;
4013 // If the offset will be truncated at this use, check that it is in bounds.
4014 if (!IntTy->isPointerTy() &&
4015 !ConstantInt::isValueValidForType(IntTy, NewBaseOffset))
4016 continue;
4017
4018 // Check that multiplying with the use offset doesn't overflow.
4019 int64_t Offset = LU.MinOffset;
4020 if (Offset == std::numeric_limits<int64_t>::min() && Factor == -1)
4021 continue;
4022 Offset = (uint64_t)Offset * Factor;
4023 if (Offset / Factor != LU.MinOffset)
4024 continue;
4025 // If the offset will be truncated at this use, check that it is in bounds.
4026 if (!IntTy->isPointerTy() &&
4028 continue;
4029
4030 Formula F = Base;
4031 F.BaseOffset = NewBaseOffset;
4032
4033 // Check that this scale is legal.
4034 if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
4035 continue;
4036
4037 // Compensate for the use having MinOffset built into it.
4038 F.BaseOffset = (uint64_t)F.BaseOffset + Offset - LU.MinOffset;
4039
4040 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4041
4042 // Check that multiplying with each base register doesn't overflow.
4043 for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
4044 F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
4045 if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
4046 goto next;
4047 }
4048
4049 // Check that multiplying with the scaled register doesn't overflow.
4050 if (F.ScaledReg) {
4051 F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
4052 if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
4053 continue;
4054 }
4055
4056 // Check that multiplying with the unfolded offset doesn't overflow.
4057 if (F.UnfoldedOffset != 0) {
4058 if (F.UnfoldedOffset == std::numeric_limits<int64_t>::min() &&
4059 Factor == -1)
4060 continue;
4061 F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor;
4062 if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset)
4063 continue;
4064 // If the offset will be truncated, check that it is in bounds.
4065 if (!IntTy->isPointerTy() &&
4066 !ConstantInt::isValueValidForType(IntTy, F.UnfoldedOffset))
4067 continue;
4068 }
4069
4070 // If we make it here and it's legal, add it.
4071 (void)InsertFormula(LU, LUIdx, F);
4072 next:;
4073 }
4074}
4075
4076/// Generate stride factor reuse formulae by making use of scaled-offset address
4077/// modes, for example.
4078void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
4079 // Determine the integer type for the base formula.
4080 Type *IntTy = Base.getType();
4081 if (!IntTy) return;
4082
4083 // If this Formula already has a scaled register, we can't add another one.
4084 // Try to unscale the formula to generate a better scale.
4085 if (Base.Scale != 0 && !Base.unscale())
4086 return;
4087
4088 assert(Base.Scale == 0 && "unscale did not did its job!");
4089
4090 // Check each interesting stride.
4091 for (int64_t Factor : Factors) {
4092 Base.Scale = Factor;
4093 Base.HasBaseReg = Base.BaseRegs.size() > 1;
4094 // Check whether this scale is going to be legal.
4095 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4096 Base)) {
4097 // As a special-case, handle special out-of-loop Basic users specially.
4098 // TODO: Reconsider this special case.
4099 if (LU.Kind == LSRUse::Basic &&
4100 isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
4101 LU.AccessTy, Base) &&
4102 LU.AllFixupsOutsideLoop)
4103 LU.Kind = LSRUse::Special;
4104 else
4105 continue;
4106 }
4107 // For an ICmpZero, negating a solitary base register won't lead to
4108 // new solutions.
4109 if (LU.Kind == LSRUse::ICmpZero &&
4110 !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV)
4111 continue;
4112 // For each addrec base reg, if its loop is current loop, apply the scale.
4113 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
4114 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
4115 if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
4116 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4117 if (FactorS->isZero())
4118 continue;
4119 // Divide out the factor, ignoring high bits, since we'll be
4120 // scaling the value back up in the end.
4121 if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true))
4122 if (!Quotient->isZero()) {
4123 // TODO: This could be optimized to avoid all the copying.
4124 Formula F = Base;
4125 F.ScaledReg = Quotient;
4126 F.deleteBaseReg(F.BaseRegs[i]);
4127 // The canonical representation of 1*reg is reg, which is already in
4128 // Base. In that case, do not try to insert the formula, it will be
4129 // rejected anyway.
4130 if (F.Scale == 1 && (F.BaseRegs.empty() ||
4131 (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
4132 continue;
4133 // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
4134 // non canonical Formula with ScaledReg's loop not being L.
4135 if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
4136 F.canonicalize(*L);
4137 (void)InsertFormula(LU, LUIdx, F);
4138 }
4139 }
4140 }
4141 }
4142}
4143
4144/// Extend/Truncate \p Expr to \p ToTy considering post-inc uses in \p Loops.
4145/// For all PostIncLoopSets in \p Loops, first de-normalize \p Expr, then
4146/// perform the extension/truncate and normalize again, as the normalized form
4147/// can result in folds that are not valid in the post-inc use contexts. The
4148/// expressions for all PostIncLoopSets must match, otherwise return nullptr.
4149static const SCEV *
4151 const SCEV *Expr, Type *ToTy,
4152 ScalarEvolution &SE) {
4153 const SCEV *Result = nullptr;
4154 for (auto &L : Loops) {
4155 auto *DenormExpr = denormalizeForPostIncUse(Expr, L, SE);
4156 const SCEV *NewDenormExpr = SE.getAnyExtendExpr(DenormExpr, ToTy);
4157 const SCEV *New = normalizeForPostIncUse(NewDenormExpr, L, SE);
4158 if (!New || (Result && New != Result))
4159 return nullptr;
4160 Result = New;
4161 }
4162
4163 assert(Result && "failed to create expression");
4164 return Result;
4165}
4166
4167/// Generate reuse formulae from different IV types.
4168void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
4169 // Don't bother truncating symbolic values.
4170 if (Base.BaseGV) return;
4171
4172 // Determine the integer type for the base formula.
4173 Type *DstTy = Base.getType();
4174 if (!DstTy) return;
4175 if (DstTy->isPointerTy())
4176 return;
4177
4178 // It is invalid to extend a pointer type so exit early if ScaledReg or
4179 // any of the BaseRegs are pointers.
4180 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4181 return;
4182 if (any_of(Base.BaseRegs,
4183 [](const SCEV *S) { return S->getType()->isPointerTy(); }))
4184 return;
4185
4187 for (auto &LF : LU.Fixups)
4188 Loops.push_back(LF.PostIncLoops);
4189
4190 for (Type *SrcTy : Types) {
4191 if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
4192 Formula F = Base;
4193
4194 // Sometimes SCEV is able to prove zero during ext transform. It may
4195 // happen if SCEV did not do all possible transforms while creating the
4196 // initial node (maybe due to depth limitations), but it can do them while
4197 // taking ext.
4198 if (F.ScaledReg) {
4199 const SCEV *NewScaledReg =
4200 getAnyExtendConsideringPostIncUses(Loops, F.ScaledReg, SrcTy, SE);
4201 if (!NewScaledReg || NewScaledReg->isZero())
4202 continue;
4203 F.ScaledReg = NewScaledReg;
4204 }
4205 bool HasZeroBaseReg = false;
4206 for (const SCEV *&BaseReg : F.BaseRegs) {
4207 const SCEV *NewBaseReg =
4208 getAnyExtendConsideringPostIncUses(Loops, BaseReg, SrcTy, SE);
4209 if (!NewBaseReg || NewBaseReg->isZero()) {
4210 HasZeroBaseReg = true;
4211 break;
4212 }
4213 BaseReg = NewBaseReg;
4214 }
4215 if (HasZeroBaseReg)
4216 continue;
4217
4218 // TODO: This assumes we've done basic processing on all uses and
4219 // have an idea what the register usage is.
4220 if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
4221 continue;
4222
4223 F.canonicalize(*L);
4224 (void)InsertFormula(LU, LUIdx, F);
4225 }
4226 }
4227}
4228
4229namespace {
4230
4231/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
4232/// modifications so that the search phase doesn't have to worry about the data
4233/// structures moving underneath it.
4234struct WorkItem {
4235 size_t LUIdx;
4236 int64_t Imm;
4237 const SCEV *OrigReg;
4238
4239 WorkItem(size_t LI, int64_t I, const SCEV *R)
4240 : LUIdx(LI), Imm(I), OrigReg(R) {}
4241
4242 void print(raw_ostream &OS) const;
4243 void dump() const;
4244};
4245
4246} // end anonymous namespace
4247
4248#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4249void WorkItem::print(raw_ostream &OS) const {
4250 OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
4251 << " , add offset " << Imm;
4252}
4253
4254LLVM_DUMP_METHOD void WorkItem::dump() const {
4255 print(errs()); errs() << '\n';
4256}
4257#endif
4258
4259/// Look for registers which are a constant distance apart and try to form reuse
4260/// opportunities between them.
4261void LSRInstance::GenerateCrossUseConstantOffsets() {
4262 // Group the registers by their value without any added constant offset.
4263 using ImmMapTy = std::map<int64_t, const SCEV *>;
4264
4268 for (const SCEV *Use : RegUses) {
4269 const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify.
4270 int64_t Imm = ExtractImmediate(Reg, SE);
4271 auto Pair = Map.insert(std::make_pair(Reg, ImmMapTy()));
4272 if (Pair.second)
4273 Sequence.push_back(Reg);
4274 Pair.first->second.insert(std::make_pair(Imm, Use));
4275 UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
4276 }
4277
4278 // Now examine each set of registers with the same base value. Build up
4279 // a list of work to do and do the work in a separate step so that we're
4280 // not adding formulae and register counts while we're searching.
4281 SmallVector<WorkItem, 32> WorkItems;
4282 SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems;
4283 for (const SCEV *Reg : Sequence) {
4284 const ImmMapTy &Imms = Map.find(Reg)->second;
4285
4286 // It's not worthwhile looking for reuse if there's only one offset.
4287 if (Imms.size() == 1)
4288 continue;
4289
4290 LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
4291 for (const auto &Entry
4292 : Imms) dbgs()
4293 << ' ' << Entry.first;
4294 dbgs() << '\n');
4295
4296 // Examine each offset.
4297 for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
4298 J != JE; ++J) {
4299 const SCEV *OrigReg = J->second;
4300
4301 int64_t JImm = J->first;
4302 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
4303
4304 if (!isa<SCEVConstant>(OrigReg) &&
4305 UsedByIndicesMap[Reg].count() == 1) {
4306 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4307 << '\n');
4308 continue;
4309 }
4310
4311 // Conservatively examine offsets between this orig reg a few selected
4312 // other orig regs.
4313 int64_t First = Imms.begin()->first;
4314 int64_t Last = std::prev(Imms.end())->first;
4315 // Compute (First + Last) / 2 without overflow using the fact that
4316 // First + Last = 2 * (First + Last) + (First ^ Last).
4317 int64_t Avg = (First & Last) + ((First ^ Last) >> 1);
4318 // If the result is negative and First is odd and Last even (or vice versa),
4319 // we rounded towards -inf. Add 1 in that case, to round towards 0.
4320 Avg = Avg + ((First ^ Last) & ((uint64_t)Avg >> 63));
4321 ImmMapTy::const_iterator OtherImms[] = {
4322 Imms.begin(), std::prev(Imms.end()),
4323 Imms.lower_bound(Avg)};
4324 for (const auto &M : OtherImms) {
4325 if (M == J || M == JE) continue;
4326
4327 // Compute the difference between the two.
4328 int64_t Imm = (uint64_t)JImm - M->first;
4329 for (unsigned LUIdx : UsedByIndices.set_bits())
4330 // Make a memo of this use, offset, and register tuple.
4331 if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
4332 WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
4333 }
4334 }
4335 }
4336
4337 Map.clear();
4338 Sequence.clear();
4339 UsedByIndicesMap.clear();
4340 UniqueItems.clear();
4341
4342 // Now iterate through the worklist and add new formulae.
4343 for (const WorkItem &WI : WorkItems) {
4344 size_t LUIdx = WI.LUIdx;
4345 LSRUse &LU = Uses[LUIdx];
4346 int64_t Imm = WI.Imm;
4347 const SCEV *OrigReg = WI.OrigReg;
4348
4349 Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
4350 const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
4351 unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
4352
4353 // TODO: Use a more targeted data structure.
4354 for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
4355 Formula F = LU.Formulae[L];
4356 // FIXME: The code for the scaled and unscaled registers looks
4357 // very similar but slightly different. Investigate if they
4358 // could be merged. That way, we would not have to unscale the
4359 // Formula.
4360 F.unscale();
4361 // Use the immediate in the scaled register.
4362 if (F.ScaledReg == OrigReg) {
4363 int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale;
4364 // Don't create 50 + reg(-50).
4365 if (F.referencesReg(SE.getSCEV(
4366 ConstantInt::get(IntTy, -(uint64_t)Offset))))
4367 continue;
4368 Formula NewF = F;
4369 NewF.BaseOffset = Offset;
4370 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4371 NewF))
4372 continue;
4373 NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
4374
4375 // If the new scale is a constant in a register, and adding the constant
4376 // value to the immediate would produce a value closer to zero than the
4377 // immediate itself, then the formula isn't worthwhile.
4378 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
4379 if (C->getValue()->isNegative() != (NewF.BaseOffset < 0) &&
4380 (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
4381 .ule(std::abs(NewF.BaseOffset)))
4382 continue;
4383
4384 // OK, looks good.
4385 NewF.canonicalize(*this->L);
4386 (void)InsertFormula(LU, LUIdx, NewF);
4387 } else {
4388 // Use the immediate in a base register.
4389 for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
4390 const SCEV *BaseReg = F.BaseRegs[N];
4391 if (BaseReg != OrigReg)
4392 continue;
4393 Formula NewF = F;
4394 NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm;
4395 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
4396 LU.Kind, LU.AccessTy, NewF)) {
4397 if (AMK == TTI::AMK_PostIndexed &&
4398 mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
4399 continue;
4400 if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
4401 continue;
4402 NewF = F;
4403 NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm;
4404 }
4405 NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
4406
4407 // If the new formula has a constant in a register, and adding the
4408 // constant value to the immediate would produce a value closer to
4409 // zero than the immediate itself, then the formula isn't worthwhile.
4410 for (const SCEV *NewReg : NewF.BaseRegs)
4411 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg))
4412 if ((C->getAPInt() + NewF.BaseOffset)
4413 .abs()
4414 .slt(std::abs(NewF.BaseOffset)) &&
4415 (C->getAPInt() + NewF.BaseOffset).countr_zero() >=
4416 (unsigned)llvm::countr_zero<uint64_t>(NewF.BaseOffset))
4417 goto skip_formula;
4418
4419 // Ok, looks good.
4420 NewF.canonicalize(*this->L);
4421 (void)InsertFormula(LU, LUIdx, NewF);
4422 break;
4423 skip_formula:;
4424 }
4425 }
4426 }
4427 }
4428}
4429
4430/// Generate formulae for each use.
4431void
4432LSRInstance::GenerateAllReuseFormulae() {
4433 // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
4434 // queries are more precise.
4435 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4436 LSRUse &LU = Uses[LUIdx];
4437 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4438 GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
4439 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4440 GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
4441 }
4442 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4443 LSRUse &LU = Uses[LUIdx];
4444 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4445 GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
4446 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4447 GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
4448 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4449 GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
4450 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4451 GenerateScales(LU, LUIdx, LU.Formulae[i]);
4452 }
4453 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4454 LSRUse &LU = Uses[LUIdx];
4455 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4456 GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
4457 }
4458
4459 GenerateCrossUseConstantOffsets();
4460
4461 LLVM_DEBUG(dbgs() << "\n"
4462 "After generating reuse formulae:\n";
4463 print_uses(dbgs()));
4464}
4465
4466/// If there are multiple formulae with the same set of registers used
4467/// by other uses, pick the best one and delete the others.
4468void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
4469 DenseSet<const SCEV *> VisitedRegs;
4472#ifndef NDEBUG
4473 bool ChangedFormulae = false;
4474#endif
4475
4476 // Collect the best formula for each unique set of shared registers. This
4477 // is reset for each use.
4478 using BestFormulaeTy =
4479 DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>;
4480
4481 BestFormulaeTy BestFormulae;
4482
4483 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4484 LSRUse &LU = Uses[LUIdx];
4485 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
4486 dbgs() << '\n');
4487
4488 bool Any = false;
4489 for (size_t FIdx = 0, NumForms = LU.Formulae.size();
4490 FIdx != NumForms; ++FIdx) {
4491 Formula &F = LU.Formulae[FIdx];
4492
4493 // Some formulas are instant losers. For example, they may depend on
4494 // nonexistent AddRecs from other loops. These need to be filtered
4495 // immediately, otherwise heuristics could choose them over others leading
4496 // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
4497 // avoids the need to recompute this information across formulae using the
4498 // same bad AddRec. Passing LoserRegs is also essential unless we remove
4499 // the corresponding bad register from the Regs set.
4500 Cost CostF(L, SE, TTI, AMK);
4501 Regs.clear();
4502 CostF.RateFormula(F, Regs, VisitedRegs, LU, &LoserRegs);
4503 if (CostF.isLoser()) {
4504 // During initial formula generation, undesirable formulae are generated
4505 // by uses within other loops that have some non-trivial address mode or
4506 // use the postinc form of the IV. LSR needs to provide these formulae
4507 // as the basis of rediscovering the desired formula that uses an AddRec
4508 // corresponding to the existing phi. Once all formulae have been
4509 // generated, these initial losers may be pruned.
4510 LLVM_DEBUG(dbgs() << " Filtering loser "; F.print(dbgs());
4511 dbgs() << "\n");
4512 }
4513 else {
4515 for (const SCEV *Reg : F.BaseRegs) {
4516 if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
4517 Key.push_back(Reg);
4518 }
4519 if (F.ScaledReg &&
4520 RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
4521 Key.push_back(F.ScaledReg);
4522 // Unstable sort by host order ok, because this is only used for
4523 // uniquifying.
4524 llvm::sort(Key);
4525
4526 std::pair<BestFormulaeTy::const_iterator, bool> P =
4527 BestFormulae.insert(std::make_pair(Key, FIdx));
4528 if (P.second)
4529 continue;
4530
4531 Formula &Best = LU.Formulae[P.first->second];
4532
4533 Cost CostBest(L, SE, TTI, AMK);
4534 Regs.clear();
4535 CostBest.RateFormula(Best, Regs, VisitedRegs, LU);
4536 if (CostF.isLess(CostBest))
4537 std::swap(F, Best);
4538 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
4539 dbgs() << "\n"
4540 " in favor of formula ";
4541 Best.print(dbgs()); dbgs() << '\n');
4542 }
4543#ifndef NDEBUG
4544 ChangedFormulae = true;
4545#endif
4546 LU.DeleteFormula(F);
4547 --FIdx;
4548 --NumForms;
4549 Any = true;
4550 }
4551
4552 // Now that we've filtered out some formulae, recompute the Regs set.
4553 if (Any)
4554 LU.RecomputeRegs(LUIdx, RegUses);
4555
4556 // Reset this to prepare for the next use.
4557 BestFormulae.clear();
4558 }
4559
4560 LLVM_DEBUG(if (ChangedFormulae) {
4561 dbgs() << "\n"
4562 "After filtering out undesirable candidates:\n";
4563 print_uses(dbgs());
4564 });
4565}
4566
4567/// Estimate the worst-case number of solutions the solver might have to
4568/// consider. It almost never considers this many solutions because it prune the
4569/// search space, but the pruning isn't always sufficient.
4570size_t LSRInstance::EstimateSearchSpaceComplexity() const {
4571 size_t Power = 1;
4572 for (const LSRUse &LU : Uses) {
4573 size_t FSize = LU.Formulae.size();
4574 if (FSize >= ComplexityLimit) {
4575 Power = ComplexityLimit;
4576 break;
4577 }
4578 Power *= FSize;
4579 if (Power >= ComplexityLimit)
4580 break;
4581 }
4582 return Power;
4583}
4584
4585/// When one formula uses a superset of the registers of another formula, it
4586/// won't help reduce register pressure (though it may not necessarily hurt
4587/// register pressure); remove it to simplify the system.
4588void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
4589 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4590 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4591
4592 LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
4593 "which use a superset of registers used by other "
4594 "formulae.\n");
4595
4596 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4597 LSRUse &LU = Uses[LUIdx];
4598 bool Any = false;
4599 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
4600 Formula &F = LU.Formulae[i];
4601 // Look for a formula with a constant or GV in a register. If the use
4602 // also has a formula with that same value in an immediate field,
4603 // delete the one that uses a register.
4605 I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
4606 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
4607 Formula NewF = F;
4608 //FIXME: Formulas should store bitwidth to do wrapping properly.
4609 // See PR41034.
4610 NewF.BaseOffset += (uint64_t)C->getValue()->getSExtValue();
4611 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4612 (I - F.BaseRegs.begin()));
4613 if (LU.HasFormulaWithSameRegs(NewF)) {
4614 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4615 dbgs() << '\n');
4616 LU.DeleteFormula(F);
4617 --i;
4618 --e;
4619 Any = true;
4620 break;
4621 }
4622 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
4623 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
4624 if (!F.BaseGV) {
4625 Formula NewF = F;
4626 NewF.BaseGV = GV;
4627 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4628 (I - F.BaseRegs.begin()));
4629 if (LU.HasFormulaWithSameRegs(NewF)) {
4630 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4631 dbgs() << '\n');
4632 LU.DeleteFormula(F);
4633 --i;
4634 --e;
4635 Any = true;
4636 break;
4637 }
4638 }
4639 }
4640 }
4641 }
4642 if (Any)
4643 LU.RecomputeRegs(LUIdx, RegUses);
4644 }
4645
4646 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4647 }
4648}
4649
4650/// When there are many registers for expressions like A, A+1, A+2, etc.,
4651/// allocate a single register for them.
4652void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
4653 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4654 return;
4655
4656 LLVM_DEBUG(
4657 dbgs() << "The search space is too complex.\n"
4658 "Narrowing the search space by assuming that uses separated "
4659 "by a constant offset will use the same registers.\n");
4660
4661 // This is especially useful for unrolled loops.
4662
4663 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4664 LSRUse &LU = Uses[LUIdx];
4665 for (const Formula &F : LU.Formulae) {
4666 if (F.BaseOffset == 0 || (F.Scale != 0 && F.Scale != 1))
4667 continue;
4668
4669 LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
4670 if (!LUThatHas)
4671 continue;
4672
4673 if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
4674 LU.Kind, LU.AccessTy))
4675 continue;
4676
4677 LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');
4678
4679 LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
4680
4681 // Transfer the fixups of LU to LUThatHas.
4682 for (LSRFixup &Fixup : LU.Fixups) {
4683 Fixup.Offset += F.BaseOffset;
4684 LUThatHas->pushFixup(Fixup);
4685 LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
4686 }
4687
4688 // Delete formulae from the new use which are no longer legal.
4689 bool Any = false;
4690 for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
4691 Formula &F = LUThatHas->Formulae[i];
4692 if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
4693 LUThatHas->Kind, LUThatHas->AccessTy, F)) {
4694 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
4695 LUThatHas->DeleteFormula(F);
4696 --i;
4697 --e;
4698 Any = true;
4699 }
4700 }
4701
4702 if (Any)
4703 LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
4704
4705 // Delete the old use.
4706 DeleteUse(LU, LUIdx);
4707 --LUIdx;
4708 --NumUses;
4709 break;
4710 }
4711 }
4712
4713 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4714}
4715
4716/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
4717/// we've done more filtering, as it may be able to find more formulae to
4718/// eliminate.
4719void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
4720 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4721 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4722
4723 LLVM_DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
4724 "undesirable dedicated registers.\n");
4725
4726 FilterOutUndesirableDedicatedRegisters();
4727
4728 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4729 }
4730}
4731
4732/// If a LSRUse has multiple formulae with the same ScaledReg and Scale.
4733/// Pick the best one and delete the others.
4734/// This narrowing heuristic is to keep as many formulae with different
4735/// Scale and ScaledReg pair as possible while narrowing the search space.
4736/// The benefit is that it is more likely to find out a better solution
4737/// from a formulae set with more Scale and ScaledReg variations than
4738/// a formulae set with the same Scale and ScaledReg. The picking winner
4739/// reg heuristic will often keep the formulae with the same Scale and
4740/// ScaledReg and filter others, and we want to avoid that if possible.
4741void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
4742 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4743 return;
4744
4745 LLVM_DEBUG(
4746 dbgs() << "The search space is too complex.\n"
4747 "Narrowing the search space by choosing the best Formula "
4748 "from the Formulae with the same Scale and ScaledReg.\n");
4749
4750 // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
4751 using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>;
4752
4753 BestFormulaeTy BestFormulae;
4754#ifndef NDEBUG
4755 bool ChangedFormulae = false;
4756#endif
4757 DenseSet<const SCEV *> VisitedRegs;
4759
4760 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4761 LSRUse &LU = Uses[LUIdx];
4762 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
4763 dbgs() << '\n');
4764
4765 // Return true if Formula FA is better than Formula FB.
4766 auto IsBetterThan = [&](Formula &FA, Formula &FB) {
4767 // First we will try to choose the Formula with fewer new registers.
4768 // For a register used by current Formula, the more the register is
4769 // shared among LSRUses, the less we increase the register number
4770 // counter of the formula.
4771 size_t FARegNum = 0;
4772 for (const SCEV *Reg : FA.BaseRegs) {
4773 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
4774 FARegNum += (NumUses - UsedByIndices.count() + 1);
4775 }
4776 size_t FBRegNum = 0;
4777 for (const SCEV *Reg : FB.BaseRegs) {
4778 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
4779 FBRegNum += (NumUses - UsedByIndices.count() + 1);
4780 }
4781 if (FARegNum != FBRegNum)
4782 return FARegNum < FBRegNum;
4783
4784 // If the new register numbers are the same, choose the Formula with
4785 // less Cost.
4786 Cost CostFA(L, SE, TTI, AMK);
4787 Cost CostFB(L, SE, TTI, AMK);
4788 Regs.clear();
4789 CostFA.RateFormula(FA, Regs, VisitedRegs, LU);
4790 Regs.clear();
4791 CostFB.RateFormula(FB, Regs, VisitedRegs, LU);
4792 return CostFA.isLess(CostFB);
4793 };
4794
4795 bool Any = false;
4796 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
4797 ++FIdx) {
4798 Formula &F = LU.Formulae[FIdx];
4799 if (!F.ScaledReg)
4800 continue;
4801 auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx});
4802 if (P.second)
4803 continue;
4804
4805 Formula &Best = LU.Formulae[P.first->second];
4806 if (IsBetterThan(F, Best))
4807 std::swap(F, Best);
4808 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
4809 dbgs() << "\n"
4810 " in favor of formula ";
4811 Best.print(dbgs()); dbgs() << '\n');
4812#ifndef NDEBUG
4813 ChangedFormulae = true;
4814#endif
4815 LU.DeleteFormula(F);
4816 --FIdx;
4817 --NumForms;
4818 Any = true;
4819 }
4820 if (Any)
4821 LU.RecomputeRegs(LUIdx, RegUses);
4822
4823 // Reset this to prepare for the next use.
4824 BestFormulae.clear();
4825 }
4826
4827 LLVM_DEBUG(if (ChangedFormulae) {
4828 dbgs() << "\n"
4829 "After filtering out undesirable candidates:\n";
4830 print_uses(dbgs());
4831 });
4832}
4833
4834/// If we are over the complexity limit, filter out any post-inc prefering
4835/// variables to only post-inc values.
4836void LSRInstance::NarrowSearchSpaceByFilterPostInc() {
4837 if (AMK != TTI::AMK_PostIndexed)
4838 return;
4839 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4840 return;
4841
4842 LLVM_DEBUG(dbgs() << "The search space is too complex.\n"
4843 "Narrowing the search space by choosing the lowest "
4844 "register Formula for PostInc Uses.\n");
4845
4846 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4847 LSRUse &LU = Uses[LUIdx];
4848
4849 if (LU.Kind != LSRUse::Address)
4850 continue;
4851 if (!TTI.isIndexedLoadLegal(TTI.MIM_PostInc, LU.AccessTy.getType()) &&
4852 !TTI.isIndexedStoreLegal(TTI.MIM_PostInc, LU.AccessTy.getType()))
4853 continue;
4854
4855 size_t MinRegs = std::numeric_limits<size_t>::max();
4856 for (const Formula &F : LU.Formulae)
4857 MinRegs = std::min(F.getNumRegs(), MinRegs);
4858
4859 bool Any = false;
4860 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
4861 ++FIdx) {
4862 Formula &F = LU.Formulae[FIdx];
4863 if (F.getNumRegs() > MinRegs) {
4864 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
4865 dbgs() << "\n");
4866 LU.DeleteFormula(F);
4867 --FIdx;
4868 --NumForms;
4869 Any = true;
4870 }
4871 }
4872 if (Any)
4873 LU.RecomputeRegs(LUIdx, RegUses);
4874
4875 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4876 break;
4877 }
4878
4879 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4880}
4881
4882/// The function delete formulas with high registers number expectation.
4883/// Assuming we don't know the value of each formula (already delete
4884/// all inefficient), generate probability of not selecting for each
4885/// register.
4886/// For example,
4887/// Use1:
4888/// reg(a) + reg({0,+,1})
4889/// reg(a) + reg({-1,+,1}) + 1
4890/// reg({a,+,1})
4891/// Use2:
4892/// reg(b) + reg({0,+,1})
4893/// reg(b) + reg({-1,+,1}) + 1
4894/// reg({b,+,1})
4895/// Use3:
4896/// reg(c) + reg(b) + reg({0,+,1})
4897/// reg(c) + reg({b,+,1})
4898///
4899/// Probability of not selecting
4900/// Use1 Use2 Use3
4901/// reg(a) (1/3) * 1 * 1
4902/// reg(b) 1 * (1/3) * (1/2)
4903/// reg({0,+,1}) (2/3) * (2/3) * (1/2)
4904/// reg({-1,+,1}) (2/3) * (2/3) * 1
4905/// reg({a,+,1}) (2/3) * 1 * 1
4906/// reg({b,+,1}) 1 * (2/3) * (2/3)
4907/// reg(c) 1 * 1 * 0
4908///
4909/// Now count registers number mathematical expectation for each formula:
4910/// Note that for each use we exclude probability if not selecting for the use.
4911/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
4912/// probabilty 1/3 of not selecting for Use1).
4913/// Use1:
4914/// reg(a) + reg({0,+,1}) 1 + 1/3 -- to be deleted
4915/// reg(a) + reg({-1,+,1}) + 1 1 + 4/9 -- to be deleted
4916/// reg({a,+,1}) 1
4917/// Use2:
4918/// reg(b) + reg({0,+,1}) 1/2 + 1/3 -- to be deleted
4919/// reg(b) + reg({-1,+,1}) + 1 1/2 + 2/3 -- to be deleted
4920/// reg({b,+,1}) 2/3
4921/// Use3:
4922/// reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
4923/// reg(c) + reg({b,+,1}) 1 + 2/3
4924void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
4925 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4926 return;
4927 // Ok, we have too many of formulae on our hands to conveniently handle.
4928 // Use a rough heuristic to thin out the list.
4929
4930 // Set of Regs wich will be 100% used in final solution.
4931 // Used in each formula of a solution (in example above this is reg(c)).
4932 // We can skip them in calculations.
4934 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4935
4936 // Map each register to probability of not selecting
4937 DenseMap <const SCEV *, float> RegNumMap;
4938 for (const SCEV *Reg : RegUses) {
4939 if (UniqRegs.count(Reg))
4940 continue;
4941 float PNotSel = 1;
4942 for (const LSRUse &LU : Uses) {
4943 if (!LU.Regs.count(Reg))
4944 continue;
4945 float P = LU.getNotSelectedProbability(Reg);
4946 if (P != 0.0)
4947 PNotSel *= P;
4948 else
4949 UniqRegs.insert(Reg);
4950 }
4951 RegNumMap.insert(std::make_pair(Reg, PNotSel));
4952 }
4953
4954 LLVM_DEBUG(
4955 dbgs() << "Narrowing the search space by deleting costly formulas\n");
4956
4957 // Delete formulas where registers number expectation is high.
4958 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4959 LSRUse &LU = Uses[LUIdx];
4960 // If nothing to delete - continue.
4961 if (LU.Formulae.size() < 2)
4962 continue;
4963 // This is temporary solution to test performance. Float should be
4964 // replaced with round independent type (based on integers) to avoid
4965 // different results for different target builds.
4966 float FMinRegNum = LU.Formulae[0].getNumRegs();
4967 float FMinARegNum = LU.Formulae[0].getNumRegs();
4968 size_t MinIdx = 0;
4969 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
4970 Formula &F = LU.Formulae[i];
4971 float FRegNum = 0;
4972 float FARegNum = 0;
4973 for (const SCEV *BaseReg : F.BaseRegs) {
4974 if (UniqRegs.count(BaseReg))
4975 continue;
4976 FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
4977 if (isa<SCEVAddRecExpr>(BaseReg))
4978 FARegNum +=
4979 RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
4980 }
4981 if (const SCEV *ScaledReg = F.ScaledReg) {
4982 if (!UniqRegs.count(ScaledReg)) {
4983 FRegNum +=
4984 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
4985 if (isa<SCEVAddRecExpr>(ScaledReg))
4986 FARegNum +=
4987 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
4988 }
4989 }
4990 if (FMinRegNum > FRegNum ||
4991 (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
4992 FMinRegNum = FRegNum;
4993 FMinARegNum = FARegNum;
4994 MinIdx = i;
4995 }
4996 }
4997 LLVM_DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs());
4998 dbgs() << " with min reg num " << FMinRegNum << '\n');
4999 if (MinIdx != 0)
5000 std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
5001 while (LU.Formulae.size() != 1) {
5002 LLVM_DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs());
5003 dbgs() << '\n');
5004 LU.Formulae.pop_back();
5005 }
5006 LU.RecomputeRegs(LUIdx, RegUses);
5007 assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
5008 Formula &F = LU.Formulae[0];
5009 LLVM_DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n');
5010 // When we choose the formula, the regs become unique.
5011 UniqRegs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
5012 if (F.ScaledReg)
5013 UniqRegs.insert(F.ScaledReg);
5014 }
5015 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5016}
5017
5018// Check if Best and Reg are SCEVs separated by a constant amount C, and if so
5019// would the addressing offset +C would be legal where the negative offset -C is
5020// not.
5022 ScalarEvolution &SE, const SCEV *Best,
5023 const SCEV *Reg,
5024 MemAccessTy AccessType) {
5025 if (Best->getType() != Reg->getType() ||
5026 (isa<SCEVAddRecExpr>(Best) && isa<SCEVAddRecExpr>(Reg) &&
5027 cast<SCEVAddRecExpr>(Best)->getLoop() !=
5028 cast<SCEVAddRecExpr>(Reg)->getLoop()))
5029 return false;
5030 const auto *Diff = dyn_cast<SCEVConstant>(SE.getMinusSCEV(Best, Reg));
5031 if (!Diff)
5032 return false;
5033
5035 AccessType.MemTy, /*BaseGV=*/nullptr,
5036 /*BaseOffset=*/Diff->getAPInt().getSExtValue(),
5037 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace) &&
5039 AccessType.MemTy, /*BaseGV=*/nullptr,
5040 /*BaseOffset=*/-Diff->getAPInt().getSExtValue(),
5041 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace);
5042}
5043
5044/// Pick a register which seems likely to be profitable, and then in any use
5045/// which has any reference to that register, delete all formulae which do not
5046/// reference that register.
5047void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
5048 // With all other options exhausted, loop until the system is simple
5049 // enough to handle.
5051 while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5052 // Ok, we have too many of formulae on our hands to conveniently handle.
5053 // Use a rough heuristic to thin out the list.
5054 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5055
5056 // Pick the register which is used by the most LSRUses, which is likely
5057 // to be a good reuse register candidate.
5058 const SCEV *Best = nullptr;
5059 unsigned BestNum = 0;
5060 for (const SCEV *Reg : RegUses) {
5061 if (Taken.count(Reg))
5062 continue;
5063 if (!Best) {
5064 Best = Reg;
5065 BestNum = RegUses.getUsedByIndices(Reg).count();
5066 } else {
5067 unsigned Count = RegUses.getUsedByIndices(Reg).count();
5068 if (Count > BestNum) {
5069 Best = Reg;
5070 BestNum = Count;
5071 }
5072
5073 // If the scores are the same, but the Reg is simpler for the target
5074 // (for example {x,+,1} as opposed to {x+C,+,1}, where the target can
5075 // handle +C but not -C), opt for the simpler formula.
5076 if (Count == BestNum) {
5077 int LUIdx = RegUses.getUsedByIndices(Reg).find_first();
5078 if (LUIdx >= 0 && Uses[LUIdx].Kind == LSRUse::Address &&
5079 IsSimplerBaseSCEVForTarget(TTI, SE, Best, Reg,
5080 Uses[LUIdx].AccessTy)) {
5081 Best = Reg;
5082 BestNum = Count;
5083 }
5084 }
5085 }
5086 }
5087 assert(Best && "Failed to find best LSRUse candidate");
5088
5089 LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
5090 << " will yield profitable reuse.\n");
5091 Taken.insert(Best);
5092
5093 // In any use with formulae which references this register, delete formulae
5094 // which don't reference it.
5095 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5096 LSRUse &LU = Uses[LUIdx];
5097 if (!LU.Regs.count(Best)) continue;
5098
5099 bool Any = false;
5100 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5101 Formula &F = LU.Formulae[i];
5102 if (!F.referencesReg(Best)) {
5103 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
5104 LU.DeleteFormula(F);
5105 --e;
5106 --i;
5107 Any = true;
5108 assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
5109 continue;
5110 }
5111 }
5112
5113 if (Any)
5114 LU.RecomputeRegs(LUIdx, RegUses);
5115 }
5116
5117 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5118 }
5119}
5120
5121/// If there are an extraordinary number of formulae to choose from, use some
5122/// rough heuristics to prune down the number of formulae. This keeps the main
5123/// solver from taking an extraordinary amount of time in some worst-case
5124/// scenarios.
5125void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
5126 NarrowSearchSpaceByDetectingSupersets();
5127 NarrowSearchSpaceByCollapsingUnrolledCode();
5128 NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
5130 NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
5131 NarrowSearchSpaceByFilterPostInc();
5132 if (LSRExpNarrow)
5133 NarrowSearchSpaceByDeletingCostlyFormulas();
5134 else
5135 NarrowSearchSpaceByPickingWinnerRegs();
5136}
5137
5138/// This is the recursive solver.
5139void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
5140 Cost &SolutionCost,
5142 const Cost &CurCost,
5143 const SmallPtrSet<const SCEV *, 16> &CurRegs,
5144 DenseSet<const SCEV *> &VisitedRegs) const {
5145 // Some ideas:
5146 // - prune more:
5147 // - use more aggressive filtering
5148 // - sort the formula so that the most profitable solutions are found first
5149 // - sort the uses too
5150 // - search faster:
5151 // - don't compute a cost, and then compare. compare while computing a cost
5152 // and bail early.
5153 // - track register sets with SmallBitVector
5154
5155 const LSRUse &LU = Uses[Workspace.size()];
5156
5157 // If this use references any register that's already a part of the
5158 // in-progress solution, consider it a requirement that a formula must
5159 // reference that register in order to be considered. This prunes out
5160 // unprofitable searching.
5162 for (const SCEV *S : CurRegs)
5163 if (LU.Regs.count(S))
5164 ReqRegs.insert(S);
5165
5167 Cost NewCost(L, SE, TTI, AMK);
5168 for (const Formula &F : LU.Formulae) {
5169 // Ignore formulae which may not be ideal in terms of register reuse of
5170 // ReqRegs. The formula should use all required registers before
5171 // introducing new ones.
5172 // This can sometimes (notably when trying to favour postinc) lead to
5173 // sub-optimial decisions. There it is best left to the cost modelling to
5174 // get correct.
5175 if (AMK != TTI::AMK_PostIndexed || LU.Kind != LSRUse::Address) {
5176 int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
5177 for (const SCEV *Reg : ReqRegs) {
5178 if ((F.ScaledReg && F.ScaledReg == Reg) ||
5179 is_contained(F.BaseRegs, Reg)) {
5180 --NumReqRegsToFind;
5181 if (NumReqRegsToFind == 0)
5182 break;
5183 }
5184 }
5185 if (NumReqRegsToFind != 0) {
5186 // If none of the formulae satisfied the required registers, then we could
5187 // clear ReqRegs and try again. Currently, we simply give up in this case.
5188 continue;
5189 }
5190 }
5191
5192 // Evaluate the cost of the current formula. If it's already worse than
5193 // the current best, prune the search at that point.
5194 NewCost = CurCost;
5195 NewRegs = CurRegs;
5196 NewCost.RateFormula(F, NewRegs, VisitedRegs, LU);
5197 if (NewCost.isLess(SolutionCost)) {
5198 Workspace.push_back(&F);
5199 if (Workspace.size() != Uses.size()) {
5200 SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
5201 NewRegs, VisitedRegs);
5202 if (F.getNumRegs() == 1 && Workspace.size() == 1)
5203 VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
5204 } else {
5205 LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
5206 dbgs() << ".\nRegs:\n";
5207 for (const SCEV *S : NewRegs) dbgs()
5208 << "- " << *S << "\n";
5209 dbgs() << '\n');
5210
5211 SolutionCost = NewCost;
5212 Solution = Workspace;
5213 }
5214 Workspace.pop_back();
5215 }
5216 }
5217}
5218
5219/// Choose one formula from each use. Return the results in the given Solution
5220/// vector.
5221void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
5223 Cost SolutionCost(L, SE, TTI, AMK);
5224 SolutionCost.Lose();
5225 Cost CurCost(L, SE, TTI, AMK);
5227 DenseSet<const SCEV *> VisitedRegs;
5228 Workspace.reserve(Uses.size());
5229
5230 // SolveRecurse does all the work.
5231 SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
5232 CurRegs, VisitedRegs);
5233 if (Solution.empty()) {
5234 LLVM_DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
5235 return;
5236 }
5237
5238 // Ok, we've now made all our decisions.
5239 LLVM_DEBUG(dbgs() << "\n"
5240 "The chosen solution requires ";
5241 SolutionCost.print(dbgs()); dbgs() << ":\n";
5242 for (size_t i = 0, e = Uses.size(); i != e; ++i) {
5243 dbgs() << " ";
5244 Uses[i].print(dbgs());
5245 dbgs() << "\n"
5246 " ";
5247 Solution[i]->print(dbgs());
5248 dbgs() << '\n';
5249 });
5250
5251 assert(Solution.size() == Uses.size() && "Malformed solution!");
5252
5253 if (BaselineCost.isLess(SolutionCost)) {
5255 LLVM_DEBUG(
5256 dbgs() << "Baseline is more profitable than chosen solution, "
5257 "add option 'lsr-drop-solution' to drop LSR solution.\n");
5258 else {
5259 LLVM_DEBUG(dbgs() << "Baseline is more profitable than chosen "
5260 "solution, dropping LSR solution.\n";);
5261 Solution.clear();
5262 }
5263 }
5264}
5265
5266/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
5267/// we can go while still being dominated by the input positions. This helps
5268/// canonicalize the insert position, which encourages sharing.
5270LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
5271 const SmallVectorImpl<Instruction *> &Inputs)
5272 const {
5273 Instruction *Tentative = &*IP;
5274 while (true) {
5275 bool AllDominate = true;
5276 Instruction *BetterPos = nullptr;
5277 // Don't bother attempting to insert before a catchswitch, their basic block
5278 // cannot have other non-PHI instructions.
5279 if (isa<CatchSwitchInst>(Tentative))
5280 return IP;
5281
5282 for (Instruction *Inst : Inputs) {
5283 if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
5284 AllDominate = false;
5285 break;
5286 }
5287 // Attempt to find an insert position in the middle of the block,
5288 // instead of at the end, so that it can be used for other expansions.
5289 if (Tentative->getParent() == Inst->getParent() &&
5290 (!BetterPos || !DT.dominates(Inst, BetterPos)))
5291 BetterPos = &*std::next(BasicBlock::iterator(Inst));
5292 }
5293 if (!AllDominate)
5294 break;
5295 if (BetterPos)
5296 IP = BetterPos->getIterator();
5297 else
5298 IP = Tentative->getIterator();
5299
5300 const Loop *IPLoop = LI.getLoopFor(IP->getParent());
5301 unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
5302
5303 BasicBlock *IDom;
5304 for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
5305 if (!Rung) return IP;
5306 Rung = Rung->getIDom();
5307 if (!Rung) return IP;
5308 IDom = Rung->getBlock();
5309
5310 // Don't climb into a loop though.
5311 const Loop *IDomLoop = LI.getLoopFor(IDom);
5312 unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
5313 if (IDomDepth <= IPLoopDepth &&
5314 (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
5315 break;
5316 }
5317
5318 Tentative = IDom->getTerminator();
5319 }
5320
5321 return IP;
5322}
5323
5324/// Determine an input position which will be dominated by the operands and
5325/// which will dominate the result.
5326BasicBlock::iterator LSRInstance::AdjustInsertPositionForExpand(
5327 BasicBlock::iterator LowestIP, const LSRFixup &LF, const LSRUse &LU) const {
5328 // Collect some instructions which must be dominated by the
5329 // expanding replacement. These must be dominated by any operands that
5330 // will be required in the expansion.
5332 if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
5333 Inputs.push_back(I);
5334 if (LU.Kind == LSRUse::ICmpZero)
5335 if (Instruction *I =
5336 dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
5337 Inputs.push_back(I);
5338 if (LF.PostIncLoops.count(L)) {
5339 if (LF.isUseFullyOutsideLoop(L))
5340 Inputs.push_back(L->getLoopLatch()->getTerminator());
5341 else
5342 Inputs.push_back(IVIncInsertPos);
5343 }
5344 // The expansion must also be dominated by the increment positions of any
5345 // loops it for which it is using post-inc mode.
5346 for (const Loop *PIL : LF.PostIncLoops) {
5347 if (PIL == L) continue;
5348
5349 // Be dominated by the loop exit.
5350 SmallVector<BasicBlock *, 4> ExitingBlocks;
5351 PIL->getExitingBlocks(ExitingBlocks);
5352 if (!ExitingBlocks.empty()) {
5353 BasicBlock *BB = ExitingBlocks[0];
5354 for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
5355 BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
5356 Inputs.push_back(BB->getTerminator());
5357 }
5358 }
5359
5360 assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad()
5361 && !isa<DbgInfoIntrinsic>(LowestIP) &&
5362 "Insertion point must be a normal instruction");
5363
5364 // Then, climb up the immediate dominator tree as far as we can go while
5365 // still being dominated by the input positions.
5366 BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
5367
5368 // Don't insert instructions before PHI nodes.
5369 while (isa<PHINode>(IP)) ++IP;
5370
5371 // Ignore landingpad instructions.
5372 while (IP->isEHPad()) ++IP;
5373
5374 // Ignore debug intrinsics.
5375 while (isa<DbgInfoIntrinsic>(IP)) ++IP;
5376
5377 // Set IP below instructions recently inserted by SCEVExpander. This keeps the
5378 // IP consistent across expansions and allows the previously inserted
5379 // instructions to be reused by subsequent expansion.
5380 while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP)
5381 ++IP;
5382
5383 return IP;
5384}
5385
5386/// Emit instructions for the leading candidate expression for this LSRUse (this
5387/// is called "expanding").
5388Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
5389 const Formula &F, BasicBlock::iterator IP,
5390 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5391 if (LU.RigidFormula)
5392 return LF.OperandValToReplace;
5393
5394 // Determine an input position which will be dominated by the operands and
5395 // which will dominate the result.
5396 IP = AdjustInsertPositionForExpand(IP, LF, LU);
5397 Rewriter.setInsertPoint(&*IP);
5398
5399 // Inform the Rewriter if we have a post-increment use, so that it can
5400 // perform an advantageous expansion.
5401 Rewriter.setPostInc(LF.PostIncLoops);
5402
5403 // This is the type that the user actually needs.
5404 Type *OpTy = LF.OperandValToReplace->getType();
5405 // This will be the type that we'll initially expand to.
5406 Type *Ty = F.getType();
5407 if (!Ty)
5408 // No type known; just expand directly to the ultimate type.
5409 Ty = OpTy;
5410 else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
5411 // Expand directly to the ultimate type if it's the right size.
5412 Ty = OpTy;
5413 // This is the type to do integer arithmetic in.
5414 Type *IntTy = SE.getEffectiveSCEVType(Ty);
5415
5416 // Build up a list of operands to add together to form the full base.
5418
5419 // Expand the BaseRegs portion.
5420 for (const SCEV *Reg : F.BaseRegs) {
5421 assert(!Reg->isZero() && "Zero allocated in a base register!");
5422
5423 // If we're expanding for a post-inc user, make the post-inc adjustment.
5424 Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE);
5425 Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
5426 }
5427
5428 // Expand the ScaledReg portion.
5429 Value *ICmpScaledV = nullptr;
5430 if (F.Scale != 0) {
5431 const SCEV *ScaledS = F.ScaledReg;
5432
5433 // If we're expanding for a post-inc user, make the post-inc adjustment.
5434 PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
5435 ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE);
5436
5437 if (LU.Kind == LSRUse::ICmpZero) {
5438 // Expand ScaleReg as if it was part of the base regs.
5439 if (F.Scale == 1)
5440 Ops.push_back(
5441 SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)));
5442 else {
5443 // An interesting way of "folding" with an icmp is to use a negated
5444 // scale, which we'll implement by inserting it into the other operand
5445 // of the icmp.
5446 assert(F.Scale == -1 &&
5447 "The only scale supported by ICmpZero uses is -1!");
5448 ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr);
5449 }
5450 } else {
5451 // Otherwise just expand the scaled register and an explicit scale,
5452 // which is expected to be matched as part of the address.
5453
5454 // Flush the operand list to suppress SCEVExpander hoisting address modes.
5455 // Unless the addressing mode will not be folded.
5456 if (!Ops.empty() && LU.Kind == LSRUse::Address &&
5457 isAMCompletelyFolded(TTI, LU, F)) {
5458 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), nullptr);
5459 Ops.clear();
5460 Ops.push_back(SE.getUnknown(FullV));
5461 }
5462 ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr));
5463 if (F.Scale != 1)
5464 ScaledS =
5465 SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
5466 Ops.push_back(ScaledS);
5467 }
5468 }
5469
5470 // Expand the GV portion.
5471 if (F.BaseGV) {
5472 // Flush the operand list to suppress SCEVExpander hoisting.
5473 if (!Ops.empty()) {
5474 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), IntTy);
5475 Ops.clear();
5476 Ops.push_back(SE.getUnknown(FullV));
5477 }
5478 Ops.push_back(SE.getUnknown(F.BaseGV));
5479 }
5480
5481 // Flush the operand list to suppress SCEVExpander hoisting of both folded and
5482 // unfolded offsets. LSR assumes they both live next to their uses.
5483 if (!Ops.empty()) {
5484 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
5485 Ops.clear();
5486 Ops.push_back(SE.getUnknown(FullV));
5487 }
5488
5489 // Expand the immediate portion.
5490 int64_t Offset = (uint64_t)F.BaseOffset + LF.Offset;
5491 if (Offset != 0) {
5492 if (LU.Kind == LSRUse::ICmpZero) {
5493 // The other interesting way of "folding" with an ICmpZero is to use a
5494 // negated immediate.
5495 if (!ICmpScaledV)
5496 ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset);
5497 else {
5498 Ops.push_back(SE.getUnknown(ICmpScaledV));
5499 ICmpScaledV = ConstantInt::get(IntTy, Offset);
5500 }
5501 } else {
5502 // Just add the immediate values. These again are expected to be matched
5503 // as part of the address.
5505 }
5506 }
5507
5508 // Expand the unfolded offset portion.
5509 int64_t UnfoldedOffset = F.UnfoldedOffset;
5510 if (UnfoldedOffset != 0) {
5511 // Just add the immediate values.
5513 UnfoldedOffset)));
5514 }
5515
5516 // Emit instructions summing all the operands.
5517 const SCEV *FullS = Ops.empty() ?
5518 SE.getConstant(IntTy, 0) :
5519 SE.getAddExpr(Ops);
5520 Value *FullV = Rewriter.expandCodeFor(FullS, Ty);
5521
5522 // We're done expanding now, so reset the rewriter.
5523 Rewriter.clearPostInc();
5524
5525 // An ICmpZero Formula represents an ICmp which we're handling as a
5526 // comparison against zero. Now that we've expanded an expression for that
5527 // form, update the ICmp's other operand.
5528 if (LU.Kind == LSRUse::ICmpZero) {
5529 ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
5530 if (auto *OperandIsInstr = dyn_cast<Instruction>(CI->getOperand(1)))
5531 DeadInsts.emplace_back(OperandIsInstr);
5532 assert(!F.BaseGV && "ICmp does not support folding a global value and "
5533 "a scale at the same time!");
5534 if (F.Scale == -1) {
5535 if (ICmpScaledV->getType() != OpTy) {
5537 CastInst::getCastOpcode(ICmpScaledV, false, OpTy, false),
5538 ICmpScaledV, OpTy, "tmp", CI->getIterator());
5539 ICmpScaledV = Cast;
5540 }
5541 CI->setOperand(1, ICmpScaledV);
5542 } else {
5543 // A scale of 1 means that the scale has been expanded as part of the
5544 // base regs.
5545 assert((F.Scale == 0 || F.Scale == 1) &&
5546 "ICmp does not support folding a global value and "
5547 "a scale at the same time!");
5549 -(uint64_t)Offset);
5550 if (C->getType() != OpTy) {
5552 CastInst::getCastOpcode(C, false, OpTy, false), C, OpTy,
5553 CI->getModule()->getDataLayout());
5554 assert(C && "Cast of ConstantInt should have folded");
5555 }
5556
5557 CI->setOperand(1, C);
5558 }
5559 }
5560
5561 return FullV;
5562}
5563
5564/// Helper for Rewrite. PHI nodes are special because the use of their operands
5565/// effectively happens in their predecessor blocks, so the expression may need
5566/// to be expanded in multiple places.
5567void LSRInstance::RewriteForPHI(
5568 PHINode *PN, const LSRUse &LU, const LSRFixup &LF, const Formula &F,
5569 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5571
5572 // Inserting instructions in the loop and using them as PHI's input could
5573 // break LCSSA in case if PHI's parent block is not a loop exit (i.e. the
5574 // corresponding incoming block is not loop exiting). So collect all such
5575 // instructions to form LCSSA for them later.
5576 SmallVector<Instruction *, 4> InsertedNonLCSSAInsts;
5577
5578 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
5579 if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
5580 bool needUpdateFixups = false;
5581 BasicBlock *BB = PN->getIncomingBlock(i);
5582
5583 // If this is a critical edge, split the edge so that we do not insert
5584 // the code on all predecessor/successor paths. We do this unless this
5585 // is the canonical backedge for this loop, which complicates post-inc
5586 // users.
5587 if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
5588 !isa<IndirectBrInst>(BB->getTerminator()) &&
5589 !isa<CatchSwitchInst>(BB->getTerminator())) {
5590 BasicBlock *Parent = PN->getParent();
5591 Loop *PNLoop = LI.getLoopFor(Parent);
5592 if (!PNLoop || Parent != PNLoop->getHeader()) {
5593 // Split the critical edge.
5594 BasicBlock *NewBB = nullptr;
5595 if (!Parent->isLandingPad()) {
5596 NewBB =
5597 SplitCriticalEdge(BB, Parent,
5598 CriticalEdgeSplittingOptions(&DT, &LI, MSSAU)
5599 .setMergeIdenticalEdges()
5600 .setKeepOneInputPHIs());
5601 } else {
5603 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
5604 SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DTU, &LI);
5605 NewBB = NewBBs[0];
5606 }
5607 // If NewBB==NULL, then SplitCriticalEdge refused to split because all
5608 // phi predecessors are identical. The simple thing to do is skip
5609 // splitting in this case rather than complicate the API.
5610 if (NewBB) {
5611 // If PN is outside of the loop and BB is in the loop, we want to
5612 // move the block to be immediately before the PHI block, not
5613 // immediately after BB.
5614 if (L->contains(BB) && !L->contains(PN))
5615 NewBB->moveBefore(PN->getParent());
5616
5617 // Splitting the edge can reduce the number of PHI entries we have.
5618 e = PN->getNumIncomingValues();
5619 BB = NewBB;
5620 i = PN->getBasicBlockIndex(BB);
5621
5622 needUpdateFixups = true;
5623 }
5624 }
5625 }
5626
5627 std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
5628 Inserted.insert(std::make_pair(BB, static_cast<Value *>(nullptr)));
5629 if (!Pair.second)
5630 PN->setIncomingValue(i, Pair.first->second);
5631 else {
5632 Value *FullV =
5633 Expand(LU, LF, F, BB->getTerminator()->getIterator(), DeadInsts);
5634
5635 // If this is reuse-by-noop-cast, insert the noop cast.
5636 Type *OpTy = LF.OperandValToReplace->getType();
5637 if (FullV->getType() != OpTy)
5638 FullV = CastInst::Create(
5639 CastInst::getCastOpcode(FullV, false, OpTy, false), FullV,
5640 LF.OperandValToReplace->getType(), "tmp",
5641 BB->getTerminator()->getIterator());
5642
5643 // If the incoming block for this value is not in the loop, it means the
5644 // current PHI is not in a loop exit, so we must create a LCSSA PHI for
5645 // the inserted value.
5646 if (auto *I = dyn_cast<Instruction>(FullV))
5647 if (L->contains(I) && !L->contains(BB))
5648 InsertedNonLCSSAInsts.push_back(I);
5649
5650 PN->setIncomingValue(i, FullV);
5651 Pair.first->second = FullV;
5652 }
5653
5654 // If LSR splits critical edge and phi node has other pending
5655 // fixup operands, we need to update those pending fixups. Otherwise
5656 // formulae will not be implemented completely and some instructions
5657 // will not be eliminated.
5658 if (needUpdateFixups) {
5659 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
5660 for (LSRFixup &Fixup : Uses[LUIdx].Fixups)
5661 // If fixup is supposed to rewrite some operand in the phi
5662 // that was just updated, it may be already moved to
5663 // another phi node. Such fixup requires update.
5664 if (Fixup.UserInst == PN) {
5665 // Check if the operand we try to replace still exists in the
5666 // original phi.
5667 bool foundInOriginalPHI = false;
5668 for (const auto &val : PN->incoming_values())
5669 if (val == Fixup.OperandValToReplace) {
5670 foundInOriginalPHI = true;
5671 break;
5672 }
5673
5674 // If fixup operand found in original PHI - nothing to do.
5675 if (foundInOriginalPHI)
5676 continue;
5677
5678 // Otherwise it might be moved to another PHI and requires update.
5679 // If fixup operand not found in any of the incoming blocks that
5680 // means we have already rewritten it - nothing to do.
5681 for (const auto &Block : PN->blocks())
5682 for (BasicBlock::iterator I = Block->begin(); isa<PHINode>(I);
5683 ++I) {
5684 PHINode *NewPN = cast<PHINode>(I);
5685 for (const auto &val : NewPN->incoming_values())
5686 if (val == Fixup.OperandValToReplace)
5687 Fixup.UserInst = NewPN;
5688 }
5689 }
5690 }
5691 }
5692
5693 formLCSSAForInstructions(InsertedNonLCSSAInsts, DT, LI, &SE);
5694}
5695
5696/// Emit instructions for the leading candidate expression for this LSRUse (this
5697/// is called "expanding"), and update the UserInst to reference the newly
5698/// expanded value.
5699void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
5700 const Formula &F,
5701 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5702 // First, find an insertion point that dominates UserInst. For PHI nodes,
5703 // find the nearest block which dominates all the relevant uses.
5704 if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
5705 RewriteForPHI(PN, LU, LF, F, DeadInsts);
5706 } else {
5707 Value *FullV = Expand(LU, LF, F, LF.UserInst->getIterator(), DeadInsts);
5708
5709 // If this is reuse-by-noop-cast, insert the noop cast.
5710 Type *OpTy = LF.OperandValToReplace->getType();
5711 if (FullV->getType() != OpTy) {
5712 Instruction *Cast =
5713 CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
5714 FullV, OpTy, "tmp", LF.UserInst->getIterator());
5715 FullV = Cast;
5716 }
5717
5718 // Update the user. ICmpZero is handled specially here (for now) because
5719 // Expand may have updated one of the operands of the icmp already, and
5720 // its new value may happen to be equal to LF.OperandValToReplace, in
5721 // which case doing replaceUsesOfWith leads to replacing both operands
5722 // with the same value. TODO: Reorganize this.
5723 if (LU.Kind == LSRUse::ICmpZero)
5724 LF.UserInst->setOperand(0, FullV);
5725 else
5726 LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
5727 }
5728
5729 if (auto *OperandIsInstr = dyn_cast<Instruction>(LF.OperandValToReplace))
5730 DeadInsts.emplace_back(OperandIsInstr);
5731}
5732
5733// Trying to hoist the IVInc to loop header if all IVInc users are in
5734// the loop header. It will help backend to generate post index load/store
5735// when the latch block is different from loop header block.
5736static bool canHoistIVInc(const TargetTransformInfo &TTI, const LSRFixup &Fixup,
5737 const LSRUse &LU, Instruction *IVIncInsertPos,
5738 Loop *L) {
5739 if (LU.Kind != LSRUse::Address)
5740 return false;
5741
5742 // For now this code do the conservative optimization, only work for
5743 // the header block. Later we can hoist the IVInc to the block post
5744 // dominate all users.
5745 BasicBlock *LHeader = L->getHeader();
5746 if (IVIncInsertPos->getParent() == LHeader)
5747 return false;
5748
5749 if (!Fixup.OperandValToReplace ||
5750 any_of(Fixup.OperandValToReplace->users(), [&LHeader](User *U) {
5751 Instruction *UI = cast<Instruction>(U);
5752 return UI->getParent() != LHeader;
5753 }))
5754 return false;
5755
5756 Instruction *I = Fixup.UserInst;
5757 Type *Ty = I->getType();
5758 return Ty->isIntegerTy() &&
5759 ((isa<LoadInst>(I) && TTI.isIndexedLoadLegal(TTI.MIM_PostInc, Ty)) ||
5760 (isa<StoreInst>(I) && TTI.isIndexedStoreLegal(TTI.MIM_PostInc, Ty)));
5761}
5762
5763/// Rewrite all the fixup locations with new values, following the chosen
5764/// solution.
5765void LSRInstance::ImplementSolution(
5766 const SmallVectorImpl<const Formula *> &Solution) {
5767 // Keep track of instructions we may have made dead, so that
5768 // we can remove them after we are done working.
5770
5771 // Mark phi nodes that terminate chains so the expander tries to reuse them.
5772 for (const IVChain &Chain : IVChainVec) {
5773 if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst()))
5774 Rewriter.setChainedPhi(PN);
5775 }
5776
5777 // Expand the new value definitions and update the users.
5778 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
5779 for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) {
5780 Instruction *InsertPos =
5781 canHoistIVInc(TTI, Fixup, Uses[LUIdx], IVIncInsertPos, L)
5782 ? L->getHeader()->getTerminator()
5783 : IVIncInsertPos;
5784 Rewriter.setIVIncInsertPos(L, InsertPos);
5785 Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], DeadInsts);
5786 Changed = true;
5787 }
5788
5789 for (const IVChain &Chain : IVChainVec) {
5790 GenerateIVChain(Chain, DeadInsts);
5791 Changed = true;
5792 }
5793
5794 for (const WeakVH &IV : Rewriter.getInsertedIVs())
5795 if (IV && dyn_cast<Instruction>(&*IV)->getParent())
5796 ScalarEvolutionIVs.push_back(IV);
5797
5798 // Clean up after ourselves. This must be done before deleting any
5799 // instructions.
5800 Rewriter.clear();
5801
5803 &TLI, MSSAU);
5804
5805 // In our cost analysis above, we assume that each addrec consumes exactly
5806 // one register, and arrange to have increments inserted just before the
5807 // latch to maximimize the chance this is true. However, if we reused
5808 // existing IVs, we now need to move the increments to match our
5809 // expectations. Otherwise, our cost modeling results in us having a
5810 // chosen a non-optimal result for the actual schedule. (And yes, this
5811 // scheduling decision does impact later codegen.)
5812 for (PHINode &PN : L->getHeader()->phis()) {
5813 BinaryOperator *BO = nullptr;
5814 Value *Start = nullptr, *Step = nullptr;
5815 if (!matchSimpleRecurrence(&PN, BO, Start, Step))
5816 continue;
5817
5818 switch (BO->getOpcode()) {
5819 case Instruction::Sub:
5820 if (BO->getOperand(0) != &PN)
5821 // sub is non-commutative - match handling elsewhere in LSR
5822 continue;
5823 break;
5824 case Instruction::Add:
5825 break;
5826 default:
5827 continue;
5828 };
5829
5830 if (!isa<Constant>(Step))
5831 // If not a constant step, might increase register pressure
5832 // (We assume constants have been canonicalized to RHS)
5833 continue;
5834
5835 if (BO->getParent() == IVIncInsertPos->getParent())
5836 // Only bother moving across blocks. Isel can handle block local case.
5837 continue;
5838
5839 // Can we legally schedule inc at the desired point?
5840 if (!llvm::all_of(BO->uses(),
5841 [&](Use &U) {return DT.dominates(IVIncInsertPos, U);}))
5842 continue;
5843 BO->moveBefore(IVIncInsertPos);
5844 Changed = true;
5845 }
5846
5847
5848}
5849
5850LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
5851 DominatorTree &DT, LoopInfo &LI,
5854 : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), TLI(TLI), TTI(TTI), L(L),
5855 MSSAU(MSSAU), AMK(PreferredAddresingMode.getNumOccurrences() > 0
5857 : TTI.getPreferredAddressingMode(L, &SE)),
5858 Rewriter(SE, L->getHeader()->getModule()->getDataLayout(), "lsr", false),
5859 BaselineCost(L, SE, TTI, AMK) {
5860 // If LoopSimplify form is not available, stay out of trouble.
5861 if (!L->isLoopSimplifyForm())
5862 return;
5863
5864 // If there's no interesting work to be done, bail early.
5865 if (IU.empty()) return;
5866
5867 // If there's too much analysis to be done, bail early. We won't be able to
5868 // model the problem anyway.
5869 unsigned NumUsers = 0;
5870 for (const IVStrideUse &U : IU) {
5871 if (++NumUsers > MaxIVUsers) {
5872 (void)U;
5873 LLVM_DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U
5874 << "\n");
5875 return;
5876 }
5877 // Bail out if we have a PHI on an EHPad that gets a value from a
5878 // CatchSwitchInst. Because the CatchSwitchInst cannot be split, there is
5879 // no good place to stick any instructions.
5880 if (auto *PN = dyn_cast<PHINode>(U.getUser())) {
5881 auto *FirstNonPHI = PN->getParent()->getFirstNonPHI();
5882 if (isa<FuncletPadInst>(FirstNonPHI) ||
5883 isa<CatchSwitchInst>(FirstNonPHI))
5884 for (BasicBlock *PredBB : PN->blocks())
5885 if (isa<CatchSwitchInst>(PredBB->getFirstNonPHI()))
5886 return;
5887 }
5888 }
5889
5890 LLVM_DEBUG(dbgs() << "\nLSR on loop ";
5891 L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
5892 dbgs() << ":\n");
5893
5894 // Configure SCEVExpander already now, so the correct mode is used for
5895 // isSafeToExpand() checks.
5896#ifndef NDEBUG
5897 Rewriter.setDebugType(DEBUG_TYPE);
5898#endif
5899 Rewriter.disableCanonicalMode();
5900 Rewriter.enableLSRMode();
5901
5902 // First, perform some low-level loop optimizations.
5903 OptimizeShadowIV();
5904 OptimizeLoopTermCond();
5905
5906 // If loop preparation eliminates all interesting IV users, bail.
5907 if (IU.empty()) return;
5908
5909 // Skip nested loops until we can model them better with formulae.
5910 if (!L->isInnermost()) {
5911 LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
5912 return;
5913 }
5914
5915 // Start collecting data and preparing for the solver.
5916 // If number of registers is not the major cost, we cannot benefit from the
5917 // current profitable chain optimization which is based on number of
5918 // registers.
5919 // FIXME: add profitable chain optimization for other kinds major cost, for
5920 // example number of instructions.
5922 CollectChains();
5923 CollectInterestingTypesAndFactors();
5924 CollectFixupsAndInitialFormulae();
5925 CollectLoopInvariantFixupsAndFormulae();
5926
5927 if (Uses.empty())
5928 return;
5929
5930 LLVM_DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
5931 print_uses(dbgs()));
5932 LLVM_DEBUG(dbgs() << "The baseline solution requires ";
5933 BaselineCost.print(dbgs()); dbgs() << "\n");
5934
5935 // Now use the reuse data to generate a bunch of interesting ways
5936 // to formulate the values needed for the uses.
5937 GenerateAllReuseFormulae();
5938
5939 FilterOutUndesirableDedicatedRegisters();
5940 NarrowSearchSpaceUsingHeuristics();
5941
5943 Solve(Solution);
5944
5945 // Release memory that is no longer needed.
5946 Factors.clear();
5947 Types.clear();
5948 RegUses.clear();
5949
5950 if (Solution.empty())
5951 return;
5952
5953#ifndef NDEBUG
5954 // Formulae should be legal.
5955 for (const LSRUse &LU : Uses) {
5956 for (const Formula &F : LU.Formulae)
5957 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
5958 F) && "Illegal formula generated!");
5959 };
5960#endif
5961
5962 // Now that we've decided what we want, make it so.
5963 ImplementSolution(Solution);
5964}
5965
5966#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5967void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
5968 if (Factors.empty() && Types.empty()) return;
5969
5970 OS << "LSR has identified the following interesting factors and types: ";
5971 bool First = true;
5972
5973 for (int64_t Factor : Factors) {
5974 if (!First) OS << ", ";
5975 First = false;
5976 OS << '*' << Factor;
5977 }
5978
5979 for (Type *Ty : Types) {
5980 if (!First) OS << ", ";
5981 First = false;
5982 OS << '(' << *Ty << ')';
5983 }
5984 OS << '\n';
5985}
5986
5987void LSRInstance::print_fixups(raw_ostream &OS) const {
5988 OS << "LSR is examining the following fixup sites:\n";
5989 for (const LSRUse &LU : Uses)
5990 for (const LSRFixup &LF : LU.Fixups) {
5991 dbgs() << " ";
5992 LF.print(OS);
5993 OS << '\n';
5994 }
5995}
5996
5997void LSRInstance::print_uses(raw_ostream &OS) const {
5998 OS << "LSR is examining the following uses:\n";
5999 for (const LSRUse &LU : Uses) {
6000 dbgs() << " ";
6001 LU.print(OS);
6002 OS << '\n';
6003 for (const Formula &F : LU.Formulae) {
6004 OS << " ";
6005 F.print(OS);
6006 OS << '\n';
6007 }
6008 }
6009}
6010
6011void LSRInstance::print(raw_ostream &OS) const {
6012 print_factors_and_types(OS);
6013 print_fixups(OS);
6014 print_uses(OS);
6015}
6016
6017LLVM_DUMP_METHOD void LSRInstance::dump() const {
6018 print(errs()); errs() << '\n';
6019}
6020#endif
6021
6022namespace {
6023
6024class LoopStrengthReduce : public LoopPass {
6025public:
6026 static char ID; // Pass ID, replacement for typeid
6027
6028 LoopStrengthReduce();
6029
6030private:
6031 bool runOnLoop(Loop *L, LPPassManager &LPM) override;
6032 void getAnalysisUsage(AnalysisUsage &AU) const override;
6033};
6034
6035} // end anonymous namespace
6036
6037LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
6039}
6040
6041void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
6042 // We split critical edges, so we change the CFG. However, we do update
6043 // many analyses if they are around.
6045
6055 // Requiring LoopSimplify a second time here prevents IVUsers from running
6056 // twice, since LoopSimplify was invalidated by running ScalarEvolution.
6062}
6063
6064namespace {
6065
6066/// Enables more convenient iteration over a DWARF expression vector.
6068ToDwarfOpIter(SmallVectorImpl<uint64_t> &Expr) {
6073 return {Begin, End};
6074}
6075
6076struct SCEVDbgValueBuilder {
6077 SCEVDbgValueBuilder() = default;
6078 SCEVDbgValueBuilder(const SCEVDbgValueBuilder &Base) { clone(Base); }
6079
6080 void clone(const SCEVDbgValueBuilder &Base) {
6081 LocationOps = Base.LocationOps;
6082 Expr = Base.Expr;
6083 }
6084
6085 void clear() {
6086 LocationOps.clear();
6087 Expr.clear();
6088 }
6089
6090 /// The DIExpression as we translate the SCEV.
6092 /// The location ops of the DIExpression.
6093 SmallVector<Value *, 2> LocationOps;
6094
6095 void pushOperator(uint64_t Op) { Expr.push_back(Op); }
6096 void pushUInt(uint64_t Operand) { Expr.push_back(Operand); }
6097
6098 /// Add a DW_OP_LLVM_arg to the expression, followed by the index of the value
6099 /// in the set of values referenced by the expression.
6100 void pushLocation(llvm::Value *V) {
6102 auto *It = llvm::find(LocationOps, V);
6103 unsigned ArgIndex = 0;
6104 if (It != LocationOps.end()) {
6105 ArgIndex = std::distance(LocationOps.begin(), It);
6106 } else {
6107 ArgIndex = LocationOps.size();
6108 LocationOps.push_back(V);
6109 }
6110 Expr.push_back(ArgIndex);
6111 }
6112
6113 void pushValue(const SCEVUnknown *U) {
6114 llvm::Value *V = cast<SCEVUnknown>(U)->getValue();
6115 pushLocation(V);
6116 }
6117
6118 bool pushConst(const SCEVConstant *C) {
6119 if (C->getAPInt().getSignificantBits() > 64)
6120 return false;
6121 Expr.push_back(llvm::dwarf::DW_OP_consts);
6122 Expr.push_back(C->getAPInt().getSExtValue());
6123 return true;
6124 }
6125
6126 // Iterating the expression as DWARF ops is convenient when updating
6127 // DWARF_OP_LLVM_args.
6129 return ToDwarfOpIter(Expr);
6130 }
6131
6132 /// Several SCEV types are sequences of the same arithmetic operator applied
6133 /// to constants and values that may be extended or truncated.
6134 bool pushArithmeticExpr(const llvm::SCEVCommutativeExpr *CommExpr,
6135 uint64_t DwarfOp) {
6136 assert((isa<llvm::SCEVAddExpr>(CommExpr) || isa<SCEVMulExpr>(CommExpr)) &&
6137 "Expected arithmetic SCEV type");
6138 bool Success = true;
6139 unsigned EmitOperator = 0;
6140 for (const auto &Op : CommExpr->operands()) {
6141 Success &= pushSCEV(Op);
6142
6143 if (EmitOperator >= 1)
6144 pushOperator(DwarfOp);
6145 ++EmitOperator;
6146 }
6147 return Success;
6148 }
6149
6150 // TODO: Identify and omit noop casts.
6151 bool pushCast(const llvm::SCEVCastExpr *C, bool IsSigned) {
6152 const llvm::SCEV *Inner = C->getOperand(0);
6153 const llvm::Type *Type = C->getType();
6154 uint64_t ToWidth = Type->getIntegerBitWidth();
6155 bool Success = pushSCEV(Inner);
6156 uint64_t CastOps[] = {dwarf::DW_OP_LLVM_convert, ToWidth,
6157 IsSigned ? llvm::dwarf::DW_ATE_signed
6158 : llvm::dwarf::DW_ATE_unsigned};
6159 for (const auto &Op : CastOps)
6160 pushOperator(Op);
6161 return Success;
6162 }
6163
6164 // TODO: MinMax - although these haven't been encountered in the test suite.
6165 bool pushSCEV(const llvm::SCEV *S) {
6166 bool Success = true;
6167 if (const SCEVConstant *StartInt = dyn_cast<SCEVConstant>(S)) {
6168 Success &= pushConst(StartInt);
6169
6170 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
6171 if (!U->getValue())
6172 return false;
6173 pushLocation(U->getValue());
6174
6175 } else if (const SCEVMulExpr *MulRec = dyn_cast<SCEVMulExpr>(S)) {
6176 Success &= pushArithmeticExpr(MulRec, llvm::dwarf::DW_OP_mul);
6177
6178 } else if (const SCEVUDivExpr *UDiv = dyn_cast<SCEVUDivExpr>(S)) {
6179 Success &= pushSCEV(UDiv->getLHS());
6180 Success &= pushSCEV(UDiv->getRHS());
6181 pushOperator(llvm::dwarf::DW_OP_div);
6182
6183 } else if (const SCEVCastExpr *Cast = dyn_cast<SCEVCastExpr>(S)) {
6184 // Assert if a new and unknown SCEVCastEXpr type is encountered.
6185 assert((isa<SCEVZeroExtendExpr>(Cast) || isa<SCEVTruncateExpr>(Cast) ||
6186 isa<SCEVPtrToIntExpr>(Cast) || isa<SCEVSignExtendExpr>(Cast)) &&
6187 "Unexpected cast type in SCEV.");
6188 Success &= pushCast(Cast, (isa<SCEVSignExtendExpr>(Cast)));
6189
6190 } else if (const SCEVAddExpr *AddExpr = dyn_cast<SCEVAddExpr>(S)) {
6191 Success &= pushArithmeticExpr(AddExpr, llvm::dwarf::DW_OP_plus);
6192
6193 } else if (isa<SCEVAddRecExpr>(S)) {
6194 // Nested SCEVAddRecExpr are generated by nested loops and are currently
6195 // unsupported.
6196 return false;
6197
6198 } else {
6199 return false;
6200 }
6201 return Success;
6202 }
6203
6204 /// Return true if the combination of arithmetic operator and underlying
6205 /// SCEV constant value is an identity function.
6206 bool isIdentityFunction(uint64_t Op, const SCEV *S) {
6207 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
6208 if (C->getAPInt().getSignificantBits() > 64)
6209 return false;
6210 int64_t I = C->getAPInt().getSExtValue();
6211 switch (Op) {
6212 case llvm::dwarf::DW_OP_plus:
6213 case llvm::dwarf::DW_OP_minus:
6214 return I == 0;
6215 case llvm::dwarf::DW_OP_mul:
6216 case llvm::dwarf::DW_OP_div:
6217 return I == 1;
6218 }
6219 }
6220 return false;
6221 }
6222
6223 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6224 /// builder's expression stack. The stack should already contain an
6225 /// expression for the iteration count, so that it can be multiplied by
6226 /// the stride and added to the start.
6227 /// Components of the expression are omitted if they are an identity function.
6228 /// Chain (non-affine) SCEVs are not supported.
6229 bool SCEVToValueExpr(const llvm::SCEVAddRecExpr &SAR, ScalarEvolution &SE) {
6230 assert(SAR.isAffine() && "Expected affine SCEV");
6231 // TODO: Is this check needed?
6232 if (isa<SCEVAddRecExpr>(SAR.getStart()))
6233 return false;
6234
6235 const SCEV *Start = SAR.getStart();
6236 const SCEV *Stride = SAR.getStepRecurrence(SE);
6237
6238 // Skip pushing arithmetic noops.
6239 if (!isIdentityFunction(llvm::dwarf::DW_OP_mul, Stride)) {
6240 if (!pushSCEV(Stride))
6241 return false;
6242 pushOperator(llvm::dwarf::DW_OP_mul);
6243 }
6244 if (!isIdentityFunction(llvm::dwarf::DW_OP_plus, Start)) {
6245 if (!pushSCEV(Start))
6246 return false;
6247 pushOperator(llvm::dwarf::DW_OP_plus);
6248 }
6249 return true;
6250 }
6251
6252 /// Create an expression that is an offset from a value (usually the IV).
6253 void createOffsetExpr(int64_t Offset, Value *OffsetValue) {
6254 pushLocation(OffsetValue);
6256 LLVM_DEBUG(
6257 dbgs() << "scev-salvage: Generated IV offset expression. Offset: "
6258 << std::to_string(Offset) << "\n");
6259 }
6260
6261 /// Combine a translation of the SCEV and the IV to create an expression that
6262 /// recovers a location's value.
6263 /// returns true if an expression was created.
6264 bool createIterCountExpr(const SCEV *S,
6265 const SCEVDbgValueBuilder &IterationCount,
6266 ScalarEvolution &SE) {
6267 // SCEVs for SSA values are most frquently of the form
6268 // {start,+,stride}, but sometimes they are ({start,+,stride} + %a + ..).
6269 // This is because %a is a PHI node that is not the IV. However, these
6270 // SCEVs have not been observed to result in debuginfo-lossy optimisations,
6271 // so its not expected this point will be reached.
6272 if (!isa<SCEVAddRecExpr>(S))
6273 return false;
6274
6275 LLVM_DEBUG(dbgs() << "scev-salvage: Location to salvage SCEV: " << *S
6276 << '\n');
6277
6278 const auto *Rec = cast<SCEVAddRecExpr>(S);
6279 if (!Rec->isAffine())
6280 return false;
6281
6283 return false;
6284
6285 // Initialise a new builder with the iteration count expression. In
6286 // combination with the value's SCEV this enables recovery.
6287 clone(IterationCount);
6288 if (!SCEVToValueExpr(*Rec, SE))
6289 return false;
6290
6291 return true;
6292 }
6293
6294 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6295 /// builder's expression stack. The stack should already contain an
6296 /// expression for the iteration count, so that it can be multiplied by
6297 /// the stride and added to the start.
6298 /// Components of the expression are omitted if they are an identity function.
6299 bool SCEVToIterCountExpr(const llvm::SCEVAddRecExpr &SAR,
6300 ScalarEvolution &SE) {
6301 assert(SAR.isAffine() && "Expected affine SCEV");
6302 if (isa<SCEVAddRecExpr>(SAR.getStart())) {
6303 LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV. Unsupported nested AddRec: "
6304 << SAR << '\n');
6305 return false;
6306 }
6307 const SCEV *Start = SAR.getStart();
6308 const SCEV *Stride = SAR.getStepRecurrence(SE);
6309
6310 // Skip pushing arithmetic noops.
6311 if (!isIdentityFunction(llvm::dwarf::DW_OP_minus, Start)) {
6312 if (!pushSCEV(Start))
6313 return false;
6314 pushOperator(llvm::dwarf::DW_OP_minus);
6315 }
6316 if (!isIdentityFunction(llvm::dwarf::DW_OP_div, Stride)) {
6317 if (!pushSCEV(Stride))
6318 return false;
6319 pushOperator(llvm::dwarf::DW_OP_div);
6320 }
6321 return true;
6322 }
6323
6324 // Append the current expression and locations to a location list and an
6325 // expression list. Modify the DW_OP_LLVM_arg indexes to account for
6326 // the locations already present in the destination list.
6327 void appendToVectors(SmallVectorImpl<uint64_t> &DestExpr,
6328 SmallVectorImpl<Value *> &DestLocations) {
6329 assert(!DestLocations.empty() &&
6330 "Expected the locations vector to contain the IV");
6331 // The DWARF_OP_LLVM_arg arguments of the expression being appended must be
6332 // modified to account for the locations already in the destination vector.
6333 // All builders contain the IV as the first location op.
6334 assert(!LocationOps.empty() &&
6335 "Expected the location ops to contain the IV.");
6336 // DestIndexMap[n] contains the index in DestLocations for the nth
6337 // location in this SCEVDbgValueBuilder.
6338 SmallVector<uint64_t, 2> DestIndexMap;
6339 for (const auto &Op : LocationOps) {
6340 auto It = find(DestLocations, Op);
6341 if (It != DestLocations.end()) {
6342 // Location already exists in DestLocations, reuse existing ArgIndex.
6343 DestIndexMap.push_back(std::distance(DestLocations.begin(), It));
6344 continue;
6345 }
6346 // Location is not in DestLocations, add it.
6347 DestIndexMap.push_back(DestLocations.size());
6348 DestLocations.push_back(Op);
6349 }
6350
6351 for (const auto &Op : expr_ops()) {
6352 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6353 Op.appendToVector(DestExpr);
6354 continue;
6355 }
6356
6358 // `DW_OP_LLVM_arg n` represents the nth LocationOp in this SCEV,
6359 // DestIndexMap[n] contains its new index in DestLocations.
6360 uint64_t NewIndex = DestIndexMap[Op.getArg(0)];
6361 DestExpr.push_back(NewIndex);
6362 }
6363 }
6364};
6365
6366/// Holds all the required data to salvage a dbg.value using the pre-LSR SCEVs
6367/// and DIExpression.
6368struct DVIRecoveryRec {
6369 DVIRecoveryRec(DbgValueInst *DbgValue)
6370 : DbgRef(DbgValue), Expr(DbgValue->getExpression()),
6371 HadLocationArgList(false) {}
6372 DVIRecoveryRec(DbgVariableRecord *DVR)
6373 : DbgRef(DVR), Expr(DVR->getExpression()), HadLocationArgList(false) {}
6374
6376 DIExpression *Expr;
6377 bool HadLocationArgList;
6378 SmallVector<WeakVH, 2> LocationOps;
6381
6382 void clear() {
6383 for (auto &RE : RecoveryExprs)
6384 RE.reset();
6385 RecoveryExprs.clear();
6386 }
6387
6388 ~DVIRecoveryRec() { clear(); }
6389};
6390} // namespace
6391
6392/// Returns the total number of DW_OP_llvm_arg operands in the expression.
6393/// This helps in determining if a DIArglist is necessary or can be omitted from
6394/// the dbg.value.
6396 auto expr_ops = ToDwarfOpIter(Expr);
6397 unsigned Count = 0;
6398 for (auto Op : expr_ops)
6399 if (Op.getOp() == dwarf::DW_OP_LLVM_arg)
6400 Count++;
6401 return Count;
6402}
6403
6404/// Overwrites DVI with the location and Ops as the DIExpression. This will
6405/// create an invalid expression if Ops has any dwarf::DW_OP_llvm_arg operands,
6406/// because a DIArglist is not created for the first argument of the dbg.value.
6407template <typename T>
6408static void updateDVIWithLocation(T &DbgVal, Value *Location,
6410 assert(numLLVMArgOps(Ops) == 0 && "Expected expression that does not "
6411 "contain any DW_OP_llvm_arg operands.");
6412 DbgVal.setRawLocation(ValueAsMetadata::get(Location));
6413 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6414 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6415}
6416
6417/// Overwrite DVI with locations placed into a DIArglist.
6418template <typename T>
6419static void updateDVIWithLocations(T &DbgVal,
6420 SmallVectorImpl<Value *> &Locations,
6422 assert(numLLVMArgOps(Ops) != 0 &&
6423 "Expected expression that references DIArglist locations using "
6424 "DW_OP_llvm_arg operands.");
6426 for (Value *V : Locations)
6427 MetadataLocs.push_back(ValueAsMetadata::get(V));
6428 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6429 DbgVal.setRawLocation(llvm::DIArgList::get(DbgVal.getContext(), ValArrayRef));
6430 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6431}
6432
6433/// Write the new expression and new location ops for the dbg.value. If possible
6434/// reduce the szie of the dbg.value intrinsic by omitting DIArglist. This
6435/// can be omitted if:
6436/// 1. There is only a single location, refenced by a single DW_OP_llvm_arg.
6437/// 2. The DW_OP_LLVM_arg is the first operand in the expression.
6438static void UpdateDbgValueInst(DVIRecoveryRec &DVIRec,
6439 SmallVectorImpl<Value *> &NewLocationOps,
6441 auto UpdateDbgValueInstImpl = [&](auto *DbgVal) {
6442 unsigned NumLLVMArgs = numLLVMArgOps(NewExpr);
6443 if (NumLLVMArgs == 0) {
6444 // Location assumed to be on the stack.
6445 updateDVIWithLocation(*DbgVal, NewLocationOps[0], NewExpr);
6446 } else if (NumLLVMArgs == 1 && NewExpr[0] == dwarf::DW_OP_LLVM_arg) {
6447 // There is only a single DW_OP_llvm_arg at the start of the expression,
6448 // so it can be omitted along with DIArglist.
6449 assert(NewExpr[1] == 0 &&
6450 "Lone LLVM_arg in a DIExpression should refer to location-op 0.");
6452 updateDVIWithLocation(*DbgVal, NewLocationOps[0], ShortenedOps);
6453 } else {
6454 // Multiple DW_OP_llvm_arg, so DIArgList is strictly necessary.
6455 updateDVIWithLocations(*DbgVal, NewLocationOps, NewExpr);
6456 }
6457
6458 // If the DIExpression was previously empty then add the stack terminator.
6459 // Non-empty expressions have only had elements inserted into them and so
6460 // the terminator should already be present e.g. stack_value or fragment.
6461 DIExpression *SalvageExpr = DbgVal->getExpression();
6462 if (!DVIRec.Expr->isComplex() && SalvageExpr->isComplex()) {
6463 SalvageExpr =
6464 DIExpression::append(SalvageExpr, {dwarf::DW_OP_stack_value});
6465 DbgVal->setExpression(SalvageExpr);
6466 }
6467 };
6468 if (isa<DbgValueInst *>(DVIRec.DbgRef))
6469 UpdateDbgValueInstImpl(cast<DbgValueInst *>(DVIRec.DbgRef));
6470 else
6471 UpdateDbgValueInstImpl(cast<DbgVariableRecord *>(DVIRec.DbgRef));
6472}
6473
6474/// Cached location ops may be erased during LSR, in which case a poison is
6475/// required when restoring from the cache. The type of that location is no
6476/// longer available, so just use int8. The poison will be replaced by one or
6477/// more locations later when a SCEVDbgValueBuilder selects alternative
6478/// locations to use for the salvage.
6480 return (VH) ? VH : PoisonValue::get(llvm::Type::getInt8Ty(C));
6481}
6482
6483/// Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
6484static void restorePreTransformState(DVIRecoveryRec &DVIRec) {
6485 auto RestorePreTransformStateImpl = [&](auto *DbgVal) {
6486 LLVM_DEBUG(dbgs() << "scev-salvage: restore dbg.value to pre-LSR state\n"
6487 << "scev-salvage: post-LSR: " << *DbgVal << '\n');
6488 assert(DVIRec.Expr && "Expected an expression");
6489 DbgVal->setExpression(DVIRec.Expr);
6490
6491 // Even a single location-op may be inside a DIArgList and referenced with
6492 // DW_OP_LLVM_arg, which is valid only with a DIArgList.
6493 if (!DVIRec.HadLocationArgList) {
6494 assert(DVIRec.LocationOps.size() == 1 &&
6495 "Unexpected number of location ops.");
6496 // LSR's unsuccessful salvage attempt may have added DIArgList, which in
6497 // this case was not present before, so force the location back to a
6498 // single uncontained Value.
6499 Value *CachedValue =
6500 getValueOrPoison(DVIRec.LocationOps[0], DbgVal->getContext());
6501 DbgVal->setRawLocation(ValueAsMetadata::get(CachedValue));
6502 } else {
6504 for (WeakVH VH : DVIRec.LocationOps) {
6505 Value *CachedValue = getValueOrPoison(VH, DbgVal->getContext());
6506 MetadataLocs.push_back(ValueAsMetadata::get(CachedValue));
6507 }
6508 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6509 DbgVal->setRawLocation(
6510 llvm::DIArgList::get(DbgVal->getContext(), ValArrayRef));
6511 }
6512 LLVM_DEBUG(dbgs() << "scev-salvage: pre-LSR: " << *DbgVal << '\n');
6513 };
6514 if (isa<DbgValueInst *>(DVIRec.DbgRef))
6515 RestorePreTransformStateImpl(cast<DbgValueInst *>(DVIRec.DbgRef));
6516 else
6517 RestorePreTransformStateImpl(cast<DbgVariableRecord *>(DVIRec.DbgRef));
6518}
6519
6521 llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec,
6522 const SCEV *SCEVInductionVar,
6523 SCEVDbgValueBuilder IterCountExpr) {
6524
6525 if (isa<DbgValueInst *>(DVIRec.DbgRef)
6526 ? !cast<DbgValueInst *>(DVIRec.DbgRef)->isKillLocation()
6527 : !cast<DbgVariableRecord *>(DVIRec.DbgRef)->isKillLocation())
6528 return false;
6529
6530 // LSR may have caused several changes to the dbg.value in the failed salvage
6531 // attempt. So restore the DIExpression, the location ops and also the
6532 // location ops format, which is always DIArglist for multiple ops, but only
6533 // sometimes for a single op.
6535
6536 // LocationOpIndexMap[i] will store the post-LSR location index of
6537 // the non-optimised out location at pre-LSR index i.
6538 SmallVector<int64_t, 2> LocationOpIndexMap;
6539 LocationOpIndexMap.assign(DVIRec.LocationOps.size(), -1);
6540 SmallVector<Value *, 2> NewLocationOps;
6541 NewLocationOps.push_back(LSRInductionVar);
6542
6543 for (unsigned i = 0; i < DVIRec.LocationOps.size(); i++) {
6544 WeakVH VH = DVIRec.LocationOps[i];
6545 // Place the locations not optimised out in the list first, avoiding
6546 // inserts later. The map is used to update the DIExpression's
6547 // DW_OP_LLVM_arg arguments as the expression is updated.
6548 if (VH && !isa<UndefValue>(VH)) {
6549 NewLocationOps.push_back(VH);
6550 LocationOpIndexMap[i] = NewLocationOps.size() - 1;
6551 LLVM_DEBUG(dbgs() << "scev-salvage: Location index " << i
6552 << " now at index " << LocationOpIndexMap[i] << "\n");
6553 continue;
6554 }
6555
6556 // It's possible that a value referred to in the SCEV may have been
6557 // optimised out by LSR.
6558 if (SE.containsErasedValue(DVIRec.SCEVs[i]) ||
6559 SE.containsUndefs(DVIRec.SCEVs[i])) {
6560 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV for location at index: " << i
6561 << " refers to a location that is now undef or erased. "
6562 "Salvage abandoned.\n");
6563 return false;
6564 }
6565
6566 LLVM_DEBUG(dbgs() << "scev-salvage: salvaging location at index " << i
6567 << " with SCEV: " << *DVIRec.SCEVs[i] << "\n");
6568
6569 DVIRec.RecoveryExprs[i] = std::make_unique<SCEVDbgValueBuilder>();
6570 SCEVDbgValueBuilder *SalvageExpr = DVIRec.RecoveryExprs[i].get();
6571
6572 // Create an offset-based salvage expression if possible, as it requires
6573 // less DWARF ops than an iteration count-based expression.
6574 if (std::optional<APInt> Offset =
6575 SE.computeConstantDifference(DVIRec.SCEVs[i], SCEVInductionVar)) {
6576 if (Offset->getSignificantBits() <= 64)
6577 SalvageExpr->createOffsetExpr(Offset->getSExtValue(), LSRInductionVar);
6578 } else if (!SalvageExpr->createIterCountExpr(DVIRec.SCEVs[i], IterCountExpr,
6579 SE))
6580 return false;
6581 }
6582
6583 // Merge the DbgValueBuilder generated expressions and the original
6584 // DIExpression, place the result into an new vector.
6586 if (DVIRec.Expr->getNumElements() == 0) {
6587 assert(DVIRec.RecoveryExprs.size() == 1 &&
6588 "Expected only a single recovery expression for an empty "
6589 "DIExpression.");
6590 assert(DVIRec.RecoveryExprs[0] &&
6591 "Expected a SCEVDbgSalvageBuilder for location 0");
6592 SCEVDbgValueBuilder *B = DVIRec.RecoveryExprs[0].get();
6593 B->appendToVectors(NewExpr, NewLocationOps);
6594 }
6595 for (const auto &Op : DVIRec.Expr->expr_ops()) {
6596 // Most Ops needn't be updated.
6597 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6598 Op.appendToVector(NewExpr);
6599 continue;
6600 }
6601
6602 uint64_t LocationArgIndex = Op.getArg(0);
6603 SCEVDbgValueBuilder *DbgBuilder =
6604 DVIRec.RecoveryExprs[LocationArgIndex].get();
6605 // The location doesn't have s SCEVDbgValueBuilder, so LSR did not
6606 // optimise it away. So just translate the argument to the updated
6607 // location index.
6608 if (!DbgBuilder) {
6609 NewExpr.push_back(dwarf::DW_OP_LLVM_arg);
6610 assert(LocationOpIndexMap[Op.getArg(0)] != -1 &&
6611 "Expected a positive index for the location-op position.");
6612 NewExpr.push_back(LocationOpIndexMap[Op.getArg(0)]);
6613 continue;
6614 }
6615 // The location has a recovery expression.
6616 DbgBuilder->appendToVectors(NewExpr, NewLocationOps);
6617 }
6618
6619 UpdateDbgValueInst(DVIRec, NewLocationOps, NewExpr);
6620 if (isa<DbgValueInst *>(DVIRec.DbgRef))
6621 LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: "
6622 << *cast<DbgValueInst *>(DVIRec.DbgRef) << "\n");
6623 else
6624 LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: "
6625 << *cast<DbgVariableRecord *>(DVIRec.DbgRef) << "\n");
6626 return true;
6627}
6628
6629/// Obtain an expression for the iteration count, then attempt to salvage the
6630/// dbg.value intrinsics.
6632 llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar,
6633 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &DVIToUpdate) {
6634 if (DVIToUpdate.empty())
6635 return;
6636
6637 const llvm::SCEV *SCEVInductionVar = SE.getSCEV(LSRInductionVar);
6638 assert(SCEVInductionVar &&
6639 "Anticipated a SCEV for the post-LSR induction variable");
6640
6641 if (const SCEVAddRecExpr *IVAddRec =
6642 dyn_cast<SCEVAddRecExpr>(SCEVInductionVar)) {
6643 if (!IVAddRec->isAffine())
6644 return;
6645
6646 // Prevent translation using excessive resources.
6647 if (IVAddRec->getExpressionSize() > MaxSCEVSalvageExpressionSize)
6648 return;
6649
6650 // The iteration count is required to recover location values.
6651 SCEVDbgValueBuilder IterCountExpr;
6652 IterCountExpr.pushLocation(LSRInductionVar);
6653 if (!IterCountExpr.SCEVToIterCountExpr(*IVAddRec, SE))
6654 return;
6655
6656 LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV: " << *SCEVInductionVar
6657 << '\n');
6658
6659 for (auto &DVIRec : DVIToUpdate) {
6660 SalvageDVI(L, SE, LSRInductionVar, *DVIRec, SCEVInductionVar,
6661 IterCountExpr);
6662 }
6663 }
6664}
6665
6666/// Identify and cache salvageable DVI locations and expressions along with the
6667/// corresponding SCEV(s). Also ensure that the DVI is not deleted between
6668/// cacheing and salvaging.
6670 Loop *L, ScalarEvolution &SE,
6671 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &SalvageableDVISCEVs,
6672 SmallSet<AssertingVH<DbgValueInst>, 2> &DVIHandles) {
6673 for (const auto &B : L->getBlocks()) {
6674 for (auto &I : *B) {
6675 auto ProcessDbgValue = [&](auto *DbgVal) -> bool {
6676 // Ensure that if any location op is undef that the dbg.vlue is not
6677 // cached.
6678 if (DbgVal->isKillLocation())
6679 return false;
6680
6681 // Check that the location op SCEVs are suitable for translation to
6682 // DIExpression.
6683 const auto &HasTranslatableLocationOps =
6684 [&](const auto *DbgValToTranslate) -> bool {
6685 for (const auto LocOp : DbgValToTranslate->location_ops()) {
6686 if (!LocOp)
6687 return false;
6688
6689 if (!SE.isSCEVable(LocOp->getType()))
6690 return false;
6691
6692 const SCEV *S = SE.getSCEV(LocOp);
6693 if (SE.containsUndefs(S))
6694 return false;
6695 }
6696 return true;
6697 };
6698
6699 if (!HasTranslatableLocationOps(DbgVal))
6700 return false;
6701
6702 std::unique_ptr<DVIRecoveryRec> NewRec =
6703 std::make_unique<DVIRecoveryRec>(DbgVal);
6704 // Each location Op may need a SCEVDbgValueBuilder in order to recover
6705 // it. Pre-allocating a vector will enable quick lookups of the builder
6706 // later during the salvage.
6707 NewRec->RecoveryExprs.resize(DbgVal->getNumVariableLocationOps());
6708 for (const auto LocOp : DbgVal->location_ops()) {
6709 NewRec->SCEVs.push_back(SE.getSCEV(LocOp));
6710 NewRec->LocationOps.push_back(LocOp);
6711 NewRec->HadLocationArgList = DbgVal->hasArgList();
6712 }
6713 SalvageableDVISCEVs.push_back(std::move(NewRec));
6714 return true;
6715 };
6716 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
6717 if (DVR.isDbgValue() || DVR.isDbgAssign())
6718 ProcessDbgValue(&DVR);
6719 }
6720 auto DVI = dyn_cast<DbgValueInst>(&I);
6721 if (!DVI)
6722 continue;
6723 if (ProcessDbgValue(DVI))
6724 DVIHandles.insert(DVI);
6725 }
6726 }
6727}
6728
6729/// Ideally pick the PHI IV inserted by ScalarEvolutionExpander. As a fallback
6730/// any PHi from the loop header is usable, but may have less chance of
6731/// surviving subsequent transforms.
6733 const LSRInstance &LSR) {
6734
6735 auto IsSuitableIV = [&](PHINode *P) {
6736 if (!SE.isSCEVable(P->getType()))
6737 return false;
6738 if (const SCEVAddRecExpr *Rec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(P)))
6739 return Rec->isAffine() && !SE.containsUndefs(SE.getSCEV(P));
6740 return false;
6741 };
6742
6743 // For now, just pick the first IV that was generated and inserted by
6744 // ScalarEvolution. Ideally pick an IV that is unlikely to be optimised away
6745 // by subsequent transforms.
6746 for (const WeakVH &IV : LSR.getScalarEvolutionIVs()) {
6747 if (!IV)
6748 continue;
6749
6750 // There should only be PHI node IVs.
6751 PHINode *P = cast<PHINode>(&*IV);
6752
6753 if (IsSuitableIV(P))
6754 return P;
6755 }
6756
6757 for (PHINode &P : L.getHeader()->phis()) {
6758 if (IsSuitableIV(&P))
6759 return &P;
6760 }
6761 return nullptr;
6762}
6763
6764static std::optional<std::tuple<PHINode *, PHINode *, const SCEV *, bool>>
6766 const LoopInfo &LI, const TargetTransformInfo &TTI) {
6767 if (!L->isInnermost()) {
6768 LLVM_DEBUG(dbgs() << "Cannot fold on non-innermost loop\n");
6769 return std::nullopt;
6770 }
6771 // Only inspect on simple loop structure
6772 if (!L->isLoopSimplifyForm()) {
6773 LLVM_DEBUG(dbgs() << "Cannot fold on non-simple loop\n");
6774 return std::nullopt;
6775 }
6776
6778 LLVM_DEBUG(dbgs() << "Cannot fold on backedge that is loop variant\n");
6779 return std::nullopt;
6780 }
6781
6782 BasicBlock *LoopLatch = L->getLoopLatch();
6783 BranchInst *BI = dyn_cast<BranchInst>(LoopLatch->getTerminator());
6784 if (!BI || BI->isUnconditional())
6785 return std::nullopt;
6786 auto *TermCond = dyn_cast<ICmpInst>(BI->getCondition());
6787 if (!TermCond) {
6788 LLVM_DEBUG(
6789 dbgs() << "Cannot fold on branching condition that is not an ICmpInst");
6790 return std::nullopt;
6791 }
6792 if (!TermCond->hasOneUse()) {
6793 LLVM_DEBUG(
6794 dbgs()
6795 << "Cannot replace terminating condition with more than one use\n");
6796 return std::nullopt;
6797 }
6798
6799 BinaryOperator *LHS = dyn_cast<BinaryOperator>(TermCond->getOperand(0));
6800 Value *RHS = TermCond->getOperand(1);
6801 if (!LHS || !L->isLoopInvariant(RHS))
6802 // We could pattern match the inverse form of the icmp, but that is
6803 // non-canonical, and this pass is running *very* late in the pipeline.
6804 return std::nullopt;
6805
6806 // Find the IV used by the current exit condition.
6807 PHINode *ToFold;
6808 Value *ToFoldStart, *ToFoldStep;
6809 if (!matchSimpleRecurrence(LHS, ToFold, ToFoldStart, ToFoldStep))
6810 return std::nullopt;
6811
6812 // Ensure the simple recurrence is a part of the current loop.
6813 if (ToFold->getParent() != L->getHeader())
6814 return std::nullopt;
6815
6816 // If that IV isn't dead after we rewrite the exit condition in terms of
6817 // another IV, there's no point in doing the transform.
6818 if (!isAlmostDeadIV(ToFold, LoopLatch, TermCond))
6819 return std::nullopt;
6820
6821 // Inserting instructions in the preheader has a runtime cost, scale
6822 // the allowed cost with the loops trip count as best we can.
6823 const unsigned ExpansionBudget = [&]() {
6824 unsigned Budget = 2 * SCEVCheapExpansionBudget;
6825 if (unsigned SmallTC = SE.getSmallConstantMaxTripCount(L))
6826 return std::min(Budget, SmallTC);
6827 if (std::optional<unsigned> SmallTC = getLoopEstimatedTripCount(L))
6828 return std::min(Budget, *SmallTC);
6829 // Unknown trip count, assume long running by default.
6830 return Budget;
6831 }();
6832
6833 const SCEV *BECount = SE.getBackedgeTakenCount(L);
6834 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
6835 SCEVExpander Expander(SE, DL, "lsr_fold_term_cond");
6836
6837 PHINode *ToHelpFold = nullptr;
6838 const SCEV *TermValueS = nullptr;
6839 bool MustDropPoison = false;
6840 auto InsertPt = L->getLoopPreheader()->getTerminator();
6841 for (PHINode &PN : L->getHeader()->phis()) {
6842 if (ToFold == &PN)
6843 continue;
6844
6845 if (!SE.isSCEVable(PN.getType())) {
6846 LLVM_DEBUG(dbgs() << "IV of phi '" << PN
6847 << "' is not SCEV-able, not qualified for the "
6848 "terminating condition folding.\n");
6849 continue;
6850 }
6851 const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(&PN));
6852 // Only speculate on affine AddRec
6853 if (!AddRec || !AddRec->isAffine()) {
6854 LLVM_DEBUG(dbgs() << "SCEV of phi '" << PN
6855 << "' is not an affine add recursion, not qualified "
6856 "for the terminating condition folding.\n");
6857 continue;
6858 }
6859
6860 // Check that we can compute the value of AddRec on the exiting iteration
6861 // without soundness problems. evaluateAtIteration internally needs
6862 // to multiply the stride of the iteration number - which may wrap around.
6863 // The issue here is subtle because computing the result accounting for
6864 // wrap is insufficient. In order to use the result in an exit test, we
6865 // must also know that AddRec doesn't take the same value on any previous
6866 // iteration. The simplest case to consider is a candidate IV which is
6867 // narrower than the trip count (and thus original IV), but this can
6868 // also happen due to non-unit strides on the candidate IVs.
6869 if (!AddRec->hasNoSelfWrap() ||
6870 !SE.isKnownNonZero(AddRec->getStepRecurrence(SE)))
6871 continue;
6872
6873 const SCEVAddRecExpr *PostInc = AddRec->getPostIncExpr(SE);
6874 const SCEV *TermValueSLocal = PostInc->evaluateAtIteration(BECount, SE);
6875 if (!Expander.isSafeToExpand(TermValueSLocal)) {
6876 LLVM_DEBUG(
6877 dbgs() << "Is not safe to expand terminating value for phi node" << PN
6878 << "\n");
6879 continue;
6880 }
6881
6882 if (Expander.isHighCostExpansion(TermValueSLocal, L, ExpansionBudget,
6883 &TTI, InsertPt)) {
6884 LLVM_DEBUG(
6885 dbgs() << "Is too expensive to expand terminating value for phi node"
6886 << PN << "\n");
6887 continue;
6888 }
6889
6890 // The candidate IV may have been otherwise dead and poison from the
6891 // very first iteration. If we can't disprove that, we can't use the IV.
6892 if (!mustExecuteUBIfPoisonOnPathTo(&PN, LoopLatch->getTerminator(), &DT)) {
6893 LLVM_DEBUG(dbgs() << "Can not prove poison safety for IV "
6894 << PN << "\n");
6895 continue;
6896 }
6897
6898 // The candidate IV may become poison on the last iteration. If this
6899 // value is not branched on, this is a well defined program. We're
6900 // about to add a new use to this IV, and we have to ensure we don't
6901 // insert UB which didn't previously exist.
6902 bool MustDropPoisonLocal = false;
6903 Instruction *PostIncV =
6904 cast<Instruction>(PN.getIncomingValueForBlock(LoopLatch));
6905 if (!mustExecuteUBIfPoisonOnPathTo(PostIncV, LoopLatch->getTerminator(),
6906 &DT)) {
6907 LLVM_DEBUG(dbgs() << "Can not prove poison safety to insert use"
6908 << PN << "\n");
6909
6910 // If this is a complex recurrance with multiple instructions computing
6911 // the backedge value, we might need to strip poison flags from all of
6912 // them.
6913 if (PostIncV->getOperand(0) != &PN)
6914 continue;
6915
6916 // In order to perform the transform, we need to drop the poison generating
6917 // flags on this instruction (if any).
6918 MustDropPoisonLocal = PostIncV->hasPoisonGeneratingFlags();
6919 }
6920
6921 // We pick the last legal alternate IV. We could expore choosing an optimal
6922 // alternate IV if we had a decent heuristic to do so.
6923 ToHelpFold = &PN;
6924 TermValueS = TermValueSLocal;
6925 MustDropPoison = MustDropPoisonLocal;
6926 }
6927
6928 LLVM_DEBUG(if (ToFold && !ToHelpFold) dbgs()
6929 << "Cannot find other AddRec IV to help folding\n";);
6930
6931 LLVM_DEBUG(if (ToFold && ToHelpFold) dbgs()
6932 << "\nFound loop that can fold terminating condition\n"
6933 << " BECount (SCEV): " << *SE.getBackedgeTakenCount(L) << "\n"
6934 << " TermCond: " << *TermCond << "\n"
6935 << " BrandInst: " << *BI << "\n"
6936 << " ToFold: " << *ToFold << "\n"
6937 << " ToHelpFold: " << *ToHelpFold << "\n");
6938
6939 if (!ToFold || !ToHelpFold)
6940 return std::nullopt;
6941 return std::make_tuple(ToFold, ToHelpFold, TermValueS, MustDropPoison);
6942}
6943
6945 DominatorTree &DT, LoopInfo &LI,
6946 const TargetTransformInfo &TTI,
6948 MemorySSA *MSSA) {
6949
6950 // Debug preservation - before we start removing anything identify which DVI
6951 // meet the salvageable criteria and store their DIExpression and SCEVs.
6952 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> SalvageableDVIRecords;
6954 DbgGatherSalvagableDVI(L, SE, SalvageableDVIRecords, DVIHandles);
6955
6956 bool Changed = false;
6957 std::unique_ptr<MemorySSAUpdater> MSSAU;
6958 if (MSSA)
6959 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
6960
6961 // Run the main LSR transformation.
6962 const LSRInstance &Reducer =
6963 LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get());
6964 Changed |= Reducer.getChanged();
6965
6966 // Remove any extra phis created by processing inner loops.
6967 Changed |= DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
6968 if (EnablePhiElim && L->isLoopSimplifyForm()) {
6970 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
6971 SCEVExpander Rewriter(SE, DL, "lsr", false);
6972#ifndef NDEBUG
6973 Rewriter.setDebugType(DEBUG_TYPE);
6974#endif
6975 unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI);
6976 Rewriter.clear();
6977 if (numFolded) {
6978 Changed = true;
6980 MSSAU.get());
6981 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
6982 }
6983 }
6984 // LSR may at times remove all uses of an induction variable from a loop.
6985 // The only remaining use is the PHI in the exit block.
6986 // When this is the case, if the exit value of the IV can be calculated using
6987 // SCEV, we can replace the exit block PHI with the final value of the IV and
6988 // skip the updates in each loop iteration.
6989 if (L->isRecursivelyLCSSAForm(DT, LI) && L->getExitBlock()) {
6991 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
6992 SCEVExpander Rewriter(SE, DL, "lsr", true);
6993 int Rewrites = rewriteLoopExitValues(L, &LI, &TLI, &SE, &TTI, Rewriter, &DT,
6994 UnusedIndVarInLoop, DeadInsts);
6995 Rewriter.clear();
6996 if (Rewrites) {
6997 Changed = true;
6999 MSSAU.get());
7000 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7001 }
7002 }
7003
7004 const bool EnableFormTerm = [&] {
7006 case cl::BOU_TRUE:
7007 return true;
7008 case cl::BOU_FALSE:
7009 return false;
7010 case cl::BOU_UNSET:
7012 }
7013 llvm_unreachable("Unhandled cl::boolOrDefault enum");
7014 }();
7015
7016 if (EnableFormTerm) {
7017 if (auto Opt = canFoldTermCondOfLoop(L, SE, DT, LI, TTI)) {
7018 auto [ToFold, ToHelpFold, TermValueS, MustDrop] = *Opt;
7019
7020 Changed = true;
7021 NumTermFold++;
7022
7023 BasicBlock *LoopPreheader = L->getLoopPreheader();
7024 BasicBlock *LoopLatch = L->getLoopLatch();
7025
7026 (void)ToFold;
7027 LLVM_DEBUG(dbgs() << "To fold phi-node:\n"
7028 << *ToFold << "\n"
7029 << "New term-cond phi-node:\n"
7030 << *ToHelpFold << "\n");
7031
7032 Value *StartValue = ToHelpFold->getIncomingValueForBlock(LoopPreheader);
7033 (void)StartValue;
7034 Value *LoopValue = ToHelpFold->getIncomingValueForBlock(LoopLatch);
7035
7036 // See comment in canFoldTermCondOfLoop on why this is sufficient.
7037 if (MustDrop)
7038 cast<Instruction>(LoopValue)->dropPoisonGeneratingFlags();
7039
7040 // SCEVExpander for both use in preheader and latch
7041 const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
7042 SCEVExpander Expander(SE, DL, "lsr_fold_term_cond");
7043
7044 assert(Expander.isSafeToExpand(TermValueS) &&
7045 "Terminating value was checked safe in canFoldTerminatingCondition");
7046
7047 // Create new terminating value at loop preheader
7048 Value *TermValue = Expander.expandCodeFor(TermValueS, ToHelpFold->getType(),
7049 LoopPreheader->getTerminator());
7050
7051 LLVM_DEBUG(dbgs() << "Start value of new term-cond phi-node:\n"
7052 << *StartValue << "\n"
7053 << "Terminating value of new term-cond phi-node:\n"
7054 << *TermValue << "\n");
7055
7056 // Create new terminating condition at loop latch
7057 BranchInst *BI = cast<BranchInst>(LoopLatch->getTerminator());
7058 ICmpInst *OldTermCond = cast<ICmpInst>(BI->getCondition());
7059 IRBuilder<> LatchBuilder(LoopLatch->getTerminator());
7060 Value *NewTermCond =
7061 LatchBuilder.CreateICmp(CmpInst::ICMP_EQ, LoopValue, TermValue,
7062 "lsr_fold_term_cond.replaced_term_cond");
7063 // Swap successors to exit loop body if IV equals to new TermValue
7064 if (BI->getSuccessor(0) == L->getHeader())
7065 BI->swapSuccessors();
7066
7067 LLVM_DEBUG(dbgs() << "Old term-cond:\n"
7068 << *OldTermCond << "\n"
7069 << "New term-cond:\n" << *NewTermCond << "\n");
7070
7071 BI->setCondition(NewTermCond);
7072
7073 Expander.clear();
7074 OldTermCond->eraseFromParent();
7075 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7076 }
7077 }
7078
7079 if (SalvageableDVIRecords.empty())
7080 return Changed;
7081
7082 // Obtain relevant IVs and attempt to rewrite the salvageable DVIs with
7083 // expressions composed using the derived iteration count.
7084 // TODO: Allow for multiple IV references for nested AddRecSCEVs
7085 for (const auto &L : LI) {
7086 if (llvm::PHINode *IV = GetInductionVariable(*L, SE, Reducer))
7087 DbgRewriteSalvageableDVIs(L, SE, IV, SalvageableDVIRecords);
7088 else {
7089 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV salvaging not possible. An IV "
7090 "could not be identified.\n");
7091 }
7092 }
7093
7094 for (auto &Rec : SalvageableDVIRecords)
7095 Rec->clear();
7096 SalvageableDVIRecords.clear();
7097 DVIHandles.clear();
7098 return Changed;
7099}
7100
7101bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
7102 if (skipLoop(L))
7103 return false;
7104
7105 auto &IU = getAnalysis<IVUsersWrapperPass>().getIU();
7106 auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
7107 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
7108 auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
7109 const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
7110 *L->getHeader()->getParent());
7111 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
7112 *L->getHeader()->getParent());
7113 auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
7114 *L->getHeader()->getParent());
7115 auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
7116 MemorySSA *MSSA = nullptr;
7117 if (MSSAAnalysis)
7118 MSSA = &MSSAAnalysis->getMSSA();
7119 return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, TLI, MSSA);
7120}
7121
7124 LPMUpdater &) {
7125 if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,
7126 AR.DT, AR.LI, AR.TTI, AR.AC, AR.TLI, AR.MSSA))
7127 return PreservedAnalyses::all();
7128
7129 auto PA = getLoopPassPreservedAnalyses();
7130 if (AR.MSSA)
7131 PA.preserve<MemorySSAAnalysis>();
7132 return PA;
7133}
7134
7135char LoopStrengthReduce::ID = 0;
7136
7137INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
7138 "Loop Strength Reduction", false, false)
7144INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
7145INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
7146 "Loop Strength Reduction", false, false)
7147
7148Pass *llvm::createLoopStrengthReducePass() { return new LoopStrengthReduce(); }
#define Success
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file implements a class to represent arbitrary precision integral constant values and operations...
@ PostInc
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:693
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:537
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static void clear(coro::Shape &Shape)
Definition: Coroutines.cpp:148
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static bool isCanonical(const MDString *S)
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
This file contains constants used for implementing Dwarf debug support.
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1291
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Rewrite Partial Register Uses
Hexagon Hardware Loops
iv Induction Variable Users
Definition: IVUsers.cpp:48
This header provides classes for managing per-loop analyses.
static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec, const SCEV *SCEVInductionVar, SCEVDbgValueBuilder IterCountExpr)
static std::optional< std::tuple< PHINode *, PHINode *, const SCEV *, bool > > canFoldTermCondOfLoop(Loop *L, ScalarEvolution &SE, DominatorTree &DT, const LoopInfo &LI, const TargetTransformInfo &TTI)
static Value * getWideOperand(Value *Oper)
IVChain logic must consistently peek base TruncInst operands, so wrap it in a convenient helper.
static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE)
Return true if the given add can be sign-extended without changing its value.
static bool mayUsePostIncMode(const TargetTransformInfo &TTI, LSRUse &LU, const SCEV *S, const Loop *L, ScalarEvolution &SE)
Return true if the SCEV represents a value that may end up as a post-increment operation.
static void restorePreTransformState(DVIRecoveryRec &DVIRec)
Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L)
static User::op_iterator findIVOperand(User::op_iterator OI, User::op_iterator OE, Loop *L, ScalarEvolution &SE)
Helper for CollectChains that finds an IV operand (computed by an AddRec in this loop) within [OI,...
static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE)
Return true if the given mul can be sign-extended without changing its value.
static const unsigned MaxSCEVSalvageExpressionSize
Limit the size of expression that SCEV-based salvaging will attempt to translate into a DIExpression.
static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if this AddRec is already a phi in its loop.
static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F, const Loop &L)
static cl::opt< bool > InsnsCost("lsr-insns-cost", cl::Hidden, cl::init(true), cl::desc("Add instruction count to a LSR cost model"))
static cl::opt< bool > StressIVChain("stress-ivchain", cl::Hidden, cl::init(false), cl::desc("Stress test LSR IV chains"))
static bool isAddressUse(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Returns true if the specified instruction is using the specified value as an address.
static GlobalValue * ExtractSymbol(const SCEV *&S, ScalarEvolution &SE)
If S involves the addition of a GlobalValue address, return that symbol, and mutate S to point to a n...
static void updateDVIWithLocation(T &DbgVal, Value *Location, SmallVectorImpl< uint64_t > &Ops)
Overwrites DVI with the location and Ops as the DIExpression.
static cl::opt< TTI::AddressingModeKind > PreferredAddresingMode("lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None), cl::desc("A flag that overrides the target's preferred addressing mode."), cl::values(clEnumValN(TTI::AMK_None, "none", "Don't prefer any addressing mode"), clEnumValN(TTI::AMK_PreIndexed, "preindexed", "Prefer pre-indexed addressing mode"), clEnumValN(TTI::AMK_PostIndexed, "postindexed", "Prefer post-indexed addressing mode")))
static const SCEV * getExprBase(const SCEV *S)
Return an approximation of this SCEV expression's "base", or NULL for any constant.
static llvm::PHINode * GetInductionVariable(const Loop &L, ScalarEvolution &SE, const LSRInstance &LSR)
Ideally pick the PHI IV inserted by ScalarEvolutionExpander.
static cl::opt< cl::boolOrDefault > AllowTerminatingConditionFoldingAfterLSR("lsr-term-fold", cl::Hidden, cl::desc("Attempt to replace primary IV with other IV."))
static bool IsSimplerBaseSCEVForTarget(const TargetTransformInfo &TTI, ScalarEvolution &SE, const SCEV *Best, const SCEV *Reg, MemAccessTy AccessType)
loop reduce
static const unsigned MaxIVUsers
MaxIVUsers is an arbitrary threshold that provides an early opportunity for bail out.
static cl::opt< bool > AllowDropSolutionIfLessProfitable("lsr-drop-solution", cl::Hidden, cl::init(false), cl::desc("Attempt to drop solution if it is less profitable"))
static bool isHighCostExpansion(const SCEV *S, SmallPtrSetImpl< const SCEV * > &Processed, ScalarEvolution &SE)
Check if expanding this expression is likely to incur significant cost.
static Value * getValueOrPoison(WeakVH &VH, LLVMContext &C)
Cached location ops may be erased during LSR, in which case a poison is required when restoring from ...
static MemAccessTy getAccessType(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Return the type of the memory being accessed.
static unsigned numLLVMArgOps(SmallVectorImpl< uint64_t > &Expr)
Returns the total number of DW_OP_llvm_arg operands in the expression.
static bool isAlwaysFoldable(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg)
static void DbgRewriteSalvageableDVIs(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &DVIToUpdate)
Obtain an expression for the iteration count, then attempt to salvage the dbg.value intrinsics.
static cl::opt< bool > EnablePhiElim("enable-lsr-phielim", cl::Hidden, cl::init(true), cl::desc("Enable LSR phi elimination"))
static void DbgGatherSalvagableDVI(Loop *L, ScalarEvolution &SE, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &SalvageableDVISCEVs, SmallSet< AssertingVH< DbgValueInst >, 2 > &DVIHandles)
Identify and cache salvageable DVI locations and expressions along with the corresponding SCEV(s).
static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if the given addrec can be sign-extended without changing its value.
static bool canHoistIVInc(const TargetTransformInfo &TTI, const LSRFixup &Fixup, const LSRUse &LU, Instruction *IVIncInsertPos, Loop *L)
static void DoInitialMatch(const SCEV *S, Loop *L, SmallVectorImpl< const SCEV * > &Good, SmallVectorImpl< const SCEV * > &Bad, ScalarEvolution &SE)
Recursion helper for initialMatch.
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F)
Check if the addressing mode defined by F is completely folded in LU at isel time.
static cl::opt< bool > LSRExpNarrow("lsr-exp-narrow", cl::Hidden, cl::init(false), cl::desc("Narrow LSR complex solution using" " expectation of registers number"))
static cl::opt< bool > FilterSameScaledReg("lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true), cl::desc("Narrow LSR search space by filtering non-optimal formulae" " with the same ScaledReg and Scale"))
static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, int64_t MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale)
Test whether we know how to expand the current formula.
static void updateDVIWithLocations(T &DbgVal, SmallVectorImpl< Value * > &Locations, SmallVectorImpl< uint64_t > &Ops)
Overwrite DVI with locations placed into a DIArglist.
static cl::opt< unsigned > ComplexityLimit("lsr-complexity-limit", cl::Hidden, cl::init(std::numeric_limits< uint16_t >::max()), cl::desc("LSR search space complexity limit"))
static void UpdateDbgValueInst(DVIRecoveryRec &DVIRec, SmallVectorImpl< Value * > &NewLocationOps, SmallVectorImpl< uint64_t > &NewExpr)
Write the new expression and new location ops for the dbg.value.
static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE)
If S involves the addition of a constant integer value, return that integer value,...
static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC, TargetLibraryInfo &TLI, MemorySSA *MSSA)
static bool isProfitableChain(IVChain &Chain, SmallPtrSetImpl< Instruction * > &Users, ScalarEvolution &SE, const TargetTransformInfo &TTI)
Return true if the number of registers needed for the chain is estimated to be less than the number r...
static const SCEV * CollectSubexprs(const SCEV *S, const SCEVConstant *C, SmallVectorImpl< const SCEV * > &Ops, const Loop *L, ScalarEvolution &SE, unsigned Depth=0)
Split S into subexpressions which can be pulled out into separate registers.
static const SCEV * getExactSDiv(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool IgnoreSignificantBits=false)
Return an expression for LHS /s RHS, if it can be determined and if the remainder is known to be zero...
static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, Value *Operand, const TargetTransformInfo &TTI)
Return true if the IVInc can be folded into an addressing mode.
#define DEBUG_TYPE
static const SCEV * getAnyExtendConsideringPostIncUses(ArrayRef< PostIncLoopSet > Loops, const SCEV *Expr, Type *ToTy, ScalarEvolution &SE)
Extend/Truncate Expr to ToTy considering post-inc uses in Loops.
static unsigned getSetupCost(const SCEV *Reg, unsigned Depth)
static cl::opt< unsigned > SetupCostDepthLimit("lsr-setupcost-depth-limit", cl::Hidden, cl::init(7), cl::desc("The limit on recursion depth for LSRs setup cost"))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
unsigned Reg
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
Module.h This file contains the declarations for the Module class.
#define P(N)
PowerPC TLS Dynamic Call Fixup
if(VerifyEach)
This header defines various interfaces for pass management in LLVM.
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file defines the PointerIntPair class.
const SmallVectorImpl< MachineOperand > & Cond
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
SI optimize exec mask operations pre RA
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This pass exposes codegen information to IR-level passes.
This defines the Use class.
Virtual Register Rewriter
Definition: VirtRegMap.cpp:237
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class recording the (high level) value of a variable.
Class for arbitrary precision integers.
Definition: APInt.h:76
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1498
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:307
APInt sdiv(const APInt &RHS) const
Signed division function for APInt.
Definition: APInt.cpp:1614
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition: APInt.h:1489
APInt srem(const APInt &RHS) const
Function for signed remainder operation.
Definition: APInt.cpp:1706
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1520
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:321
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:473
Represent the analysis usage information of a pass.
AnalysisUsage & addRequiredID(const void *ID)
Definition: Pass.cpp:283
AnalysisUsage & addPreservedID(const void *ID)
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Definition: Any.h:28
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
Value handle that asserts if the Value is deleted.
Definition: ValueHandle.h:264
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:539
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:748
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:499
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:360
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:206
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:165
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition: BasicBlock.h:358
bool isLandingPad() const
Return true if this basic block is a landing pad.
Definition: BasicBlock.cpp:672
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:221
static BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore)
Construct a binary instruction, given the opcode and the two operands.
BinaryOps getOpcode() const
Definition: InstrTypes.h:513
Conditional or Unconditional Branch instruction.
void setCondition(Value *V)
void swapSuccessors()
Swap the successors of this branch instruction.
BasicBlock * getSuccessor(unsigned i) const
bool isUnconditional() const
Value * getCondition() const
static Instruction::CastOps getCastOpcode(const Value *Val, bool SrcIsSigned, Type *Ty, bool DstIsSigned)
Returns the opcode necessary to cast Val into Ty using usual casting rules.
static CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name, BasicBlock::iterator InsertBefore)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ ICMP_EQ
equal
Definition: InstrTypes.h:1014
@ ICMP_NE
not equal
Definition: InstrTypes.h:1015
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:1129
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
static bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
Definition: Constants.cpp:1588
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.h:123
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:160
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:154
This is an important base class in LLVM.
Definition: Constant.h:41
static DIArgList * get(LLVMContext &Context, ArrayRef< ValueAsMetadata * > Args)
An iterator for expression operands.
DWARF expression.
static DIExpression * append(const DIExpression *Expr, ArrayRef< uint64_t > Ops)
Append the opcodes Ops to DIExpr.
static void appendOffset(SmallVectorImpl< uint64_t > &Ops, int64_t Offset)
Append Ops with operations to apply the Offset.
bool isComplex() const
Return whether the location is computed on the expression stack, meaning it cannot be a simple regist...
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
This represents the llvm.dbg.value instruction.
Record of a variable value-assignment, aka a non instruction representation of the dbg....
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
NodeT * getBlock() const
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:317
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
This instruction compares its operands according to the predicate given to the constructor.
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2351
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2666
IVStrideUse - Keep track of one use of a strided induction variable.
Definition: IVUsers.h:34
void transformToPostInc(const Loop *L)
transformToPostInc - Transform the expression to post-inc form for the given loop.
Definition: IVUsers.cpp:367
Value * getOperandValToReplace() const
getOperandValToReplace - Return the Value of the operand in the user instruction that this IVStrideUs...
Definition: IVUsers.h:53
void setUser(Instruction *NewUser)
setUser - Assign a new user instruction for this use.
Definition: IVUsers.h:47
Analysis pass that exposes the IVUsers for a loop.
Definition: IVUsers.h:183
ilist< IVStrideUse >::const_iterator const_iterator
Definition: IVUsers.h:141
bool empty() const
Definition: IVUsers.h:146
void print(raw_ostream &OS) const
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:454
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:83
bool isEHPad() const
Return true if the instruction is a variety of EH-block.
Definition: Instruction.h:812
const BasicBlock * getParent() const
Definition: Instruction.h:152
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Type * getAccessType() const LLVM_READONLY
Return the type this instruction accesses in memory, if any.
bool hasPoisonGeneratingFlags() const LLVM_READONLY
Return true if this operator has flags which may cause this instruction to evaluate to poison despite...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:252
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:451
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
An instruction for reading from memory.
Definition: Instructions.h:184
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:593
virtual bool runOnLoop(Loop *L, LPPassManager &LPM)=0
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
An analysis that produces MemorySSA for a function.
Definition: MemorySSA.h:928
Legacy analysis pass which computes MemorySSA.
Definition: MemorySSA.h:985
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition: MemorySSA.h:701
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:293
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
iterator_range< const_block_iterator > blocks() const
op_range incoming_values()
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr, BasicBlock::iterator InsertBefore)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
void setIncomingValue(unsigned i, Value *V)
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
static unsigned getIncomingValueNumForOperand(unsigned i)
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98
PointerIntPair - This class implements a pair of a pointer and small integer.
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1827
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:109
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:115
This node represents an addition of some number of SCEVs.
This node represents a polynomial recurrence on the trip count of the specified loop.
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
const SCEVAddRecExpr * getPostIncExpr(ScalarEvolution &SE) const
Return an expression representing the value of this expression one iteration of the loop ahead.
This is the base class for unary cast operator classes.
This node is the base class for n'ary commutative operators.
This class represents a constant integer value.
ConstantInt * getValue() const
const APInt & getAPInt() const
This class uses information about analyze scalars to rewrite expressions in canonical form.
bool isSafeToExpand(const SCEV *S) const
Return true if the given expression is safe to expand in the sense that all materialized values are s...
bool isHighCostExpansion(ArrayRef< const SCEV * > Exprs, Loop *L, unsigned Budget, const TargetTransformInfo *TTI, const Instruction *At)
Return true for expressions that can't be evaluated at runtime within given Budget.
void clear()
Erase the contents of the InsertedExpressions map so that users trying to expand the same expression ...
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This is the base class for unary integral cast operator classes.
This node represents multiplication of some number of SCEVs.
This node is a base class providing common functionality for n'ary operators.
ArrayRef< const SCEV * > operands() const
This class represents a signed maximum selection.
This class represents a binary unsigned division operation.
This class represents an unsigned maximum selection.
This means that we are dealing with an entirely unknown SCEV value, and only represent it as its LLVM...
This class represents an analyzed expression in the program.
ArrayRef< const SCEV * > operands() const
Return operands of this SCEV expression.
unsigned short getExpressionSize() const
bool isZero() const
Return true if the expression is a constant zero.
SCEVTypes getSCEVType() const
Type * getType() const
Return the LLVM type of this SCEV expression.
This class represents a cast from signed integer to floating point.
The main scalar evolution driver.
bool isKnownNonZero(const SCEV *S)
Test if the given expression is known to be non-zero.
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
uint64_t getTypeSizeInBits(Type *Ty) const
Return the size in bits of the specified type, for which isSCEVable must return true.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getNoopOrSignExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
unsigned getSmallConstantMaxTripCount(const Loop *L)
Returns the upper bound of the loop trip count as a normal unsigned value.
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getAddRecExpr(const SCEV *Start, const SCEV *Step, const Loop *L, SCEV::NoWrapFlags Flags)
Get an add recurrence expression for the specified loop.
bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
Type * getEffectiveSCEVType(Type *Ty) const
Return a type with the same bitwidth as the given type and which represents how SCEV will treat the g...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
const SCEV * getAnyExtendExpr(const SCEV *Op, Type *Ty)
getAnyExtendExpr - Return a SCEV for the given operand extended with unspecified bits out to the give...
bool containsUndefs(const SCEV *S) const
Return true if the SCEV expression contains an undef value.
const SCEV * getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
bool hasComputableLoopEvolution(const SCEV *S, const Loop *L)
Return true if the given SCEV changes value in a known way in the specified loop.
const SCEV * getPointerBase(const SCEV *V)
Transitively follow the chain of pointer-type operands until reaching a SCEV that does not have a sin...
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUnknown(Value *V)
std::optional< APInt > computeConstantDifference(const SCEV *LHS, const SCEV *RHS)
Compute LHS - RHS and returns the result as an APInt if it is a constant, and std::nullopt if it isn'...
bool properlyDominates(const SCEV *S, const BasicBlock *BB)
Return true if elements that makes up the given SCEV properly dominate the specified basic block.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
bool containsErasedValue(const SCEV *S) const
Return true if the SCEV expression contains a Value that has been optimised out and is now a nullptr.
LLVMContext & getContext() const
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:113
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
iterator_range< const_set_bits_iterator > set_bits() const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
size_type size() const
Returns the number of bits in this bitvector.
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
size_type count() const
Returns the number of bits which are set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:321
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
iterator end() const
Definition: SmallPtrSet.h:385
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
iterator begin() const
Definition: SmallPtrSet.h:380
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
void clear()
Definition: SmallSet.h:218
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:717
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void reserve(size_type N)
Definition: SmallVector.h:676
iterator erase(const_iterator CI)
Definition: SmallVector.h:750
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:591
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
typename SuperClass::iterator iterator
Definition: SmallVector.h:590
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
int64_t getFixed() const
Returns the fixed component of the stack.
Definition: TypeSize.h:49
An instruction for storing to memory.
Definition: Instructions.h:317
Provides information about what library functions are available for the current target.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const
Return true if LSR cost of C1 is lower than C2.
bool shouldFoldTerminatingConditionAfterLSR() const
Return true if LSR should attempts to replace a use of an otherwise dead primary IV in the latch cond...
bool isProfitableLSRChainElement(Instruction *I) const
bool LSRWithInstrQueries() const
Return true if the loop strength reduce pass should make Instruction* based TTI queries to isLegalAdd...
bool isIndexedStoreLegal(enum MemIndexedMode Mode, Type *Ty) const
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace=0, Instruction *I=nullptr, int64_t ScalableOffset=0) const
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isIndexedLoadLegal(enum MemIndexedMode Mode, Type *Ty) const
bool isLegalICmpImmediate(int64_t Imm) const
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
bool isLegalAddImmediate(int64_t Imm) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *LibInfo) const
Return true if the target can save a compare for loop count, for example hardware loop saves a compar...
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isNumRegsMajorCostOfLSR() const
Return true if LSR major cost is number of registers.
@ MIM_PostInc
Post-incrementing.
bool canMacroFuseCmp() const
Return true if the target can fuse a compare and branch.
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace=0) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
This class represents a truncation of integer types.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static Type * getVoidTy(LLVMContext &C)
int getFPMantissaWidth() const
Return the width of the mantissa of this type.
static IntegerType * getInt8Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
This class represents a cast unsigned integer to floating point.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:242
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
op_iterator op_end()
Definition: User.h:236
static ValueAsMetadata * get(Value *V)
Definition: Metadata.cpp:495
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
iterator_range< use_iterator > uses()
Definition: Value.h:376
A nullable Value handle that is nullable.
Definition: ValueHandle.h:144
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
self_iterator getIterator()
Definition: ilist_node.h:109
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Key
PAL metadata keys.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SC
CHAIN = SC CHAIN, Imm128 - System call.
Reg
All possible values of the reg field in the ModR/M byte.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:718
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
@ DW_OP_LLVM_arg
Only used in LLVM metadata.
Definition: Dwarf.h:146
@ DW_OP_LLVM_convert
Only used in LLVM metadata.
Definition: Dwarf.h:142
constexpr double e
Definition: MathExtras.h:31
Sequence
A sequence of states that a pointer may go through in which an objc_retain and objc_release are actua...
Definition: PtrState.h:41
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:227
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:236
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool mustExecuteUBIfPoisonOnPathTo(Instruction *Root, Instruction *OnPathTo, DominatorTree *DT)
Return true if undefined behavior would provable be executed on the path to OnPathTo if Root produced...
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition: DWP.cpp:456
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Returns a loop's estimated trip count based on branch weight metadata.
Definition: LoopUtils.cpp:849
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1381
bool operator!=(uint64_t V1, const APInt &V2)
Definition: APInt.h:2050
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2073
char & LoopSimplifyID
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
bool matchSimpleRecurrence(const PHINode *P, BinaryOperator *&BO, Value *&Start, Value *&Step)
Attempt to match a simple first order recurrence cycle of the form: iv = phi Ty [Start,...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
bool DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr)
Examine each PHI in the given block and delete it if it is dead.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
const SCEV * denormalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE)
Denormalize S to be post-increment for all loops present in Loops.
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
cl::opt< unsigned > SCEVCheapExpansionBudget
Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
void SplitLandingPadPredecessors(BasicBlock *OrigBB, ArrayRef< BasicBlock * > Preds, const char *Suffix, const char *Suffix2, SmallVectorImpl< BasicBlock * > &NewBBs, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, bool PreserveLCSSA=false)
This method transforms the landing pad, OrigBB, by introducing two new basic blocks into the function...
const SCEV * normalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE, bool CheckInvertible=true)
Normalize S to be post-increment for all loops present in Loops.
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
@ Add
Sum of integers.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1914
Pass * createLoopStrengthReducePass()
BasicBlock * SplitCriticalEdge(Instruction *TI, unsigned SuccNum, const CriticalEdgeSplittingOptions &Options=CriticalEdgeSplittingOptions(), const Twine &BBName="")
If this edge is a critical edge, insert a new node to split the critical edge.
bool RecursivelyDeleteTriviallyDeadInstructionsPermissive(SmallVectorImpl< WeakTrackingVH > &DeadInsts, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
Same functionality as RecursivelyDeleteTriviallyDeadInstructions, but allow instructions that are not...
Definition: Local.cpp:555
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
bool formLCSSAForInstructions(SmallVectorImpl< Instruction * > &Worklist, const DominatorTree &DT, const LoopInfo &LI, ScalarEvolution *SE, SmallVectorImpl< PHINode * > *PHIsToRemove=nullptr, SmallVectorImpl< PHINode * > *InsertedPHIs=nullptr)
Ensures LCSSA form for every instruction from the Worklist in the scope of innermost containing loop.
Definition: LCSSA.cpp:77
void initializeLoopStrengthReducePass(PassRegistry &)
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
bool isAlmostDeadIV(PHINode *IV, BasicBlock *LatchBlock, Value *Cond)
Return true if the induction variable IV in a Loop whose latch is LatchBlock would become dead if the...
Definition: LoopUtils.cpp:469
int rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI, ScalarEvolution *SE, const TargetTransformInfo *TTI, SCEVExpander &Rewriter, DominatorTree *DT, ReplaceExitVal ReplaceExitValue, SmallVector< WeakTrackingVH, 16 > &DeadInsts)
If the final value of any expressions that are recurrent in the loop can be computed,...
Definition: LoopUtils.cpp:1404
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
@ UnusedIndVarInLoop
Definition: LoopUtils.h:456
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:491
bool SCEVExprContains(const SCEV *Root, PredTy Pred)
Return true if any node in Root satisfies the predicate Pred.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Option class for critical edge splitting.
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Information about a load/store intrinsic defined by the target.
Value * PtrVal
This is the pointer that the intrinsic is loading from or storing to.