LLVM 23.0.0git
LoopStrengthReduce.cpp
Go to the documentation of this file.
1//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This transformation analyzes and transforms the induction variables (and
10// computations derived from them) into forms suitable for efficient execution
11// on the target.
12//
13// This pass performs a strength reduction on array references inside loops that
14// have as one or more of their components the loop induction variable, it
15// rewrites expressions to take advantage of scaled-index addressing modes
16// available on the target, and it performs a variety of other optimizations
17// related to loop induction variables.
18//
19// Terminology note: this code has a lot of handling for "post-increment" or
20// "post-inc" users. This is not talking about post-increment addressing modes;
21// it is instead talking about code like this:
22//
23// %i = phi [ 0, %entry ], [ %i.next, %latch ]
24// ...
25// %i.next = add %i, 1
26// %c = icmp eq %i.next, %n
27//
28// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
29// it's useful to think about these as the same register, with some uses using
30// the value of the register before the add and some using it after. In this
31// example, the icmp is a post-increment user, since it uses %i.next, which is
32// the value of the induction variable after the increment. The other common
33// case of post-increment users is users outside the loop.
34//
35// TODO: More sophistication in the way Formulae are generated and filtered.
36//
37// TODO: Handle multiple loops at a time.
38//
39// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
40// of a GlobalValue?
41//
42// TODO: When truncation is free, truncate ICmp users' operands to make it a
43// smaller encoding (on x86 at least).
44//
45// TODO: When a negated register is used by an add (such as in a list of
46// multiple base registers, or as the increment expression in an addrec),
47// we may not actually need both reg and (-1 * reg) in registers; the
48// negation can be implemented by using a sub instead of an add. The
49// lack of support for taking this into consideration when making
50// register pressure decisions is partly worked around by the "Special"
51// use kind.
52//
53//===----------------------------------------------------------------------===//
54
56#include "llvm/ADT/APInt.h"
57#include "llvm/ADT/DenseMap.h"
58#include "llvm/ADT/DenseSet.h"
60#include "llvm/ADT/STLExtras.h"
61#include "llvm/ADT/SetVector.h"
64#include "llvm/ADT/SmallSet.h"
66#include "llvm/ADT/Statistic.h"
84#include "llvm/IR/BasicBlock.h"
85#include "llvm/IR/Constant.h"
86#include "llvm/IR/Constants.h"
89#include "llvm/IR/Dominators.h"
90#include "llvm/IR/GlobalValue.h"
91#include "llvm/IR/IRBuilder.h"
92#include "llvm/IR/InstrTypes.h"
93#include "llvm/IR/Instruction.h"
96#include "llvm/IR/Module.h"
97#include "llvm/IR/Operator.h"
98#include "llvm/IR/Type.h"
99#include "llvm/IR/Use.h"
100#include "llvm/IR/User.h"
101#include "llvm/IR/Value.h"
102#include "llvm/IR/ValueHandle.h"
104#include "llvm/Pass.h"
105#include "llvm/Support/Casting.h"
108#include "llvm/Support/Debug.h"
118#include <algorithm>
119#include <cassert>
120#include <cstddef>
121#include <cstdint>
122#include <iterator>
123#include <limits>
124#include <map>
125#include <numeric>
126#include <optional>
127#include <utility>
128
129using namespace llvm;
130using namespace SCEVPatternMatch;
131
132#define DEBUG_TYPE "loop-reduce"
133
134/// MaxIVUsers is an arbitrary threshold that provides an early opportunity for
135/// bail out. This threshold is far beyond the number of users that LSR can
136/// conceivably solve, so it should not affect generated code, but catches the
137/// worst cases before LSR burns too much compile time and stack space.
138static const unsigned MaxIVUsers = 200;
139
140/// Limit the size of expression that SCEV-based salvaging will attempt to
141/// translate into a DIExpression.
142/// Choose a maximum size such that debuginfo is not excessively increased and
143/// the salvaging is not too expensive for the compiler.
144static const unsigned MaxSCEVSalvageExpressionSize = 64;
145
146// Cleanup congruent phis after LSR phi expansion.
148 "enable-lsr-phielim", cl::Hidden, cl::init(true),
149 cl::desc("Enable LSR phi elimination"));
150
151// The flag adds instruction count to solutions cost comparison.
153 "lsr-insns-cost", cl::Hidden, cl::init(true),
154 cl::desc("Add instruction count to a LSR cost model"));
155
156// Flag to choose how to narrow complex lsr solution
158 "lsr-exp-narrow", cl::Hidden, cl::init(false),
159 cl::desc("Narrow LSR complex solution using"
160 " expectation of registers number"));
161
162// Flag to narrow search space by filtering non-optimal formulae with
163// the same ScaledReg and Scale.
165 "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true),
166 cl::desc("Narrow LSR search space by filtering non-optimal formulae"
167 " with the same ScaledReg and Scale"));
168
170 "lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None),
171 cl::desc("A flag that overrides the target's preferred addressing mode."),
173 clEnumValN(TTI::AMK_None, "none", "Don't prefer any addressing mode"),
174 clEnumValN(TTI::AMK_PreIndexed, "preindexed",
175 "Prefer pre-indexed addressing mode"),
176 clEnumValN(TTI::AMK_PostIndexed, "postindexed",
177 "Prefer post-indexed addressing mode"),
178 clEnumValN(TTI::AMK_All, "all", "Consider all addressing modes")));
179
181 "lsr-complexity-limit", cl::Hidden,
182 cl::init(std::numeric_limits<uint16_t>::max()),
183 cl::desc("LSR search space complexity limit"));
184
186 "lsr-setupcost-depth-limit", cl::Hidden, cl::init(7),
187 cl::desc("The limit on recursion depth for LSRs setup cost"));
188
190 "lsr-drop-solution", cl::Hidden,
191 cl::desc("Attempt to drop solution if it is less profitable"));
192
194 "lsr-enable-vscale-immediates", cl::Hidden, cl::init(true),
195 cl::desc("Enable analysis of vscale-relative immediates in LSR"));
196
198 "lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true),
199 cl::desc("Avoid using scaled registers with vscale-relative addressing"));
200
201#ifndef NDEBUG
202// Stress test IV chain generation.
204 "stress-ivchain", cl::Hidden, cl::init(false),
205 cl::desc("Stress test LSR IV chains"));
206#else
207static bool StressIVChain = false;
208#endif
209
210namespace {
211
212struct MemAccessTy {
213 /// Used in situations where the accessed memory type is unknown.
214 static const unsigned UnknownAddressSpace =
215 std::numeric_limits<unsigned>::max();
216
217 Type *MemTy = nullptr;
218 unsigned AddrSpace = UnknownAddressSpace;
219
220 MemAccessTy() = default;
221 MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {}
222
223 bool operator==(MemAccessTy Other) const {
224 return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
225 }
226
227 bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
228
229 static MemAccessTy getUnknown(LLVMContext &Ctx,
230 unsigned AS = UnknownAddressSpace) {
231 return MemAccessTy(Type::getVoidTy(Ctx), AS);
232 }
233
234 Type *getType() { return MemTy; }
235};
236
237/// This class holds data which is used to order reuse candidates.
238class RegSortData {
239public:
240 /// This represents the set of LSRUse indices which reference
241 /// a particular register.
242 SmallBitVector UsedByIndices;
243
244 void print(raw_ostream &OS) const;
245 void dump() const;
246};
247
248// An offset from an address that is either scalable or fixed. Used for
249// per-target optimizations of addressing modes.
250class Immediate : public details::FixedOrScalableQuantity<Immediate, int64_t> {
251 constexpr Immediate(ScalarTy MinVal, bool Scalable)
252 : FixedOrScalableQuantity(MinVal, Scalable) {}
253
254 constexpr Immediate(const FixedOrScalableQuantity<Immediate, int64_t> &V)
255 : FixedOrScalableQuantity(V) {}
256
257public:
258 constexpr Immediate() = delete;
259
260 static constexpr Immediate getFixed(ScalarTy MinVal) {
261 return {MinVal, false};
262 }
263 static constexpr Immediate getScalable(ScalarTy MinVal) {
264 return {MinVal, true};
265 }
266 static constexpr Immediate get(ScalarTy MinVal, bool Scalable) {
267 return {MinVal, Scalable};
268 }
269 static constexpr Immediate getZero() { return {0, false}; }
270 static constexpr Immediate getFixedMin() {
271 return {std::numeric_limits<int64_t>::min(), false};
272 }
273 static constexpr Immediate getFixedMax() {
274 return {std::numeric_limits<int64_t>::max(), false};
275 }
276 static constexpr Immediate getScalableMin() {
277 return {std::numeric_limits<int64_t>::min(), true};
278 }
279 static constexpr Immediate getScalableMax() {
280 return {std::numeric_limits<int64_t>::max(), true};
281 }
282
283 constexpr bool isLessThanZero() const { return Quantity < 0; }
284
285 constexpr bool isGreaterThanZero() const { return Quantity > 0; }
286
287 constexpr bool isCompatibleImmediate(const Immediate &Imm) const {
288 return isZero() || Imm.isZero() || Imm.Scalable == Scalable;
289 }
290
291 constexpr bool isMin() const {
292 return Quantity == std::numeric_limits<ScalarTy>::min();
293 }
294
295 constexpr bool isMax() const {
296 return Quantity == std::numeric_limits<ScalarTy>::max();
297 }
298
299 // Arithmetic 'operators' that cast to unsigned types first.
300 constexpr Immediate addUnsigned(const Immediate &RHS) const {
301 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
302 ScalarTy Value = (uint64_t)Quantity + RHS.getKnownMinValue();
303 return {Value, Scalable || RHS.isScalable()};
304 }
305
306 constexpr Immediate subUnsigned(const Immediate &RHS) const {
307 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
308 ScalarTy Value = (uint64_t)Quantity - RHS.getKnownMinValue();
309 return {Value, Scalable || RHS.isScalable()};
310 }
311
312 // Scale the quantity by a constant without caring about runtime scalability.
313 constexpr Immediate mulUnsigned(const ScalarTy RHS) const {
314 ScalarTy Value = (uint64_t)Quantity * RHS;
315 return {Value, Scalable};
316 }
317
318 // Helpers for generating SCEVs with vscale terms where needed.
319 const SCEV *getSCEV(ScalarEvolution &SE, Type *Ty) const {
320 const SCEV *S = SE.getConstant(Ty, Quantity);
321 if (Scalable)
322 S = SE.getMulExpr(S, SE.getVScale(S->getType()));
323 return S;
324 }
325
326 const SCEV *getNegativeSCEV(ScalarEvolution &SE, Type *Ty) const {
327 const SCEV *NegS = SE.getConstant(Ty, -(uint64_t)Quantity);
328 if (Scalable)
329 NegS = SE.getMulExpr(NegS, SE.getVScale(NegS->getType()));
330 return NegS;
331 }
332
333 const SCEV *getUnknownSCEV(ScalarEvolution &SE, Type *Ty) const {
334 // TODO: Avoid implicit trunc?
335 // See https://github.com/llvm/llvm-project/issues/112510.
336 const SCEV *SU = SE.getUnknown(
337 ConstantInt::getSigned(Ty, Quantity, /*ImplicitTrunc=*/true));
338 if (Scalable)
339 SU = SE.getMulExpr(SU, SE.getVScale(SU->getType()));
340 return SU;
341 }
342};
343
344// This is needed for the Compare type of std::map when Immediate is used
345// as a key. We don't need it to be fully correct against any value of vscale,
346// just to make sure that vscale-related terms in the map are considered against
347// each other rather than being mixed up and potentially missing opportunities.
348struct KeyOrderTargetImmediate {
349 bool operator()(const Immediate &LHS, const Immediate &RHS) const {
350 if (LHS.isScalable() && !RHS.isScalable())
351 return false;
352 if (!LHS.isScalable() && RHS.isScalable())
353 return true;
354 return LHS.getKnownMinValue() < RHS.getKnownMinValue();
355 }
356};
357
358// This would be nicer if we could be generic instead of directly using size_t,
359// but there doesn't seem to be a type trait for is_orderable or
360// is_lessthan_comparable or similar.
361struct KeyOrderSizeTAndImmediate {
362 bool operator()(const std::pair<size_t, Immediate> &LHS,
363 const std::pair<size_t, Immediate> &RHS) const {
364 size_t LSize = LHS.first;
365 size_t RSize = RHS.first;
366 if (LSize != RSize)
367 return LSize < RSize;
368 return KeyOrderTargetImmediate()(LHS.second, RHS.second);
369 }
370};
371} // end anonymous namespace
372
373#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
374void RegSortData::print(raw_ostream &OS) const {
375 OS << "[NumUses=" << UsedByIndices.count() << ']';
376}
377
378LLVM_DUMP_METHOD void RegSortData::dump() const {
379 print(errs()); errs() << '\n';
380}
381#endif
382
383namespace {
384
385/// Map register candidates to information about how they are used.
386class RegUseTracker {
387 using RegUsesTy = DenseMap<const SCEV *, RegSortData>;
388
389 RegUsesTy RegUsesMap;
391
392public:
393 void countRegister(const SCEV *Reg, size_t LUIdx);
394 void dropRegister(const SCEV *Reg, size_t LUIdx);
395 void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
396
397 bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
398
399 const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
400
401 void clear();
402
405
406 iterator begin() { return RegSequence.begin(); }
407 iterator end() { return RegSequence.end(); }
408 const_iterator begin() const { return RegSequence.begin(); }
409 const_iterator end() const { return RegSequence.end(); }
410};
411
412} // end anonymous namespace
413
414void
415RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
416 std::pair<RegUsesTy::iterator, bool> Pair = RegUsesMap.try_emplace(Reg);
417 RegSortData &RSD = Pair.first->second;
418 if (Pair.second)
419 RegSequence.push_back(Reg);
420 RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
421 RSD.UsedByIndices.set(LUIdx);
422}
423
424void
425RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
426 RegUsesTy::iterator It = RegUsesMap.find(Reg);
427 assert(It != RegUsesMap.end());
428 RegSortData &RSD = It->second;
429 assert(RSD.UsedByIndices.size() > LUIdx);
430 RSD.UsedByIndices.reset(LUIdx);
431}
432
433void
434RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
435 assert(LUIdx <= LastLUIdx);
436
437 // Update RegUses. The data structure is not optimized for this purpose;
438 // we must iterate through it and update each of the bit vectors.
439 for (auto &Pair : RegUsesMap) {
440 SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
441 if (LUIdx < UsedByIndices.size())
442 UsedByIndices[LUIdx] =
443 LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : false;
444 UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
445 }
446}
447
448bool
449RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
450 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
451 if (I == RegUsesMap.end())
452 return false;
453 const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
454 int i = UsedByIndices.find_first();
455 if (i == -1) return false;
456 if ((size_t)i != LUIdx) return true;
457 return UsedByIndices.find_next(i) != -1;
458}
459
460const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
461 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
462 assert(I != RegUsesMap.end() && "Unknown register!");
463 return I->second.UsedByIndices;
464}
465
466void RegUseTracker::clear() {
467 RegUsesMap.clear();
468 RegSequence.clear();
469}
470
471namespace {
472
473/// This class holds information that describes a formula for computing
474/// satisfying a use. It may include broken-out immediates and scaled registers.
475struct Formula {
476 /// Global base address used for complex addressing.
477 GlobalValue *BaseGV = nullptr;
478
479 /// Base offset for complex addressing.
480 Immediate BaseOffset = Immediate::getZero();
481
482 /// Whether any complex addressing has a base register.
483 bool HasBaseReg = false;
484
485 /// The scale of any complex addressing.
486 int64_t Scale = 0;
487
488 /// The list of "base" registers for this use. When this is non-empty. The
489 /// canonical representation of a formula is
490 /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
491 /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
492 /// 3. The reg containing recurrent expr related with currect loop in the
493 /// formula should be put in the ScaledReg.
494 /// #1 enforces that the scaled register is always used when at least two
495 /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
496 /// #2 enforces that 1 * reg is reg.
497 /// #3 ensures invariant regs with respect to current loop can be combined
498 /// together in LSR codegen.
499 /// This invariant can be temporarily broken while building a formula.
500 /// However, every formula inserted into the LSRInstance must be in canonical
501 /// form.
503
504 /// The 'scaled' register for this use. This should be non-null when Scale is
505 /// not zero.
506 const SCEV *ScaledReg = nullptr;
507
508 /// An additional constant offset which added near the use. This requires a
509 /// temporary register, but the offset itself can live in an add immediate
510 /// field rather than a register.
511 Immediate UnfoldedOffset = Immediate::getZero();
512
513 Formula() = default;
514
515 void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
516
517 bool isCanonical(const Loop &L) const;
518
519 void canonicalize(const Loop &L);
520
521 bool unscale();
522
523 bool hasZeroEnd() const;
524
525 bool countsDownToZero() const;
526
527 size_t getNumRegs() const;
528 Type *getType() const;
529
530 void deleteBaseReg(const SCEV *&S);
531
532 bool referencesReg(const SCEV *S) const;
533 bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
534 const RegUseTracker &RegUses) const;
535
536 void print(raw_ostream &OS) const;
537 void dump() const;
538};
539
540} // end anonymous namespace
541
542/// Recursion helper for initialMatch.
543static void DoInitialMatch(const SCEV *S, Loop *L,
546 // Collect expressions which properly dominate the loop header.
547 if (SE.properlyDominates(S, L->getHeader())) {
548 Good.push_back(S);
549 return;
550 }
551
552 // Look at add operands.
553 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
554 for (const SCEV *S : Add->operands())
555 DoInitialMatch(S, L, Good, Bad, SE);
556 return;
557 }
558
559 // Look at addrec operands.
560 const SCEV *Start, *Step;
561 const Loop *ARLoop;
562 if (match(S,
563 m_scev_AffineAddRec(m_SCEV(Start), m_SCEV(Step), m_Loop(ARLoop))) &&
564 !Start->isZero()) {
565 DoInitialMatch(Start, L, Good, Bad, SE);
566 DoInitialMatch(SE.getAddRecExpr(SE.getConstant(S->getType(), 0), Step,
567 // FIXME: AR->getNoWrapFlags()
568 ARLoop, SCEV::FlagAnyWrap),
569 L, Good, Bad, SE);
570 return;
571 }
572
573 // Handle a multiplication by -1 (negation) if it didn't fold.
574 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
575 if (Mul->getOperand(0)->isAllOnesValue()) {
577 const SCEV *NewMul = SE.getMulExpr(Ops);
578
581 DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
582 const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
583 SE.getEffectiveSCEVType(NewMul->getType())));
584 for (const SCEV *S : MyGood)
585 Good.push_back(SE.getMulExpr(NegOne, S));
586 for (const SCEV *S : MyBad)
587 Bad.push_back(SE.getMulExpr(NegOne, S));
588 return;
589 }
590
591 // Ok, we can't do anything interesting. Just stuff the whole thing into a
592 // register and hope for the best.
593 Bad.push_back(S);
594}
595
596/// Incorporate loop-variant parts of S into this Formula, attempting to keep
597/// all loop-invariant and loop-computable values in a single base register.
598void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
601 DoInitialMatch(S, L, Good, Bad, SE);
602 if (!Good.empty()) {
603 const SCEV *Sum = SE.getAddExpr(Good);
604 if (!Sum->isZero())
605 BaseRegs.push_back(Sum);
606 HasBaseReg = true;
607 }
608 if (!Bad.empty()) {
609 const SCEV *Sum = SE.getAddExpr(Bad);
610 if (!Sum->isZero())
611 BaseRegs.push_back(Sum);
612 HasBaseReg = true;
613 }
614 canonicalize(*L);
615}
616
617static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L) {
618 return SCEVExprContains(S, [&L](const SCEV *S) {
619 return isa<SCEVAddRecExpr>(S) && (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
620 });
621}
622
623/// Check whether or not this formula satisfies the canonical
624/// representation.
625/// \see Formula::BaseRegs.
626bool Formula::isCanonical(const Loop &L) const {
627 assert((Scale == 0 || ScaledReg) &&
628 "ScaledReg must be non-null if Scale is non-zero");
629
630 if (!ScaledReg)
631 return BaseRegs.size() <= 1;
632
633 if (Scale != 1)
634 return true;
635
636 if (Scale == 1 && BaseRegs.empty())
637 return false;
638
639 if (containsAddRecDependentOnLoop(ScaledReg, L))
640 return true;
641
642 // If ScaledReg is not a recurrent expr, or it is but its loop is not current
643 // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
644 // loop, we want to swap the reg in BaseRegs with ScaledReg.
645 return none_of(BaseRegs, [&L](const SCEV *S) {
647 });
648}
649
650/// Helper method to morph a formula into its canonical representation.
651/// \see Formula::BaseRegs.
652/// Every formula having more than one base register, must use the ScaledReg
653/// field. Otherwise, we would have to do special cases everywhere in LSR
654/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
655/// On the other hand, 1*reg should be canonicalized into reg.
656void Formula::canonicalize(const Loop &L) {
657 if (isCanonical(L))
658 return;
659
660 if (BaseRegs.empty()) {
661 // No base reg? Use scale reg with scale = 1 as such.
662 assert(ScaledReg && "Expected 1*reg => reg");
663 assert(Scale == 1 && "Expected 1*reg => reg");
664 BaseRegs.push_back(ScaledReg);
665 Scale = 0;
666 ScaledReg = nullptr;
667 return;
668 }
669
670 // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
671 if (!ScaledReg) {
672 ScaledReg = BaseRegs.pop_back_val();
673 Scale = 1;
674 }
675
676 // If ScaledReg is an invariant with respect to L, find the reg from
677 // BaseRegs containing the recurrent expr related with Loop L. Swap the
678 // reg with ScaledReg.
679 if (!containsAddRecDependentOnLoop(ScaledReg, L)) {
680 auto I = find_if(BaseRegs, [&L](const SCEV *S) {
682 });
683 if (I != BaseRegs.end())
684 std::swap(ScaledReg, *I);
685 }
686 assert(isCanonical(L) && "Failed to canonicalize?");
687}
688
689/// Get rid of the scale in the formula.
690/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
691/// \return true if it was possible to get rid of the scale, false otherwise.
692/// \note After this operation the formula may not be in the canonical form.
693bool Formula::unscale() {
694 if (Scale != 1)
695 return false;
696 Scale = 0;
697 BaseRegs.push_back(ScaledReg);
698 ScaledReg = nullptr;
699 return true;
700}
701
702bool Formula::hasZeroEnd() const {
703 if (UnfoldedOffset || BaseOffset)
704 return false;
705 if (BaseRegs.size() != 1 || ScaledReg)
706 return false;
707 return true;
708}
709
710bool Formula::countsDownToZero() const {
711 if (!hasZeroEnd())
712 return false;
713 assert(BaseRegs.size() == 1 && "hasZeroEnd should mean one BaseReg");
714 const APInt *StepInt;
715 if (!match(BaseRegs[0], m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StepInt))))
716 return false;
717 return StepInt->isNegative();
718}
719
720/// Return the total number of register operands used by this formula. This does
721/// not include register uses implied by non-constant addrec strides.
722size_t Formula::getNumRegs() const {
723 return !!ScaledReg + BaseRegs.size();
724}
725
726/// Return the type of this formula, if it has one, or null otherwise. This type
727/// is meaningless except for the bit size.
728Type *Formula::getType() const {
729 return !BaseRegs.empty() ? BaseRegs.front()->getType() :
730 ScaledReg ? ScaledReg->getType() :
731 BaseGV ? BaseGV->getType() :
732 nullptr;
733}
734
735/// Delete the given base reg from the BaseRegs list.
736void Formula::deleteBaseReg(const SCEV *&S) {
737 if (&S != &BaseRegs.back())
738 std::swap(S, BaseRegs.back());
739 BaseRegs.pop_back();
740}
741
742/// Test if this formula references the given register.
743bool Formula::referencesReg(const SCEV *S) const {
744 return S == ScaledReg || is_contained(BaseRegs, S);
745}
746
747/// Test whether this formula uses registers which are used by uses other than
748/// the use with the given index.
749bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
750 const RegUseTracker &RegUses) const {
751 if (ScaledReg)
752 if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
753 return true;
754 for (const SCEV *BaseReg : BaseRegs)
755 if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
756 return true;
757 return false;
758}
759
760#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
761void Formula::print(raw_ostream &OS) const {
762 ListSeparator Plus(" + ");
763 if (BaseGV) {
764 OS << Plus;
765 BaseGV->printAsOperand(OS, /*PrintType=*/false);
766 }
767 if (BaseOffset.isNonZero())
768 OS << Plus << BaseOffset;
769
770 for (const SCEV *BaseReg : BaseRegs)
771 OS << Plus << "reg(" << *BaseReg << ')';
772
773 if (HasBaseReg && BaseRegs.empty())
774 OS << Plus << "**error: HasBaseReg**";
775 else if (!HasBaseReg && !BaseRegs.empty())
776 OS << Plus << "**error: !HasBaseReg**";
777
778 if (Scale != 0) {
779 OS << Plus << Scale << "*reg(";
780 if (ScaledReg)
781 OS << *ScaledReg;
782 else
783 OS << "<unknown>";
784 OS << ')';
785 }
786 if (UnfoldedOffset.isNonZero())
787 OS << Plus << "imm(" << UnfoldedOffset << ')';
788}
789
790LLVM_DUMP_METHOD void Formula::dump() const {
791 print(errs()); errs() << '\n';
792}
793#endif
794
795/// Return true if the given addrec can be sign-extended without changing its
796/// value.
798 Type *WideTy =
800 return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
801}
802
803/// Return true if the given add can be sign-extended without changing its
804/// value.
805static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
806 Type *WideTy =
807 IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
808 return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
809}
810
811/// Return true if the given mul can be sign-extended without changing its
812/// value.
813static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
814 Type *WideTy =
816 SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
817 return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
818}
819
820/// Return an expression for LHS /s RHS, if it can be determined and if the
821/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
822/// is true, expressions like (X * Y) /s Y are simplified to X, ignoring that
823/// the multiplication may overflow, which is useful when the result will be
824/// used in a context where the most significant bits are ignored.
825static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
826 ScalarEvolution &SE,
827 bool IgnoreSignificantBits = false) {
828 // Handle the trivial case, which works for any SCEV type.
829 if (LHS == RHS)
830 return SE.getConstant(LHS->getType(), 1);
831
832 // Handle a few RHS special cases.
834 if (RC) {
835 const APInt &RA = RC->getAPInt();
836 // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
837 // some folding.
838 if (RA.isAllOnes()) {
839 if (LHS->getType()->isPointerTy())
840 return nullptr;
841 return SE.getMulExpr(LHS, RC);
842 }
843 // Handle x /s 1 as x.
844 if (RA == 1)
845 return LHS;
846 }
847
848 // Check for a division of a constant by a constant.
850 if (!RC)
851 return nullptr;
852 const APInt &LA = C->getAPInt();
853 const APInt &RA = RC->getAPInt();
854 if (LA.srem(RA) != 0)
855 return nullptr;
856 return SE.getConstant(LA.sdiv(RA));
857 }
858
859 // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
861 if ((IgnoreSignificantBits || isAddRecSExtable(AR, SE)) && AR->isAffine()) {
862 const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
863 IgnoreSignificantBits);
864 if (!Step) return nullptr;
865 const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
866 IgnoreSignificantBits);
867 if (!Start) return nullptr;
868 // FlagNW is independent of the start value, step direction, and is
869 // preserved with smaller magnitude steps.
870 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
871 return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
872 }
873 return nullptr;
874 }
875
876 // Distribute the sdiv over add operands, if the add doesn't overflow.
878 if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
880 for (const SCEV *S : Add->operands()) {
881 const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
882 if (!Op) return nullptr;
883 Ops.push_back(Op);
884 }
885 return SE.getAddExpr(Ops);
886 }
887 return nullptr;
888 }
889
890 // Check for a multiply operand that we can pull RHS out of.
892 if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
893 // Handle special case C1*X*Y /s C2*X*Y.
894 if (const SCEVMulExpr *MulRHS = dyn_cast<SCEVMulExpr>(RHS)) {
895 if (IgnoreSignificantBits || isMulSExtable(MulRHS, SE)) {
896 const SCEVConstant *LC = dyn_cast<SCEVConstant>(Mul->getOperand(0));
897 const SCEVConstant *RC =
898 dyn_cast<SCEVConstant>(MulRHS->getOperand(0));
899 if (LC && RC) {
901 SmallVector<const SCEV *, 4> ROps(drop_begin(MulRHS->operands()));
902 if (LOps == ROps)
903 return getExactSDiv(LC, RC, SE, IgnoreSignificantBits);
904 }
905 }
906 }
907
909 bool Found = false;
910 for (const SCEV *S : Mul->operands()) {
911 if (!Found)
912 if (const SCEV *Q = getExactSDiv(S, RHS, SE,
913 IgnoreSignificantBits)) {
914 S = Q;
915 Found = true;
916 }
917 Ops.push_back(S);
918 }
919 return Found ? SE.getMulExpr(Ops) : nullptr;
920 }
921 return nullptr;
922 }
923
924 // Otherwise we don't know.
925 return nullptr;
926}
927
928/// If S involves the addition of a constant integer value, return that integer
929/// value, and mutate S to point to a new SCEV with that value excluded.
930static Immediate ExtractImmediate(SCEVUse &S, ScalarEvolution &SE) {
931 const APInt *C;
932 if (match(S, m_scev_APInt(C))) {
933 if (C->getSignificantBits() <= 64) {
934 S = SE.getConstant(S->getType(), 0);
935 return Immediate::getFixed(C->getSExtValue());
936 }
937 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
938 SmallVector<SCEVUse, 8> NewOps(Add->operands());
939 Immediate Result = ExtractImmediate(NewOps.front(), SE);
940 if (Result.isNonZero())
941 S = SE.getAddExpr(NewOps);
942 return Result;
943 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
944 SmallVector<SCEVUse, 8> NewOps(AR->operands());
945 Immediate Result = ExtractImmediate(NewOps.front(), SE);
946 if (Result.isNonZero())
947 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
948 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
950 return Result;
951 } else if (EnableVScaleImmediates &&
953 S = SE.getConstant(S->getType(), 0);
954 return Immediate::getScalable(C->getSExtValue());
955 }
956 return Immediate::getZero();
957}
958
959/// If S involves the addition of a GlobalValue address, return that symbol, and
960/// mutate S to point to a new SCEV with that value excluded.
962 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
963 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
964 S = SE.getConstant(GV->getType(), 0);
965 return GV;
966 }
967 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
968 SmallVector<SCEVUse, 8> NewOps(Add->operands());
969 GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
970 if (Result)
971 S = SE.getAddExpr(NewOps);
972 return Result;
973 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
974 SmallVector<SCEVUse, 8> NewOps(AR->operands());
975 GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
976 if (Result)
977 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
978 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
980 return Result;
981 }
982 return nullptr;
983}
984
985/// Returns true if the specified instruction is using the specified value as an
986/// address.
988 Instruction *Inst, Value *OperandVal) {
989 bool isAddress = isa<LoadInst>(Inst);
990 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
991 if (SI->getPointerOperand() == OperandVal)
992 isAddress = true;
993 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
994 // Addressing modes can also be folded into prefetches and a variety
995 // of intrinsics.
996 switch (II->getIntrinsicID()) {
997 case Intrinsic::memset:
998 case Intrinsic::prefetch:
999 case Intrinsic::masked_load:
1000 if (II->getArgOperand(0) == OperandVal)
1001 isAddress = true;
1002 break;
1003 case Intrinsic::masked_store:
1004 if (II->getArgOperand(1) == OperandVal)
1005 isAddress = true;
1006 break;
1007 case Intrinsic::memmove:
1008 case Intrinsic::memcpy:
1009 if (II->getArgOperand(0) == OperandVal ||
1010 II->getArgOperand(1) == OperandVal)
1011 isAddress = true;
1012 break;
1013 default: {
1014 MemIntrinsicInfo IntrInfo;
1015 if (TTI.getTgtMemIntrinsic(II, IntrInfo)) {
1016 if (IntrInfo.PtrVal == OperandVal)
1017 isAddress = true;
1018 }
1019 }
1020 }
1021 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1022 if (RMW->getPointerOperand() == OperandVal)
1023 isAddress = true;
1024 } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1025 if (CmpX->getPointerOperand() == OperandVal)
1026 isAddress = true;
1027 }
1028 return isAddress;
1029}
1030
1031/// Return the type of the memory being accessed.
1032static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
1033 Instruction *Inst, Value *OperandVal) {
1034 MemAccessTy AccessTy = MemAccessTy::getUnknown(Inst->getContext());
1035
1036 // First get the type of memory being accessed.
1037 if (Type *Ty = Inst->getAccessType())
1038 AccessTy.MemTy = Ty;
1039
1040 // Then get the pointer address space.
1041 if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
1042 AccessTy.AddrSpace = SI->getPointerAddressSpace();
1043 } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
1044 AccessTy.AddrSpace = LI->getPointerAddressSpace();
1045 } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1046 AccessTy.AddrSpace = RMW->getPointerAddressSpace();
1047 } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1048 AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
1049 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
1050 switch (II->getIntrinsicID()) {
1051 case Intrinsic::prefetch:
1052 case Intrinsic::memset:
1053 AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
1054 AccessTy.MemTy = OperandVal->getType();
1055 break;
1056 case Intrinsic::memmove:
1057 case Intrinsic::memcpy:
1058 AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace();
1059 AccessTy.MemTy = OperandVal->getType();
1060 break;
1061 case Intrinsic::masked_load:
1062 AccessTy.AddrSpace =
1063 II->getArgOperand(0)->getType()->getPointerAddressSpace();
1064 break;
1065 case Intrinsic::masked_store:
1066 AccessTy.AddrSpace =
1067 II->getArgOperand(1)->getType()->getPointerAddressSpace();
1068 break;
1069 default: {
1070 MemIntrinsicInfo IntrInfo;
1071 if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) {
1072 AccessTy.AddrSpace
1073 = IntrInfo.PtrVal->getType()->getPointerAddressSpace();
1074 }
1075
1076 break;
1077 }
1078 }
1079 }
1080
1081 return AccessTy;
1082}
1083
1084/// Return true if this AddRec is already a phi in its loop.
1085static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
1086 for (PHINode &PN : AR->getLoop()->getHeader()->phis()) {
1087 if (SE.isSCEVable(PN.getType()) &&
1088 (SE.getEffectiveSCEVType(PN.getType()) ==
1089 SE.getEffectiveSCEVType(AR->getType())) &&
1090 SE.getSCEV(&PN) == AR)
1091 return true;
1092 }
1093 return false;
1094}
1095
1096/// Check if expanding this expression is likely to incur significant cost. This
1097/// is tricky because SCEV doesn't track which expressions are actually computed
1098/// by the current IR.
1099///
1100/// We currently allow expansion of IV increments that involve adds,
1101/// multiplication by constants, and AddRecs from existing phis.
1102///
1103/// TODO: Allow UDivExpr if we can find an existing IV increment that is an
1104/// obvious multiple of the UDivExpr.
1105static bool isHighCostExpansion(const SCEV *S,
1107 ScalarEvolution &SE) {
1108 // Zero/One operand expressions
1109 switch (S->getSCEVType()) {
1110 case scUnknown:
1111 case scConstant:
1112 case scVScale:
1113 return false;
1114 case scTruncate:
1115 return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
1116 Processed, SE);
1117 case scZeroExtend:
1118 return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
1119 Processed, SE);
1120 case scSignExtend:
1121 return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
1122 Processed, SE);
1123 default:
1124 break;
1125 }
1126
1127 if (!Processed.insert(S).second)
1128 return false;
1129
1130 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
1131 for (const SCEV *S : Add->operands()) {
1132 if (isHighCostExpansion(S, Processed, SE))
1133 return true;
1134 }
1135 return false;
1136 }
1137
1138 const SCEV *Op0, *Op1;
1139 if (match(S, m_scev_Mul(m_SCEV(Op0), m_SCEV(Op1)))) {
1140 // Multiplication by a constant is ok
1141 if (isa<SCEVConstant>(Op0))
1142 return isHighCostExpansion(Op1, Processed, SE);
1143
1144 // If we have the value of one operand, check if an existing
1145 // multiplication already generates this expression.
1146 if (const auto *U = dyn_cast<SCEVUnknown>(Op1)) {
1147 Value *UVal = U->getValue();
1148 for (User *UR : UVal->users()) {
1149 // If U is a constant, it may be used by a ConstantExpr.
1151 if (UI && UI->getOpcode() == Instruction::Mul &&
1152 SE.isSCEVable(UI->getType())) {
1153 return SE.getSCEV(UI) == S;
1154 }
1155 }
1156 }
1157 }
1158
1159 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
1160 if (isExistingPhi(AR, SE))
1161 return false;
1162 }
1163
1164 // Fow now, consider any other type of expression (div/mul/min/max) high cost.
1165 return true;
1166}
1167
1168namespace {
1169
1170class LSRUse;
1171
1172} // end anonymous namespace
1173
1174/// Check if the addressing mode defined by \p F is completely
1175/// folded in \p LU at isel time.
1176/// This includes address-mode folding and special icmp tricks.
1177/// This function returns true if \p LU can accommodate what \p F
1178/// defines and up to 1 base + 1 scaled + offset.
1179/// In other words, if \p F has several base registers, this function may
1180/// still return true. Therefore, users still need to account for
1181/// additional base registers and/or unfolded offsets to derive an
1182/// accurate cost model.
1183static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1184 const LSRUse &LU, const Formula &F);
1185
1186// Get the cost of the scaling factor used in F for LU.
1187static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI,
1188 const LSRUse &LU, const Formula &F,
1189 const Loop &L);
1190
1191namespace {
1192
1193/// This class is used to measure and compare candidate formulae.
1194class Cost {
1195 const Loop *L = nullptr;
1196 ScalarEvolution *SE = nullptr;
1197 const TargetTransformInfo *TTI = nullptr;
1198 TargetTransformInfo::LSRCost C;
1200
1201public:
1202 Cost() = delete;
1203 Cost(const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
1205 L(L), SE(&SE), TTI(&TTI), AMK(AMK) {
1206 C.Insns = 0;
1207 C.NumRegs = 0;
1208 C.AddRecCost = 0;
1209 C.NumIVMuls = 0;
1210 C.NumBaseAdds = 0;
1211 C.ImmCost = 0;
1212 C.SetupCost = 0;
1213 C.ScaleCost = 0;
1214 }
1215
1216 bool isLess(const Cost &Other) const;
1217
1218 void Lose();
1219
1220#ifndef NDEBUG
1221 // Once any of the metrics loses, they must all remain losers.
1222 bool isValid() {
1223 return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
1224 | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
1225 || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
1226 & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
1227 }
1228#endif
1229
1230 bool isLoser() {
1231 assert(isValid() && "invalid cost");
1232 return C.NumRegs == ~0u;
1233 }
1234
1235 void RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
1236 const DenseSet<const SCEV *> &VisitedRegs, const LSRUse &LU,
1237 bool HardwareLoopProfitable,
1238 SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
1239
1240 void print(raw_ostream &OS) const;
1241 void dump() const;
1242
1243private:
1244 void RateRegister(const Formula &F, const SCEV *Reg,
1245 SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
1246 bool HardwareLoopProfitable);
1247 void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1248 SmallPtrSetImpl<const SCEV *> &Regs,
1249 const LSRUse &LU, bool HardwareLoopProfitable,
1250 SmallPtrSetImpl<const SCEV *> *LoserRegs);
1251};
1252
1253/// An operand value in an instruction which is to be replaced with some
1254/// equivalent, possibly strength-reduced, replacement.
1255struct LSRFixup {
1256 /// The instruction which will be updated.
1257 Instruction *UserInst = nullptr;
1258
1259 /// The operand of the instruction which will be replaced. The operand may be
1260 /// used more than once; every instance will be replaced.
1261 Value *OperandValToReplace = nullptr;
1262
1263 /// If this user is to use the post-incremented value of an induction
1264 /// variable, this set is non-empty and holds the loops associated with the
1265 /// induction variable.
1266 PostIncLoopSet PostIncLoops;
1267
1268 /// A constant offset to be added to the LSRUse expression. This allows
1269 /// multiple fixups to share the same LSRUse with different offsets, for
1270 /// example in an unrolled loop.
1271 Immediate Offset = Immediate::getZero();
1272
1273 LSRFixup() = default;
1274
1275 bool isUseFullyOutsideLoop(const Loop *L) const;
1276
1277 void print(raw_ostream &OS) const;
1278 void dump() const;
1279};
1280
1281/// This class holds the state that LSR keeps for each use in IVUsers, as well
1282/// as uses invented by LSR itself. It includes information about what kinds of
1283/// things can be folded into the user, information about the user itself, and
1284/// information about how the use may be satisfied. TODO: Represent multiple
1285/// users of the same expression in common?
1286class LSRUse {
1287 DenseSet<SmallVector<const SCEV *, 4>> Uniquifier;
1288
1289public:
1290 /// An enum for a kind of use, indicating what types of scaled and immediate
1291 /// operands it might support.
1292 enum KindType {
1293 Basic, ///< A normal use, with no folding.
1294 Special, ///< A special case of basic, allowing -1 scales.
1295 Address, ///< An address use; folding according to TargetLowering
1296 ICmpZero ///< An equality icmp with both operands folded into one.
1297 // TODO: Add a generic icmp too?
1298 };
1299
1300 using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>;
1301
1302 KindType Kind;
1303 MemAccessTy AccessTy;
1304
1305 /// The list of operands which are to be replaced.
1307
1308 /// Keep track of the min and max offsets of the fixups.
1309 Immediate MinOffset = Immediate::getFixedMax();
1310 Immediate MaxOffset = Immediate::getFixedMin();
1311
1312 /// This records whether all of the fixups using this LSRUse are outside of
1313 /// the loop, in which case some special-case heuristics may be used.
1314 bool AllFixupsOutsideLoop = true;
1315
1316 /// This records whether all of the fixups using this LSRUse are unconditional
1317 /// within the loop, meaning they will be executed on every path to the loop
1318 /// latch. This includes fixups before early exits.
1319 bool AllFixupsUnconditional = true;
1320
1321 /// RigidFormula is set to true to guarantee that this use will be associated
1322 /// with a single formula--the one that initially matched. Some SCEV
1323 /// expressions cannot be expanded. This allows LSR to consider the registers
1324 /// used by those expressions without the need to expand them later after
1325 /// changing the formula.
1326 bool RigidFormula = false;
1327
1328 /// This records the widest use type for any fixup using this
1329 /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
1330 /// fixup widths to be equivalent, because the narrower one may be relying on
1331 /// the implicit truncation to truncate away bogus bits.
1332 Type *WidestFixupType = nullptr;
1333
1334 /// A list of ways to build a value that can satisfy this user. After the
1335 /// list is populated, one of these is selected heuristically and used to
1336 /// formulate a replacement for OperandValToReplace in UserInst.
1337 SmallVector<Formula, 12> Formulae;
1338
1339 /// The set of register candidates used by all formulae in this LSRUse.
1340 SmallPtrSet<const SCEV *, 4> Regs;
1341
1342 LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {}
1343
1344 LSRFixup &getNewFixup() {
1345 Fixups.push_back(LSRFixup());
1346 return Fixups.back();
1347 }
1348
1349 void pushFixup(LSRFixup &f) {
1350 Fixups.push_back(f);
1351 if (Immediate::isKnownGT(f.Offset, MaxOffset))
1352 MaxOffset = f.Offset;
1353 if (Immediate::isKnownLT(f.Offset, MinOffset))
1354 MinOffset = f.Offset;
1355 }
1356
1357 bool HasFormulaWithSameRegs(const Formula &F) const;
1358 float getNotSelectedProbability(const SCEV *Reg) const;
1359 bool InsertFormula(const Formula &F, const Loop &L);
1360 void DeleteFormula(Formula &F);
1361 void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
1362
1363 void print(raw_ostream &OS) const;
1364 void dump() const;
1365};
1366
1367} // end anonymous namespace
1368
1369static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1370 LSRUse::KindType Kind, MemAccessTy AccessTy,
1371 GlobalValue *BaseGV, Immediate BaseOffset,
1372 bool HasBaseReg, int64_t Scale,
1373 Instruction *Fixup = nullptr);
1374
1375static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) {
1377 return 1;
1378 if (Depth == 0)
1379 return 0;
1380 if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))
1381 return getSetupCost(S->getStart(), Depth - 1);
1382 if (auto S = dyn_cast<SCEVIntegralCastExpr>(Reg))
1383 return getSetupCost(S->getOperand(), Depth - 1);
1384 if (auto S = dyn_cast<SCEVNAryExpr>(Reg))
1385 return std::accumulate(S->operands().begin(), S->operands().end(), 0,
1386 [&](unsigned i, const SCEV *Reg) {
1387 return i + getSetupCost(Reg, Depth - 1);
1388 });
1389 if (auto S = dyn_cast<SCEVUDivExpr>(Reg))
1390 return getSetupCost(S->getLHS(), Depth - 1) +
1391 getSetupCost(S->getRHS(), Depth - 1);
1392 return 0;
1393}
1394
1395/// Tally up interesting quantities from the given register.
1396void Cost::RateRegister(const Formula &F, const SCEV *Reg,
1397 SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
1398 bool HardwareLoopProfitable) {
1399 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
1400 // If this is an addrec for another loop, it should be an invariant
1401 // with respect to L since L is the innermost loop (at least
1402 // for now LSR only handles innermost loops).
1403 if (AR->getLoop() != L) {
1404 // If the AddRec exists, consider it's register free and leave it alone.
1405 if (isExistingPhi(AR, *SE) && !(AMK & TTI::AMK_PostIndexed))
1406 return;
1407
1408 // It is bad to allow LSR for current loop to add induction variables
1409 // for its sibling loops.
1410 if (!AR->getLoop()->contains(L)) {
1411 Lose();
1412 return;
1413 }
1414
1415 // Otherwise, it will be an invariant with respect to Loop L.
1416 ++C.NumRegs;
1417 return;
1418 }
1419
1420 unsigned LoopCost = 1;
1421 if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||
1422 TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {
1423 const SCEV *Start;
1424 const APInt *Step;
1425 if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_scev_APInt(Step)))) {
1426 // If the step size matches the base offset, we could use pre-indexed
1427 // addressing.
1428 bool CanPreIndex = (AMK & TTI::AMK_PreIndexed) &&
1429 F.BaseOffset.isFixed() &&
1430 *Step == F.BaseOffset.getFixedValue();
1431 bool CanPostIndex = (AMK & TTI::AMK_PostIndexed) &&
1432 !isa<SCEVConstant>(Start) &&
1433 SE->isLoopInvariant(Start, L);
1434 // We can only pre or post index when the load/store is unconditional.
1435 if ((CanPreIndex || CanPostIndex) && LU.AllFixupsUnconditional)
1436 LoopCost = 0;
1437 }
1438 }
1439
1440 // If the loop counts down to zero and we'll be using a hardware loop then
1441 // the addrec will be combined into the hardware loop instruction.
1442 if (LU.Kind == LSRUse::ICmpZero && F.countsDownToZero() &&
1443 HardwareLoopProfitable)
1444 LoopCost = 0;
1445 C.AddRecCost += LoopCost;
1446
1447 // Add the step value register, if it needs one.
1448 // TODO: The non-affine case isn't precisely modeled here.
1449 if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
1450 if (!Regs.count(AR->getOperand(1))) {
1451 RateRegister(F, AR->getOperand(1), Regs, LU, HardwareLoopProfitable);
1452 if (isLoser())
1453 return;
1454 }
1455 }
1456 }
1457 ++C.NumRegs;
1458
1459 // Rough heuristic; favor registers which don't require extra setup
1460 // instructions in the preheader.
1461 C.SetupCost += getSetupCost(Reg, SetupCostDepthLimit);
1462 // Ensure we don't, even with the recusion limit, produce invalid costs.
1463 C.SetupCost = std::min<unsigned>(C.SetupCost, 1 << 16);
1464
1465 C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
1467}
1468
1469/// Record this register in the set. If we haven't seen it before, rate
1470/// it. Optional LoserRegs provides a way to declare any formula that refers to
1471/// one of those regs an instant loser.
1472void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1473 SmallPtrSetImpl<const SCEV *> &Regs,
1474 const LSRUse &LU, bool HardwareLoopProfitable,
1475 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1476 if (LoserRegs && LoserRegs->count(Reg)) {
1477 Lose();
1478 return;
1479 }
1480 if (Regs.insert(Reg).second) {
1481 RateRegister(F, Reg, Regs, LU, HardwareLoopProfitable);
1482 if (LoserRegs && isLoser())
1483 LoserRegs->insert(Reg);
1484 }
1485}
1486
1487void Cost::RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
1488 const DenseSet<const SCEV *> &VisitedRegs,
1489 const LSRUse &LU, bool HardwareLoopProfitable,
1490 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1491 if (isLoser())
1492 return;
1493 assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
1494 // Tally up the registers.
1495 unsigned PrevAddRecCost = C.AddRecCost;
1496 unsigned PrevNumRegs = C.NumRegs;
1497 unsigned PrevNumBaseAdds = C.NumBaseAdds;
1498 if (const SCEV *ScaledReg = F.ScaledReg) {
1499 if (VisitedRegs.count(ScaledReg)) {
1500 Lose();
1501 return;
1502 }
1503 RatePrimaryRegister(F, ScaledReg, Regs, LU, HardwareLoopProfitable,
1504 LoserRegs);
1505 if (isLoser())
1506 return;
1507 }
1508 for (const SCEV *BaseReg : F.BaseRegs) {
1509 if (VisitedRegs.count(BaseReg)) {
1510 Lose();
1511 return;
1512 }
1513 RatePrimaryRegister(F, BaseReg, Regs, LU, HardwareLoopProfitable,
1514 LoserRegs);
1515 if (isLoser())
1516 return;
1517 }
1518
1519 // Determine how many (unfolded) adds we'll need inside the loop.
1520 size_t NumBaseParts = F.getNumRegs();
1521 if (NumBaseParts > 1)
1522 // Do not count the base and a possible second register if the target
1523 // allows to fold 2 registers.
1524 C.NumBaseAdds +=
1525 NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F)));
1526 C.NumBaseAdds += (F.UnfoldedOffset.isNonZero());
1527
1528 // Accumulate non-free scaling amounts.
1529 C.ScaleCost += getScalingFactorCost(*TTI, LU, F, *L).getValue();
1530
1531 // Tally up the non-zero immediates.
1532 for (const LSRFixup &Fixup : LU.Fixups) {
1533 if (Fixup.Offset.isCompatibleImmediate(F.BaseOffset)) {
1534 Immediate Offset = Fixup.Offset.addUnsigned(F.BaseOffset);
1535 if (F.BaseGV)
1536 C.ImmCost += 64; // Handle symbolic values conservatively.
1537 // TODO: This should probably be the pointer size.
1538 else if (Offset.isNonZero())
1539 C.ImmCost +=
1540 APInt(64, Offset.getKnownMinValue(), true).getSignificantBits();
1541
1542 // Check with target if this offset with this instruction is
1543 // specifically not supported.
1544 if (LU.Kind == LSRUse::Address && Offset.isNonZero() &&
1545 !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1546 Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
1547 C.NumBaseAdds++;
1548 } else {
1549 // Incompatible immediate type, increase cost to avoid using
1550 C.ImmCost += 2048;
1551 }
1552 }
1553
1554 // If we don't count instruction cost exit here.
1555 if (!InsnsCost) {
1556 assert(isValid() && "invalid cost");
1557 return;
1558 }
1559
1560 // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
1561 // additional instruction (at least fill).
1562 // TODO: Need distinguish register class?
1563 unsigned TTIRegNum = TTI->getNumberOfRegisters(
1564 TTI->getRegisterClassForType(false, F.getType())) - 1;
1565 if (C.NumRegs > TTIRegNum) {
1566 // Cost already exceeded TTIRegNum, then only newly added register can add
1567 // new instructions.
1568 if (PrevNumRegs > TTIRegNum)
1569 C.Insns += (C.NumRegs - PrevNumRegs);
1570 else
1571 C.Insns += (C.NumRegs - TTIRegNum);
1572 }
1573
1574 // If ICmpZero formula ends with not 0, it could not be replaced by
1575 // just add or sub. We'll need to compare final result of AddRec.
1576 // That means we'll need an additional instruction. But if the target can
1577 // macro-fuse a compare with a branch, don't count this extra instruction.
1578 // For -10 + {0, +, 1}:
1579 // i = i + 1;
1580 // cmp i, 10
1581 //
1582 // For {-10, +, 1}:
1583 // i = i + 1;
1584 if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() &&
1585 !TTI->canMacroFuseCmp())
1586 C.Insns++;
1587 // Each new AddRec adds 1 instruction to calculation.
1588 C.Insns += (C.AddRecCost - PrevAddRecCost);
1589
1590 // BaseAdds adds instructions for unfolded registers.
1591 if (LU.Kind != LSRUse::ICmpZero)
1592 C.Insns += C.NumBaseAdds - PrevNumBaseAdds;
1593 assert(isValid() && "invalid cost");
1594}
1595
1596/// Set this cost to a losing value.
1597void Cost::Lose() {
1598 C.Insns = std::numeric_limits<unsigned>::max();
1599 C.NumRegs = std::numeric_limits<unsigned>::max();
1600 C.AddRecCost = std::numeric_limits<unsigned>::max();
1601 C.NumIVMuls = std::numeric_limits<unsigned>::max();
1602 C.NumBaseAdds = std::numeric_limits<unsigned>::max();
1603 C.ImmCost = std::numeric_limits<unsigned>::max();
1604 C.SetupCost = std::numeric_limits<unsigned>::max();
1605 C.ScaleCost = std::numeric_limits<unsigned>::max();
1606}
1607
1608/// Choose the lower cost.
1609bool Cost::isLess(const Cost &Other) const {
1610 if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
1611 C.Insns != Other.C.Insns)
1612 return C.Insns < Other.C.Insns;
1613 return TTI->isLSRCostLess(C, Other.C);
1614}
1615
1616#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1617void Cost::print(raw_ostream &OS) const {
1618 if (InsnsCost)
1619 OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
1620 OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s");
1621 if (C.AddRecCost != 0)
1622 OS << ", with addrec cost " << C.AddRecCost;
1623 if (C.NumIVMuls != 0)
1624 OS << ", plus " << C.NumIVMuls << " IV mul"
1625 << (C.NumIVMuls == 1 ? "" : "s");
1626 if (C.NumBaseAdds != 0)
1627 OS << ", plus " << C.NumBaseAdds << " base add"
1628 << (C.NumBaseAdds == 1 ? "" : "s");
1629 if (C.ScaleCost != 0)
1630 OS << ", plus " << C.ScaleCost << " scale cost";
1631 if (C.ImmCost != 0)
1632 OS << ", plus " << C.ImmCost << " imm cost";
1633 if (C.SetupCost != 0)
1634 OS << ", plus " << C.SetupCost << " setup cost";
1635}
1636
1637LLVM_DUMP_METHOD void Cost::dump() const {
1638 print(errs()); errs() << '\n';
1639}
1640#endif
1641
1642/// Test whether this fixup always uses its value outside of the given loop.
1643bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
1644 // PHI nodes use their value in their incoming blocks.
1645 if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
1646 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
1647 if (PN->getIncomingValue(i) == OperandValToReplace &&
1648 L->contains(PN->getIncomingBlock(i)))
1649 return false;
1650 return true;
1651 }
1652
1653 return !L->contains(UserInst);
1654}
1655
1656#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1657void LSRFixup::print(raw_ostream &OS) const {
1658 OS << "UserInst=";
1659 // Store is common and interesting enough to be worth special-casing.
1660 if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
1661 OS << "store ";
1662 Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
1663 } else if (UserInst->getType()->isVoidTy())
1664 OS << UserInst->getOpcodeName();
1665 else
1666 UserInst->printAsOperand(OS, /*PrintType=*/false);
1667
1668 OS << ", OperandValToReplace=";
1669 OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
1670
1671 for (const Loop *PIL : PostIncLoops) {
1672 OS << ", PostIncLoop=";
1673 PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
1674 }
1675
1676 if (Offset.isNonZero())
1677 OS << ", Offset=" << Offset;
1678}
1679
1680LLVM_DUMP_METHOD void LSRFixup::dump() const {
1681 print(errs()); errs() << '\n';
1682}
1683#endif
1684
1685/// Test whether this use as a formula which has the same registers as the given
1686/// formula.
1687bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
1689 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1690 // Unstable sort by host order ok, because this is only used for uniquifying.
1691 llvm::sort(Key);
1692 return Uniquifier.count(Key);
1693}
1694
1695/// The function returns a probability of selecting formula without Reg.
1696float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
1697 unsigned FNum = 0;
1698 for (const Formula &F : Formulae)
1699 if (F.referencesReg(Reg))
1700 FNum++;
1701 return ((float)(Formulae.size() - FNum)) / Formulae.size();
1702}
1703
1704/// If the given formula has not yet been inserted, add it to the list, and
1705/// return true. Return false otherwise. The formula must be in canonical form.
1706bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
1707 assert(F.isCanonical(L) && "Invalid canonical representation");
1708
1709 if (!Formulae.empty() && RigidFormula)
1710 return false;
1711
1713 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1714 // Unstable sort by host order ok, because this is only used for uniquifying.
1715 llvm::sort(Key);
1716
1717 if (!Uniquifier.insert(Key).second)
1718 return false;
1719
1720 // Using a register to hold the value of 0 is not profitable.
1721 assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
1722 "Zero allocated in a scaled register!");
1723#ifndef NDEBUG
1724 for (const SCEV *BaseReg : F.BaseRegs)
1725 assert(!BaseReg->isZero() && "Zero allocated in a base register!");
1726#endif
1727
1728 // Add the formula to the list.
1729 Formulae.push_back(F);
1730
1731 // Record registers now being used by this use.
1732 Regs.insert_range(F.BaseRegs);
1733 if (F.ScaledReg)
1734 Regs.insert(F.ScaledReg);
1735
1736 return true;
1737}
1738
1739/// Remove the given formula from this use's list.
1740void LSRUse::DeleteFormula(Formula &F) {
1741 if (&F != &Formulae.back())
1742 std::swap(F, Formulae.back());
1743 Formulae.pop_back();
1744}
1745
1746/// Recompute the Regs field, and update RegUses.
1747void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
1748 // Now that we've filtered out some formulae, recompute the Regs set.
1749 SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
1750 Regs.clear();
1751 for (const Formula &F : Formulae) {
1752 if (F.ScaledReg) Regs.insert(F.ScaledReg);
1753 Regs.insert_range(F.BaseRegs);
1754 }
1755
1756 // Update the RegTracker.
1757 for (const SCEV *S : OldRegs)
1758 if (!Regs.count(S))
1759 RegUses.dropRegister(S, LUIdx);
1760}
1761
1762#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1763void LSRUse::print(raw_ostream &OS) const {
1764 OS << "LSR Use: Kind=";
1765 switch (Kind) {
1766 case Basic: OS << "Basic"; break;
1767 case Special: OS << "Special"; break;
1768 case ICmpZero: OS << "ICmpZero"; break;
1769 case Address:
1770 OS << "Address of ";
1771 if (AccessTy.MemTy->isPointerTy())
1772 OS << "pointer"; // the full pointer type could be really verbose
1773 else {
1774 OS << *AccessTy.MemTy;
1775 }
1776
1777 OS << " in addrspace(" << AccessTy.AddrSpace << ')';
1778 }
1779
1780 OS << ", Offsets={";
1781 bool NeedComma = false;
1782 for (const LSRFixup &Fixup : Fixups) {
1783 if (NeedComma) OS << ',';
1784 OS << Fixup.Offset;
1785 NeedComma = true;
1786 }
1787 OS << '}';
1788
1789 if (AllFixupsOutsideLoop)
1790 OS << ", all-fixups-outside-loop";
1791
1792 if (AllFixupsUnconditional)
1793 OS << ", all-fixups-unconditional";
1794
1795 if (WidestFixupType)
1796 OS << ", widest fixup type: " << *WidestFixupType;
1797}
1798
1799LLVM_DUMP_METHOD void LSRUse::dump() const {
1800 print(errs()); errs() << '\n';
1801}
1802#endif
1803
1805 LSRUse::KindType Kind, MemAccessTy AccessTy,
1806 GlobalValue *BaseGV, Immediate BaseOffset,
1807 bool HasBaseReg, int64_t Scale,
1808 Instruction *Fixup /* = nullptr */) {
1809 switch (Kind) {
1810 case LSRUse::Address: {
1811 int64_t FixedOffset =
1812 BaseOffset.isScalable() ? 0 : BaseOffset.getFixedValue();
1813 int64_t ScalableOffset =
1814 BaseOffset.isScalable() ? BaseOffset.getKnownMinValue() : 0;
1815 return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, FixedOffset,
1816 HasBaseReg, Scale, AccessTy.AddrSpace,
1817 Fixup, ScalableOffset);
1818 }
1819 case LSRUse::ICmpZero:
1820 // There's not even a target hook for querying whether it would be legal to
1821 // fold a GV into an ICmp.
1822 if (BaseGV)
1823 return false;
1824
1825 // ICmp only has two operands; don't allow more than two non-trivial parts.
1826 if (Scale != 0 && HasBaseReg && BaseOffset.isNonZero())
1827 return false;
1828
1829 // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
1830 // putting the scaled register in the other operand of the icmp.
1831 if (Scale != 0 && Scale != -1)
1832 return false;
1833
1834 // If we have low-level target information, ask the target if it can fold an
1835 // integer immediate on an icmp.
1836 if (BaseOffset.isNonZero()) {
1837 // We don't have an interface to query whether the target supports
1838 // icmpzero against scalable quantities yet.
1839 if (BaseOffset.isScalable())
1840 return false;
1841
1842 // We have one of:
1843 // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
1844 // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
1845 // Offs is the ICmp immediate.
1846 if (Scale == 0)
1847 // The cast does the right thing with
1848 // std::numeric_limits<int64_t>::min().
1849 BaseOffset = BaseOffset.getFixed(-(uint64_t)BaseOffset.getFixedValue());
1850 return TTI.isLegalICmpImmediate(BaseOffset.getFixedValue());
1851 }
1852
1853 // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
1854 return true;
1855
1856 case LSRUse::Basic:
1857 // Only handle single-register values.
1858 return !BaseGV && Scale == 0 && BaseOffset.isZero();
1859
1860 case LSRUse::Special:
1861 // Special case Basic to handle -1 scales.
1862 return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset.isZero();
1863 }
1864
1865 llvm_unreachable("Invalid LSRUse Kind!");
1866}
1867
1869 Immediate MinOffset, Immediate MaxOffset,
1870 LSRUse::KindType Kind, MemAccessTy AccessTy,
1871 GlobalValue *BaseGV, Immediate BaseOffset,
1872 bool HasBaseReg, int64_t Scale) {
1873 if (BaseOffset.isNonZero() &&
1874 (BaseOffset.isScalable() != MinOffset.isScalable() ||
1875 BaseOffset.isScalable() != MaxOffset.isScalable()))
1876 return false;
1877 // Check for overflow.
1878 int64_t Base = BaseOffset.getKnownMinValue();
1879 int64_t Min = MinOffset.getKnownMinValue();
1880 int64_t Max = MaxOffset.getKnownMinValue();
1881 if (((int64_t)((uint64_t)Base + Min) > Base) != (Min > 0))
1882 return false;
1883 MinOffset = Immediate::get((uint64_t)Base + Min, MinOffset.isScalable());
1884 if (((int64_t)((uint64_t)Base + Max) > Base) != (Max > 0))
1885 return false;
1886 MaxOffset = Immediate::get((uint64_t)Base + Max, MaxOffset.isScalable());
1887
1888 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
1889 HasBaseReg, Scale) &&
1890 isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
1891 HasBaseReg, Scale);
1892}
1893
1895 Immediate MinOffset, Immediate MaxOffset,
1896 LSRUse::KindType Kind, MemAccessTy AccessTy,
1897 const Formula &F, const Loop &L) {
1898 // For the purpose of isAMCompletelyFolded either having a canonical formula
1899 // or a scale not equal to zero is correct.
1900 // Problems may arise from non canonical formulae having a scale == 0.
1901 // Strictly speaking it would best to just rely on canonical formulae.
1902 // However, when we generate the scaled formulae, we first check that the
1903 // scaling factor is profitable before computing the actual ScaledReg for
1904 // compile time sake.
1905 assert((F.isCanonical(L) || F.Scale != 0));
1906 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1907 F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
1908}
1909
1910/// Test whether we know how to expand the current formula.
1911static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1912 Immediate MaxOffset, LSRUse::KindType Kind,
1913 MemAccessTy AccessTy, GlobalValue *BaseGV,
1914 Immediate BaseOffset, bool HasBaseReg, int64_t Scale) {
1915 // We know how to expand completely foldable formulae.
1916 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1917 BaseOffset, HasBaseReg, Scale) ||
1918 // Or formulae that use a base register produced by a sum of base
1919 // registers.
1920 (Scale == 1 &&
1921 isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1922 BaseGV, BaseOffset, true, 0));
1923}
1924
1925static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1926 Immediate MaxOffset, LSRUse::KindType Kind,
1927 MemAccessTy AccessTy, const Formula &F) {
1928 return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
1929 F.BaseOffset, F.HasBaseReg, F.Scale);
1930}
1931
1933 Immediate Offset) {
1934 if (Offset.isScalable())
1935 return TTI.isLegalAddScalableImmediate(Offset.getKnownMinValue());
1936
1937 return TTI.isLegalAddImmediate(Offset.getFixedValue());
1938}
1939
1941 const LSRUse &LU, const Formula &F) {
1942 // Target may want to look at the user instructions.
1943 if (LU.Kind == LSRUse::Address && TTI.LSRWithInstrQueries()) {
1944 for (const LSRFixup &Fixup : LU.Fixups)
1945 if (!isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1946 (F.BaseOffset + Fixup.Offset), F.HasBaseReg,
1947 F.Scale, Fixup.UserInst))
1948 return false;
1949 return true;
1950 }
1951
1952 return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1953 LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
1954 F.Scale);
1955}
1956
1958 const LSRUse &LU, const Formula &F,
1959 const Loop &L) {
1960 if (!F.Scale)
1961 return 0;
1962
1963 // If the use is not completely folded in that instruction, we will have to
1964 // pay an extra cost only for scale != 1.
1965 if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1966 LU.AccessTy, F, L))
1967 return F.Scale != 1;
1968
1969 switch (LU.Kind) {
1970 case LSRUse::Address: {
1971 // Check the scaling factor cost with both the min and max offsets.
1972 int64_t ScalableMin = 0, ScalableMax = 0, FixedMin = 0, FixedMax = 0;
1973 if (F.BaseOffset.isScalable()) {
1974 ScalableMin = (F.BaseOffset + LU.MinOffset).getKnownMinValue();
1975 ScalableMax = (F.BaseOffset + LU.MaxOffset).getKnownMinValue();
1976 } else {
1977 FixedMin = (F.BaseOffset + LU.MinOffset).getFixedValue();
1978 FixedMax = (F.BaseOffset + LU.MaxOffset).getFixedValue();
1979 }
1980 InstructionCost ScaleCostMinOffset = TTI.getScalingFactorCost(
1981 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMin, ScalableMin),
1982 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
1983 InstructionCost ScaleCostMaxOffset = TTI.getScalingFactorCost(
1984 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMax, ScalableMax),
1985 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
1986
1987 assert(ScaleCostMinOffset.isValid() && ScaleCostMaxOffset.isValid() &&
1988 "Legal addressing mode has an illegal cost!");
1989 return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
1990 }
1991 case LSRUse::ICmpZero:
1992 case LSRUse::Basic:
1993 case LSRUse::Special:
1994 // The use is completely folded, i.e., everything is folded into the
1995 // instruction.
1996 return 0;
1997 }
1998
1999 llvm_unreachable("Invalid LSRUse Kind!");
2000}
2001
2003 LSRUse::KindType Kind, MemAccessTy AccessTy,
2004 GlobalValue *BaseGV, Immediate BaseOffset,
2005 bool HasBaseReg) {
2006 // Fast-path: zero is always foldable.
2007 if (BaseOffset.isZero() && !BaseGV)
2008 return true;
2009
2010 // Conservatively, create an address with an immediate and a
2011 // base and a scale.
2012 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2013
2014 // Canonicalize a scale of 1 to a base register if the formula doesn't
2015 // already have a base register.
2016 if (!HasBaseReg && Scale == 1) {
2017 Scale = 0;
2018 HasBaseReg = true;
2019 }
2020
2021 // FIXME: Try with + without a scale? Maybe based on TTI?
2022 // I think basereg + scaledreg + immediateoffset isn't a good 'conservative'
2023 // default for many architectures, not just AArch64 SVE. More investigation
2024 // needed later to determine if this should be used more widely than just
2025 // on scalable types.
2026 if (HasBaseReg && BaseOffset.isNonZero() && Kind != LSRUse::ICmpZero &&
2027 AccessTy.MemTy && AccessTy.MemTy->isScalableTy() && DropScaledForVScale)
2028 Scale = 0;
2029
2030 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
2031 HasBaseReg, Scale);
2032}
2033
2035 ScalarEvolution &SE, Immediate MinOffset,
2036 Immediate MaxOffset, LSRUse::KindType Kind,
2037 MemAccessTy AccessTy, const SCEV *S,
2038 bool HasBaseReg) {
2039 // Fast-path: zero is always foldable.
2040 if (S->isZero()) return true;
2041
2042 // Conservatively, create an address with an immediate and a
2043 // base and a scale.
2044 SCEVUse SCopy = S;
2045 Immediate BaseOffset = ExtractImmediate(SCopy, SE);
2046 GlobalValue *BaseGV = ExtractSymbol(SCopy, SE);
2047
2048 // If there's anything else involved, it's not foldable.
2049 if (!SCopy->isZero())
2050 return false;
2051
2052 // Fast-path: zero is always foldable.
2053 if (BaseOffset.isZero() && !BaseGV)
2054 return true;
2055
2056 if (BaseOffset.isScalable())
2057 return false;
2058
2059 // Conservatively, create an address with an immediate and a
2060 // base and a scale.
2061 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2062
2063 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
2064 BaseOffset, HasBaseReg, Scale);
2065}
2066
2067namespace {
2068
2069/// An individual increment in a Chain of IV increments. Relate an IV user to
2070/// an expression that computes the IV it uses from the IV used by the previous
2071/// link in the Chain.
2072///
2073/// For the head of a chain, IncExpr holds the absolute SCEV expression for the
2074/// original IVOperand. The head of the chain's IVOperand is only valid during
2075/// chain collection, before LSR replaces IV users. During chain generation,
2076/// IncExpr can be used to find the new IVOperand that computes the same
2077/// expression.
2078struct IVInc {
2079 Instruction *UserInst;
2080 Value* IVOperand;
2081 const SCEV *IncExpr;
2082
2083 IVInc(Instruction *U, Value *O, const SCEV *E)
2084 : UserInst(U), IVOperand(O), IncExpr(E) {}
2085};
2086
2087// The list of IV increments in program order. We typically add the head of a
2088// chain without finding subsequent links.
2089struct IVChain {
2091 const SCEV *ExprBase = nullptr;
2092
2093 IVChain() = default;
2094 IVChain(const IVInc &Head, const SCEV *Base)
2095 : Incs(1, Head), ExprBase(Base) {}
2096
2097 using const_iterator = SmallVectorImpl<IVInc>::const_iterator;
2098
2099 // Return the first increment in the chain.
2100 const_iterator begin() const {
2101 assert(!Incs.empty());
2102 return std::next(Incs.begin());
2103 }
2104 const_iterator end() const {
2105 return Incs.end();
2106 }
2107
2108 // Returns true if this chain contains any increments.
2109 bool hasIncs() const { return Incs.size() >= 2; }
2110
2111 // Add an IVInc to the end of this chain.
2112 void add(const IVInc &X) { Incs.push_back(X); }
2113
2114 // Returns the last UserInst in the chain.
2115 Instruction *tailUserInst() const { return Incs.back().UserInst; }
2116
2117 // Returns true if IncExpr can be profitably added to this chain.
2118 bool isProfitableIncrement(const SCEV *OperExpr,
2119 const SCEV *IncExpr,
2120 ScalarEvolution&);
2121};
2122
2123/// Helper for CollectChains to track multiple IV increment uses. Distinguish
2124/// between FarUsers that definitely cross IV increments and NearUsers that may
2125/// be used between IV increments.
2126struct ChainUsers {
2127 SmallPtrSet<Instruction*, 4> FarUsers;
2128 SmallPtrSet<Instruction*, 4> NearUsers;
2129};
2130
2131/// This class holds state for the main loop strength reduction logic.
2132class LSRInstance {
2133 IVUsers &IU;
2134 ScalarEvolution &SE;
2135 DominatorTree &DT;
2136 LoopInfo &LI;
2137 AssumptionCache &AC;
2138 TargetLibraryInfo &TLI;
2139 const TargetTransformInfo &TTI;
2140 Loop *const L;
2141 MemorySSAUpdater *MSSAU;
2143 mutable SCEVExpander Rewriter;
2144 bool Changed = false;
2145 bool HardwareLoopProfitable = false;
2146
2147 /// This is the insert position that the current loop's induction variable
2148 /// increment should be placed. In simple loops, this is the latch block's
2149 /// terminator. But in more complicated cases, this is a position which will
2150 /// dominate all the in-loop post-increment users.
2151 Instruction *IVIncInsertPos = nullptr;
2152
2153 /// Interesting factors between use strides.
2154 ///
2155 /// We explicitly use a SetVector which contains a SmallSet, instead of the
2156 /// default, a SmallDenseSet, because we need to use the full range of
2157 /// int64_ts, and there's currently no good way of doing that with
2158 /// SmallDenseSet.
2159 SetVector<int64_t, SmallVector<int64_t, 8>, SmallSet<int64_t, 8>> Factors;
2160
2161 /// The cost of the current SCEV, the best solution by LSR will be dropped if
2162 /// the solution is not profitable.
2163 Cost BaselineCost;
2164
2165 /// Interesting use types, to facilitate truncation reuse.
2166 SmallSetVector<Type *, 4> Types;
2167
2168 /// The list of interesting uses.
2170
2171 /// Track which uses use which register candidates.
2172 RegUseTracker RegUses;
2173
2174 // Limit the number of chains to avoid quadratic behavior. We don't expect to
2175 // have more than a few IV increment chains in a loop. Missing a Chain falls
2176 // back to normal LSR behavior for those uses.
2177 static const unsigned MaxChains = 8;
2178
2179 /// IV users can form a chain of IV increments.
2181
2182 /// IV users that belong to profitable IVChains.
2183 SmallPtrSet<Use*, MaxChains> IVIncSet;
2184
2185 /// Induction variables that were generated and inserted by the SCEV Expander.
2186 SmallVector<llvm::WeakVH, 2> ScalarEvolutionIVs;
2187
2188 // Inserting instructions in the loop and using them as PHI's input could
2189 // break LCSSA in case if PHI's parent block is not a loop exit (i.e. the
2190 // corresponding incoming block is not loop exiting). So collect all such
2191 // instructions to form LCSSA for them later.
2192 SmallSetVector<Instruction *, 4> InsertedNonLCSSAInsts;
2193
2194 void OptimizeShadowIV();
2195 bool FindIVUserForCond(Instruction *Cond, IVStrideUse *&CondUse);
2196 Instruction *OptimizeMax(ICmpInst *Cond, IVStrideUse *&CondUse);
2197 void OptimizeLoopTermCond();
2198
2199 void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
2200 SmallVectorImpl<ChainUsers> &ChainUsersVec);
2201 void FinalizeChain(IVChain &Chain);
2202 void CollectChains();
2203 void GenerateIVChain(const IVChain &Chain,
2204 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2205
2206 void CollectInterestingTypesAndFactors();
2207 void CollectFixupsAndInitialFormulae();
2208
2209 // Support for sharing of LSRUses between LSRFixups.
2210 using UseMapTy = DenseMap<LSRUse::SCEVUseKindPair, size_t>;
2211 UseMapTy UseMap;
2212
2213 bool reconcileNewOffset(LSRUse &LU, Immediate NewOffset, bool HasBaseReg,
2214 LSRUse::KindType Kind, MemAccessTy AccessTy);
2215
2216 std::pair<size_t, Immediate> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
2217 MemAccessTy AccessTy);
2218
2219 void DeleteUse(LSRUse &LU, size_t LUIdx);
2220
2221 LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
2222
2223 void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2224 void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2225 void CountRegisters(const Formula &F, size_t LUIdx);
2226 bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
2227 bool IsFixupExecutedEachIncrement(const LSRFixup &LF) const;
2228
2229 void CollectLoopInvariantFixupsAndFormulae();
2230
2231 void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
2232 unsigned Depth = 0);
2233
2234 void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
2235 const Formula &Base, unsigned Depth,
2236 size_t Idx, bool IsScaledReg = false);
2237 void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
2238 void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2239 const Formula &Base, size_t Idx,
2240 bool IsScaledReg = false);
2241 void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2242 void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2243 const Formula &Base,
2244 const SmallVectorImpl<Immediate> &Worklist,
2245 size_t Idx, bool IsScaledReg = false);
2246 void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2247 void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2248 void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2249 void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
2250 void GenerateCrossUseConstantOffsets();
2251 void GenerateAllReuseFormulae();
2252
2253 void FilterOutUndesirableDedicatedRegisters();
2254
2255 size_t EstimateSearchSpaceComplexity() const;
2256 void NarrowSearchSpaceByDetectingSupersets();
2257 void NarrowSearchSpaceByCollapsingUnrolledCode();
2258 void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
2259 void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
2260 void NarrowSearchSpaceByFilterPostInc();
2261 void NarrowSearchSpaceByDeletingCostlyFormulas();
2262 void NarrowSearchSpaceByPickingWinnerRegs();
2263 void NarrowSearchSpaceUsingHeuristics();
2264
2265 void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
2266 Cost &SolutionCost,
2267 SmallVectorImpl<const Formula *> &Workspace,
2268 const Cost &CurCost,
2269 const SmallPtrSet<const SCEV *, 16> &CurRegs,
2270 DenseSet<const SCEV *> &VisitedRegs) const;
2271 void Solve(SmallVectorImpl<const Formula *> &Solution) const;
2272
2274 HoistInsertPosition(BasicBlock::iterator IP,
2275 const SmallVectorImpl<Instruction *> &Inputs) const;
2276 BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP,
2277 const LSRFixup &LF,
2278 const LSRUse &LU) const;
2279
2280 Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2282 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2283 void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
2284 const Formula &F,
2285 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2286 void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2287 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2288 void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
2289
2290public:
2291 LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
2292 LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC,
2293 TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU);
2294
2295 bool getChanged() const { return Changed; }
2296 const SmallVectorImpl<WeakVH> &getScalarEvolutionIVs() const {
2297 return ScalarEvolutionIVs;
2298 }
2299
2300 void print_factors_and_types(raw_ostream &OS) const;
2301 void print_fixups(raw_ostream &OS) const;
2302 void print_uses(raw_ostream &OS) const;
2303 void print(raw_ostream &OS) const;
2304 void dump() const;
2305};
2306
2307} // end anonymous namespace
2308
2309/// If IV is used in a int-to-float cast inside the loop then try to eliminate
2310/// the cast operation.
2311void LSRInstance::OptimizeShadowIV() {
2312 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2313 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2314 return;
2315
2316 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
2317 UI != E; /* empty */) {
2318 IVUsers::const_iterator CandidateUI = UI;
2319 ++UI;
2320 Instruction *ShadowUse = CandidateUI->getUser();
2321 Type *DestTy = nullptr;
2322 bool IsSigned = false;
2323
2324 /* If shadow use is a int->float cast then insert a second IV
2325 to eliminate this cast.
2326
2327 for (unsigned i = 0; i < n; ++i)
2328 foo((double)i);
2329
2330 is transformed into
2331
2332 double d = 0.0;
2333 for (unsigned i = 0; i < n; ++i, ++d)
2334 foo(d);
2335 */
2336 if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
2337 IsSigned = false;
2338 DestTy = UCast->getDestTy();
2339 }
2340 else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
2341 IsSigned = true;
2342 DestTy = SCast->getDestTy();
2343 }
2344 if (!DestTy) continue;
2345
2346 // If target does not support DestTy natively then do not apply
2347 // this transformation.
2348 if (!TTI.isTypeLegal(DestTy)) continue;
2349
2350 PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
2351 if (!PH) continue;
2352 if (PH->getNumIncomingValues() != 2) continue;
2353
2354 // If the calculation in integers overflows, the result in FP type will
2355 // differ. So we only can do this transformation if we are guaranteed to not
2356 // deal with overflowing values
2357 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PH));
2358 if (!AR) continue;
2359 if (IsSigned && !AR->hasNoSignedWrap()) continue;
2360 if (!IsSigned && !AR->hasNoUnsignedWrap()) continue;
2361
2362 Type *SrcTy = PH->getType();
2363 int Mantissa = DestTy->getFPMantissaWidth();
2364 if (Mantissa == -1) continue;
2365 if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
2366 continue;
2367
2368 unsigned Entry, Latch;
2369 if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
2370 Entry = 0;
2371 Latch = 1;
2372 } else {
2373 Entry = 1;
2374 Latch = 0;
2375 }
2376
2377 ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
2378 if (!Init) continue;
2379 Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
2380 (double)Init->getSExtValue() :
2381 (double)Init->getZExtValue());
2382
2383 BinaryOperator *Incr =
2385 if (!Incr) continue;
2386 if (Incr->getOpcode() != Instruction::Add
2387 && Incr->getOpcode() != Instruction::Sub)
2388 continue;
2389
2390 /* Initialize new IV, double d = 0.0 in above example. */
2391 ConstantInt *C = nullptr;
2392 if (Incr->getOperand(0) == PH)
2394 else if (Incr->getOperand(1) == PH)
2396 else
2397 continue;
2398
2399 if (!C) continue;
2400
2401 // Ignore negative constants, as the code below doesn't handle them
2402 // correctly. TODO: Remove this restriction.
2403 if (!C->getValue().isStrictlyPositive())
2404 continue;
2405
2406 /* Add new PHINode. */
2407 PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH->getIterator());
2408 NewPH->setDebugLoc(PH->getDebugLoc());
2409
2410 /* create new increment. '++d' in above example. */
2411 Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
2412 BinaryOperator *NewIncr = BinaryOperator::Create(
2413 Incr->getOpcode() == Instruction::Add ? Instruction::FAdd
2414 : Instruction::FSub,
2415 NewPH, CFP, "IV.S.next.", Incr->getIterator());
2416 NewIncr->setDebugLoc(Incr->getDebugLoc());
2417
2418 NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
2419 NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
2420
2421 /* Remove cast operation */
2422 ShadowUse->replaceAllUsesWith(NewPH);
2423 ShadowUse->eraseFromParent();
2424 Changed = true;
2425 break;
2426 }
2427}
2428
2429/// If Cond has an operand that is an expression of an IV, set the IV user and
2430/// stride information and return true, otherwise return false.
2431bool LSRInstance::FindIVUserForCond(Instruction *Cond, IVStrideUse *&CondUse) {
2432 for (IVStrideUse &U : IU)
2433 if (U.getUser() == Cond) {
2434 // NOTE: we could handle setcc instructions with multiple uses here, but
2435 // InstCombine does it as well for simple uses, it's not clear that it
2436 // occurs enough in real life to handle.
2437 CondUse = &U;
2438 return true;
2439 }
2440 return false;
2441}
2442
2443/// Rewrite the loop's terminating condition if it uses a max computation.
2444///
2445/// This is a narrow solution to a specific, but acute, problem. For loops
2446/// like this:
2447///
2448/// i = 0;
2449/// do {
2450/// p[i] = 0.0;
2451/// } while (++i < n);
2452///
2453/// the trip count isn't just 'n', because 'n' might not be positive. And
2454/// unfortunately this can come up even for loops where the user didn't use
2455/// a C do-while loop. For example, seemingly well-behaved top-test loops
2456/// will commonly be lowered like this:
2457///
2458/// if (n > 0) {
2459/// i = 0;
2460/// do {
2461/// p[i] = 0.0;
2462/// } while (++i < n);
2463/// }
2464///
2465/// and then it's possible for subsequent optimization to obscure the if
2466/// test in such a way that indvars can't find it.
2467///
2468/// When indvars can't find the if test in loops like this, it creates a
2469/// max expression, which allows it to give the loop a canonical
2470/// induction variable:
2471///
2472/// i = 0;
2473/// max = n < 1 ? 1 : n;
2474/// do {
2475/// p[i] = 0.0;
2476/// } while (++i != max);
2477///
2478/// Canonical induction variables are necessary because the loop passes
2479/// are designed around them. The most obvious example of this is the
2480/// LoopInfo analysis, which doesn't remember trip count values. It
2481/// expects to be able to rediscover the trip count each time it is
2482/// needed, and it does this using a simple analysis that only succeeds if
2483/// the loop has a canonical induction variable.
2484///
2485/// However, when it comes time to generate code, the maximum operation
2486/// can be quite costly, especially if it's inside of an outer loop.
2487///
2488/// This function solves this problem by detecting this type of loop and
2489/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
2490/// the instructions for the maximum computation.
2491Instruction *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse *&CondUse) {
2492 // Check that the loop matches the pattern we're looking for.
2493 if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
2494 Cond->getPredicate() != CmpInst::ICMP_NE)
2495 return Cond;
2496
2497 SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
2498 if (!Sel || !Sel->hasOneUse()) return Cond;
2499
2500 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2501 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2502 return Cond;
2503 const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
2504
2505 // Add one to the backedge-taken count to get the trip count.
2506 const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
2507 if (IterationCount != SE.getSCEV(Sel)) return Cond;
2508
2509 // Check for a max calculation that matches the pattern. There's no check
2510 // for ICMP_ULE here because the comparison would be with zero, which
2511 // isn't interesting.
2512 CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
2513 const SCEVNAryExpr *Max = nullptr;
2514 if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
2515 Pred = ICmpInst::ICMP_SLE;
2516 Max = S;
2517 } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
2518 Pred = ICmpInst::ICMP_SLT;
2519 Max = S;
2520 } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
2521 Pred = ICmpInst::ICMP_ULT;
2522 Max = U;
2523 } else {
2524 // No match; bail.
2525 return Cond;
2526 }
2527
2528 // To handle a max with more than two operands, this optimization would
2529 // require additional checking and setup.
2530 if (Max->getNumOperands() != 2)
2531 return Cond;
2532
2533 const SCEV *MaxLHS = Max->getOperand(0);
2534 const SCEV *MaxRHS = Max->getOperand(1);
2535
2536 // ScalarEvolution canonicalizes constants to the left. For < and >, look
2537 // for a comparison with 1. For <= and >=, a comparison with zero.
2538 if (!MaxLHS ||
2539 (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
2540 return Cond;
2541
2542 // Check the relevant induction variable for conformance to
2543 // the pattern.
2544 const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
2545 if (!match(IV,
2547 return Cond;
2548
2549 assert(cast<SCEVAddRecExpr>(IV)->getLoop() == L &&
2550 "Loop condition operand is an addrec in a different loop!");
2551
2552 // Check the right operand of the select, and remember it, as it will
2553 // be used in the new comparison instruction.
2554 Value *NewRHS = nullptr;
2555 if (ICmpInst::isTrueWhenEqual(Pred)) {
2556 // Look for n+1, and grab n.
2557 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
2558 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2559 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2560 NewRHS = BO->getOperand(0);
2561 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
2562 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2563 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2564 NewRHS = BO->getOperand(0);
2565 if (!NewRHS)
2566 return Cond;
2567 } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
2568 NewRHS = Sel->getOperand(1);
2569 else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
2570 NewRHS = Sel->getOperand(2);
2571 else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
2572 NewRHS = SU->getValue();
2573 else
2574 // Max doesn't match expected pattern.
2575 return Cond;
2576
2577 // Determine the new comparison opcode. It may be signed or unsigned,
2578 // and the original comparison may be either equality or inequality.
2579 if (Cond->getPredicate() == CmpInst::ICMP_EQ)
2580 Pred = CmpInst::getInversePredicate(Pred);
2581
2582 // Ok, everything looks ok to change the condition into an SLT or SGE and
2583 // delete the max calculation.
2584 ICmpInst *NewCond = new ICmpInst(Cond->getIterator(), Pred,
2585 Cond->getOperand(0), NewRHS, "scmp");
2586
2587 // Delete the max calculation instructions.
2588 NewCond->setDebugLoc(Cond->getDebugLoc());
2589 Cond->replaceAllUsesWith(NewCond);
2590 CondUse->setUser(NewCond);
2592 Cond->eraseFromParent();
2593 Sel->eraseFromParent();
2594 if (Cmp->use_empty()) {
2595 salvageDebugInfo(*Cmp);
2596 Cmp->eraseFromParent();
2597 }
2598 return NewCond;
2599}
2600
2601/// Change loop terminating condition to use the postinc iv when possible.
2602void
2603LSRInstance::OptimizeLoopTermCond() {
2604 SmallPtrSet<Instruction *, 4> PostIncs;
2605
2606 // We need a different set of heuristics for rotated and non-rotated loops.
2607 // If a loop is rotated then the latch is also the backedge, so inserting
2608 // post-inc expressions just before the latch is ideal. To reduce live ranges
2609 // it also makes sense to rewrite terminating conditions to use post-inc
2610 // expressions.
2611 //
2612 // If the loop is not rotated then the latch is not a backedge; the latch
2613 // check is done in the loop head. Adding post-inc expressions before the
2614 // latch will cause overlapping live-ranges of pre-inc and post-inc expressions
2615 // in the loop body. In this case we do *not* want to use post-inc expressions
2616 // in the latch check, and we want to insert post-inc expressions before
2617 // the backedge.
2618 BasicBlock *LatchBlock = L->getLoopLatch();
2619 SmallVector<BasicBlock*, 8> ExitingBlocks;
2620 L->getExitingBlocks(ExitingBlocks);
2621 if (!llvm::is_contained(ExitingBlocks, LatchBlock)) {
2622 // The backedge doesn't exit the loop; treat this as a head-tested loop.
2623 IVIncInsertPos = LatchBlock->getTerminator();
2624 return;
2625 }
2626
2627 // Otherwise treat this as a rotated loop.
2628 for (BasicBlock *ExitingBlock : ExitingBlocks) {
2629 // Get the terminating condition for the loop if possible. If we
2630 // can, we want to change it to use a post-incremented version of its
2631 // induction variable, to allow coalescing the live ranges for the IV into
2632 // one register value.
2633
2634 BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
2635 if (!TermBr || TermBr->isUnconditional())
2636 continue;
2637
2639 // If the argument to TermBr is an extractelement, then the source of that
2640 // instruction is what's generated the condition.
2642 if (Extract)
2643 Cond = dyn_cast<Instruction>(Extract->getVectorOperand());
2644 // FIXME: We could do more here, like handling logical operations where one
2645 // side is a cmp that uses an induction variable.
2646 if (!Cond)
2647 continue;
2648
2649 // Search IVUsesByStride to find Cond's IVUse if there is one.
2650 IVStrideUse *CondUse = nullptr;
2651 if (!FindIVUserForCond(Cond, CondUse))
2652 continue;
2653
2654 // If the trip count is computed in terms of a max (due to ScalarEvolution
2655 // being unable to find a sufficient guard, for example), change the loop
2656 // comparison to use SLT or ULT instead of NE.
2657 // One consequence of doing this now is that it disrupts the count-down
2658 // optimization. That's not always a bad thing though, because in such
2659 // cases it may still be worthwhile to avoid a max.
2660 if (auto *Cmp = dyn_cast<ICmpInst>(Cond))
2661 Cond = OptimizeMax(Cmp, CondUse);
2662
2663 // If this exiting block dominates the latch block, it may also use
2664 // the post-inc value if it won't be shared with other uses.
2665 // Check for dominance.
2666 if (!DT.dominates(ExitingBlock, LatchBlock))
2667 continue;
2668
2669 // Conservatively avoid trying to use the post-inc value in non-latch
2670 // exits if there may be pre-inc users in intervening blocks.
2671 if (LatchBlock != ExitingBlock)
2672 for (const IVStrideUse &UI : IU)
2673 // Test if the use is reachable from the exiting block. This dominator
2674 // query is a conservative approximation of reachability.
2675 if (&UI != CondUse &&
2676 !DT.properlyDominates(UI.getUser()->getParent(), ExitingBlock)) {
2677 // Conservatively assume there may be reuse if the quotient of their
2678 // strides could be a legal scale.
2679 const SCEV *A = IU.getStride(*CondUse, L);
2680 const SCEV *B = IU.getStride(UI, L);
2681 if (!A || !B) continue;
2682 if (SE.getTypeSizeInBits(A->getType()) !=
2683 SE.getTypeSizeInBits(B->getType())) {
2684 if (SE.getTypeSizeInBits(A->getType()) >
2685 SE.getTypeSizeInBits(B->getType()))
2686 B = SE.getSignExtendExpr(B, A->getType());
2687 else
2688 A = SE.getSignExtendExpr(A, B->getType());
2689 }
2690 if (const SCEVConstant *D =
2692 const ConstantInt *C = D->getValue();
2693 // Stride of one or negative one can have reuse with non-addresses.
2694 if (C->isOne() || C->isMinusOne())
2695 goto decline_post_inc;
2696 // Avoid weird situations.
2697 if (C->getValue().getSignificantBits() >= 64 ||
2698 C->getValue().isMinSignedValue())
2699 goto decline_post_inc;
2700 // Check for possible scaled-address reuse.
2701 if (isAddressUse(TTI, UI.getUser(), UI.getOperandValToReplace())) {
2702 MemAccessTy AccessTy =
2703 getAccessType(TTI, UI.getUser(), UI.getOperandValToReplace());
2704 int64_t Scale = C->getSExtValue();
2705 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2706 /*BaseOffset=*/0,
2707 /*HasBaseReg=*/true, Scale,
2708 AccessTy.AddrSpace))
2709 goto decline_post_inc;
2710 Scale = -Scale;
2711 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2712 /*BaseOffset=*/0,
2713 /*HasBaseReg=*/true, Scale,
2714 AccessTy.AddrSpace))
2715 goto decline_post_inc;
2716 }
2717 }
2718 }
2719
2720 LLVM_DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: "
2721 << *Cond << '\n');
2722
2723 // It's possible for the setcc instruction to be anywhere in the loop, and
2724 // possible for it to have multiple users. If it is not immediately before
2725 // the exiting block branch, move it.
2726 if (isa_and_nonnull<CmpInst>(Cond) && Cond->getNextNode() != TermBr &&
2727 !Extract) {
2728 if (Cond->hasOneUse()) {
2729 Cond->moveBefore(TermBr->getIterator());
2730 } else {
2731 // Clone the terminating condition and insert into the loopend.
2732 Instruction *OldCond = Cond;
2733 Cond = Cond->clone();
2734 Cond->setName(L->getHeader()->getName() + ".termcond");
2735 Cond->insertInto(ExitingBlock, TermBr->getIterator());
2736
2737 // Clone the IVUse, as the old use still exists!
2738 CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
2739 TermBr->replaceUsesOfWith(OldCond, Cond);
2740 }
2741 }
2742
2743 // If we get to here, we know that we can transform the setcc instruction to
2744 // use the post-incremented version of the IV, allowing us to coalesce the
2745 // live ranges for the IV correctly.
2746 CondUse->transformToPostInc(L);
2747 Changed = true;
2748
2749 PostIncs.insert(Cond);
2750 decline_post_inc:;
2751 }
2752
2753 // Determine an insertion point for the loop induction variable increment. It
2754 // must dominate all the post-inc comparisons we just set up, and it must
2755 // dominate the loop latch edge.
2756 IVIncInsertPos = L->getLoopLatch()->getTerminator();
2757 for (Instruction *Inst : PostIncs)
2758 IVIncInsertPos = DT.findNearestCommonDominator(IVIncInsertPos, Inst);
2759}
2760
2761/// Determine if the given use can accommodate a fixup at the given offset and
2762/// other details. If so, update the use and return true.
2763bool LSRInstance::reconcileNewOffset(LSRUse &LU, Immediate NewOffset,
2764 bool HasBaseReg, LSRUse::KindType Kind,
2765 MemAccessTy AccessTy) {
2766 Immediate NewMinOffset = LU.MinOffset;
2767 Immediate NewMaxOffset = LU.MaxOffset;
2768 MemAccessTy NewAccessTy = AccessTy;
2769
2770 // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
2771 // something conservative, however this can pessimize in the case that one of
2772 // the uses will have all its uses outside the loop, for example.
2773 if (LU.Kind != Kind)
2774 return false;
2775
2776 // Check for a mismatched access type, and fall back conservatively as needed.
2777 // TODO: Be less conservative when the type is similar and can use the same
2778 // addressing modes.
2779 if (Kind == LSRUse::Address) {
2780 if (AccessTy.MemTy != LU.AccessTy.MemTy) {
2781 NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext(),
2782 AccessTy.AddrSpace);
2783 }
2784 }
2785
2786 // Conservatively assume HasBaseReg is true for now.
2787 if (Immediate::isKnownLT(NewOffset, LU.MinOffset)) {
2788 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2789 LU.MaxOffset - NewOffset, HasBaseReg))
2790 return false;
2791 NewMinOffset = NewOffset;
2792 } else if (Immediate::isKnownGT(NewOffset, LU.MaxOffset)) {
2793 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2794 NewOffset - LU.MinOffset, HasBaseReg))
2795 return false;
2796 NewMaxOffset = NewOffset;
2797 }
2798
2799 // FIXME: We should be able to handle some level of scalable offset support
2800 // for 'void', but in order to get basic support up and running this is
2801 // being left out.
2802 if (NewAccessTy.MemTy && NewAccessTy.MemTy->isVoidTy() &&
2803 (NewMinOffset.isScalable() || NewMaxOffset.isScalable()))
2804 return false;
2805
2806 // Update the use.
2807 LU.MinOffset = NewMinOffset;
2808 LU.MaxOffset = NewMaxOffset;
2809 LU.AccessTy = NewAccessTy;
2810 return true;
2811}
2812
2813/// Return an LSRUse index and an offset value for a fixup which needs the given
2814/// expression, with the given kind and optional access type. Either reuse an
2815/// existing use or create a new one, as needed.
2816std::pair<size_t, Immediate> LSRInstance::getUse(const SCEV *&Expr,
2817 LSRUse::KindType Kind,
2818 MemAccessTy AccessTy) {
2819 const SCEV *Copy = Expr;
2820 SCEVUse ExprUse = Expr;
2821 Immediate Offset = ExtractImmediate(ExprUse, SE);
2822 Expr = ExprUse;
2823
2824 // Basic uses can't accept any offset, for example.
2825 if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
2826 Offset, /*HasBaseReg=*/ true)) {
2827 Expr = Copy;
2828 Offset = Immediate::getFixed(0);
2829 }
2830
2831 std::pair<UseMapTy::iterator, bool> P =
2832 UseMap.try_emplace(LSRUse::SCEVUseKindPair(Expr, Kind));
2833 if (!P.second) {
2834 // A use already existed with this base.
2835 size_t LUIdx = P.first->second;
2836 LSRUse &LU = Uses[LUIdx];
2837 if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
2838 // Reuse this use.
2839 return std::make_pair(LUIdx, Offset);
2840 }
2841
2842 // Create a new use.
2843 size_t LUIdx = Uses.size();
2844 P.first->second = LUIdx;
2845 Uses.push_back(LSRUse(Kind, AccessTy));
2846 LSRUse &LU = Uses[LUIdx];
2847
2848 LU.MinOffset = Offset;
2849 LU.MaxOffset = Offset;
2850 return std::make_pair(LUIdx, Offset);
2851}
2852
2853/// Delete the given use from the Uses list.
2854void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
2855 if (&LU != &Uses.back())
2856 std::swap(LU, Uses.back());
2857 Uses.pop_back();
2858
2859 // Update RegUses.
2860 RegUses.swapAndDropUse(LUIdx, Uses.size());
2861}
2862
2863/// Look for a use distinct from OrigLU which is has a formula that has the same
2864/// registers as the given formula.
2865LSRUse *
2866LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
2867 const LSRUse &OrigLU) {
2868 // Search all uses for the formula. This could be more clever.
2869 for (LSRUse &LU : Uses) {
2870 // Check whether this use is close enough to OrigLU, to see whether it's
2871 // worthwhile looking through its formulae.
2872 // Ignore ICmpZero uses because they may contain formulae generated by
2873 // GenerateICmpZeroScales, in which case adding fixup offsets may
2874 // be invalid.
2875 if (&LU != &OrigLU &&
2876 LU.Kind != LSRUse::ICmpZero &&
2877 LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
2878 LU.WidestFixupType == OrigLU.WidestFixupType &&
2879 LU.HasFormulaWithSameRegs(OrigF)) {
2880 // Scan through this use's formulae.
2881 for (const Formula &F : LU.Formulae) {
2882 // Check to see if this formula has the same registers and symbols
2883 // as OrigF.
2884 if (F.BaseRegs == OrigF.BaseRegs &&
2885 F.ScaledReg == OrigF.ScaledReg &&
2886 F.BaseGV == OrigF.BaseGV &&
2887 F.Scale == OrigF.Scale &&
2888 F.UnfoldedOffset == OrigF.UnfoldedOffset) {
2889 if (F.BaseOffset.isZero())
2890 return &LU;
2891 // This is the formula where all the registers and symbols matched;
2892 // there aren't going to be any others. Since we declined it, we
2893 // can skip the rest of the formulae and proceed to the next LSRUse.
2894 break;
2895 }
2896 }
2897 }
2898 }
2899
2900 // Nothing looked good.
2901 return nullptr;
2902}
2903
2904void LSRInstance::CollectInterestingTypesAndFactors() {
2905 SmallSetVector<const SCEV *, 4> Strides;
2906
2907 // Collect interesting types and strides.
2909 for (const IVStrideUse &U : IU) {
2910 const SCEV *Expr = IU.getExpr(U);
2911 if (!Expr)
2912 continue;
2913
2914 // Collect interesting types.
2915 Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
2916
2917 // Add strides for mentioned loops.
2918 Worklist.push_back(Expr);
2919 do {
2920 const SCEV *S = Worklist.pop_back_val();
2921 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
2922 if (AR->getLoop() == L)
2923 Strides.insert(AR->getStepRecurrence(SE));
2924 Worklist.push_back(AR->getStart());
2925 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
2926 append_range(Worklist, Add->operands());
2927 }
2928 } while (!Worklist.empty());
2929 }
2930
2931 // Compute interesting factors from the set of interesting strides.
2932 for (SmallSetVector<const SCEV *, 4>::const_iterator
2933 I = Strides.begin(), E = Strides.end(); I != E; ++I)
2934 for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
2935 std::next(I); NewStrideIter != E; ++NewStrideIter) {
2936 const SCEV *OldStride = *I;
2937 const SCEV *NewStride = *NewStrideIter;
2938
2939 if (SE.getTypeSizeInBits(OldStride->getType()) !=
2940 SE.getTypeSizeInBits(NewStride->getType())) {
2941 if (SE.getTypeSizeInBits(OldStride->getType()) >
2942 SE.getTypeSizeInBits(NewStride->getType()))
2943 NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
2944 else
2945 OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
2946 }
2947 if (const SCEVConstant *Factor =
2948 dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
2949 SE, true))) {
2950 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2951 Factors.insert(Factor->getAPInt().getSExtValue());
2952 } else if (const SCEVConstant *Factor =
2954 NewStride,
2955 SE, true))) {
2956 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2957 Factors.insert(Factor->getAPInt().getSExtValue());
2958 }
2959 }
2960
2961 // If all uses use the same type, don't bother looking for truncation-based
2962 // reuse.
2963 if (Types.size() == 1)
2964 Types.clear();
2965
2966 LLVM_DEBUG(print_factors_and_types(dbgs()));
2967}
2968
2969/// Helper for CollectChains that finds an IV operand (computed by an AddRec in
2970/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
2971/// IVStrideUses, we could partially skip this.
2972static User::op_iterator
2974 Loop *L, ScalarEvolution &SE) {
2975 for(; OI != OE; ++OI) {
2976 if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
2977 if (!SE.isSCEVable(Oper->getType()))
2978 continue;
2979
2980 if (const SCEVAddRecExpr *AR =
2982 if (AR->getLoop() == L)
2983 break;
2984 }
2985 }
2986 }
2987 return OI;
2988}
2989
2990/// IVChain logic must consistently peek base TruncInst operands, so wrap it in
2991/// a convenient helper.
2993 if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
2994 return Trunc->getOperand(0);
2995 return Oper;
2996}
2997
2998/// Return an approximation of this SCEV expression's "base", or NULL for any
2999/// constant. Returning the expression itself is conservative. Returning a
3000/// deeper subexpression is more precise and valid as long as it isn't less
3001/// complex than another subexpression. For expressions involving multiple
3002/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
3003/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
3004/// IVInc==b-a.
3005///
3006/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
3007/// SCEVUnknown, we simply return the rightmost SCEV operand.
3008static const SCEV *getExprBase(const SCEV *S) {
3009 switch (S->getSCEVType()) {
3010 default: // including scUnknown.
3011 return S;
3012 case scConstant:
3013 case scVScale:
3014 return nullptr;
3015 case scTruncate:
3016 return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
3017 case scZeroExtend:
3018 return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
3019 case scSignExtend:
3020 return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
3021 case scAddExpr: {
3022 // Skip over scaled operands (scMulExpr) to follow add operands as long as
3023 // there's nothing more complex.
3024 // FIXME: not sure if we want to recognize negation.
3025 const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
3026 for (const SCEV *SubExpr : reverse(Add->operands())) {
3027 if (SubExpr->getSCEVType() == scAddExpr)
3028 return getExprBase(SubExpr);
3029
3030 if (SubExpr->getSCEVType() != scMulExpr)
3031 return SubExpr;
3032 }
3033 return S; // all operands are scaled, be conservative.
3034 }
3035 case scAddRecExpr:
3036 return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
3037 }
3038 llvm_unreachable("Unknown SCEV kind!");
3039}
3040
3041/// Return true if the chain increment is profitable to expand into a loop
3042/// invariant value, which may require its own register. A profitable chain
3043/// increment will be an offset relative to the same base. We allow such offsets
3044/// to potentially be used as chain increment as long as it's not obviously
3045/// expensive to expand using real instructions.
3046bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
3047 const SCEV *IncExpr,
3048 ScalarEvolution &SE) {
3049 // Aggressively form chains when -stress-ivchain.
3050 if (StressIVChain)
3051 return true;
3052
3053 // Do not replace a constant offset from IV head with a nonconstant IV
3054 // increment.
3055 if (!isa<SCEVConstant>(IncExpr)) {
3056 const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
3057 if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
3058 return false;
3059 }
3060
3061 SmallPtrSet<const SCEV*, 8> Processed;
3062 return !isHighCostExpansion(IncExpr, Processed, SE);
3063}
3064
3065/// Return true if the number of registers needed for the chain is estimated to
3066/// be less than the number required for the individual IV users. First prohibit
3067/// any IV users that keep the IV live across increments (the Users set should
3068/// be empty). Next count the number and type of increments in the chain.
3069///
3070/// Chaining IVs can lead to considerable code bloat if ISEL doesn't
3071/// effectively use postinc addressing modes. Only consider it profitable it the
3072/// increments can be computed in fewer registers when chained.
3073///
3074/// TODO: Consider IVInc free if it's already used in another chains.
3075static bool isProfitableChain(IVChain &Chain,
3077 ScalarEvolution &SE,
3078 const TargetTransformInfo &TTI) {
3079 if (StressIVChain)
3080 return true;
3081
3082 if (!Chain.hasIncs())
3083 return false;
3084
3085 if (!Users.empty()) {
3086 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
3087 for (Instruction *Inst
3088 : Users) { dbgs() << " " << *Inst << "\n"; });
3089 return false;
3090 }
3091 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3092
3093 // The chain itself may require a register, so initialize cost to 1.
3094 int cost = 1;
3095
3096 // A complete chain likely eliminates the need for keeping the original IV in
3097 // a register. LSR does not currently know how to form a complete chain unless
3098 // the header phi already exists.
3099 if (isa<PHINode>(Chain.tailUserInst())
3100 && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
3101 --cost;
3102 }
3103 const SCEV *LastIncExpr = nullptr;
3104 unsigned NumConstIncrements = 0;
3105 unsigned NumVarIncrements = 0;
3106 unsigned NumReusedIncrements = 0;
3107
3108 if (TTI.isProfitableLSRChainElement(Chain.Incs[0].UserInst))
3109 return true;
3110
3111 for (const IVInc &Inc : Chain) {
3112 if (TTI.isProfitableLSRChainElement(Inc.UserInst))
3113 return true;
3114 if (Inc.IncExpr->isZero())
3115 continue;
3116
3117 // Incrementing by zero or some constant is neutral. We assume constants can
3118 // be folded into an addressing mode or an add's immediate operand.
3119 if (isa<SCEVConstant>(Inc.IncExpr)) {
3120 ++NumConstIncrements;
3121 continue;
3122 }
3123
3124 if (Inc.IncExpr == LastIncExpr)
3125 ++NumReusedIncrements;
3126 else
3127 ++NumVarIncrements;
3128
3129 LastIncExpr = Inc.IncExpr;
3130 }
3131 // An IV chain with a single increment is handled by LSR's postinc
3132 // uses. However, a chain with multiple increments requires keeping the IV's
3133 // value live longer than it needs to be if chained.
3134 if (NumConstIncrements > 1)
3135 --cost;
3136
3137 // Materializing increment expressions in the preheader that didn't exist in
3138 // the original code may cost a register. For example, sign-extended array
3139 // indices can produce ridiculous increments like this:
3140 // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
3141 cost += NumVarIncrements;
3142
3143 // Reusing variable increments likely saves a register to hold the multiple of
3144 // the stride.
3145 cost -= NumReusedIncrements;
3146
3147 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
3148 << "\n");
3149
3150 return cost < 0;
3151}
3152
3153/// Add this IV user to an existing chain or make it the head of a new chain.
3154void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
3155 SmallVectorImpl<ChainUsers> &ChainUsersVec) {
3156 // When IVs are used as types of varying widths, they are generally converted
3157 // to a wider type with some uses remaining narrow under a (free) trunc.
3158 Value *const NextIV = getWideOperand(IVOper);
3159 const SCEV *const OperExpr = SE.getSCEV(NextIV);
3160 const SCEV *const OperExprBase = getExprBase(OperExpr);
3161
3162 // Visit all existing chains. Check if its IVOper can be computed as a
3163 // profitable loop invariant increment from the last link in the Chain.
3164 unsigned ChainIdx = 0, NChains = IVChainVec.size();
3165 const SCEV *LastIncExpr = nullptr;
3166 for (; ChainIdx < NChains; ++ChainIdx) {
3167 IVChain &Chain = IVChainVec[ChainIdx];
3168
3169 // Prune the solution space aggressively by checking that both IV operands
3170 // are expressions that operate on the same unscaled SCEVUnknown. This
3171 // "base" will be canceled by the subsequent getMinusSCEV call. Checking
3172 // first avoids creating extra SCEV expressions.
3173 if (!StressIVChain && Chain.ExprBase != OperExprBase)
3174 continue;
3175
3176 Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
3177 if (PrevIV->getType() != NextIV->getType())
3178 continue;
3179
3180 // A phi node terminates a chain.
3181 if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
3182 continue;
3183
3184 // The increment must be loop-invariant so it can be kept in a register.
3185 const SCEV *PrevExpr = SE.getSCEV(PrevIV);
3186 const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
3187 if (isa<SCEVCouldNotCompute>(IncExpr) || !SE.isLoopInvariant(IncExpr, L))
3188 continue;
3189
3190 if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
3191 LastIncExpr = IncExpr;
3192 break;
3193 }
3194 }
3195 // If we haven't found a chain, create a new one, unless we hit the max. Don't
3196 // bother for phi nodes, because they must be last in the chain.
3197 if (ChainIdx == NChains) {
3198 if (isa<PHINode>(UserInst))
3199 return;
3200 if (NChains >= MaxChains && !StressIVChain) {
3201 LLVM_DEBUG(dbgs() << "IV Chain Limit\n");
3202 return;
3203 }
3204 LastIncExpr = OperExpr;
3205 // IVUsers may have skipped over sign/zero extensions. We don't currently
3206 // attempt to form chains involving extensions unless they can be hoisted
3207 // into this loop's AddRec.
3208 if (!isa<SCEVAddRecExpr>(LastIncExpr))
3209 return;
3210 ++NChains;
3211 IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
3212 OperExprBase));
3213 ChainUsersVec.resize(NChains);
3214 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
3215 << ") IV=" << *LastIncExpr << "\n");
3216 } else {
3217 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst
3218 << ") IV+" << *LastIncExpr << "\n");
3219 // Add this IV user to the end of the chain.
3220 IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
3221 }
3222 IVChain &Chain = IVChainVec[ChainIdx];
3223
3224 SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
3225 // This chain's NearUsers become FarUsers.
3226 if (!LastIncExpr->isZero()) {
3227 ChainUsersVec[ChainIdx].FarUsers.insert_range(NearUsers);
3228 NearUsers.clear();
3229 }
3230
3231 // All other uses of IVOperand become near uses of the chain.
3232 // We currently ignore intermediate values within SCEV expressions, assuming
3233 // they will eventually be used be the current chain, or can be computed
3234 // from one of the chain increments. To be more precise we could
3235 // transitively follow its user and only add leaf IV users to the set.
3236 for (User *U : IVOper->users()) {
3237 Instruction *OtherUse = dyn_cast<Instruction>(U);
3238 if (!OtherUse)
3239 continue;
3240 // Uses in the chain will no longer be uses if the chain is formed.
3241 // Include the head of the chain in this iteration (not Chain.begin()).
3242 IVChain::const_iterator IncIter = Chain.Incs.begin();
3243 IVChain::const_iterator IncEnd = Chain.Incs.end();
3244 for( ; IncIter != IncEnd; ++IncIter) {
3245 if (IncIter->UserInst == OtherUse)
3246 break;
3247 }
3248 if (IncIter != IncEnd)
3249 continue;
3250
3251 if (SE.isSCEVable(OtherUse->getType())
3252 && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
3253 && IU.isIVUserOrOperand(OtherUse)) {
3254 continue;
3255 }
3256 NearUsers.insert(OtherUse);
3257 }
3258
3259 // Since this user is part of the chain, it's no longer considered a use
3260 // of the chain.
3261 ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
3262}
3263
3264/// Populate the vector of Chains.
3265///
3266/// This decreases ILP at the architecture level. Targets with ample registers,
3267/// multiple memory ports, and no register renaming probably don't want
3268/// this. However, such targets should probably disable LSR altogether.
3269///
3270/// The job of LSR is to make a reasonable choice of induction variables across
3271/// the loop. Subsequent passes can easily "unchain" computation exposing more
3272/// ILP *within the loop* if the target wants it.
3273///
3274/// Finding the best IV chain is potentially a scheduling problem. Since LSR
3275/// will not reorder memory operations, it will recognize this as a chain, but
3276/// will generate redundant IV increments. Ideally this would be corrected later
3277/// by a smart scheduler:
3278/// = A[i]
3279/// = A[i+x]
3280/// A[i] =
3281/// A[i+x] =
3282///
3283/// TODO: Walk the entire domtree within this loop, not just the path to the
3284/// loop latch. This will discover chains on side paths, but requires
3285/// maintaining multiple copies of the Chains state.
3286void LSRInstance::CollectChains() {
3287 LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n");
3288 SmallVector<ChainUsers, 8> ChainUsersVec;
3289
3290 SmallVector<BasicBlock *,8> LatchPath;
3291 BasicBlock *LoopHeader = L->getHeader();
3292 for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
3293 Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
3294 LatchPath.push_back(Rung->getBlock());
3295 }
3296 LatchPath.push_back(LoopHeader);
3297
3298 // Walk the instruction stream from the loop header to the loop latch.
3299 for (BasicBlock *BB : reverse(LatchPath)) {
3300 for (Instruction &I : *BB) {
3301 // Skip instructions that weren't seen by IVUsers analysis.
3302 if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&I))
3303 continue;
3304
3305 // Ignore users that are part of a SCEV expression. This way we only
3306 // consider leaf IV Users. This effectively rediscovers a portion of
3307 // IVUsers analysis but in program order this time.
3308 if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I)))
3309 continue;
3310
3311 // Remove this instruction from any NearUsers set it may be in.
3312 for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
3313 ChainIdx < NChains; ++ChainIdx) {
3314 ChainUsersVec[ChainIdx].NearUsers.erase(&I);
3315 }
3316 // Search for operands that can be chained.
3317 SmallPtrSet<Instruction*, 4> UniqueOperands;
3318 User::op_iterator IVOpEnd = I.op_end();
3319 User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE);
3320 while (IVOpIter != IVOpEnd) {
3321 Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
3322 if (UniqueOperands.insert(IVOpInst).second)
3323 ChainInstruction(&I, IVOpInst, ChainUsersVec);
3324 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3325 }
3326 } // Continue walking down the instructions.
3327 } // Continue walking down the domtree.
3328 // Visit phi backedges to determine if the chain can generate the IV postinc.
3329 for (PHINode &PN : L->getHeader()->phis()) {
3330 if (!SE.isSCEVable(PN.getType()))
3331 continue;
3332
3333 Instruction *IncV =
3334 dyn_cast<Instruction>(PN.getIncomingValueForBlock(L->getLoopLatch()));
3335 if (IncV)
3336 ChainInstruction(&PN, IncV, ChainUsersVec);
3337 }
3338 // Remove any unprofitable chains.
3339 unsigned ChainIdx = 0;
3340 for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
3341 UsersIdx < NChains; ++UsersIdx) {
3342 if (!isProfitableChain(IVChainVec[UsersIdx],
3343 ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
3344 continue;
3345 // Preserve the chain at UsesIdx.
3346 if (ChainIdx != UsersIdx)
3347 IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
3348 FinalizeChain(IVChainVec[ChainIdx]);
3349 ++ChainIdx;
3350 }
3351 IVChainVec.resize(ChainIdx);
3352}
3353
3354void LSRInstance::FinalizeChain(IVChain &Chain) {
3355 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3356 LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
3357
3358 for (const IVInc &Inc : Chain) {
3359 LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");
3360 auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
3361 assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
3362 IVIncSet.insert(UseI);
3363 }
3364}
3365
3366/// Return true if the IVInc can be folded into an addressing mode.
3367static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
3368 Value *Operand, const TargetTransformInfo &TTI) {
3369 const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
3370 Immediate IncOffset = Immediate::getZero();
3371 if (IncConst) {
3372 if (IncConst && IncConst->getAPInt().getSignificantBits() > 64)
3373 return false;
3374 IncOffset = Immediate::getFixed(IncConst->getValue()->getSExtValue());
3375 } else {
3376 // Look for mul(vscale, constant), to detect a scalable offset.
3377 const APInt *C;
3378 if (!match(IncExpr, m_scev_Mul(m_scev_APInt(C), m_SCEVVScale())) ||
3379 C->getSignificantBits() > 64)
3380 return false;
3381 IncOffset = Immediate::getScalable(C->getSExtValue());
3382 }
3383
3384 if (!isAddressUse(TTI, UserInst, Operand))
3385 return false;
3386
3387 MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
3388 if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
3389 IncOffset, /*HasBaseReg=*/false))
3390 return false;
3391
3392 return true;
3393}
3394
3395/// Generate an add or subtract for each IVInc in a chain to materialize the IV
3396/// user's operand from the previous IV user's operand.
3397void LSRInstance::GenerateIVChain(const IVChain &Chain,
3398 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
3399 // Find the new IVOperand for the head of the chain. It may have been replaced
3400 // by LSR.
3401 const IVInc &Head = Chain.Incs[0];
3402 User::op_iterator IVOpEnd = Head.UserInst->op_end();
3403 // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
3404 User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
3405 IVOpEnd, L, SE);
3406 Value *IVSrc = nullptr;
3407 while (IVOpIter != IVOpEnd) {
3408 IVSrc = getWideOperand(*IVOpIter);
3409
3410 // If this operand computes the expression that the chain needs, we may use
3411 // it. (Check this after setting IVSrc which is used below.)
3412 //
3413 // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
3414 // narrow for the chain, so we can no longer use it. We do allow using a
3415 // wider phi, assuming the LSR checked for free truncation. In that case we
3416 // should already have a truncate on this operand such that
3417 // getSCEV(IVSrc) == IncExpr.
3418 if (SE.getSCEV(*IVOpIter) == Head.IncExpr
3419 || SE.getSCEV(IVSrc) == Head.IncExpr) {
3420 break;
3421 }
3422 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3423 }
3424 if (IVOpIter == IVOpEnd) {
3425 // Gracefully give up on this chain.
3426 LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
3427 return;
3428 }
3429 assert(IVSrc && "Failed to find IV chain source");
3430
3431 LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
3432 Type *IVTy = IVSrc->getType();
3433 Type *IntTy = SE.getEffectiveSCEVType(IVTy);
3434 const SCEV *LeftOverExpr = nullptr;
3435 const SCEV *Accum = SE.getZero(IntTy);
3437 Bases.emplace_back(Accum, IVSrc);
3438
3439 for (const IVInc &Inc : Chain) {
3440 Instruction *InsertPt = Inc.UserInst;
3441 if (isa<PHINode>(InsertPt))
3442 InsertPt = L->getLoopLatch()->getTerminator();
3443
3444 // IVOper will replace the current IV User's operand. IVSrc is the IV
3445 // value currently held in a register.
3446 Value *IVOper = IVSrc;
3447 if (!Inc.IncExpr->isZero()) {
3448 // IncExpr was the result of subtraction of two narrow values, so must
3449 // be signed.
3450 const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
3451 Accum = SE.getAddExpr(Accum, IncExpr);
3452 LeftOverExpr = LeftOverExpr ?
3453 SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
3454 }
3455
3456 // Look through each base to see if any can produce a nice addressing mode.
3457 bool FoundBase = false;
3458 for (auto [MapScev, MapIVOper] : reverse(Bases)) {
3459 const SCEV *Remainder = SE.getMinusSCEV(Accum, MapScev);
3460 if (canFoldIVIncExpr(Remainder, Inc.UserInst, Inc.IVOperand, TTI)) {
3461 if (!Remainder->isZero()) {
3462 Rewriter.clearPostInc();
3463 Value *IncV = Rewriter.expandCodeFor(Remainder, IntTy, InsertPt);
3464 const SCEV *IVOperExpr =
3465 SE.getAddExpr(SE.getUnknown(MapIVOper), SE.getUnknown(IncV));
3466 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3467 } else {
3468 IVOper = MapIVOper;
3469 }
3470
3471 FoundBase = true;
3472 break;
3473 }
3474 }
3475 if (!FoundBase && LeftOverExpr && !LeftOverExpr->isZero()) {
3476 // Expand the IV increment.
3477 Rewriter.clearPostInc();
3478 Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
3479 const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
3480 SE.getUnknown(IncV));
3481 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3482
3483 // If an IV increment can't be folded, use it as the next IV value.
3484 if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
3485 assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
3486 Bases.emplace_back(Accum, IVOper);
3487 IVSrc = IVOper;
3488 LeftOverExpr = nullptr;
3489 }
3490 }
3491 Type *OperTy = Inc.IVOperand->getType();
3492 if (IVTy != OperTy) {
3493 assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
3494 "cannot extend a chained IV");
3495 IRBuilder<> Builder(InsertPt);
3496 IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
3497 }
3498 Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
3499 if (auto *OperandIsInstr = dyn_cast<Instruction>(Inc.IVOperand))
3500 DeadInsts.emplace_back(OperandIsInstr);
3501 }
3502 // If LSR created a new, wider phi, we may also replace its postinc. We only
3503 // do this if we also found a wide value for the head of the chain.
3504 if (isa<PHINode>(Chain.tailUserInst())) {
3505 for (PHINode &Phi : L->getHeader()->phis()) {
3506 if (Phi.getType() != IVSrc->getType())
3507 continue;
3509 Phi.getIncomingValueForBlock(L->getLoopLatch()));
3510 if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
3511 continue;
3512 Value *IVOper = IVSrc;
3513 Type *PostIncTy = PostIncV->getType();
3514 if (IVTy != PostIncTy) {
3515 assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
3516 IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
3517 Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
3518 IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
3519 }
3520 Phi.replaceUsesOfWith(PostIncV, IVOper);
3521 DeadInsts.emplace_back(PostIncV);
3522 }
3523 }
3524}
3525
3526void LSRInstance::CollectFixupsAndInitialFormulae() {
3527 CondBrInst *ExitBranch = nullptr;
3528 bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &TLI);
3529
3530 // For calculating baseline cost
3531 SmallPtrSet<const SCEV *, 16> Regs;
3532 DenseSet<const SCEV *> VisitedRegs;
3533 DenseSet<size_t> VisitedLSRUse;
3534
3535 for (const IVStrideUse &U : IU) {
3536 Instruction *UserInst = U.getUser();
3537 // Skip IV users that are part of profitable IV Chains.
3538 User::op_iterator UseI =
3539 find(UserInst->operands(), U.getOperandValToReplace());
3540 assert(UseI != UserInst->op_end() && "cannot find IV operand");
3541 if (IVIncSet.count(UseI)) {
3542 LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
3543 continue;
3544 }
3545
3546 LSRUse::KindType Kind = LSRUse::Basic;
3547 MemAccessTy AccessTy;
3548 if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
3549 Kind = LSRUse::Address;
3550 AccessTy = getAccessType(TTI, UserInst, U.getOperandValToReplace());
3551 }
3552
3553 const SCEV *S = IU.getExpr(U);
3554 if (!S)
3555 continue;
3556 PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
3557
3558 // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
3559 // (N - i == 0), and this allows (N - i) to be the expression that we work
3560 // with rather than just N or i, so we can consider the register
3561 // requirements for both N and i at the same time. Limiting this code to
3562 // equality icmps is not a problem because all interesting loops use
3563 // equality icmps, thanks to IndVarSimplify.
3564 if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) {
3565 // If CI can be saved in some target, like replaced inside hardware loop
3566 // in PowerPC, no need to generate initial formulae for it.
3567 if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition()))
3568 continue;
3569 if (CI->isEquality()) {
3570 // Swap the operands if needed to put the OperandValToReplace on the
3571 // left, for consistency.
3572 Value *NV = CI->getOperand(1);
3573 if (NV == U.getOperandValToReplace()) {
3574 CI->setOperand(1, CI->getOperand(0));
3575 CI->setOperand(0, NV);
3576 NV = CI->getOperand(1);
3577 Changed = true;
3578 }
3579
3580 // x == y --> x - y == 0
3581 const SCEV *N = SE.getSCEV(NV);
3582 if (SE.isLoopInvariant(N, L) && Rewriter.isSafeToExpand(N) &&
3583 (!NV->getType()->isPointerTy() ||
3584 SE.getPointerBase(N) == SE.getPointerBase(S))) {
3585 // S is normalized, so normalize N before folding it into S
3586 // to keep the result normalized.
3587 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3588 if (!N)
3589 continue;
3590 Kind = LSRUse::ICmpZero;
3591 S = SE.getMinusSCEV(N, S);
3592 } else if (L->isLoopInvariant(NV) &&
3593 (!isa<Instruction>(NV) ||
3594 DT.dominates(cast<Instruction>(NV), L->getHeader())) &&
3595 !NV->getType()->isPointerTy()) {
3596 // If we can't generally expand the expression (e.g. it contains
3597 // a divide), but it is already at a loop invariant point before the
3598 // loop, wrap it in an unknown (to prevent the expander from trying
3599 // to re-expand in a potentially unsafe way.) The restriction to
3600 // integer types is required because the unknown hides the base, and
3601 // SCEV can't compute the difference of two unknown pointers.
3602 N = SE.getUnknown(NV);
3603 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3604 if (!N)
3605 continue;
3606 Kind = LSRUse::ICmpZero;
3607 S = SE.getMinusSCEV(N, S);
3609 }
3610
3611 // -1 and the negations of all interesting strides (except the negation
3612 // of -1) are now also interesting.
3613 for (size_t i = 0, e = Factors.size(); i != e; ++i)
3614 if (Factors[i] != -1)
3615 Factors.insert(-(uint64_t)Factors[i]);
3616 Factors.insert(-1);
3617 }
3618 }
3619
3620 // Get or create an LSRUse.
3621 std::pair<size_t, Immediate> P = getUse(S, Kind, AccessTy);
3622 size_t LUIdx = P.first;
3623 Immediate Offset = P.second;
3624 LSRUse &LU = Uses[LUIdx];
3625
3626 // Record the fixup.
3627 LSRFixup &LF = LU.getNewFixup();
3628 LF.UserInst = UserInst;
3629 LF.OperandValToReplace = U.getOperandValToReplace();
3630 LF.PostIncLoops = TmpPostIncLoops;
3631 LF.Offset = Offset;
3632 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3633 LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF);
3634
3635 // Create SCEV as Formula for calculating baseline cost
3636 if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) {
3637 Formula F;
3638 F.initialMatch(S, L, SE);
3639 BaselineCost.RateFormula(F, Regs, VisitedRegs, LU,
3640 HardwareLoopProfitable);
3641 VisitedLSRUse.insert(LUIdx);
3642 }
3643
3644 if (!LU.WidestFixupType ||
3645 SE.getTypeSizeInBits(LU.WidestFixupType) <
3646 SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3647 LU.WidestFixupType = LF.OperandValToReplace->getType();
3648
3649 // If this is the first use of this LSRUse, give it a formula.
3650 if (LU.Formulae.empty()) {
3651 InsertInitialFormula(S, LU, LUIdx);
3652 CountRegisters(LU.Formulae.back(), LUIdx);
3653 }
3654 }
3655
3656 LLVM_DEBUG(print_fixups(dbgs()));
3657}
3658
3659/// Insert a formula for the given expression into the given use, separating out
3660/// loop-variant portions from loop-invariant and loop-computable portions.
3661void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU,
3662 size_t LUIdx) {
3663 // Mark uses whose expressions cannot be expanded.
3664 if (!Rewriter.isSafeToExpand(S))
3665 LU.RigidFormula = true;
3666
3667 Formula F;
3668 F.initialMatch(S, L, SE);
3669 bool Inserted = InsertFormula(LU, LUIdx, F);
3670 assert(Inserted && "Initial formula already exists!"); (void)Inserted;
3671}
3672
3673/// Insert a simple single-register formula for the given expression into the
3674/// given use.
3675void
3676LSRInstance::InsertSupplementalFormula(const SCEV *S,
3677 LSRUse &LU, size_t LUIdx) {
3678 Formula F;
3679 F.BaseRegs.push_back(S);
3680 F.HasBaseReg = true;
3681 bool Inserted = InsertFormula(LU, LUIdx, F);
3682 assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
3683}
3684
3685/// Note which registers are used by the given formula, updating RegUses.
3686void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
3687 if (F.ScaledReg)
3688 RegUses.countRegister(F.ScaledReg, LUIdx);
3689 for (const SCEV *BaseReg : F.BaseRegs)
3690 RegUses.countRegister(BaseReg, LUIdx);
3691}
3692
3693/// If the given formula has not yet been inserted, add it to the list, and
3694/// return true. Return false otherwise.
3695bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
3696 // Do not insert formula that we will not be able to expand.
3697 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
3698 "Formula is illegal");
3699
3700 if (!LU.InsertFormula(F, *L))
3701 return false;
3702
3703 CountRegisters(F, LUIdx);
3704 return true;
3705}
3706
3707/// Test whether this fixup will be executed each time the corresponding IV
3708/// increment instruction is executed.
3709bool LSRInstance::IsFixupExecutedEachIncrement(const LSRFixup &LF) const {
3710 // If the fixup block dominates the IV increment block then there is no path
3711 // through the loop to the increment that doesn't pass through the fixup.
3712 return DT.dominates(LF.UserInst->getParent(), IVIncInsertPos->getParent());
3713}
3714
3715/// Check for other uses of loop-invariant values which we're tracking. These
3716/// other uses will pin these values in registers, making them less profitable
3717/// for elimination.
3718/// TODO: This currently misses non-constant addrec step registers.
3719/// TODO: Should this give more weight to users inside the loop?
3720void
3721LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
3722 SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
3723 SmallPtrSet<const SCEV *, 32> Visited;
3724
3725 // Don't collect outside uses if we are favoring postinc - the instructions in
3726 // the loop are more important than the ones outside of it.
3727 if (AMK == TTI::AMK_PostIndexed)
3728 return;
3729
3730 while (!Worklist.empty()) {
3731 const SCEV *S = Worklist.pop_back_val();
3732
3733 // Don't process the same SCEV twice
3734 if (!Visited.insert(S).second)
3735 continue;
3736
3737 if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
3738 append_range(Worklist, N->operands());
3739 else if (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(S))
3740 Worklist.push_back(C->getOperand());
3741 else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
3742 Worklist.push_back(D->getLHS());
3743 Worklist.push_back(D->getRHS());
3744 } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
3745 const Value *V = US->getValue();
3746 if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
3747 // Look for instructions defined outside the loop.
3748 if (L->contains(Inst)) continue;
3749 } else if (isa<Constant>(V))
3750 // Constants can be re-materialized.
3751 continue;
3752 for (const Use &U : V->uses()) {
3753 const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
3754 // Ignore non-instructions.
3755 if (!UserInst)
3756 continue;
3757 // Don't bother if the instruction is an EHPad.
3758 if (UserInst->isEHPad())
3759 continue;
3760 // Ignore instructions in other functions (as can happen with
3761 // Constants).
3762 if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
3763 continue;
3764 // Ignore instructions not dominated by the loop.
3765 const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
3766 UserInst->getParent() :
3767 cast<PHINode>(UserInst)->getIncomingBlock(
3769 if (!DT.dominates(L->getHeader(), UseBB))
3770 continue;
3771 // Don't bother if the instruction is in a BB which ends in an EHPad.
3772 if (UseBB->getTerminator()->isEHPad())
3773 continue;
3774
3775 // Ignore cases in which the currently-examined value could come from
3776 // a basic block terminated with an EHPad. This checks all incoming
3777 // blocks of the phi node since it is possible that the same incoming
3778 // value comes from multiple basic blocks, only some of which may end
3779 // in an EHPad. If any of them do, a subsequent rewrite attempt by this
3780 // pass would try to insert instructions into an EHPad, hitting an
3781 // assertion.
3782 if (isa<PHINode>(UserInst)) {
3783 const auto *PhiNode = cast<PHINode>(UserInst);
3784 bool HasIncompatibleEHPTerminatedBlock = false;
3785 llvm::Value *ExpectedValue = U;
3786 for (unsigned int I = 0; I < PhiNode->getNumIncomingValues(); I++) {
3787 if (PhiNode->getIncomingValue(I) == ExpectedValue) {
3788 if (PhiNode->getIncomingBlock(I)->getTerminator()->isEHPad()) {
3789 HasIncompatibleEHPTerminatedBlock = true;
3790 break;
3791 }
3792 }
3793 }
3794 if (HasIncompatibleEHPTerminatedBlock) {
3795 continue;
3796 }
3797 }
3798
3799 // Don't bother rewriting PHIs in catchswitch blocks.
3800 if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
3801 continue;
3802 // Ignore uses which are part of other SCEV expressions, to avoid
3803 // analyzing them multiple times.
3804 if (SE.isSCEVable(UserInst->getType())) {
3805 const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
3806 // If the user is a no-op, look through to its uses.
3807 if (!isa<SCEVUnknown>(UserS))
3808 continue;
3809 if (UserS == US) {
3810 Worklist.push_back(
3811 SE.getUnknown(const_cast<Instruction *>(UserInst)));
3812 continue;
3813 }
3814 }
3815 // Ignore icmp instructions which are already being analyzed.
3816 if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
3817 unsigned OtherIdx = !U.getOperandNo();
3818 Value *OtherOp = ICI->getOperand(OtherIdx);
3819 if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
3820 continue;
3821 }
3822
3823 // Do not consider uses inside lifetime intrinsics. These are not
3824 // actually materialized.
3825 if (UserInst->isLifetimeStartOrEnd())
3826 continue;
3827
3828 std::pair<size_t, Immediate> P =
3829 getUse(S, LSRUse::Basic, MemAccessTy());
3830 size_t LUIdx = P.first;
3831 Immediate Offset = P.second;
3832 LSRUse &LU = Uses[LUIdx];
3833 LSRFixup &LF = LU.getNewFixup();
3834 LF.UserInst = const_cast<Instruction *>(UserInst);
3835 LF.OperandValToReplace = U;
3836 LF.Offset = Offset;
3837 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3838 LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF);
3839 if (!LU.WidestFixupType ||
3840 SE.getTypeSizeInBits(LU.WidestFixupType) <
3841 SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3842 LU.WidestFixupType = LF.OperandValToReplace->getType();
3843 InsertSupplementalFormula(US, LU, LUIdx);
3844 CountRegisters(LU.Formulae.back(), Uses.size() - 1);
3845 break;
3846 }
3847 }
3848 }
3849}
3850
3851/// Split S into subexpressions which can be pulled out into separate
3852/// registers. If C is non-null, multiply each subexpression by C.
3853///
3854/// Return remainder expression after factoring the subexpressions captured by
3855/// Ops. If Ops is complete, return NULL.
3856static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
3858 const Loop *L,
3859 ScalarEvolution &SE,
3860 unsigned Depth = 0) {
3861 // Arbitrarily cap recursion to protect compile time.
3862 if (Depth >= 3)
3863 return S;
3864
3865 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
3866 // Break out add operands.
3867 for (const SCEV *S : Add->operands()) {
3868 const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
3869 if (Remainder)
3870 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3871 }
3872 return nullptr;
3873 }
3874 const SCEV *Start, *Step;
3875 const SCEVConstant *Op0;
3876 const SCEV *Op1;
3877 if (match(S, m_scev_AffineAddRec(m_SCEV(Start), m_SCEV(Step)))) {
3878 // Split a non-zero base out of an addrec.
3879 if (Start->isZero())
3880 return S;
3881
3882 const SCEV *Remainder = CollectSubexprs(Start, C, Ops, L, SE, Depth + 1);
3883 // Split the non-zero AddRec unless it is part of a nested recurrence that
3884 // does not pertain to this loop.
3885 if (Remainder && (cast<SCEVAddRecExpr>(S)->getLoop() == L ||
3886 !isa<SCEVAddRecExpr>(Remainder))) {
3887 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3888 Remainder = nullptr;
3889 }
3890 if (Remainder != Start) {
3891 if (!Remainder)
3892 Remainder = SE.getConstant(S->getType(), 0);
3893 return SE.getAddRecExpr(Remainder, Step,
3894 cast<SCEVAddRecExpr>(S)->getLoop(),
3895 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
3897 }
3898 } else if (match(S, m_scev_Mul(m_SCEVConstant(Op0), m_SCEV(Op1)))) {
3899 // Break (C * (a + b + c)) into C*a + C*b + C*c.
3900 C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
3901 const SCEV *Remainder = CollectSubexprs(Op1, C, Ops, L, SE, Depth + 1);
3902 if (Remainder)
3903 Ops.push_back(SE.getMulExpr(C, Remainder));
3904 return nullptr;
3905 }
3906 return S;
3907}
3908
3909/// Return true if the SCEV represents a value that may end up as a
3910/// post-increment operation.
3912 LSRUse &LU, const SCEV *S, const Loop *L,
3913 ScalarEvolution &SE) {
3914 if (LU.Kind != LSRUse::Address ||
3915 !LU.AccessTy.getType()->isIntOrIntVectorTy())
3916 return false;
3917 const SCEV *Start;
3918 if (!match(S, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant())))
3919 return false;
3920 // Check if a post-indexed load/store can be used.
3921 if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, S->getType()) ||
3922 TTI.isIndexedStoreLegal(TTI.MIM_PostInc, S->getType())) {
3923 if (!isa<SCEVConstant>(Start) && SE.isLoopInvariant(Start, L))
3924 return true;
3925 }
3926 return false;
3927}
3928
3929/// Helper function for LSRInstance::GenerateReassociations.
3930void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
3931 const Formula &Base,
3932 unsigned Depth, size_t Idx,
3933 bool IsScaledReg) {
3934 const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3935 // Don't generate reassociations for the base register of a value that
3936 // may generate a post-increment operator. The reason is that the
3937 // reassociations cause extra base+register formula to be created,
3938 // and possibly chosen, but the post-increment is more efficient.
3939 if (AMK == TTI::AMK_PostIndexed && mayUsePostIncMode(TTI, LU, BaseReg, L, SE))
3940 return;
3942 const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
3943 if (Remainder)
3944 AddOps.push_back(Remainder);
3945
3946 if (AddOps.size() == 1)
3947 return;
3948
3950 JE = AddOps.end();
3951 J != JE; ++J) {
3952 // Loop-variant "unknown" values are uninteresting; we won't be able to
3953 // do anything meaningful with them.
3954 if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
3955 continue;
3956
3957 // Don't pull a constant into a register if the constant could be folded
3958 // into an immediate field.
3959 if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3960 LU.AccessTy, *J, Base.getNumRegs() > 1))
3961 continue;
3962
3963 // Collect all operands except *J.
3964 SmallVector<SCEVUse, 8> InnerAddOps(std::as_const(AddOps).begin(), J);
3965 InnerAddOps.append(std::next(J), std::as_const(AddOps).end());
3966
3967 // Don't leave just a constant behind in a register if the constant could
3968 // be folded into an immediate field.
3969 if (InnerAddOps.size() == 1 &&
3970 isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3971 LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
3972 continue;
3973
3974 const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
3975 if (InnerSum->isZero())
3976 continue;
3977 Formula F = Base;
3978
3979 if (F.UnfoldedOffset.isNonZero() && F.UnfoldedOffset.isScalable())
3980 continue;
3981
3982 // Add the remaining pieces of the add back into the new formula.
3983 const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
3984 if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
3985 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
3986 InnerSumSC->getValue()->getZExtValue())) {
3987 F.UnfoldedOffset =
3988 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
3989 InnerSumSC->getValue()->getZExtValue());
3990 if (IsScaledReg) {
3991 F.ScaledReg = nullptr;
3992 F.Scale = 0;
3993 } else
3994 F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
3995 } else if (IsScaledReg)
3996 F.ScaledReg = InnerSum;
3997 else
3998 F.BaseRegs[Idx] = InnerSum;
3999
4000 // Add J as its own register, or an unfolded immediate.
4001 const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
4002 if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
4003 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
4004 SC->getValue()->getZExtValue()))
4005 F.UnfoldedOffset =
4006 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
4007 SC->getValue()->getZExtValue());
4008 else
4009 F.BaseRegs.push_back(*J);
4010 // We may have changed the number of register in base regs, adjust the
4011 // formula accordingly.
4012 F.canonicalize(*L);
4013
4014 if (InsertFormula(LU, LUIdx, F))
4015 // If that formula hadn't been seen before, recurse to find more like
4016 // it.
4017 // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2)
4018 // Because just Depth is not enough to bound compile time.
4019 // This means that every time AddOps.size() is greater 16^x we will add
4020 // x to Depth.
4021 GenerateReassociations(LU, LUIdx, LU.Formulae.back(),
4022 Depth + 1 + (Log2_32(AddOps.size()) >> 2));
4023 }
4024}
4025
4026/// Split out subexpressions from adds and the bases of addrecs.
4027void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
4028 Formula Base, unsigned Depth) {
4029 assert(Base.isCanonical(*L) && "Input must be in the canonical form");
4030 // Arbitrarily cap recursion to protect compile time.
4031 if (Depth >= 3)
4032 return;
4033
4034 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4035 GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
4036
4037 if (Base.Scale == 1)
4038 GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
4039 /* Idx */ -1, /* IsScaledReg */ true);
4040}
4041
4042/// Generate a formula consisting of all of the loop-dominating registers added
4043/// into a single register.
4044void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
4045 Formula Base) {
4046 // This method is only interesting on a plurality of registers.
4047 if (Base.BaseRegs.size() + (Base.Scale == 1) +
4048 (Base.UnfoldedOffset.isNonZero()) <=
4049 1)
4050 return;
4051
4052 // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
4053 // processing the formula.
4054 Base.unscale();
4056 Formula NewBase = Base;
4057 NewBase.BaseRegs.clear();
4058 Type *CombinedIntegerType = nullptr;
4059 for (const SCEV *BaseReg : Base.BaseRegs) {
4060 if (SE.properlyDominates(BaseReg, L->getHeader()) &&
4061 !SE.hasComputableLoopEvolution(BaseReg, L)) {
4062 if (!CombinedIntegerType)
4063 CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType());
4064 Ops.push_back(BaseReg);
4065 }
4066 else
4067 NewBase.BaseRegs.push_back(BaseReg);
4068 }
4069
4070 // If no register is relevant, we're done.
4071 if (Ops.size() == 0)
4072 return;
4073
4074 // Utility function for generating the required variants of the combined
4075 // registers.
4076 auto GenerateFormula = [&](const SCEV *Sum) {
4077 Formula F = NewBase;
4078
4079 // TODO: If Sum is zero, it probably means ScalarEvolution missed an
4080 // opportunity to fold something. For now, just ignore such cases
4081 // rather than proceed with zero in a register.
4082 if (Sum->isZero())
4083 return;
4084
4085 F.BaseRegs.push_back(Sum);
4086 F.canonicalize(*L);
4087 (void)InsertFormula(LU, LUIdx, F);
4088 };
4089
4090 // If we collected at least two registers, generate a formula combining them.
4091 if (Ops.size() > 1) {
4092 SmallVector<SCEVUse, 4> OpsCopy(Ops); // Don't let SE modify Ops.
4093 GenerateFormula(SE.getAddExpr(OpsCopy));
4094 }
4095
4096 // If we have an unfolded offset, generate a formula combining it with the
4097 // registers collected.
4098 if (NewBase.UnfoldedOffset.isNonZero() && NewBase.UnfoldedOffset.isFixed()) {
4099 assert(CombinedIntegerType && "Missing a type for the unfolded offset");
4100 Ops.push_back(SE.getConstant(CombinedIntegerType,
4101 NewBase.UnfoldedOffset.getFixedValue(), true));
4102 NewBase.UnfoldedOffset = Immediate::getFixed(0);
4103 GenerateFormula(SE.getAddExpr(Ops));
4104 }
4105}
4106
4107/// Helper function for LSRInstance::GenerateSymbolicOffsets.
4108void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
4109 const Formula &Base, size_t Idx,
4110 bool IsScaledReg) {
4111 SCEVUse G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4112 GlobalValue *GV = ExtractSymbol(G, SE);
4113 if (G->isZero() || !GV)
4114 return;
4115 Formula F = Base;
4116 F.BaseGV = GV;
4117 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4118 return;
4119 if (IsScaledReg)
4120 F.ScaledReg = G;
4121 else
4122 F.BaseRegs[Idx] = G;
4123 (void)InsertFormula(LU, LUIdx, F);
4124}
4125
4126/// Generate reuse formulae using symbolic offsets.
4127void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
4128 Formula Base) {
4129 // We can't add a symbolic offset if the address already contains one.
4130 if (Base.BaseGV) return;
4131
4132 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4133 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
4134 if (Base.Scale == 1)
4135 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
4136 /* IsScaledReg */ true);
4137}
4138
4139/// Helper function for LSRInstance::GenerateConstantOffsets.
4140void LSRInstance::GenerateConstantOffsetsImpl(
4141 LSRUse &LU, unsigned LUIdx, const Formula &Base,
4142 const SmallVectorImpl<Immediate> &Worklist, size_t Idx, bool IsScaledReg) {
4143
4144 auto GenerateOffset = [&](const SCEV *G, Immediate Offset) {
4145 Formula F = Base;
4146 if (!Base.BaseOffset.isCompatibleImmediate(Offset))
4147 return;
4148 F.BaseOffset = Base.BaseOffset.subUnsigned(Offset);
4149
4150 if (isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) {
4151 // Add the offset to the base register.
4152 const SCEV *NewOffset = Offset.getSCEV(SE, G->getType());
4153 const SCEV *NewG = SE.getAddExpr(NewOffset, G);
4154 // If it cancelled out, drop the base register, otherwise update it.
4155 if (NewG->isZero()) {
4156 if (IsScaledReg) {
4157 F.Scale = 0;
4158 F.ScaledReg = nullptr;
4159 } else
4160 F.deleteBaseReg(F.BaseRegs[Idx]);
4161 F.canonicalize(*L);
4162 } else if (IsScaledReg)
4163 F.ScaledReg = NewG;
4164 else
4165 F.BaseRegs[Idx] = NewG;
4166
4167 (void)InsertFormula(LU, LUIdx, F);
4168 }
4169 };
4170
4171 SCEVUse G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4172
4173 // With constant offsets and constant steps, we can generate pre-inc
4174 // accesses by having the offset equal the step. So, for access #0 with a
4175 // step of 8, we generate a G - 8 base which would require the first access
4176 // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer
4177 // for itself and hopefully becomes the base for other accesses. This means
4178 // means that a single pre-indexed access can be generated to become the new
4179 // base pointer for each iteration of the loop, resulting in no extra add/sub
4180 // instructions for pointer updating.
4181 if ((AMK & TTI::AMK_PreIndexed) && LU.Kind == LSRUse::Address) {
4182 const APInt *StepInt;
4183 if (match(G, m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StepInt)))) {
4184 int64_t Step = StepInt->isNegative() ? StepInt->getSExtValue()
4185 : StepInt->getZExtValue();
4186
4187 for (Immediate Offset : Worklist) {
4188 if (Offset.isFixed()) {
4189 Offset = Immediate::getFixed(Offset.getFixedValue() - Step);
4190 GenerateOffset(G, Offset);
4191 }
4192 }
4193 }
4194 }
4195 for (Immediate Offset : Worklist)
4196 GenerateOffset(G, Offset);
4197
4198 Immediate Imm = ExtractImmediate(G, SE);
4199 if (G->isZero() || Imm.isZero() ||
4200 !Base.BaseOffset.isCompatibleImmediate(Imm))
4201 return;
4202 Formula F = Base;
4203 F.BaseOffset = F.BaseOffset.addUnsigned(Imm);
4204 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4205 return;
4206 if (IsScaledReg) {
4207 F.ScaledReg = G;
4208 } else {
4209 F.BaseRegs[Idx] = G;
4210 // We may generate non canonical Formula if G is a recurrent expr reg
4211 // related with current loop while F.ScaledReg is not.
4212 F.canonicalize(*L);
4213 }
4214 (void)InsertFormula(LU, LUIdx, F);
4215}
4216
4217/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
4218void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
4219 Formula Base) {
4220 // TODO: For now, just add the min and max offset, because it usually isn't
4221 // worthwhile looking at everything inbetween.
4223 Worklist.push_back(LU.MinOffset);
4224 if (LU.MaxOffset != LU.MinOffset)
4225 Worklist.push_back(LU.MaxOffset);
4226
4227 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4228 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
4229 if (Base.Scale == 1)
4230 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
4231 /* IsScaledReg */ true);
4232}
4233
4234/// For ICmpZero, check to see if we can scale up the comparison. For example, x
4235/// == y -> x*c == y*c.
4236void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
4237 Formula Base) {
4238 if (LU.Kind != LSRUse::ICmpZero) return;
4239
4240 // Determine the integer type for the base formula.
4241 Type *IntTy = Base.getType();
4242 if (!IntTy) return;
4243 if (SE.getTypeSizeInBits(IntTy) > 64) return;
4244
4245 // Don't do this if there is more than one offset.
4246 if (LU.MinOffset != LU.MaxOffset) return;
4247
4248 // Check if transformation is valid. It is illegal to multiply pointer.
4249 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4250 return;
4251 for (const SCEV *BaseReg : Base.BaseRegs)
4252 if (BaseReg->getType()->isPointerTy())
4253 return;
4254 assert(!Base.BaseGV && "ICmpZero use is not legal!");
4255
4256 // Check each interesting stride.
4257 for (int64_t Factor : Factors) {
4258 // Check that Factor can be represented by IntTy
4259 if (!ConstantInt::isValueValidForType(IntTy, Factor))
4260 continue;
4261 // Check that the multiplication doesn't overflow.
4262 if (Base.BaseOffset.isMin() && Factor == -1)
4263 continue;
4264 // Not supporting scalable immediates.
4265 if (Base.BaseOffset.isNonZero() && Base.BaseOffset.isScalable())
4266 continue;
4267 Immediate NewBaseOffset = Base.BaseOffset.mulUnsigned(Factor);
4268 assert(Factor != 0 && "Zero factor not expected!");
4269 if (NewBaseOffset.getFixedValue() / Factor !=
4270 Base.BaseOffset.getFixedValue())
4271 continue;
4272 // If the offset will be truncated at this use, check that it is in bounds.
4273 if (!IntTy->isPointerTy() &&
4274 !ConstantInt::isValueValidForType(IntTy, NewBaseOffset.getFixedValue()))
4275 continue;
4276
4277 // Check that multiplying with the use offset doesn't overflow.
4278 Immediate Offset = LU.MinOffset;
4279 if (Offset.isMin() && Factor == -1)
4280 continue;
4281 Offset = Offset.mulUnsigned(Factor);
4282 if (Offset.getFixedValue() / Factor != LU.MinOffset.getFixedValue())
4283 continue;
4284 // If the offset will be truncated at this use, check that it is in bounds.
4285 if (!IntTy->isPointerTy() &&
4286 !ConstantInt::isValueValidForType(IntTy, Offset.getFixedValue()))
4287 continue;
4288
4289 Formula F = Base;
4290 F.BaseOffset = NewBaseOffset;
4291
4292 // Check that this scale is legal.
4293 if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
4294 continue;
4295
4296 // Compensate for the use having MinOffset built into it.
4297 F.BaseOffset = F.BaseOffset.addUnsigned(Offset).subUnsigned(LU.MinOffset);
4298
4299 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4300
4301 // Check that multiplying with each base register doesn't overflow.
4302 for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
4303 F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
4304 if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
4305 goto next;
4306 }
4307
4308 // Check that multiplying with the scaled register doesn't overflow.
4309 if (F.ScaledReg) {
4310 F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
4311 if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
4312 continue;
4313 }
4314
4315 // Check that multiplying with the unfolded offset doesn't overflow.
4316 if (F.UnfoldedOffset.isNonZero()) {
4317 if (F.UnfoldedOffset.isMin() && Factor == -1)
4318 continue;
4319 F.UnfoldedOffset = F.UnfoldedOffset.mulUnsigned(Factor);
4320 if (F.UnfoldedOffset.getFixedValue() / Factor !=
4321 Base.UnfoldedOffset.getFixedValue())
4322 continue;
4323 // If the offset will be truncated, check that it is in bounds.
4325 IntTy, F.UnfoldedOffset.getFixedValue()))
4326 continue;
4327 }
4328
4329 // If we make it here and it's legal, add it.
4330 (void)InsertFormula(LU, LUIdx, F);
4331 next:;
4332 }
4333}
4334
4335/// Generate stride factor reuse formulae by making use of scaled-offset address
4336/// modes, for example.
4337void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
4338 // Determine the integer type for the base formula.
4339 Type *IntTy = Base.getType();
4340 if (!IntTy) return;
4341
4342 // If this Formula already has a scaled register, we can't add another one.
4343 // Try to unscale the formula to generate a better scale.
4344 if (Base.Scale != 0 && !Base.unscale())
4345 return;
4346
4347 assert(Base.Scale == 0 && "unscale did not did its job!");
4348
4349 // Check each interesting stride.
4350 for (int64_t Factor : Factors) {
4351 Base.Scale = Factor;
4352 Base.HasBaseReg = Base.BaseRegs.size() > 1;
4353 // Check whether this scale is going to be legal.
4354 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4355 Base)) {
4356 // As a special-case, handle special out-of-loop Basic users specially.
4357 // TODO: Reconsider this special case.
4358 if (LU.Kind == LSRUse::Basic &&
4359 isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
4360 LU.AccessTy, Base) &&
4361 LU.AllFixupsOutsideLoop)
4362 LU.Kind = LSRUse::Special;
4363 else
4364 continue;
4365 }
4366 // For an ICmpZero, negating a solitary base register won't lead to
4367 // new solutions.
4368 if (LU.Kind == LSRUse::ICmpZero && !Base.HasBaseReg &&
4369 Base.BaseOffset.isZero() && !Base.BaseGV)
4370 continue;
4371 // For each addrec base reg, if its loop is current loop, apply the scale.
4372 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
4373 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
4374 if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
4375 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4376 if (FactorS->isZero())
4377 continue;
4378 // Divide out the factor, ignoring high bits, since we'll be
4379 // scaling the value back up in the end.
4380 if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true))
4381 if (!Quotient->isZero()) {
4382 // TODO: This could be optimized to avoid all the copying.
4383 Formula F = Base;
4384 F.ScaledReg = Quotient;
4385 F.deleteBaseReg(F.BaseRegs[i]);
4386 // The canonical representation of 1*reg is reg, which is already in
4387 // Base. In that case, do not try to insert the formula, it will be
4388 // rejected anyway.
4389 if (F.Scale == 1 && (F.BaseRegs.empty() ||
4390 (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
4391 continue;
4392 // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
4393 // non canonical Formula with ScaledReg's loop not being L.
4394 if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
4395 F.canonicalize(*L);
4396 (void)InsertFormula(LU, LUIdx, F);
4397 }
4398 }
4399 }
4400 }
4401}
4402
4403/// Extend/Truncate \p Expr to \p ToTy considering post-inc uses in \p Loops.
4404/// For all PostIncLoopSets in \p Loops, first de-normalize \p Expr, then
4405/// perform the extension/truncate and normalize again, as the normalized form
4406/// can result in folds that are not valid in the post-inc use contexts. The
4407/// expressions for all PostIncLoopSets must match, otherwise return nullptr.
4408static const SCEV *
4410 const SCEV *Expr, Type *ToTy,
4411 ScalarEvolution &SE) {
4412 const SCEV *Result = nullptr;
4413 for (auto &L : Loops) {
4414 auto *DenormExpr = denormalizeForPostIncUse(Expr, L, SE);
4415 const SCEV *NewDenormExpr = SE.getAnyExtendExpr(DenormExpr, ToTy);
4416 const SCEV *New = normalizeForPostIncUse(NewDenormExpr, L, SE);
4417 if (!New || (Result && New != Result))
4418 return nullptr;
4419 Result = New;
4420 }
4421
4422 assert(Result && "failed to create expression");
4423 return Result;
4424}
4425
4426/// Generate reuse formulae from different IV types.
4427void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
4428 // Don't bother truncating symbolic values.
4429 if (Base.BaseGV) return;
4430
4431 // Determine the integer type for the base formula.
4432 Type *DstTy = Base.getType();
4433 if (!DstTy) return;
4434 if (DstTy->isPointerTy())
4435 return;
4436
4437 // It is invalid to extend a pointer type so exit early if ScaledReg or
4438 // any of the BaseRegs are pointers.
4439 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4440 return;
4441 if (any_of(Base.BaseRegs,
4442 [](const SCEV *S) { return S->getType()->isPointerTy(); }))
4443 return;
4444
4446 for (auto &LF : LU.Fixups)
4447 Loops.push_back(LF.PostIncLoops);
4448
4449 for (Type *SrcTy : Types) {
4450 if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
4451 Formula F = Base;
4452
4453 // Sometimes SCEV is able to prove zero during ext transform. It may
4454 // happen if SCEV did not do all possible transforms while creating the
4455 // initial node (maybe due to depth limitations), but it can do them while
4456 // taking ext.
4457 if (F.ScaledReg) {
4458 const SCEV *NewScaledReg =
4459 getAnyExtendConsideringPostIncUses(Loops, F.ScaledReg, SrcTy, SE);
4460 if (!NewScaledReg || NewScaledReg->isZero())
4461 continue;
4462 F.ScaledReg = NewScaledReg;
4463 }
4464 bool HasZeroBaseReg = false;
4465 for (const SCEV *&BaseReg : F.BaseRegs) {
4466 const SCEV *NewBaseReg =
4467 getAnyExtendConsideringPostIncUses(Loops, BaseReg, SrcTy, SE);
4468 if (!NewBaseReg || NewBaseReg->isZero()) {
4469 HasZeroBaseReg = true;
4470 break;
4471 }
4472 BaseReg = NewBaseReg;
4473 }
4474 if (HasZeroBaseReg)
4475 continue;
4476
4477 // TODO: This assumes we've done basic processing on all uses and
4478 // have an idea what the register usage is.
4479 if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
4480 continue;
4481
4482 F.canonicalize(*L);
4483 (void)InsertFormula(LU, LUIdx, F);
4484 }
4485 }
4486}
4487
4488namespace {
4489
4490/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
4491/// modifications so that the search phase doesn't have to worry about the data
4492/// structures moving underneath it.
4493struct WorkItem {
4494 size_t LUIdx;
4495 Immediate Imm;
4496 const SCEV *OrigReg;
4497
4498 WorkItem(size_t LI, Immediate I, const SCEV *R)
4499 : LUIdx(LI), Imm(I), OrigReg(R) {}
4500
4501 void print(raw_ostream &OS) const;
4502 void dump() const;
4503};
4504
4505} // end anonymous namespace
4506
4507#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4508void WorkItem::print(raw_ostream &OS) const {
4509 OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
4510 << " , add offset " << Imm;
4511}
4512
4513LLVM_DUMP_METHOD void WorkItem::dump() const {
4514 print(errs()); errs() << '\n';
4515}
4516#endif
4517
4518/// Look for registers which are a constant distance apart and try to form reuse
4519/// opportunities between them.
4520void LSRInstance::GenerateCrossUseConstantOffsets() {
4521 // Group the registers by their value without any added constant offset.
4522 using ImmMapTy = std::map<Immediate, const SCEV *, KeyOrderTargetImmediate>;
4523
4524 DenseMap<const SCEV *, ImmMapTy> Map;
4525 DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
4527 for (const SCEV *Use : RegUses) {
4528 SCEVUse Reg = Use; // Make a copy for ExtractImmediate to modify.
4529 Immediate Imm = ExtractImmediate(Reg, SE);
4530 auto Pair = Map.try_emplace(Reg);
4531 if (Pair.second)
4532 Sequence.push_back(Reg);
4533 Pair.first->second.insert(std::make_pair(Imm, Use));
4534 UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
4535 }
4536
4537 // Now examine each set of registers with the same base value. Build up
4538 // a list of work to do and do the work in a separate step so that we're
4539 // not adding formulae and register counts while we're searching.
4540 SmallVector<WorkItem, 32> WorkItems;
4541 SmallSet<std::pair<size_t, Immediate>, 32, KeyOrderSizeTAndImmediate>
4542 UniqueItems;
4543 for (const SCEV *Reg : Sequence) {
4544 const ImmMapTy &Imms = Map.find(Reg)->second;
4545
4546 // It's not worthwhile looking for reuse if there's only one offset.
4547 if (Imms.size() == 1)
4548 continue;
4549
4550 LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
4551 for (const auto &Entry
4552 : Imms) dbgs()
4553 << ' ' << Entry.first;
4554 dbgs() << '\n');
4555
4556 // Examine each offset.
4557 for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
4558 J != JE; ++J) {
4559 const SCEV *OrigReg = J->second;
4560
4561 Immediate JImm = J->first;
4562 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
4563
4564 if (!isa<SCEVConstant>(OrigReg) &&
4565 UsedByIndicesMap[Reg].count() == 1) {
4566 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4567 << '\n');
4568 continue;
4569 }
4570
4571 // Conservatively examine offsets between this orig reg a few selected
4572 // other orig regs.
4573 Immediate First = Imms.begin()->first;
4574 Immediate Last = std::prev(Imms.end())->first;
4575 if (!First.isCompatibleImmediate(Last)) {
4576 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4577 << "\n");
4578 continue;
4579 }
4580 // Only scalable if both terms are scalable, or if one is scalable and
4581 // the other is 0.
4582 bool Scalable = First.isScalable() || Last.isScalable();
4583 int64_t FI = First.getKnownMinValue();
4584 int64_t LI = Last.getKnownMinValue();
4585 // Compute (First + Last) / 2 without overflow using the fact that
4586 // First + Last = 2 * (First + Last) + (First ^ Last).
4587 int64_t Avg = (FI & LI) + ((FI ^ LI) >> 1);
4588 // If the result is negative and FI is odd and LI even (or vice versa),
4589 // we rounded towards -inf. Add 1 in that case, to round towards 0.
4590 Avg = Avg + ((FI ^ LI) & ((uint64_t)Avg >> 63));
4591 ImmMapTy::const_iterator OtherImms[] = {
4592 Imms.begin(), std::prev(Imms.end()),
4593 Imms.lower_bound(Immediate::get(Avg, Scalable))};
4594 for (const auto &M : OtherImms) {
4595 if (M == J || M == JE) continue;
4596 if (!JImm.isCompatibleImmediate(M->first))
4597 continue;
4598
4599 // Compute the difference between the two.
4600 Immediate Imm = JImm.subUnsigned(M->first);
4601 for (unsigned LUIdx : UsedByIndices.set_bits())
4602 // Make a memo of this use, offset, and register tuple.
4603 if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
4604 WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
4605 }
4606 }
4607 }
4608
4609 Map.clear();
4610 Sequence.clear();
4611 UsedByIndicesMap.clear();
4612 UniqueItems.clear();
4613
4614 // Now iterate through the worklist and add new formulae.
4615 for (const WorkItem &WI : WorkItems) {
4616 size_t LUIdx = WI.LUIdx;
4617 LSRUse &LU = Uses[LUIdx];
4618 Immediate Imm = WI.Imm;
4619 const SCEV *OrigReg = WI.OrigReg;
4620
4621 Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
4622 const SCEV *NegImmS = Imm.getNegativeSCEV(SE, IntTy);
4623 unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
4624
4625 // TODO: Use a more targeted data structure.
4626 for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
4627 Formula F = LU.Formulae[L];
4628 // FIXME: The code for the scaled and unscaled registers looks
4629 // very similar but slightly different. Investigate if they
4630 // could be merged. That way, we would not have to unscale the
4631 // Formula.
4632 F.unscale();
4633 // Use the immediate in the scaled register.
4634 if (F.ScaledReg == OrigReg) {
4635 if (!F.BaseOffset.isCompatibleImmediate(Imm))
4636 continue;
4637 Immediate Offset = F.BaseOffset.addUnsigned(Imm.mulUnsigned(F.Scale));
4638 // Don't create 50 + reg(-50).
4639 const SCEV *S = Offset.getNegativeSCEV(SE, IntTy);
4640 if (F.referencesReg(S))
4641 continue;
4642 Formula NewF = F;
4643 NewF.BaseOffset = Offset;
4644 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4645 NewF))
4646 continue;
4647 NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
4648
4649 // If the new scale is a constant in a register, and adding the constant
4650 // value to the immediate would produce a value closer to zero than the
4651 // immediate itself, then the formula isn't worthwhile.
4652 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) {
4653 // FIXME: Do we need to do something for scalable immediates here?
4654 // A scalable SCEV won't be constant, but we might still have
4655 // something in the offset? Bail out for now to be safe.
4656 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4657 continue;
4658 if (C->getValue()->isNegative() !=
4659 (NewF.BaseOffset.isLessThanZero()) &&
4660 (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
4661 .ule(std::abs(NewF.BaseOffset.getFixedValue())))
4662 continue;
4663 }
4664
4665 // OK, looks good.
4666 NewF.canonicalize(*this->L);
4667 (void)InsertFormula(LU, LUIdx, NewF);
4668 } else {
4669 // Use the immediate in a base register.
4670 for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
4671 const SCEV *BaseReg = F.BaseRegs[N];
4672 if (BaseReg != OrigReg)
4673 continue;
4674 Formula NewF = F;
4675 if (!NewF.BaseOffset.isCompatibleImmediate(Imm) ||
4676 !NewF.UnfoldedOffset.isCompatibleImmediate(Imm) ||
4677 !NewF.BaseOffset.isCompatibleImmediate(NewF.UnfoldedOffset))
4678 continue;
4679 NewF.BaseOffset = NewF.BaseOffset.addUnsigned(Imm);
4680 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
4681 LU.Kind, LU.AccessTy, NewF)) {
4682 if (AMK == TTI::AMK_PostIndexed &&
4683 mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
4684 continue;
4685 Immediate NewUnfoldedOffset = NewF.UnfoldedOffset.addUnsigned(Imm);
4686 if (!isLegalAddImmediate(TTI, NewUnfoldedOffset))
4687 continue;
4688 NewF = F;
4689 NewF.UnfoldedOffset = NewUnfoldedOffset;
4690 }
4691 NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
4692
4693 // If the new formula has a constant in a register, and adding the
4694 // constant value to the immediate would produce a value closer to
4695 // zero than the immediate itself, then the formula isn't worthwhile.
4696 for (const SCEV *NewReg : NewF.BaseRegs)
4697 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg)) {
4698 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4699 goto skip_formula;
4700 if ((C->getAPInt() + NewF.BaseOffset.getFixedValue())
4701 .abs()
4702 .slt(std::abs(NewF.BaseOffset.getFixedValue())) &&
4703 (C->getAPInt() + NewF.BaseOffset.getFixedValue())
4704 .countr_zero() >=
4706 NewF.BaseOffset.getFixedValue()))
4707 goto skip_formula;
4708 }
4709
4710 // Ok, looks good.
4711 NewF.canonicalize(*this->L);
4712 (void)InsertFormula(LU, LUIdx, NewF);
4713 break;
4714 skip_formula:;
4715 }
4716 }
4717 }
4718 }
4719}
4720
4721/// Generate formulae for each use.
4722void
4723LSRInstance::GenerateAllReuseFormulae() {
4724 // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
4725 // queries are more precise.
4726 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4727 LSRUse &LU = Uses[LUIdx];
4728 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4729 GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
4730 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4731 GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
4732 }
4733 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4734 LSRUse &LU = Uses[LUIdx];
4735 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4736 GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
4737 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4738 GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
4739 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4740 GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
4741 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4742 GenerateScales(LU, LUIdx, LU.Formulae[i]);
4743 }
4744 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4745 LSRUse &LU = Uses[LUIdx];
4746 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4747 GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
4748 }
4749
4750 GenerateCrossUseConstantOffsets();
4751
4752 LLVM_DEBUG(dbgs() << "\n"
4753 "After generating reuse formulae:\n";
4754 print_uses(dbgs()));
4755}
4756
4757/// If there are multiple formulae with the same set of registers used
4758/// by other uses, pick the best one and delete the others.
4759void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
4760 DenseSet<const SCEV *> VisitedRegs;
4761 SmallPtrSet<const SCEV *, 16> Regs;
4762 SmallPtrSet<const SCEV *, 16> LoserRegs;
4763#ifndef NDEBUG
4764 bool ChangedFormulae = false;
4765#endif
4766
4767 // Collect the best formula for each unique set of shared registers. This
4768 // is reset for each use.
4769 using BestFormulaeTy = DenseMap<SmallVector<const SCEV *, 4>, size_t>;
4770
4771 BestFormulaeTy BestFormulae;
4772
4773 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4774 LSRUse &LU = Uses[LUIdx];
4775 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
4776 dbgs() << '\n');
4777
4778 bool Any = false;
4779 for (size_t FIdx = 0, NumForms = LU.Formulae.size();
4780 FIdx != NumForms; ++FIdx) {
4781 Formula &F = LU.Formulae[FIdx];
4782
4783 // Some formulas are instant losers. For example, they may depend on
4784 // nonexistent AddRecs from other loops. These need to be filtered
4785 // immediately, otherwise heuristics could choose them over others leading
4786 // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
4787 // avoids the need to recompute this information across formulae using the
4788 // same bad AddRec. Passing LoserRegs is also essential unless we remove
4789 // the corresponding bad register from the Regs set.
4790 Cost CostF(L, SE, TTI, AMK);
4791 Regs.clear();
4792 CostF.RateFormula(F, Regs, VisitedRegs, LU, HardwareLoopProfitable,
4793 &LoserRegs);
4794 if (CostF.isLoser()) {
4795 // During initial formula generation, undesirable formulae are generated
4796 // by uses within other loops that have some non-trivial address mode or
4797 // use the postinc form of the IV. LSR needs to provide these formulae
4798 // as the basis of rediscovering the desired formula that uses an AddRec
4799 // corresponding to the existing phi. Once all formulae have been
4800 // generated, these initial losers may be pruned.
4801 LLVM_DEBUG(dbgs() << " Filtering loser "; F.print(dbgs());
4802 dbgs() << "\n");
4803 }
4804 else {
4806 for (const SCEV *Reg : F.BaseRegs) {
4807 if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
4808 Key.push_back(Reg);
4809 }
4810 if (F.ScaledReg &&
4811 RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
4812 Key.push_back(F.ScaledReg);
4813 // Unstable sort by host order ok, because this is only used for
4814 // uniquifying.
4815 llvm::sort(Key);
4816
4817 std::pair<BestFormulaeTy::const_iterator, bool> P =
4818 BestFormulae.insert(std::make_pair(Key, FIdx));
4819 if (P.second)
4820 continue;
4821
4822 Formula &Best = LU.Formulae[P.first->second];
4823
4824 Cost CostBest(L, SE, TTI, AMK);
4825 Regs.clear();
4826 CostBest.RateFormula(Best, Regs, VisitedRegs, LU,
4827 HardwareLoopProfitable);
4828 if (CostF.isLess(CostBest))
4829 std::swap(F, Best);
4830 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
4831 dbgs() << "\n"
4832 " in favor of formula ";
4833 Best.print(dbgs()); dbgs() << '\n');
4834 }
4835#ifndef NDEBUG
4836 ChangedFormulae = true;
4837#endif
4838 LU.DeleteFormula(F);
4839 --FIdx;
4840 --NumForms;
4841 Any = true;
4842 }
4843
4844 // Now that we've filtered out some formulae, recompute the Regs set.
4845 if (Any)
4846 LU.RecomputeRegs(LUIdx, RegUses);
4847
4848 // Reset this to prepare for the next use.
4849 BestFormulae.clear();
4850 }
4851
4852 LLVM_DEBUG(if (ChangedFormulae) {
4853 dbgs() << "\n"
4854 "After filtering out undesirable candidates:\n";
4855 print_uses(dbgs());
4856 });
4857}
4858
4859/// Estimate the worst-case number of solutions the solver might have to
4860/// consider. It almost never considers this many solutions because it prune the
4861/// search space, but the pruning isn't always sufficient.
4862size_t LSRInstance::EstimateSearchSpaceComplexity() const {
4863 size_t Power = 1;
4864 for (const LSRUse &LU : Uses) {
4865 size_t FSize = LU.Formulae.size();
4866 if (FSize >= ComplexityLimit) {
4867 Power = ComplexityLimit;
4868 break;
4869 }
4870 Power *= FSize;
4871 if (Power >= ComplexityLimit)
4872 break;
4873 }
4874 return Power;
4875}
4876
4877/// When one formula uses a superset of the registers of another formula, it
4878/// won't help reduce register pressure (though it may not necessarily hurt
4879/// register pressure); remove it to simplify the system.
4880void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
4881 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4882 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4883
4884 LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
4885 "which use a superset of registers used by other "
4886 "formulae.\n");
4887
4888 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4889 LSRUse &LU = Uses[LUIdx];
4890 bool Any = false;
4891 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
4892 Formula &F = LU.Formulae[i];
4893 if (F.BaseOffset.isNonZero() && F.BaseOffset.isScalable())
4894 continue;
4895 // Look for a formula with a constant or GV in a register. If the use
4896 // also has a formula with that same value in an immediate field,
4897 // delete the one that uses a register.
4899 I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
4900 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
4901 Formula NewF = F;
4902 //FIXME: Formulas should store bitwidth to do wrapping properly.
4903 // See PR41034.
4904 NewF.BaseOffset =
4905 Immediate::getFixed(NewF.BaseOffset.getFixedValue() +
4906 (uint64_t)C->getValue()->getSExtValue());
4907 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4908 (I - F.BaseRegs.begin()));
4909 if (LU.HasFormulaWithSameRegs(NewF)) {
4910 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4911 dbgs() << '\n');
4912 LU.DeleteFormula(F);
4913 --i;
4914 --e;
4915 Any = true;
4916 break;
4917 }
4918 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
4919 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
4920 if (!F.BaseGV) {
4921 Formula NewF = F;
4922 NewF.BaseGV = GV;
4923 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4924 (I - F.BaseRegs.begin()));
4925 if (LU.HasFormulaWithSameRegs(NewF)) {
4926 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4927 dbgs() << '\n');
4928 LU.DeleteFormula(F);
4929 --i;
4930 --e;
4931 Any = true;
4932 break;
4933 }
4934 }
4935 }
4936 }
4937 }
4938 if (Any)
4939 LU.RecomputeRegs(LUIdx, RegUses);
4940 }
4941
4942 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4943 }
4944}
4945
4946/// When there are many registers for expressions like A, A+1, A+2, etc.,
4947/// allocate a single register for them.
4948void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
4949 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4950 return;
4951
4952 LLVM_DEBUG(
4953 dbgs() << "The search space is too complex.\n"
4954 "Narrowing the search space by assuming that uses separated "
4955 "by a constant offset will use the same registers.\n");
4956
4957 // This is especially useful for unrolled loops.
4958
4959 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4960 LSRUse &LU = Uses[LUIdx];
4961 for (const Formula &F : LU.Formulae) {
4962 if (F.BaseOffset.isZero() || (F.Scale != 0 && F.Scale != 1))
4963 continue;
4964
4965 LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
4966 if (!LUThatHas)
4967 continue;
4968
4969 if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
4970 LU.Kind, LU.AccessTy))
4971 continue;
4972
4973 LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');
4974
4975 LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
4976 LUThatHas->AllFixupsUnconditional &= LU.AllFixupsUnconditional;
4977
4978 // Transfer the fixups of LU to LUThatHas.
4979 for (LSRFixup &Fixup : LU.Fixups) {
4980 Fixup.Offset += F.BaseOffset;
4981 LUThatHas->pushFixup(Fixup);
4982 LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
4983 }
4984
4985 // Delete formulae from the new use which are no longer legal.
4986 bool Any = false;
4987 for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
4988 Formula &F = LUThatHas->Formulae[i];
4989 if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
4990 LUThatHas->Kind, LUThatHas->AccessTy, F)) {
4991 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
4992 LUThatHas->DeleteFormula(F);
4993 --i;
4994 --e;
4995 Any = true;
4996 }
4997 }
4998
4999 if (Any)
5000 LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
5001
5002 // Delete the old use.
5003 DeleteUse(LU, LUIdx);
5004 --LUIdx;
5005 --NumUses;
5006 break;
5007 }
5008 }
5009
5010 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5011}
5012
5013/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
5014/// we've done more filtering, as it may be able to find more formulae to
5015/// eliminate.
5016void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
5017 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5018 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5019
5020 LLVM_DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
5021 "undesirable dedicated registers.\n");
5022
5023 FilterOutUndesirableDedicatedRegisters();
5024
5025 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5026 }
5027}
5028
5029/// If a LSRUse has multiple formulae with the same ScaledReg and Scale.
5030/// Pick the best one and delete the others.
5031/// This narrowing heuristic is to keep as many formulae with different
5032/// Scale and ScaledReg pair as possible while narrowing the search space.
5033/// The benefit is that it is more likely to find out a better solution
5034/// from a formulae set with more Scale and ScaledReg variations than
5035/// a formulae set with the same Scale and ScaledReg. The picking winner
5036/// reg heuristic will often keep the formulae with the same Scale and
5037/// ScaledReg and filter others, and we want to avoid that if possible.
5038void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
5039 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5040 return;
5041
5042 LLVM_DEBUG(
5043 dbgs() << "The search space is too complex.\n"
5044 "Narrowing the search space by choosing the best Formula "
5045 "from the Formulae with the same Scale and ScaledReg.\n");
5046
5047 // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
5048 using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>;
5049
5050 BestFormulaeTy BestFormulae;
5051#ifndef NDEBUG
5052 bool ChangedFormulae = false;
5053#endif
5054 DenseSet<const SCEV *> VisitedRegs;
5055 SmallPtrSet<const SCEV *, 16> Regs;
5056
5057 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5058 LSRUse &LU = Uses[LUIdx];
5059 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
5060 dbgs() << '\n');
5061
5062 // Return true if Formula FA is better than Formula FB.
5063 auto IsBetterThan = [&](Formula &FA, Formula &FB) {
5064 // First we will try to choose the Formula with fewer new registers.
5065 // For a register used by current Formula, the more the register is
5066 // shared among LSRUses, the less we increase the register number
5067 // counter of the formula.
5068 size_t FARegNum = 0;
5069 for (const SCEV *Reg : FA.BaseRegs) {
5070 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5071 FARegNum += (NumUses - UsedByIndices.count() + 1);
5072 }
5073 size_t FBRegNum = 0;
5074 for (const SCEV *Reg : FB.BaseRegs) {
5075 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5076 FBRegNum += (NumUses - UsedByIndices.count() + 1);
5077 }
5078 if (FARegNum != FBRegNum)
5079 return FARegNum < FBRegNum;
5080
5081 // If the new register numbers are the same, choose the Formula with
5082 // less Cost.
5083 Cost CostFA(L, SE, TTI, AMK);
5084 Cost CostFB(L, SE, TTI, AMK);
5085 Regs.clear();
5086 CostFA.RateFormula(FA, Regs, VisitedRegs, LU, HardwareLoopProfitable);
5087 Regs.clear();
5088 CostFB.RateFormula(FB, Regs, VisitedRegs, LU, HardwareLoopProfitable);
5089 return CostFA.isLess(CostFB);
5090 };
5091
5092 bool Any = false;
5093 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5094 ++FIdx) {
5095 Formula &F = LU.Formulae[FIdx];
5096 if (!F.ScaledReg)
5097 continue;
5098 auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx});
5099 if (P.second)
5100 continue;
5101
5102 Formula &Best = LU.Formulae[P.first->second];
5103 if (IsBetterThan(F, Best))
5104 std::swap(F, Best);
5105 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5106 dbgs() << "\n"
5107 " in favor of formula ";
5108 Best.print(dbgs()); dbgs() << '\n');
5109#ifndef NDEBUG
5110 ChangedFormulae = true;
5111#endif
5112 LU.DeleteFormula(F);
5113 --FIdx;
5114 --NumForms;
5115 Any = true;
5116 }
5117 if (Any)
5118 LU.RecomputeRegs(LUIdx, RegUses);
5119
5120 // Reset this to prepare for the next use.
5121 BestFormulae.clear();
5122 }
5123
5124 LLVM_DEBUG(if (ChangedFormulae) {
5125 dbgs() << "\n"
5126 "After filtering out undesirable candidates:\n";
5127 print_uses(dbgs());
5128 });
5129}
5130
5131/// If we are over the complexity limit, filter out any post-inc prefering
5132/// variables to only post-inc values.
5133void LSRInstance::NarrowSearchSpaceByFilterPostInc() {
5134 if (AMK != TTI::AMK_PostIndexed)
5135 return;
5136 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5137 return;
5138
5139 LLVM_DEBUG(dbgs() << "The search space is too complex.\n"
5140 "Narrowing the search space by choosing the lowest "
5141 "register Formula for PostInc Uses.\n");
5142
5143 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5144 LSRUse &LU = Uses[LUIdx];
5145
5146 if (LU.Kind != LSRUse::Address)
5147 continue;
5148 if (!TTI.isIndexedLoadLegal(TTI.MIM_PostInc, LU.AccessTy.getType()) &&
5149 !TTI.isIndexedStoreLegal(TTI.MIM_PostInc, LU.AccessTy.getType()))
5150 continue;
5151
5152 size_t MinRegs = std::numeric_limits<size_t>::max();
5153 for (const Formula &F : LU.Formulae)
5154 MinRegs = std::min(F.getNumRegs(), MinRegs);
5155
5156 bool Any = false;
5157 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5158 ++FIdx) {
5159 Formula &F = LU.Formulae[FIdx];
5160 if (F.getNumRegs() > MinRegs) {
5161 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5162 dbgs() << "\n");
5163 LU.DeleteFormula(F);
5164 --FIdx;
5165 --NumForms;
5166 Any = true;
5167 }
5168 }
5169 if (Any)
5170 LU.RecomputeRegs(LUIdx, RegUses);
5171
5172 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5173 break;
5174 }
5175
5176 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5177}
5178
5179/// The function delete formulas with high registers number expectation.
5180/// Assuming we don't know the value of each formula (already delete
5181/// all inefficient), generate probability of not selecting for each
5182/// register.
5183/// For example,
5184/// Use1:
5185/// reg(a) + reg({0,+,1})
5186/// reg(a) + reg({-1,+,1}) + 1
5187/// reg({a,+,1})
5188/// Use2:
5189/// reg(b) + reg({0,+,1})
5190/// reg(b) + reg({-1,+,1}) + 1
5191/// reg({b,+,1})
5192/// Use3:
5193/// reg(c) + reg(b) + reg({0,+,1})
5194/// reg(c) + reg({b,+,1})
5195///
5196/// Probability of not selecting
5197/// Use1 Use2 Use3
5198/// reg(a) (1/3) * 1 * 1
5199/// reg(b) 1 * (1/3) * (1/2)
5200/// reg({0,+,1}) (2/3) * (2/3) * (1/2)
5201/// reg({-1,+,1}) (2/3) * (2/3) * 1
5202/// reg({a,+,1}) (2/3) * 1 * 1
5203/// reg({b,+,1}) 1 * (2/3) * (2/3)
5204/// reg(c) 1 * 1 * 0
5205///
5206/// Now count registers number mathematical expectation for each formula:
5207/// Note that for each use we exclude probability if not selecting for the use.
5208/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
5209/// probabilty 1/3 of not selecting for Use1).
5210/// Use1:
5211/// reg(a) + reg({0,+,1}) 1 + 1/3 -- to be deleted
5212/// reg(a) + reg({-1,+,1}) + 1 1 + 4/9 -- to be deleted
5213/// reg({a,+,1}) 1
5214/// Use2:
5215/// reg(b) + reg({0,+,1}) 1/2 + 1/3 -- to be deleted
5216/// reg(b) + reg({-1,+,1}) + 1 1/2 + 2/3 -- to be deleted
5217/// reg({b,+,1}) 2/3
5218/// Use3:
5219/// reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
5220/// reg(c) + reg({b,+,1}) 1 + 2/3
5221void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
5222 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5223 return;
5224 // Ok, we have too many of formulae on our hands to conveniently handle.
5225 // Use a rough heuristic to thin out the list.
5226
5227 // Set of Regs wich will be 100% used in final solution.
5228 // Used in each formula of a solution (in example above this is reg(c)).
5229 // We can skip them in calculations.
5230 SmallPtrSet<const SCEV *, 4> UniqRegs;
5231 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5232
5233 // Map each register to probability of not selecting
5234 DenseMap <const SCEV *, float> RegNumMap;
5235 for (const SCEV *Reg : RegUses) {
5236 if (UniqRegs.count(Reg))
5237 continue;
5238 float PNotSel = 1;
5239 for (const LSRUse &LU : Uses) {
5240 if (!LU.Regs.count(Reg))
5241 continue;
5242 float P = LU.getNotSelectedProbability(Reg);
5243 if (P != 0.0)
5244 PNotSel *= P;
5245 else
5246 UniqRegs.insert(Reg);
5247 }
5248 RegNumMap.insert(std::make_pair(Reg, PNotSel));
5249 }
5250
5251 LLVM_DEBUG(
5252 dbgs() << "Narrowing the search space by deleting costly formulas\n");
5253
5254 // Delete formulas where registers number expectation is high.
5255 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5256 LSRUse &LU = Uses[LUIdx];
5257 // If nothing to delete - continue.
5258 if (LU.Formulae.size() < 2)
5259 continue;
5260 // This is temporary solution to test performance. Float should be
5261 // replaced with round independent type (based on integers) to avoid
5262 // different results for different target builds.
5263 float FMinRegNum = LU.Formulae[0].getNumRegs();
5264 float FMinARegNum = LU.Formulae[0].getNumRegs();
5265 size_t MinIdx = 0;
5266 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5267 Formula &F = LU.Formulae[i];
5268 float FRegNum = 0;
5269 float FARegNum = 0;
5270 for (const SCEV *BaseReg : F.BaseRegs) {
5271 if (UniqRegs.count(BaseReg))
5272 continue;
5273 FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5274 if (isa<SCEVAddRecExpr>(BaseReg))
5275 FARegNum +=
5276 RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5277 }
5278 if (const SCEV *ScaledReg = F.ScaledReg) {
5279 if (!UniqRegs.count(ScaledReg)) {
5280 FRegNum +=
5281 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5282 if (isa<SCEVAddRecExpr>(ScaledReg))
5283 FARegNum +=
5284 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5285 }
5286 }
5287 if (FMinRegNum > FRegNum ||
5288 (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
5289 FMinRegNum = FRegNum;
5290 FMinARegNum = FARegNum;
5291 MinIdx = i;
5292 }
5293 }
5294 LLVM_DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs());
5295 dbgs() << " with min reg num " << FMinRegNum << '\n');
5296 if (MinIdx != 0)
5297 std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
5298 while (LU.Formulae.size() != 1) {
5299 LLVM_DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs());
5300 dbgs() << '\n');
5301 LU.Formulae.pop_back();
5302 }
5303 LU.RecomputeRegs(LUIdx, RegUses);
5304 assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
5305 Formula &F = LU.Formulae[0];
5306 LLVM_DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n');
5307 // When we choose the formula, the regs become unique.
5308 UniqRegs.insert_range(F.BaseRegs);
5309 if (F.ScaledReg)
5310 UniqRegs.insert(F.ScaledReg);
5311 }
5312 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5313}
5314
5315// Check if Best and Reg are SCEVs separated by a constant amount C, and if so
5316// would the addressing offset +C would be legal where the negative offset -C is
5317// not.
5319 ScalarEvolution &SE, const SCEV *Best,
5320 const SCEV *Reg,
5321 MemAccessTy AccessType) {
5322 if (Best->getType() != Reg->getType() ||
5324 cast<SCEVAddRecExpr>(Best)->getLoop() !=
5325 cast<SCEVAddRecExpr>(Reg)->getLoop()))
5326 return false;
5327 std::optional<APInt> Diff = SE.computeConstantDifference(Best, Reg);
5328 if (!Diff)
5329 return false;
5330
5331 return TTI.isLegalAddressingMode(
5332 AccessType.MemTy, /*BaseGV=*/nullptr,
5333 /*BaseOffset=*/Diff->getSExtValue(),
5334 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace) &&
5335 !TTI.isLegalAddressingMode(
5336 AccessType.MemTy, /*BaseGV=*/nullptr,
5337 /*BaseOffset=*/-Diff->getSExtValue(),
5338 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace);
5339}
5340
5341/// Pick a register which seems likely to be profitable, and then in any use
5342/// which has any reference to that register, delete all formulae which do not
5343/// reference that register.
5344void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
5345 // With all other options exhausted, loop until the system is simple
5346 // enough to handle.
5347 SmallPtrSet<const SCEV *, 4> Taken;
5348 while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5349 // Ok, we have too many of formulae on our hands to conveniently handle.
5350 // Use a rough heuristic to thin out the list.
5351 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5352
5353 // Pick the register which is used by the most LSRUses, which is likely
5354 // to be a good reuse register candidate.
5355 const SCEV *Best = nullptr;
5356 unsigned BestNum = 0;
5357 for (const SCEV *Reg : RegUses) {
5358 if (Taken.count(Reg))
5359 continue;
5360 if (!Best) {
5361 Best = Reg;
5362 BestNum = RegUses.getUsedByIndices(Reg).count();
5363 } else {
5364 unsigned Count = RegUses.getUsedByIndices(Reg).count();
5365 if (Count > BestNum) {
5366 Best = Reg;
5367 BestNum = Count;
5368 }
5369
5370 // If the scores are the same, but the Reg is simpler for the target
5371 // (for example {x,+,1} as opposed to {x+C,+,1}, where the target can
5372 // handle +C but not -C), opt for the simpler formula.
5373 if (Count == BestNum) {
5374 int LUIdx = RegUses.getUsedByIndices(Reg).find_first();
5375 if (LUIdx >= 0 && Uses[LUIdx].Kind == LSRUse::Address &&
5377 Uses[LUIdx].AccessTy)) {
5378 Best = Reg;
5379 BestNum = Count;
5380 }
5381 }
5382 }
5383 }
5384 assert(Best && "Failed to find best LSRUse candidate");
5385
5386 LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
5387 << " will yield profitable reuse.\n");
5388 Taken.insert(Best);
5389
5390 // In any use with formulae which references this register, delete formulae
5391 // which don't reference it.
5392 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5393 LSRUse &LU = Uses[LUIdx];
5394 if (!LU.Regs.count(Best)) continue;
5395
5396 bool Any = false;
5397 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5398 Formula &F = LU.Formulae[i];
5399 if (!F.referencesReg(Best)) {
5400 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
5401 LU.DeleteFormula(F);
5402 --e;
5403 --i;
5404 Any = true;
5405 assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
5406 continue;
5407 }
5408 }
5409
5410 if (Any)
5411 LU.RecomputeRegs(LUIdx, RegUses);
5412 }
5413
5414 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5415 }
5416}
5417
5418/// If there are an extraordinary number of formulae to choose from, use some
5419/// rough heuristics to prune down the number of formulae. This keeps the main
5420/// solver from taking an extraordinary amount of time in some worst-case
5421/// scenarios.
5422void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
5423 NarrowSearchSpaceByDetectingSupersets();
5424 NarrowSearchSpaceByCollapsingUnrolledCode();
5425 NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
5427 NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
5428 NarrowSearchSpaceByFilterPostInc();
5429 if (LSRExpNarrow)
5430 NarrowSearchSpaceByDeletingCostlyFormulas();
5431 else
5432 NarrowSearchSpaceByPickingWinnerRegs();
5433}
5434
5435/// This is the recursive solver.
5436void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
5437 Cost &SolutionCost,
5438 SmallVectorImpl<const Formula *> &Workspace,
5439 const Cost &CurCost,
5440 const SmallPtrSet<const SCEV *, 16> &CurRegs,
5441 DenseSet<const SCEV *> &VisitedRegs) const {
5442 // Some ideas:
5443 // - prune more:
5444 // - use more aggressive filtering
5445 // - sort the formula so that the most profitable solutions are found first
5446 // - sort the uses too
5447 // - search faster:
5448 // - don't compute a cost, and then compare. compare while computing a cost
5449 // and bail early.
5450 // - track register sets with SmallBitVector
5451
5452 const LSRUse &LU = Uses[Workspace.size()];
5453
5454 // If this use references any register that's already a part of the
5455 // in-progress solution, consider it a requirement that a formula must
5456 // reference that register in order to be considered. This prunes out
5457 // unprofitable searching.
5458 SmallSetVector<const SCEV *, 4> ReqRegs;
5459 for (const SCEV *S : CurRegs)
5460 if (LU.Regs.count(S))
5461 ReqRegs.insert(S);
5462
5463 SmallPtrSet<const SCEV *, 16> NewRegs;
5464 Cost NewCost(L, SE, TTI, AMK);
5465 for (const Formula &F : LU.Formulae) {
5466 // Ignore formulae which may not be ideal in terms of register reuse of
5467 // ReqRegs. The formula should use all required registers before
5468 // introducing new ones.
5469 // This can sometimes (notably when trying to favour postinc) lead to
5470 // sub-optimial decisions. There it is best left to the cost modelling to
5471 // get correct.
5472 if (!(AMK & TTI::AMK_PostIndexed) || LU.Kind != LSRUse::Address) {
5473 int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
5474 for (const SCEV *Reg : ReqRegs) {
5475 if ((F.ScaledReg && F.ScaledReg == Reg) ||
5476 is_contained(F.BaseRegs, Reg)) {
5477 --NumReqRegsToFind;
5478 if (NumReqRegsToFind == 0)
5479 break;
5480 }
5481 }
5482 if (NumReqRegsToFind != 0) {
5483 // If none of the formulae satisfied the required registers, then we could
5484 // clear ReqRegs and try again. Currently, we simply give up in this case.
5485 continue;
5486 }
5487 }
5488
5489 // Evaluate the cost of the current formula. If it's already worse than
5490 // the current best, prune the search at that point.
5491 NewCost = CurCost;
5492 NewRegs = CurRegs;
5493 NewCost.RateFormula(F, NewRegs, VisitedRegs, LU, HardwareLoopProfitable);
5494 if (NewCost.isLess(SolutionCost)) {
5495 Workspace.push_back(&F);
5496 if (Workspace.size() != Uses.size()) {
5497 SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
5498 NewRegs, VisitedRegs);
5499 if (F.getNumRegs() == 1 && Workspace.size() == 1)
5500 VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
5501 } else {
5502 LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
5503 dbgs() << ".\nRegs:\n";
5504 for (const SCEV *S : NewRegs) dbgs()
5505 << "- " << *S << "\n";
5506 dbgs() << '\n');
5507
5508 SolutionCost = NewCost;
5509 Solution = Workspace;
5510 }
5511 Workspace.pop_back();
5512 }
5513 }
5514}
5515
5516/// Choose one formula from each use. Return the results in the given Solution
5517/// vector.
5518void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
5520 Cost SolutionCost(L, SE, TTI, AMK);
5521 SolutionCost.Lose();
5522 Cost CurCost(L, SE, TTI, AMK);
5523 SmallPtrSet<const SCEV *, 16> CurRegs;
5524 DenseSet<const SCEV *> VisitedRegs;
5525 Workspace.reserve(Uses.size());
5526
5527 // SolveRecurse does all the work.
5528 SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
5529 CurRegs, VisitedRegs);
5530 if (Solution.empty()) {
5531 LLVM_DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
5532 return;
5533 }
5534
5535 // Ok, we've now made all our decisions.
5536 LLVM_DEBUG(dbgs() << "\n"
5537 "The chosen solution requires ";
5538 SolutionCost.print(dbgs()); dbgs() << ":\n";
5539 for (size_t i = 0, e = Uses.size(); i != e; ++i) {
5540 dbgs() << " ";
5541 Uses[i].print(dbgs());
5542 dbgs() << "\n"
5543 " ";
5544 Solution[i]->print(dbgs());
5545 dbgs() << '\n';
5546 });
5547
5548 assert(Solution.size() == Uses.size() && "Malformed solution!");
5549
5550 const bool EnableDropUnprofitableSolution = [&] {
5552 case cl::BOU_TRUE:
5553 return true;
5554 case cl::BOU_FALSE:
5555 return false;
5556 case cl::BOU_UNSET:
5558 }
5559 llvm_unreachable("Unhandled cl::boolOrDefault enum");
5560 }();
5561
5562 if (BaselineCost.isLess(SolutionCost)) {
5563 if (!EnableDropUnprofitableSolution)
5564 LLVM_DEBUG(
5565 dbgs() << "Baseline is more profitable than chosen solution, "
5566 "add option 'lsr-drop-solution' to drop LSR solution.\n");
5567 else {
5568 LLVM_DEBUG(dbgs() << "Baseline is more profitable than chosen "
5569 "solution, dropping LSR solution.\n";);
5570 Solution.clear();
5571 }
5572 }
5573}
5574
5575/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
5576/// we can go while still being dominated by the input positions. This helps
5577/// canonicalize the insert position, which encourages sharing.
5579LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
5580 const SmallVectorImpl<Instruction *> &Inputs)
5581 const {
5582 Instruction *Tentative = &*IP;
5583 while (true) {
5584 bool AllDominate = true;
5585 Instruction *BetterPos = nullptr;
5586 // Don't bother attempting to insert before a catchswitch, their basic block
5587 // cannot have other non-PHI instructions.
5588 if (isa<CatchSwitchInst>(Tentative))
5589 return IP;
5590
5591 for (Instruction *Inst : Inputs) {
5592 if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
5593 AllDominate = false;
5594 break;
5595 }
5596 // Attempt to find an insert position in the middle of the block,
5597 // instead of at the end, so that it can be used for other expansions.
5598 if (Tentative->getParent() == Inst->getParent() &&
5599 (!BetterPos || !DT.dominates(Inst, BetterPos)))
5600 BetterPos = &*std::next(BasicBlock::iterator(Inst));
5601 }
5602 if (!AllDominate)
5603 break;
5604 if (BetterPos)
5605 IP = BetterPos->getIterator();
5606 else
5607 IP = Tentative->getIterator();
5608
5609 const Loop *IPLoop = LI.getLoopFor(IP->getParent());
5610 unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
5611
5612 BasicBlock *IDom;
5613 for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
5614 if (!Rung) return IP;
5615 Rung = Rung->getIDom();
5616 if (!Rung) return IP;
5617 IDom = Rung->getBlock();
5618
5619 // Don't climb into a loop though.
5620 const Loop *IDomLoop = LI.getLoopFor(IDom);
5621 unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
5622 if (IDomDepth <= IPLoopDepth &&
5623 (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
5624 break;
5625 }
5626
5627 Tentative = IDom->getTerminator();
5628 }
5629
5630 return IP;
5631}
5632
5633/// Determine an input position which will be dominated by the operands and
5634/// which will dominate the result.
5635BasicBlock::iterator LSRInstance::AdjustInsertPositionForExpand(
5636 BasicBlock::iterator LowestIP, const LSRFixup &LF, const LSRUse &LU) const {
5637 // Collect some instructions which must be dominated by the
5638 // expanding replacement. These must be dominated by any operands that
5639 // will be required in the expansion.
5640 SmallVector<Instruction *, 4> Inputs;
5641 if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
5642 Inputs.push_back(I);
5643 if (LU.Kind == LSRUse::ICmpZero)
5644 if (Instruction *I =
5645 dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
5646 Inputs.push_back(I);
5647 if (LF.PostIncLoops.count(L)) {
5648 if (LF.isUseFullyOutsideLoop(L))
5649 Inputs.push_back(L->getLoopLatch()->getTerminator());
5650 else
5651 Inputs.push_back(IVIncInsertPos);
5652 }
5653 // The expansion must also be dominated by the increment positions of any
5654 // loops it for which it is using post-inc mode.
5655 for (const Loop *PIL : LF.PostIncLoops) {
5656 if (PIL == L) continue;
5657
5658 // Be dominated by the loop exit.
5659 SmallVector<BasicBlock *, 4> ExitingBlocks;
5660 PIL->getExitingBlocks(ExitingBlocks);
5661 if (!ExitingBlocks.empty()) {
5662 BasicBlock *BB = ExitingBlocks[0];
5663 for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
5664 BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
5665 Inputs.push_back(BB->getTerminator());
5666 }
5667 }
5668
5669 assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad() &&
5670 "Insertion point must be a normal instruction");
5671
5672 // Then, climb up the immediate dominator tree as far as we can go while
5673 // still being dominated by the input positions.
5674 BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
5675
5676 // Don't insert instructions before PHI nodes.
5677 while (isa<PHINode>(IP)) ++IP;
5678
5679 // Ignore landingpad instructions.
5680 while (IP->isEHPad()) ++IP;
5681
5682 // Set IP below instructions recently inserted by SCEVExpander. This keeps the
5683 // IP consistent across expansions and allows the previously inserted
5684 // instructions to be reused by subsequent expansion.
5685 while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP)
5686 ++IP;
5687
5688 return IP;
5689}
5690
5691/// Emit instructions for the leading candidate expression for this LSRUse (this
5692/// is called "expanding").
5693Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
5694 const Formula &F, BasicBlock::iterator IP,
5695 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5696 if (LU.RigidFormula)
5697 return LF.OperandValToReplace;
5698
5699 // Determine an input position which will be dominated by the operands and
5700 // which will dominate the result.
5701 IP = AdjustInsertPositionForExpand(IP, LF, LU);
5702 Rewriter.setInsertPoint(&*IP);
5703
5704 // Inform the Rewriter if we have a post-increment use, so that it can
5705 // perform an advantageous expansion.
5706 Rewriter.setPostInc(LF.PostIncLoops);
5707
5708 // This is the type that the user actually needs.
5709 Type *OpTy = LF.OperandValToReplace->getType();
5710 // This will be the type that we'll initially expand to.
5711 Type *Ty = F.getType();
5712 if (!Ty)
5713 // No type known; just expand directly to the ultimate type.
5714 Ty = OpTy;
5715 else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
5716 // Expand directly to the ultimate type if it's the right size.
5717 Ty = OpTy;
5718 // This is the type to do integer arithmetic in.
5719 Type *IntTy = SE.getEffectiveSCEVType(Ty);
5720
5721 // Build up a list of operands to add together to form the full base.
5723
5724 // Expand the BaseRegs portion.
5725 for (const SCEV *Reg : F.BaseRegs) {
5726 assert(!Reg->isZero() && "Zero allocated in a base register!");
5727
5728 // If we're expanding for a post-inc user, make the post-inc adjustment.
5729 Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE);
5730 Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
5731 }
5732
5733 // Expand the ScaledReg portion.
5734 Value *ICmpScaledV = nullptr;
5735 if (F.Scale != 0) {
5736 const SCEV *ScaledS = F.ScaledReg;
5737
5738 // If we're expanding for a post-inc user, make the post-inc adjustment.
5739 PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
5740 ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE);
5741
5742 if (LU.Kind == LSRUse::ICmpZero) {
5743 // Expand ScaleReg as if it was part of the base regs.
5744 if (F.Scale == 1)
5745 Ops.push_back(
5746 SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)));
5747 else {
5748 // An interesting way of "folding" with an icmp is to use a negated
5749 // scale, which we'll implement by inserting it into the other operand
5750 // of the icmp.
5751 assert(F.Scale == -1 &&
5752 "The only scale supported by ICmpZero uses is -1!");
5753 ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr);
5754 }
5755 } else {
5756 // Otherwise just expand the scaled register and an explicit scale,
5757 // which is expected to be matched as part of the address.
5758
5759 // Flush the operand list to suppress SCEVExpander hoisting address modes.
5760 // Unless the addressing mode will not be folded.
5761 if (!Ops.empty() && LU.Kind == LSRUse::Address &&
5762 isAMCompletelyFolded(TTI, LU, F)) {
5763 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), nullptr);
5764 Ops.clear();
5765 Ops.push_back(SE.getUnknown(FullV));
5766 }
5767 ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr));
5768 if (F.Scale != 1)
5769 ScaledS =
5770 SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
5771 Ops.push_back(ScaledS);
5772 }
5773 }
5774
5775 // Expand the GV portion.
5776 if (F.BaseGV) {
5777 // Flush the operand list to suppress SCEVExpander hoisting.
5778 if (!Ops.empty()) {
5779 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), IntTy);
5780 Ops.clear();
5781 Ops.push_back(SE.getUnknown(FullV));
5782 }
5783 Ops.push_back(SE.getUnknown(F.BaseGV));
5784 }
5785
5786 // Flush the operand list to suppress SCEVExpander hoisting of both folded and
5787 // unfolded offsets. LSR assumes they both live next to their uses.
5788 if (!Ops.empty()) {
5789 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
5790 Ops.clear();
5791 Ops.push_back(SE.getUnknown(FullV));
5792 }
5793
5794 // FIXME: Are we sure we won't get a mismatch here? Is there a way to bail
5795 // out at this point, or should we generate a SCEV adding together mixed
5796 // offsets?
5797 assert(F.BaseOffset.isCompatibleImmediate(LF.Offset) &&
5798 "Expanding mismatched offsets\n");
5799 // Expand the immediate portion.
5800 Immediate Offset = F.BaseOffset.addUnsigned(LF.Offset);
5801 if (Offset.isNonZero()) {
5802 if (LU.Kind == LSRUse::ICmpZero) {
5803 // The other interesting way of "folding" with an ICmpZero is to use a
5804 // negated immediate.
5805 if (!ICmpScaledV) {
5806 // TODO: Avoid implicit trunc?
5807 // See https://github.com/llvm/llvm-project/issues/112510.
5808 ICmpScaledV = ConstantInt::getSigned(
5809 IntTy, -(uint64_t)Offset.getFixedValue(), /*ImplicitTrunc=*/true);
5810 } else {
5811 Ops.push_back(SE.getUnknown(ICmpScaledV));
5812 ICmpScaledV = ConstantInt::getSigned(IntTy, Offset.getFixedValue(),
5813 /*ImplicitTrunc=*/true);
5814 }
5815 } else {
5816 // Just add the immediate values. These again are expected to be matched
5817 // as part of the address.
5818 Ops.push_back(Offset.getUnknownSCEV(SE, IntTy));
5819 }
5820 }
5821
5822 // Expand the unfolded offset portion.
5823 Immediate UnfoldedOffset = F.UnfoldedOffset;
5824 if (UnfoldedOffset.isNonZero()) {
5825 // Just add the immediate values.
5826 Ops.push_back(UnfoldedOffset.getUnknownSCEV(SE, IntTy));
5827 }
5828
5829 // Emit instructions summing all the operands.
5830 const SCEV *FullS = Ops.empty() ?
5831 SE.getConstant(IntTy, 0) :
5832 SE.getAddExpr(Ops);
5833 Value *FullV = Rewriter.expandCodeFor(FullS, Ty);
5834
5835 // We're done expanding now, so reset the rewriter.
5836 Rewriter.clearPostInc();
5837
5838 // An ICmpZero Formula represents an ICmp which we're handling as a
5839 // comparison against zero. Now that we've expanded an expression for that
5840 // form, update the ICmp's other operand.
5841 if (LU.Kind == LSRUse::ICmpZero) {
5842 ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
5843 if (auto *OperandIsInstr = dyn_cast<Instruction>(CI->getOperand(1)))
5844 DeadInsts.emplace_back(OperandIsInstr);
5845 assert(!F.BaseGV && "ICmp does not support folding a global value and "
5846 "a scale at the same time!");
5847 if (F.Scale == -1) {
5848 if (ICmpScaledV->getType() != OpTy) {
5850 CastInst::getCastOpcode(ICmpScaledV, false, OpTy, false),
5851 ICmpScaledV, OpTy, "tmp", CI->getIterator());
5852 ICmpScaledV = Cast;
5853 }
5854 CI->setOperand(1, ICmpScaledV);
5855 } else {
5856 // A scale of 1 means that the scale has been expanded as part of the
5857 // base regs.
5858 assert((F.Scale == 0 || F.Scale == 1) &&
5859 "ICmp does not support folding a global value and "
5860 "a scale at the same time!");
5861 // TODO: Avoid implicit trunc?
5862 // See https://github.com/llvm/llvm-project/issues/112510.
5864 -(uint64_t)Offset.getFixedValue(),
5865 /*ImplicitTrunc=*/true);
5866 if (C->getType() != OpTy) {
5868 CastInst::getCastOpcode(C, false, OpTy, false), C, OpTy,
5869 CI->getDataLayout());
5870 assert(C && "Cast of ConstantInt should have folded");
5871 }
5872
5873 CI->setOperand(1, C);
5874 }
5875 }
5876
5877 return FullV;
5878}
5879
5880/// Helper for Rewrite. PHI nodes are special because the use of their operands
5881/// effectively happens in their predecessor blocks, so the expression may need
5882/// to be expanded in multiple places.
5883void LSRInstance::RewriteForPHI(PHINode *PN, const LSRUse &LU,
5884 const LSRFixup &LF, const Formula &F,
5885 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
5886 DenseMap<BasicBlock *, Value *> Inserted;
5887
5888 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
5889 if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
5890 bool needUpdateFixups = false;
5891 BasicBlock *BB = PN->getIncomingBlock(i);
5892
5893 // If this is a critical edge, split the edge so that we do not insert
5894 // the code on all predecessor/successor paths. We do this unless this
5895 // is the canonical backedge for this loop, which complicates post-inc
5896 // users.
5897 if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
5900 BasicBlock *Parent = PN->getParent();
5901 Loop *PNLoop = LI.getLoopFor(Parent);
5902 if (!PNLoop || Parent != PNLoop->getHeader()) {
5903 // Split the critical edge.
5904 BasicBlock *NewBB = nullptr;
5905 if (!Parent->isLandingPad()) {
5906 NewBB =
5907 SplitCriticalEdge(BB, Parent,
5908 CriticalEdgeSplittingOptions(&DT, &LI, MSSAU)
5909 .setMergeIdenticalEdges()
5910 .setKeepOneInputPHIs());
5911 } else {
5913 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
5914 SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DTU, &LI);
5915 NewBB = NewBBs[0];
5916 }
5917 // If NewBB==NULL, then SplitCriticalEdge refused to split because all
5918 // phi predecessors are identical. The simple thing to do is skip
5919 // splitting in this case rather than complicate the API.
5920 if (NewBB) {
5921 // If PN is outside of the loop and BB is in the loop, we want to
5922 // move the block to be immediately before the PHI block, not
5923 // immediately after BB.
5924 if (L->contains(BB) && !L->contains(PN))
5925 NewBB->moveBefore(PN->getParent());
5926
5927 // Splitting the edge can reduce the number of PHI entries we have.
5928 e = PN->getNumIncomingValues();
5929 BB = NewBB;
5930 i = PN->getBasicBlockIndex(BB);
5931
5932 needUpdateFixups = true;
5933 }
5934 }
5935 }
5936
5937 std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
5938 Inserted.try_emplace(BB);
5939 if (!Pair.second)
5940 PN->setIncomingValue(i, Pair.first->second);
5941 else {
5942 Value *FullV =
5943 Expand(LU, LF, F, BB->getTerminator()->getIterator(), DeadInsts);
5944
5945 // If this is reuse-by-noop-cast, insert the noop cast.
5946 Type *OpTy = LF.OperandValToReplace->getType();
5947 if (FullV->getType() != OpTy)
5948 FullV = CastInst::Create(
5949 CastInst::getCastOpcode(FullV, false, OpTy, false), FullV,
5950 LF.OperandValToReplace->getType(), "tmp",
5951 BB->getTerminator()->getIterator());
5952
5953 // If the incoming block for this value is not in the loop, it means the
5954 // current PHI is not in a loop exit, so we must create a LCSSA PHI for
5955 // the inserted value.
5956 if (auto *I = dyn_cast<Instruction>(FullV))
5957 if (L->contains(I) && !L->contains(BB))
5958 InsertedNonLCSSAInsts.insert(I);
5959
5960 PN->setIncomingValue(i, FullV);
5961 Pair.first->second = FullV;
5962 }
5963
5964 // If LSR splits critical edge and phi node has other pending
5965 // fixup operands, we need to update those pending fixups. Otherwise
5966 // formulae will not be implemented completely and some instructions
5967 // will not be eliminated.
5968 if (needUpdateFixups) {
5969 for (LSRUse &LU : Uses)
5970 for (LSRFixup &Fixup : LU.Fixups)
5971 // If fixup is supposed to rewrite some operand in the phi
5972 // that was just updated, it may be already moved to
5973 // another phi node. Such fixup requires update.
5974 if (Fixup.UserInst == PN) {
5975 // Check if the operand we try to replace still exists in the
5976 // original phi.
5977 bool foundInOriginalPHI = false;
5978 for (const auto &val : PN->incoming_values())
5979 if (val == Fixup.OperandValToReplace) {
5980 foundInOriginalPHI = true;
5981 break;
5982 }
5983
5984 // If fixup operand found in original PHI - nothing to do.
5985 if (foundInOriginalPHI)
5986 continue;
5987
5988 // Otherwise it might be moved to another PHI and requires update.
5989 // If fixup operand not found in any of the incoming blocks that
5990 // means we have already rewritten it - nothing to do.
5991 for (const auto &Block : PN->blocks())
5992 for (BasicBlock::iterator I = Block->begin(); isa<PHINode>(I);
5993 ++I) {
5994 PHINode *NewPN = cast<PHINode>(I);
5995 for (const auto &val : NewPN->incoming_values())
5996 if (val == Fixup.OperandValToReplace)
5997 Fixup.UserInst = NewPN;
5998 }
5999 }
6000 }
6001 }
6002}
6003
6004/// Emit instructions for the leading candidate expression for this LSRUse (this
6005/// is called "expanding"), and update the UserInst to reference the newly
6006/// expanded value.
6007void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
6008 const Formula &F,
6009 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
6010 // First, find an insertion point that dominates UserInst. For PHI nodes,
6011 // find the nearest block which dominates all the relevant uses.
6012 if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
6013 RewriteForPHI(PN, LU, LF, F, DeadInsts);
6014 } else {
6015 Value *FullV = Expand(LU, LF, F, LF.UserInst->getIterator(), DeadInsts);
6016
6017 // If this is reuse-by-noop-cast, insert the noop cast.
6018 Type *OpTy = LF.OperandValToReplace->getType();
6019 if (FullV->getType() != OpTy) {
6020 Instruction *Cast =
6021 CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
6022 FullV, OpTy, "tmp", LF.UserInst->getIterator());
6023 FullV = Cast;
6024 }
6025
6026 // Update the user. ICmpZero is handled specially here (for now) because
6027 // Expand may have updated one of the operands of the icmp already, and
6028 // its new value may happen to be equal to LF.OperandValToReplace, in
6029 // which case doing replaceUsesOfWith leads to replacing both operands
6030 // with the same value. TODO: Reorganize this.
6031 if (LU.Kind == LSRUse::ICmpZero)
6032 LF.UserInst->setOperand(0, FullV);
6033 else
6034 LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
6035 }
6036
6037 if (auto *OperandIsInstr = dyn_cast<Instruction>(LF.OperandValToReplace))
6038 DeadInsts.emplace_back(OperandIsInstr);
6039}
6040
6041// Determine where to insert the transformed IV increment instruction for this
6042// fixup. By default this is the default insert position, but if this is a
6043// postincrement opportunity then we try to insert it in the same block as the
6044// fixup user instruction, as this is needed for a postincrement instruction to
6045// be generated.
6047 const LSRFixup &Fixup, const LSRUse &LU,
6048 Instruction *IVIncInsertPos,
6049 DominatorTree &DT) {
6050 // Only address uses can be postincremented
6051 if (LU.Kind != LSRUse::Address)
6052 return IVIncInsertPos;
6053
6054 // Don't try to postincrement if it's not legal
6055 Instruction *I = Fixup.UserInst;
6056 Type *Ty = I->getType();
6057 if (!(isa<LoadInst>(I) && TTI.isIndexedLoadLegal(TTI.MIM_PostInc, Ty)) &&
6058 !(isa<StoreInst>(I) && TTI.isIndexedStoreLegal(TTI.MIM_PostInc, Ty)))
6059 return IVIncInsertPos;
6060
6061 // It's only legal to hoist to the user block if it dominates the default
6062 // insert position.
6063 BasicBlock *HoistBlock = I->getParent();
6064 BasicBlock *IVIncBlock = IVIncInsertPos->getParent();
6065 if (!DT.dominates(I, IVIncBlock))
6066 return IVIncInsertPos;
6067
6068 return HoistBlock->getTerminator();
6069}
6070
6071/// Rewrite all the fixup locations with new values, following the chosen
6072/// solution.
6073void LSRInstance::ImplementSolution(
6074 const SmallVectorImpl<const Formula *> &Solution) {
6075 // Keep track of instructions we may have made dead, so that
6076 // we can remove them after we are done working.
6078
6079 // Mark phi nodes that terminate chains so the expander tries to reuse them.
6080 for (const IVChain &Chain : IVChainVec) {
6081 if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst()))
6082 Rewriter.setChainedPhi(PN);
6083 }
6084
6085 // Expand the new value definitions and update the users.
6086 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
6087 for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) {
6088 Instruction *InsertPos =
6089 getFixupInsertPos(TTI, Fixup, Uses[LUIdx], IVIncInsertPos, DT);
6090 Rewriter.setIVIncInsertPos(L, InsertPos);
6091 Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], DeadInsts);
6092 Changed = true;
6093 }
6094
6095 auto InsertedInsts = InsertedNonLCSSAInsts.takeVector();
6096 formLCSSAForInstructions(InsertedInsts, DT, LI, &SE);
6097
6098 for (const IVChain &Chain : IVChainVec) {
6099 GenerateIVChain(Chain, DeadInsts);
6100 Changed = true;
6101 }
6102
6103 for (const WeakVH &IV : Rewriter.getInsertedIVs())
6104 if (IV && dyn_cast<Instruction>(&*IV)->getParent())
6105 ScalarEvolutionIVs.push_back(IV);
6106
6107 // Clean up after ourselves. This must be done before deleting any
6108 // instructions.
6109 Rewriter.clear();
6110
6112 &TLI, MSSAU);
6113
6114 // In our cost analysis above, we assume that each addrec consumes exactly
6115 // one register, and arrange to have increments inserted just before the
6116 // latch to maximimize the chance this is true. However, if we reused
6117 // existing IVs, we now need to move the increments to match our
6118 // expectations. Otherwise, our cost modeling results in us having a
6119 // chosen a non-optimal result for the actual schedule. (And yes, this
6120 // scheduling decision does impact later codegen.)
6121 for (PHINode &PN : L->getHeader()->phis()) {
6122 BinaryOperator *BO = nullptr;
6123 Value *Start = nullptr, *Step = nullptr;
6124 if (!matchSimpleRecurrence(&PN, BO, Start, Step))
6125 continue;
6126
6127 switch (BO->getOpcode()) {
6128 case Instruction::Sub:
6129 if (BO->getOperand(0) != &PN)
6130 // sub is non-commutative - match handling elsewhere in LSR
6131 continue;
6132 break;
6133 case Instruction::Add:
6134 break;
6135 default:
6136 continue;
6137 };
6138
6139 if (!isa<Constant>(Step))
6140 // If not a constant step, might increase register pressure
6141 // (We assume constants have been canonicalized to RHS)
6142 continue;
6143
6144 if (BO->getParent() == IVIncInsertPos->getParent())
6145 // Only bother moving across blocks. Isel can handle block local case.
6146 continue;
6147
6148 // Can we legally schedule inc at the desired point?
6149 if (!llvm::all_of(BO->uses(),
6150 [&](Use &U) {return DT.dominates(IVIncInsertPos, U);}))
6151 continue;
6152 BO->moveBefore(IVIncInsertPos->getIterator());
6153 Changed = true;
6154 }
6155
6156
6157}
6158
6159LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
6160 DominatorTree &DT, LoopInfo &LI,
6161 const TargetTransformInfo &TTI, AssumptionCache &AC,
6162 TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU)
6163 : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), TLI(TLI), TTI(TTI), L(L),
6164 MSSAU(MSSAU), AMK(PreferredAddresingMode.getNumOccurrences() > 0
6166 : TTI.getPreferredAddressingMode(L, &SE)),
6167 Rewriter(SE, "lsr", false), BaselineCost(L, SE, TTI, AMK) {
6168 // If LoopSimplify form is not available, stay out of trouble.
6169 if (!L->isLoopSimplifyForm())
6170 return;
6171
6172 // If there's no interesting work to be done, bail early.
6173 if (IU.empty()) return;
6174
6175 // If there's too much analysis to be done, bail early. We won't be able to
6176 // model the problem anyway.
6177 unsigned NumUsers = 0;
6178 for (const IVStrideUse &U : IU) {
6179 if (++NumUsers > MaxIVUsers) {
6180 (void)U;
6181 LLVM_DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U
6182 << "\n");
6183 return;
6184 }
6185 // Bail out if we have a PHI on an EHPad that gets a value from a
6186 // CatchSwitchInst. Because the CatchSwitchInst cannot be split, there is
6187 // no good place to stick any instructions.
6188 if (auto *PN = dyn_cast<PHINode>(U.getUser())) {
6189 auto FirstNonPHI = PN->getParent()->getFirstNonPHIIt();
6190 if (isa<FuncletPadInst>(FirstNonPHI) ||
6191 isa<CatchSwitchInst>(FirstNonPHI))
6192 for (BasicBlock *PredBB : PN->blocks())
6193 if (isa<CatchSwitchInst>(PredBB->getFirstNonPHIIt()))
6194 return;
6195 }
6196 }
6197
6198 LLVM_DEBUG(dbgs() << "\nLSR on loop ";
6199 L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
6200 dbgs() << ":\n");
6201
6202 // Check if we expect this loop to use a hardware loop instruction, which will
6203 // be used when calculating the costs of formulas.
6204 HardwareLoopInfo HWLoopInfo(L);
6205 HardwareLoopProfitable =
6206 TTI.isHardwareLoopProfitable(L, SE, AC, &TLI, HWLoopInfo);
6207
6208 // Configure SCEVExpander already now, so the correct mode is used for
6209 // isSafeToExpand() checks.
6210#if LLVM_ENABLE_ABI_BREAKING_CHECKS
6211 Rewriter.setDebugType(DEBUG_TYPE);
6212#endif
6213 Rewriter.disableCanonicalMode();
6214 Rewriter.enableLSRMode();
6215
6216 // First, perform some low-level loop optimizations.
6217 OptimizeShadowIV();
6218 OptimizeLoopTermCond();
6219
6220 // If loop preparation eliminates all interesting IV users, bail.
6221 if (IU.empty()) return;
6222
6223 // Skip nested loops until we can model them better with formulae.
6224 if (!L->isInnermost()) {
6225 LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
6226 return;
6227 }
6228
6229 // Start collecting data and preparing for the solver.
6230 // If number of registers is not the major cost, we cannot benefit from the
6231 // current profitable chain optimization which is based on number of
6232 // registers.
6233 // FIXME: add profitable chain optimization for other kinds major cost, for
6234 // example number of instructions.
6235 if (TTI.isNumRegsMajorCostOfLSR() || StressIVChain)
6236 CollectChains();
6237 CollectInterestingTypesAndFactors();
6238 CollectFixupsAndInitialFormulae();
6239 CollectLoopInvariantFixupsAndFormulae();
6240
6241 if (Uses.empty())
6242 return;
6243
6244 LLVM_DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
6245 print_uses(dbgs()));
6246 LLVM_DEBUG(dbgs() << "The baseline solution requires ";
6247 BaselineCost.print(dbgs()); dbgs() << "\n");
6248
6249 // Now use the reuse data to generate a bunch of interesting ways
6250 // to formulate the values needed for the uses.
6251 GenerateAllReuseFormulae();
6252
6253 FilterOutUndesirableDedicatedRegisters();
6254 NarrowSearchSpaceUsingHeuristics();
6255
6257 Solve(Solution);
6258
6259 // Release memory that is no longer needed.
6260 Factors.clear();
6261 Types.clear();
6262 RegUses.clear();
6263
6264 if (Solution.empty())
6265 return;
6266
6267#ifndef NDEBUG
6268 // Formulae should be legal.
6269 for (const LSRUse &LU : Uses) {
6270 for (const Formula &F : LU.Formulae)
6271 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
6272 F) && "Illegal formula generated!");
6273 };
6274#endif
6275
6276 // Now that we've decided what we want, make it so.
6277 ImplementSolution(Solution);
6278}
6279
6280#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
6281void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
6282 if (Factors.empty() && Types.empty()) return;
6283
6284 OS << "LSR has identified the following interesting factors and types: ";
6285 ListSeparator LS;
6286
6287 for (int64_t Factor : Factors)
6288 OS << LS << '*' << Factor;
6289
6290 for (Type *Ty : Types)
6291 OS << LS << '(' << *Ty << ')';
6292 OS << '\n';
6293}
6294
6295void LSRInstance::print_fixups(raw_ostream &OS) const {
6296 OS << "LSR is examining the following fixup sites:\n";
6297 for (const LSRUse &LU : Uses)
6298 for (const LSRFixup &LF : LU.Fixups) {
6299 dbgs() << " ";
6300 LF.print(OS);
6301 OS << '\n';
6302 }
6303}
6304
6305void LSRInstance::print_uses(raw_ostream &OS) const {
6306 OS << "LSR is examining the following uses:\n";
6307 for (const LSRUse &LU : Uses) {
6308 dbgs() << " ";
6309 LU.print(OS);
6310 OS << '\n';
6311 for (const Formula &F : LU.Formulae) {
6312 OS << " ";
6313 F.print(OS);
6314 OS << '\n';
6315 }
6316 }
6317}
6318
6319void LSRInstance::print(raw_ostream &OS) const {
6320 print_factors_and_types(OS);
6321 print_fixups(OS);
6322 print_uses(OS);
6323}
6324
6325LLVM_DUMP_METHOD void LSRInstance::dump() const {
6326 print(errs()); errs() << '\n';
6327}
6328#endif
6329
6330namespace {
6331
6332class LoopStrengthReduce : public LoopPass {
6333public:
6334 static char ID; // Pass ID, replacement for typeid
6335
6336 LoopStrengthReduce();
6337
6338private:
6339 bool runOnLoop(Loop *L, LPPassManager &LPM) override;
6340 void getAnalysisUsage(AnalysisUsage &AU) const override;
6341};
6342
6343} // end anonymous namespace
6344
6345LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
6347}
6348
6349void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
6350 // We split critical edges, so we change the CFG. However, we do update
6351 // many analyses if they are around.
6353
6354 AU.addRequired<LoopInfoWrapperPass>();
6355 AU.addPreserved<LoopInfoWrapperPass>();
6357 AU.addRequired<DominatorTreeWrapperPass>();
6358 AU.addPreserved<DominatorTreeWrapperPass>();
6359 AU.addRequired<ScalarEvolutionWrapperPass>();
6360 AU.addPreserved<ScalarEvolutionWrapperPass>();
6361 AU.addRequired<AssumptionCacheTracker>();
6362 AU.addRequired<TargetLibraryInfoWrapperPass>();
6363 // Requiring LoopSimplify a second time here prevents IVUsers from running
6364 // twice, since LoopSimplify was invalidated by running ScalarEvolution.
6366 AU.addRequired<IVUsersWrapperPass>();
6367 AU.addPreserved<IVUsersWrapperPass>();
6368 AU.addRequired<TargetTransformInfoWrapperPass>();
6369 AU.addPreserved<MemorySSAWrapperPass>();
6370}
6371
6372namespace {
6373
6374/// Enables more convenient iteration over a DWARF expression vector.
6376ToDwarfOpIter(SmallVectorImpl<uint64_t> &Expr) {
6377 llvm::DIExpression::expr_op_iterator Begin =
6378 llvm::DIExpression::expr_op_iterator(Expr.begin());
6379 llvm::DIExpression::expr_op_iterator End =
6380 llvm::DIExpression::expr_op_iterator(Expr.end());
6381 return {Begin, End};
6382}
6383
6384struct SCEVDbgValueBuilder {
6385 SCEVDbgValueBuilder() = default;
6386 SCEVDbgValueBuilder(const SCEVDbgValueBuilder &Base) { clone(Base); }
6387
6388 void clone(const SCEVDbgValueBuilder &Base) {
6389 LocationOps = Base.LocationOps;
6390 Expr = Base.Expr;
6391 }
6392
6393 void clear() {
6394 LocationOps.clear();
6395 Expr.clear();
6396 }
6397
6398 /// The DIExpression as we translate the SCEV.
6400 /// The location ops of the DIExpression.
6401 SmallVector<Value *, 2> LocationOps;
6402
6403 void pushOperator(uint64_t Op) { Expr.push_back(Op); }
6404 void pushUInt(uint64_t Operand) { Expr.push_back(Operand); }
6405
6406 /// Add a DW_OP_LLVM_arg to the expression, followed by the index of the value
6407 /// in the set of values referenced by the expression.
6408 void pushLocation(llvm::Value *V) {
6410 auto *It = llvm::find(LocationOps, V);
6411 unsigned ArgIndex = 0;
6412 if (It != LocationOps.end()) {
6413 ArgIndex = std::distance(LocationOps.begin(), It);
6414 } else {
6415 ArgIndex = LocationOps.size();
6416 LocationOps.push_back(V);
6417 }
6418 Expr.push_back(ArgIndex);
6419 }
6420
6421 void pushValue(const SCEVUnknown *U) {
6422 llvm::Value *V = cast<SCEVUnknown>(U)->getValue();
6423 pushLocation(V);
6424 }
6425
6426 bool pushConst(const SCEVConstant *C) {
6427 if (C->getAPInt().getSignificantBits() > 64)
6428 return false;
6429 Expr.push_back(llvm::dwarf::DW_OP_consts);
6430 Expr.push_back(C->getAPInt().getSExtValue());
6431 return true;
6432 }
6433
6434 // Iterating the expression as DWARF ops is convenient when updating
6435 // DWARF_OP_LLVM_args.
6437 return ToDwarfOpIter(Expr);
6438 }
6439
6440 /// Several SCEV types are sequences of the same arithmetic operator applied
6441 /// to constants and values that may be extended or truncated.
6442 bool pushArithmeticExpr(const llvm::SCEVCommutativeExpr *CommExpr,
6443 uint64_t DwarfOp) {
6444 assert((isa<llvm::SCEVAddExpr>(CommExpr) || isa<SCEVMulExpr>(CommExpr)) &&
6445 "Expected arithmetic SCEV type");
6446 bool Success = true;
6447 unsigned EmitOperator = 0;
6448 for (const auto &Op : CommExpr->operands()) {
6449 Success &= pushSCEV(Op);
6450
6451 if (EmitOperator >= 1)
6452 pushOperator(DwarfOp);
6453 ++EmitOperator;
6454 }
6455 return Success;
6456 }
6457
6458 // TODO: Identify and omit noop casts.
6459 bool pushCast(const llvm::SCEVCastExpr *C, bool IsSigned) {
6460 const llvm::SCEV *Inner = C->getOperand(0);
6461 const llvm::Type *Type = C->getType();
6462 uint64_t ToWidth = Type->getIntegerBitWidth();
6463 bool Success = pushSCEV(Inner);
6464 uint64_t CastOps[] = {dwarf::DW_OP_LLVM_convert, ToWidth,
6465 IsSigned ? llvm::dwarf::DW_ATE_signed
6466 : llvm::dwarf::DW_ATE_unsigned};
6467 for (const auto &Op : CastOps)
6468 pushOperator(Op);
6469 return Success;
6470 }
6471
6472 // TODO: MinMax - although these haven't been encountered in the test suite.
6473 bool pushSCEV(const llvm::SCEV *S) {
6474 bool Success = true;
6475 if (const SCEVConstant *StartInt = dyn_cast<SCEVConstant>(S)) {
6476 Success &= pushConst(StartInt);
6477
6478 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
6479 if (!U->getValue())
6480 return false;
6481 pushLocation(U->getValue());
6482
6483 } else if (const SCEVMulExpr *MulRec = dyn_cast<SCEVMulExpr>(S)) {
6484 Success &= pushArithmeticExpr(MulRec, llvm::dwarf::DW_OP_mul);
6485
6486 } else if (const SCEVUDivExpr *UDiv = dyn_cast<SCEVUDivExpr>(S)) {
6487 Success &= pushSCEV(UDiv->getLHS());
6488 Success &= pushSCEV(UDiv->getRHS());
6489 pushOperator(llvm::dwarf::DW_OP_div);
6490
6491 } else if (const SCEVCastExpr *Cast = dyn_cast<SCEVCastExpr>(S)) {
6492 // Assert if a new and unknown SCEVCastEXpr type is encountered.
6495 isa<SCEVSignExtendExpr>(Cast)) &&
6496 "Unexpected cast type in SCEV.");
6497 Success &= pushCast(Cast, (isa<SCEVSignExtendExpr>(Cast)));
6498
6499 } else if (const SCEVAddExpr *AddExpr = dyn_cast<SCEVAddExpr>(S)) {
6500 Success &= pushArithmeticExpr(AddExpr, llvm::dwarf::DW_OP_plus);
6501
6502 } else if (isa<SCEVAddRecExpr>(S)) {
6503 // Nested SCEVAddRecExpr are generated by nested loops and are currently
6504 // unsupported.
6505 return false;
6506
6507 } else {
6508 return false;
6509 }
6510 return Success;
6511 }
6512
6513 /// Return true if the combination of arithmetic operator and underlying
6514 /// SCEV constant value is an identity function.
6515 bool isIdentityFunction(uint64_t Op, const SCEV *S) {
6516 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
6517 if (C->getAPInt().getSignificantBits() > 64)
6518 return false;
6519 int64_t I = C->getAPInt().getSExtValue();
6520 switch (Op) {
6521 case llvm::dwarf::DW_OP_plus:
6522 case llvm::dwarf::DW_OP_minus:
6523 return I == 0;
6524 case llvm::dwarf::DW_OP_mul:
6525 case llvm::dwarf::DW_OP_div:
6526 return I == 1;
6527 }
6528 }
6529 return false;
6530 }
6531
6532 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6533 /// builder's expression stack. The stack should already contain an
6534 /// expression for the iteration count, so that it can be multiplied by
6535 /// the stride and added to the start.
6536 /// Components of the expression are omitted if they are an identity function.
6537 /// Chain (non-affine) SCEVs are not supported.
6538 bool SCEVToValueExpr(const llvm::SCEVAddRecExpr &SAR, ScalarEvolution &SE) {
6539 assert(SAR.isAffine() && "Expected affine SCEV");
6540 const SCEV *Start = SAR.getStart();
6541 const SCEV *Stride = SAR.getStepRecurrence(SE);
6542
6543 // Skip pushing arithmetic noops.
6544 if (!isIdentityFunction(llvm::dwarf::DW_OP_mul, Stride)) {
6545 if (!pushSCEV(Stride))
6546 return false;
6547 pushOperator(llvm::dwarf::DW_OP_mul);
6548 }
6549 if (!isIdentityFunction(llvm::dwarf::DW_OP_plus, Start)) {
6550 if (!pushSCEV(Start))
6551 return false;
6552 pushOperator(llvm::dwarf::DW_OP_plus);
6553 }
6554 return true;
6555 }
6556
6557 /// Create an expression that is an offset from a value (usually the IV).
6558 void createOffsetExpr(int64_t Offset, Value *OffsetValue) {
6559 pushLocation(OffsetValue);
6561 LLVM_DEBUG(
6562 dbgs() << "scev-salvage: Generated IV offset expression. Offset: "
6563 << std::to_string(Offset) << "\n");
6564 }
6565
6566 /// Combine a translation of the SCEV and the IV to create an expression that
6567 /// recovers a location's value.
6568 /// returns true if an expression was created.
6569 bool createIterCountExpr(const SCEV *S,
6570 const SCEVDbgValueBuilder &IterationCount,
6571 ScalarEvolution &SE) {
6572 // SCEVs for SSA values are most frquently of the form
6573 // {start,+,stride}, but sometimes they are ({start,+,stride} + %a + ..).
6574 // This is because %a is a PHI node that is not the IV. However, these
6575 // SCEVs have not been observed to result in debuginfo-lossy optimisations,
6576 // so its not expected this point will be reached.
6577 if (!isa<SCEVAddRecExpr>(S))
6578 return false;
6579
6580 LLVM_DEBUG(dbgs() << "scev-salvage: Location to salvage SCEV: " << *S
6581 << '\n');
6582
6583 const auto *Rec = cast<SCEVAddRecExpr>(S);
6584 if (!Rec->isAffine())
6585 return false;
6586
6588 return false;
6589
6590 // Initialise a new builder with the iteration count expression. In
6591 // combination with the value's SCEV this enables recovery.
6592 clone(IterationCount);
6593 if (!SCEVToValueExpr(*Rec, SE))
6594 return false;
6595
6596 return true;
6597 }
6598
6599 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6600 /// builder's expression stack. The stack should already contain an
6601 /// expression for the iteration count, so that it can be multiplied by
6602 /// the stride and added to the start.
6603 /// Components of the expression are omitted if they are an identity function.
6604 bool SCEVToIterCountExpr(const llvm::SCEVAddRecExpr &SAR,
6605 ScalarEvolution &SE) {
6606 assert(SAR.isAffine() && "Expected affine SCEV");
6607 const SCEV *Start = SAR.getStart();
6608 const SCEV *Stride = SAR.getStepRecurrence(SE);
6609
6610 // Skip pushing arithmetic noops.
6611 if (!isIdentityFunction(llvm::dwarf::DW_OP_minus, Start)) {
6612 if (!pushSCEV(Start))
6613 return false;
6614 pushOperator(llvm::dwarf::DW_OP_minus);
6615 }
6616 if (!isIdentityFunction(llvm::dwarf::DW_OP_div, Stride)) {
6617 if (!pushSCEV(Stride))
6618 return false;
6619 pushOperator(llvm::dwarf::DW_OP_div);
6620 }
6621 return true;
6622 }
6623
6624 // Append the current expression and locations to a location list and an
6625 // expression list. Modify the DW_OP_LLVM_arg indexes to account for
6626 // the locations already present in the destination list.
6627 void appendToVectors(SmallVectorImpl<uint64_t> &DestExpr,
6628 SmallVectorImpl<Value *> &DestLocations) {
6629 assert(!DestLocations.empty() &&
6630 "Expected the locations vector to contain the IV");
6631 // The DWARF_OP_LLVM_arg arguments of the expression being appended must be
6632 // modified to account for the locations already in the destination vector.
6633 // All builders contain the IV as the first location op.
6634 assert(!LocationOps.empty() &&
6635 "Expected the location ops to contain the IV.");
6636 // DestIndexMap[n] contains the index in DestLocations for the nth
6637 // location in this SCEVDbgValueBuilder.
6638 SmallVector<uint64_t, 2> DestIndexMap;
6639 for (const auto &Op : LocationOps) {
6640 auto It = find(DestLocations, Op);
6641 if (It != DestLocations.end()) {
6642 // Location already exists in DestLocations, reuse existing ArgIndex.
6643 DestIndexMap.push_back(std::distance(DestLocations.begin(), It));
6644 continue;
6645 }
6646 // Location is not in DestLocations, add it.
6647 DestIndexMap.push_back(DestLocations.size());
6648 DestLocations.push_back(Op);
6649 }
6650
6651 for (const auto &Op : expr_ops()) {
6652 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6653 Op.appendToVector(DestExpr);
6654 continue;
6655 }
6656
6658 // `DW_OP_LLVM_arg n` represents the nth LocationOp in this SCEV,
6659 // DestIndexMap[n] contains its new index in DestLocations.
6660 uint64_t NewIndex = DestIndexMap[Op.getArg(0)];
6661 DestExpr.push_back(NewIndex);
6662 }
6663 }
6664};
6665
6666/// Holds all the required data to salvage a dbg.value using the pre-LSR SCEVs
6667/// and DIExpression.
6668struct DVIRecoveryRec {
6669 DVIRecoveryRec(DbgVariableRecord *DVR)
6670 : DbgRef(DVR), Expr(DVR->getExpression()), HadLocationArgList(false) {}
6671
6672 DbgVariableRecord *DbgRef;
6673 DIExpression *Expr;
6674 bool HadLocationArgList;
6675 SmallVector<WeakVH, 2> LocationOps;
6678
6679 void clear() {
6680 for (auto &RE : RecoveryExprs)
6681 RE.reset();
6682 RecoveryExprs.clear();
6683 }
6684
6685 ~DVIRecoveryRec() { clear(); }
6686};
6687} // namespace
6688
6689/// Returns the total number of DW_OP_llvm_arg operands in the expression.
6690/// This helps in determining if a DIArglist is necessary or can be omitted from
6691/// the dbg.value.
6693 auto expr_ops = ToDwarfOpIter(Expr);
6694 unsigned Count = 0;
6695 for (auto Op : expr_ops)
6696 if (Op.getOp() == dwarf::DW_OP_LLVM_arg)
6697 Count++;
6698 return Count;
6699}
6700
6701/// Overwrites DVI with the location and Ops as the DIExpression. This will
6702/// create an invalid expression if Ops has any dwarf::DW_OP_llvm_arg operands,
6703/// because a DIArglist is not created for the first argument of the dbg.value.
6704template <typename T>
6705static void updateDVIWithLocation(T &DbgVal, Value *Location,
6707 assert(numLLVMArgOps(Ops) == 0 && "Expected expression that does not "
6708 "contain any DW_OP_llvm_arg operands.");
6709 DbgVal.setRawLocation(ValueAsMetadata::get(Location));
6710 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6711 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6712}
6713
6714/// Overwrite DVI with locations placed into a DIArglist.
6715template <typename T>
6716static void updateDVIWithLocations(T &DbgVal,
6717 SmallVectorImpl<Value *> &Locations,
6719 assert(numLLVMArgOps(Ops) != 0 &&
6720 "Expected expression that references DIArglist locations using "
6721 "DW_OP_llvm_arg operands.");
6723 for (Value *V : Locations)
6724 MetadataLocs.push_back(ValueAsMetadata::get(V));
6725 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6726 DbgVal.setRawLocation(llvm::DIArgList::get(DbgVal.getContext(), ValArrayRef));
6727 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6728}
6729
6730/// Write the new expression and new location ops for the dbg.value. If possible
6731/// reduce the szie of the dbg.value by omitting DIArglist. This
6732/// can be omitted if:
6733/// 1. There is only a single location, refenced by a single DW_OP_llvm_arg.
6734/// 2. The DW_OP_LLVM_arg is the first operand in the expression.
6735static void UpdateDbgValue(DVIRecoveryRec &DVIRec,
6736 SmallVectorImpl<Value *> &NewLocationOps,
6738 DbgVariableRecord *DbgVal = DVIRec.DbgRef;
6739 unsigned NumLLVMArgs = numLLVMArgOps(NewExpr);
6740 if (NumLLVMArgs == 0) {
6741 // Location assumed to be on the stack.
6742 updateDVIWithLocation(*DbgVal, NewLocationOps[0], NewExpr);
6743 } else if (NumLLVMArgs == 1 && NewExpr[0] == dwarf::DW_OP_LLVM_arg) {
6744 // There is only a single DW_OP_llvm_arg at the start of the expression,
6745 // so it can be omitted along with DIArglist.
6746 assert(NewExpr[1] == 0 &&
6747 "Lone LLVM_arg in a DIExpression should refer to location-op 0.");
6749 updateDVIWithLocation(*DbgVal, NewLocationOps[0], ShortenedOps);
6750 } else {
6751 // Multiple DW_OP_llvm_arg, so DIArgList is strictly necessary.
6752 updateDVIWithLocations(*DbgVal, NewLocationOps, NewExpr);
6753 }
6754
6755 // If the DIExpression was previously empty then add the stack terminator.
6756 // Non-empty expressions have only had elements inserted into them and so
6757 // the terminator should already be present e.g. stack_value or fragment.
6758 DIExpression *SalvageExpr = DbgVal->getExpression();
6759 if (!DVIRec.Expr->isComplex() && SalvageExpr->isComplex()) {
6760 SalvageExpr = DIExpression::append(SalvageExpr, {dwarf::DW_OP_stack_value});
6761 DbgVal->setExpression(SalvageExpr);
6762 }
6763}
6764
6765/// Cached location ops may be erased during LSR, in which case a poison is
6766/// required when restoring from the cache. The type of that location is no
6767/// longer available, so just use int8. The poison will be replaced by one or
6768/// more locations later when a SCEVDbgValueBuilder selects alternative
6769/// locations to use for the salvage.
6771 return (VH) ? VH : PoisonValue::get(llvm::Type::getInt8Ty(C));
6772}
6773
6774/// Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
6775static void restorePreTransformState(DVIRecoveryRec &DVIRec) {
6776 DbgVariableRecord *DbgVal = DVIRec.DbgRef;
6777 LLVM_DEBUG(dbgs() << "scev-salvage: restore dbg.value to pre-LSR state\n"
6778 << "scev-salvage: post-LSR: " << *DbgVal << '\n');
6779 assert(DVIRec.Expr && "Expected an expression");
6780 DbgVal->setExpression(DVIRec.Expr);
6781
6782 // Even a single location-op may be inside a DIArgList and referenced with
6783 // DW_OP_LLVM_arg, which is valid only with a DIArgList.
6784 if (!DVIRec.HadLocationArgList) {
6785 assert(DVIRec.LocationOps.size() == 1 &&
6786 "Unexpected number of location ops.");
6787 // LSR's unsuccessful salvage attempt may have added DIArgList, which in
6788 // this case was not present before, so force the location back to a
6789 // single uncontained Value.
6790 Value *CachedValue =
6791 getValueOrPoison(DVIRec.LocationOps[0], DbgVal->getContext());
6792 DbgVal->setRawLocation(ValueAsMetadata::get(CachedValue));
6793 } else {
6795 for (WeakVH VH : DVIRec.LocationOps) {
6796 Value *CachedValue = getValueOrPoison(VH, DbgVal->getContext());
6797 MetadataLocs.push_back(ValueAsMetadata::get(CachedValue));
6798 }
6799 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6800 DbgVal->setRawLocation(
6801 llvm::DIArgList::get(DbgVal->getContext(), ValArrayRef));
6802 }
6803 LLVM_DEBUG(dbgs() << "scev-salvage: pre-LSR: " << *DbgVal << '\n');
6804}
6805
6807 llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec,
6808 const SCEV *SCEVInductionVar,
6809 SCEVDbgValueBuilder IterCountExpr) {
6810
6811 if (!DVIRec.DbgRef->isKillLocation())
6812 return false;
6813
6814 // LSR may have caused several changes to the dbg.value in the failed salvage
6815 // attempt. So restore the DIExpression, the location ops and also the
6816 // location ops format, which is always DIArglist for multiple ops, but only
6817 // sometimes for a single op.
6819
6820 // LocationOpIndexMap[i] will store the post-LSR location index of
6821 // the non-optimised out location at pre-LSR index i.
6822 SmallVector<int64_t, 2> LocationOpIndexMap;
6823 LocationOpIndexMap.assign(DVIRec.LocationOps.size(), -1);
6824 SmallVector<Value *, 2> NewLocationOps;
6825 NewLocationOps.push_back(LSRInductionVar);
6826
6827 for (unsigned i = 0; i < DVIRec.LocationOps.size(); i++) {
6828 WeakVH VH = DVIRec.LocationOps[i];
6829 // Place the locations not optimised out in the list first, avoiding
6830 // inserts later. The map is used to update the DIExpression's
6831 // DW_OP_LLVM_arg arguments as the expression is updated.
6832 if (VH && !isa<UndefValue>(VH)) {
6833 NewLocationOps.push_back(VH);
6834 LocationOpIndexMap[i] = NewLocationOps.size() - 1;
6835 LLVM_DEBUG(dbgs() << "scev-salvage: Location index " << i
6836 << " now at index " << LocationOpIndexMap[i] << "\n");
6837 continue;
6838 }
6839
6840 // It's possible that a value referred to in the SCEV may have been
6841 // optimised out by LSR.
6842 if (SE.containsErasedValue(DVIRec.SCEVs[i]) ||
6843 SE.containsUndefs(DVIRec.SCEVs[i])) {
6844 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV for location at index: " << i
6845 << " refers to a location that is now undef or erased. "
6846 "Salvage abandoned.\n");
6847 return false;
6848 }
6849
6850 LLVM_DEBUG(dbgs() << "scev-salvage: salvaging location at index " << i
6851 << " with SCEV: " << *DVIRec.SCEVs[i] << "\n");
6852
6853 DVIRec.RecoveryExprs[i] = std::make_unique<SCEVDbgValueBuilder>();
6854 SCEVDbgValueBuilder *SalvageExpr = DVIRec.RecoveryExprs[i].get();
6855
6856 // Create an offset-based salvage expression if possible, as it requires
6857 // less DWARF ops than an iteration count-based expression.
6858 if (std::optional<APInt> Offset =
6859 SE.computeConstantDifference(DVIRec.SCEVs[i], SCEVInductionVar)) {
6860 if (Offset->getSignificantBits() <= 64)
6861 SalvageExpr->createOffsetExpr(Offset->getSExtValue(), LSRInductionVar);
6862 else
6863 return false;
6864 } else if (!SalvageExpr->createIterCountExpr(DVIRec.SCEVs[i], IterCountExpr,
6865 SE))
6866 return false;
6867 }
6868
6869 // Merge the DbgValueBuilder generated expressions and the original
6870 // DIExpression, place the result into an new vector.
6872 if (DVIRec.Expr->getNumElements() == 0) {
6873 assert(DVIRec.RecoveryExprs.size() == 1 &&
6874 "Expected only a single recovery expression for an empty "
6875 "DIExpression.");
6876 assert(DVIRec.RecoveryExprs[0] &&
6877 "Expected a SCEVDbgSalvageBuilder for location 0");
6878 SCEVDbgValueBuilder *B = DVIRec.RecoveryExprs[0].get();
6879 B->appendToVectors(NewExpr, NewLocationOps);
6880 }
6881 for (const auto &Op : DVIRec.Expr->expr_ops()) {
6882 // Most Ops needn't be updated.
6883 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6884 Op.appendToVector(NewExpr);
6885 continue;
6886 }
6887
6888 uint64_t LocationArgIndex = Op.getArg(0);
6889 SCEVDbgValueBuilder *DbgBuilder =
6890 DVIRec.RecoveryExprs[LocationArgIndex].get();
6891 // The location doesn't have s SCEVDbgValueBuilder, so LSR did not
6892 // optimise it away. So just translate the argument to the updated
6893 // location index.
6894 if (!DbgBuilder) {
6895 NewExpr.push_back(dwarf::DW_OP_LLVM_arg);
6896 assert(LocationOpIndexMap[Op.getArg(0)] != -1 &&
6897 "Expected a positive index for the location-op position.");
6898 NewExpr.push_back(LocationOpIndexMap[Op.getArg(0)]);
6899 continue;
6900 }
6901 // The location has a recovery expression.
6902 DbgBuilder->appendToVectors(NewExpr, NewLocationOps);
6903 }
6904
6905 UpdateDbgValue(DVIRec, NewLocationOps, NewExpr);
6906 LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: " << *DVIRec.DbgRef << "\n");
6907 return true;
6908}
6909
6910/// Obtain an expression for the iteration count, then attempt to salvage the
6911/// dbg.value intrinsics.
6913 llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar,
6914 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &DVIToUpdate) {
6915 if (DVIToUpdate.empty())
6916 return;
6917
6918 const llvm::SCEV *SCEVInductionVar = SE.getSCEV(LSRInductionVar);
6919 assert(SCEVInductionVar &&
6920 "Anticipated a SCEV for the post-LSR induction variable");
6921
6922 if (const SCEVAddRecExpr *IVAddRec =
6923 dyn_cast<SCEVAddRecExpr>(SCEVInductionVar)) {
6924 if (!IVAddRec->isAffine())
6925 return;
6926
6927 // Prevent translation using excessive resources.
6928 if (IVAddRec->getExpressionSize() > MaxSCEVSalvageExpressionSize)
6929 return;
6930
6931 // The iteration count is required to recover location values.
6932 SCEVDbgValueBuilder IterCountExpr;
6933 IterCountExpr.pushLocation(LSRInductionVar);
6934 if (!IterCountExpr.SCEVToIterCountExpr(*IVAddRec, SE))
6935 return;
6936
6937 LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV: " << *SCEVInductionVar
6938 << '\n');
6939
6940 for (auto &DVIRec : DVIToUpdate) {
6941 SalvageDVI(L, SE, LSRInductionVar, *DVIRec, SCEVInductionVar,
6942 IterCountExpr);
6943 }
6944 }
6945}
6946
6947/// Identify and cache salvageable DVI locations and expressions along with the
6948/// corresponding SCEV(s). Also ensure that the DVI is not deleted between
6949/// cacheing and salvaging.
6951 Loop *L, ScalarEvolution &SE,
6952 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &SalvageableDVISCEVs) {
6953 for (const auto &B : L->getBlocks()) {
6954 for (auto &I : *B) {
6955 for (DbgVariableRecord &DbgVal : filterDbgVars(I.getDbgRecordRange())) {
6956 if (!DbgVal.isDbgValue() && !DbgVal.isDbgAssign())
6957 continue;
6958
6959 // Ensure that if any location op is undef that the dbg.vlue is not
6960 // cached.
6961 if (DbgVal.isKillLocation())
6962 continue;
6963
6964 // Check that the location op SCEVs are suitable for translation to
6965 // DIExpression.
6966 const auto &HasTranslatableLocationOps =
6967 [&](const DbgVariableRecord &DbgValToTranslate) -> bool {
6968 for (const auto LocOp : DbgValToTranslate.location_ops()) {
6969 if (!LocOp)
6970 return false;
6971
6972 if (!SE.isSCEVable(LocOp->getType()))
6973 return false;
6974
6975 const SCEV *S = SE.getSCEV(LocOp);
6976 if (SE.containsUndefs(S))
6977 return false;
6978 }
6979 return true;
6980 };
6981
6982 if (!HasTranslatableLocationOps(DbgVal))
6983 continue;
6984
6985 std::unique_ptr<DVIRecoveryRec> NewRec =
6986 std::make_unique<DVIRecoveryRec>(&DbgVal);
6987 // Each location Op may need a SCEVDbgValueBuilder in order to recover
6988 // it. Pre-allocating a vector will enable quick lookups of the builder
6989 // later during the salvage.
6990 NewRec->RecoveryExprs.resize(DbgVal.getNumVariableLocationOps());
6991 for (const auto LocOp : DbgVal.location_ops()) {
6992 NewRec->SCEVs.push_back(SE.getSCEV(LocOp));
6993 NewRec->LocationOps.push_back(LocOp);
6994 NewRec->HadLocationArgList = DbgVal.hasArgList();
6995 }
6996 SalvageableDVISCEVs.push_back(std::move(NewRec));
6997 }
6998 }
6999 }
7000}
7001
7002/// Ideally pick the PHI IV inserted by ScalarEvolutionExpander. As a fallback
7003/// any PHi from the loop header is usable, but may have less chance of
7004/// surviving subsequent transforms.
7006 const LSRInstance &LSR) {
7007
7008 auto IsSuitableIV = [&](PHINode *P) {
7009 if (!SE.isSCEVable(P->getType()))
7010 return false;
7011 if (const SCEVAddRecExpr *Rec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(P)))
7012 return Rec->isAffine() && !SE.containsUndefs(SE.getSCEV(P));
7013 return false;
7014 };
7015
7016 // For now, just pick the first IV that was generated and inserted by
7017 // ScalarEvolution. Ideally pick an IV that is unlikely to be optimised away
7018 // by subsequent transforms.
7019 for (const WeakVH &IV : LSR.getScalarEvolutionIVs()) {
7020 if (!IV)
7021 continue;
7022
7023 // There should only be PHI node IVs.
7024 PHINode *P = cast<PHINode>(&*IV);
7025
7026 if (IsSuitableIV(P))
7027 return P;
7028 }
7029
7030 for (PHINode &P : L.getHeader()->phis()) {
7031 if (IsSuitableIV(&P))
7032 return &P;
7033 }
7034 return nullptr;
7035}
7036
7038 DominatorTree &DT, LoopInfo &LI,
7039 const TargetTransformInfo &TTI,
7041 MemorySSA *MSSA) {
7042
7043 // Debug preservation - before we start removing anything identify which DVI
7044 // meet the salvageable criteria and store their DIExpression and SCEVs.
7045 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> SalvageableDVIRecords;
7046 DbgGatherSalvagableDVI(L, SE, SalvageableDVIRecords);
7047
7048 bool Changed = false;
7049 std::unique_ptr<MemorySSAUpdater> MSSAU;
7050 if (MSSA)
7051 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
7052
7053 // Run the main LSR transformation.
7054 const LSRInstance &Reducer =
7055 LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get());
7056 Changed |= Reducer.getChanged();
7057
7058 // Remove any extra phis created by processing inner loops.
7059 Changed |= DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7060 if (EnablePhiElim && L->isLoopSimplifyForm()) {
7062 SCEVExpander Rewriter(SE, "lsr", false);
7063#if LLVM_ENABLE_ABI_BREAKING_CHECKS
7064 Rewriter.setDebugType(DEBUG_TYPE);
7065#endif
7066 unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI);
7067 Rewriter.clear();
7068 if (numFolded) {
7069 Changed = true;
7071 MSSAU.get());
7072 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7073 }
7074 }
7075 // LSR may at times remove all uses of an induction variable from a loop.
7076 // The only remaining use is the PHI in the exit block.
7077 // When this is the case, if the exit value of the IV can be calculated using
7078 // SCEV, we can replace the exit block PHI with the final value of the IV and
7079 // skip the updates in each loop iteration.
7080 if (L->isRecursivelyLCSSAForm(DT, LI) && L->getExitBlock()) {
7082 SCEVExpander Rewriter(SE, "lsr", true);
7083 int Rewrites = rewriteLoopExitValues(L, &LI, &TLI, &SE, &TTI, Rewriter, &DT,
7084 UnusedIndVarInLoop, DeadInsts);
7085 Rewriter.clear();
7086 if (Rewrites) {
7087 Changed = true;
7089 MSSAU.get());
7090 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7091 }
7092 }
7093
7094 if (SalvageableDVIRecords.empty())
7095 return Changed;
7096
7097 // Obtain relevant IVs and attempt to rewrite the salvageable DVIs with
7098 // expressions composed using the derived iteration count.
7099 // TODO: Allow for multiple IV references for nested AddRecSCEVs
7100 for (const auto &L : LI) {
7101 if (llvm::PHINode *IV = GetInductionVariable(*L, SE, Reducer))
7102 DbgRewriteSalvageableDVIs(L, SE, IV, SalvageableDVIRecords);
7103 else {
7104 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV salvaging not possible. An IV "
7105 "could not be identified.\n");
7106 }
7107 }
7108
7109 for (auto &Rec : SalvageableDVIRecords)
7110 Rec->clear();
7111 SalvageableDVIRecords.clear();
7112 return Changed;
7113}
7114
7115bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
7116 if (skipLoop(L))
7117 return false;
7118
7119 auto &IU = getAnalysis<IVUsersWrapperPass>().getIU();
7120 auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
7121 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
7122 auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
7123 const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
7124 *L->getHeader()->getParent());
7125 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
7126 *L->getHeader()->getParent());
7127 auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
7128 *L->getHeader()->getParent());
7129 auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
7130 MemorySSA *MSSA = nullptr;
7131 if (MSSAAnalysis)
7132 MSSA = &MSSAAnalysis->getMSSA();
7133 return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, TLI, MSSA);
7134}
7135
7138 LPMUpdater &) {
7139 if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,
7140 AR.DT, AR.LI, AR.TTI, AR.AC, AR.TLI, AR.MSSA))
7141 return PreservedAnalyses::all();
7142
7143 auto PA = getLoopPassPreservedAnalyses();
7144 if (AR.MSSA)
7145 PA.preserve<MemorySSAAnalysis>();
7146 return PA;
7147}
7148
7149char LoopStrengthReduce::ID = 0;
7150
7151INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
7152 "Loop Strength Reduction", false, false)
7158INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
7159INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
7160 "Loop Strength Reduction", false, false)
7161
7162Pass *llvm::createLoopStrengthReducePass() { return new LoopStrengthReduce(); }
#define Success
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
Function Alias Analysis false
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:849
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:661
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isCanonical(const MDString *S)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
This file contains constants used for implementing Dwarf debug support.
early cse Early CSE w MemorySSA
#define DEBUG_TYPE
Hexagon Hardware Loops
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
This header provides classes for managing per-loop analyses.
static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec, const SCEV *SCEVInductionVar, SCEVDbgValueBuilder IterCountExpr)
static Immediate ExtractImmediate(SCEVUse &S, ScalarEvolution &SE)
If S involves the addition of a constant integer value, return that integer value,...
static cl::opt< bool > DropScaledForVScale("lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true), cl::desc("Avoid using scaled registers with vscale-relative addressing"))
static Value * getWideOperand(Value *Oper)
IVChain logic must consistently peek base TruncInst operands, so wrap it in a convenient helper.
static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE)
Return true if the given add can be sign-extended without changing its value.
static bool mayUsePostIncMode(const TargetTransformInfo &TTI, LSRUse &LU, const SCEV *S, const Loop *L, ScalarEvolution &SE)
Return true if the SCEV represents a value that may end up as a post-increment operation.
static void restorePreTransformState(DVIRecoveryRec &DVIRec)
Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L)
static User::op_iterator findIVOperand(User::op_iterator OI, User::op_iterator OE, Loop *L, ScalarEvolution &SE)
Helper for CollectChains that finds an IV operand (computed by an AddRec in this loop) within [OI,...
static cl::opt< TTI::AddressingModeKind > PreferredAddresingMode("lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None), cl::desc("A flag that overrides the target's preferred addressing mode."), cl::values(clEnumValN(TTI::AMK_None, "none", "Don't prefer any addressing mode"), clEnumValN(TTI::AMK_PreIndexed, "preindexed", "Prefer pre-indexed addressing mode"), clEnumValN(TTI::AMK_PostIndexed, "postindexed", "Prefer post-indexed addressing mode"), clEnumValN(TTI::AMK_All, "all", "Consider all addressing modes")))
static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset, Immediate MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg, int64_t Scale)
Test whether we know how to expand the current formula.
static void DbgGatherSalvagableDVI(Loop *L, ScalarEvolution &SE, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &SalvageableDVISCEVs)
Identify and cache salvageable DVI locations and expressions along with the corresponding SCEV(s).
static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE)
Return true if the given mul can be sign-extended without changing its value.
static const unsigned MaxSCEVSalvageExpressionSize
Limit the size of expression that SCEV-based salvaging will attempt to translate into a DIExpression.
static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if this AddRec is already a phi in its loop.
static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F, const Loop &L)
static cl::opt< bool > InsnsCost("lsr-insns-cost", cl::Hidden, cl::init(true), cl::desc("Add instruction count to a LSR cost model"))
static cl::opt< bool > StressIVChain("stress-ivchain", cl::Hidden, cl::init(false), cl::desc("Stress test LSR IV chains"))
static bool isAddressUse(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Returns true if the specified instruction is using the specified value as an address.
static void DoInitialMatch(const SCEV *S, Loop *L, SmallVectorImpl< SCEVUse > &Good, SmallVectorImpl< SCEVUse > &Bad, ScalarEvolution &SE)
Recursion helper for initialMatch.
static void updateDVIWithLocation(T &DbgVal, Value *Location, SmallVectorImpl< uint64_t > &Ops)
Overwrites DVI with the location and Ops as the DIExpression.
static bool isLegalAddImmediate(const TargetTransformInfo &TTI, Immediate Offset)
static cl::opt< cl::boolOrDefault > AllowDropSolutionIfLessProfitable("lsr-drop-solution", cl::Hidden, cl::desc("Attempt to drop solution if it is less profitable"))
static cl::opt< bool > EnableVScaleImmediates("lsr-enable-vscale-immediates", cl::Hidden, cl::init(true), cl::desc("Enable analysis of vscale-relative immediates in LSR"))
static Instruction * getFixupInsertPos(const TargetTransformInfo &TTI, const LSRFixup &Fixup, const LSRUse &LU, Instruction *IVIncInsertPos, DominatorTree &DT)
static const SCEV * getExprBase(const SCEV *S)
Return an approximation of this SCEV expression's "base", or NULL for any constant.
static bool isAlwaysFoldable(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg)
static llvm::PHINode * GetInductionVariable(const Loop &L, ScalarEvolution &SE, const LSRInstance &LSR)
Ideally pick the PHI IV inserted by ScalarEvolutionExpander.
static bool IsSimplerBaseSCEVForTarget(const TargetTransformInfo &TTI, ScalarEvolution &SE, const SCEV *Best, const SCEV *Reg, MemAccessTy AccessType)
static const unsigned MaxIVUsers
MaxIVUsers is an arbitrary threshold that provides an early opportunity for bail out.
static bool isHighCostExpansion(const SCEV *S, SmallPtrSetImpl< const SCEV * > &Processed, ScalarEvolution &SE)
Check if expanding this expression is likely to incur significant cost.
static Value * getValueOrPoison(WeakVH &VH, LLVMContext &C)
Cached location ops may be erased during LSR, in which case a poison is required when restoring from ...
static MemAccessTy getAccessType(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Return the type of the memory being accessed.
static unsigned numLLVMArgOps(SmallVectorImpl< uint64_t > &Expr)
Returns the total number of DW_OP_llvm_arg operands in the expression.
static void DbgRewriteSalvageableDVIs(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &DVIToUpdate)
Obtain an expression for the iteration count, then attempt to salvage the dbg.value intrinsics.
static cl::opt< bool > EnablePhiElim("enable-lsr-phielim", cl::Hidden, cl::init(true), cl::desc("Enable LSR phi elimination"))
static void UpdateDbgValue(DVIRecoveryRec &DVIRec, SmallVectorImpl< Value * > &NewLocationOps, SmallVectorImpl< uint64_t > &NewExpr)
Write the new expression and new location ops for the dbg.value.
static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if the given addrec can be sign-extended without changing its value.
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F)
Check if the addressing mode defined by F is completely folded in LU at isel time.
static cl::opt< bool > LSRExpNarrow("lsr-exp-narrow", cl::Hidden, cl::init(false), cl::desc("Narrow LSR complex solution using" " expectation of registers number"))
static cl::opt< bool > FilterSameScaledReg("lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true), cl::desc("Narrow LSR search space by filtering non-optimal formulae" " with the same ScaledReg and Scale"))
static void updateDVIWithLocations(T &DbgVal, SmallVectorImpl< Value * > &Locations, SmallVectorImpl< uint64_t > &Ops)
Overwrite DVI with locations placed into a DIArglist.
static cl::opt< unsigned > ComplexityLimit("lsr-complexity-limit", cl::Hidden, cl::init(std::numeric_limits< uint16_t >::max()), cl::desc("LSR search space complexity limit"))
static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC, TargetLibraryInfo &TLI, MemorySSA *MSSA)
static GlobalValue * ExtractSymbol(SCEVUse &S, ScalarEvolution &SE)
If S involves the addition of a GlobalValue address, return that symbol, and mutate S to point to a n...
static bool isProfitableChain(IVChain &Chain, SmallPtrSetImpl< Instruction * > &Users, ScalarEvolution &SE, const TargetTransformInfo &TTI)
Return true if the number of registers needed for the chain is estimated to be less than the number r...
static const SCEV * CollectSubexprs(const SCEV *S, const SCEVConstant *C, SmallVectorImpl< const SCEV * > &Ops, const Loop *L, ScalarEvolution &SE, unsigned Depth=0)
Split S into subexpressions which can be pulled out into separate registers.
static const SCEV * getExactSDiv(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool IgnoreSignificantBits=false)
Return an expression for LHS /s RHS, if it can be determined and if the remainder is known to be zero...
static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, Value *Operand, const TargetTransformInfo &TTI)
Return true if the IVInc can be folded into an addressing mode.
static const SCEV * getAnyExtendConsideringPostIncUses(ArrayRef< PostIncLoopSet > Loops, const SCEV *Expr, Type *ToTy, ScalarEvolution &SE)
Extend/Truncate Expr to ToTy considering post-inc uses in Loops.
static unsigned getSetupCost(const SCEV *Reg, unsigned Depth)
static cl::opt< unsigned > SetupCostDepthLimit("lsr-setupcost-depth-limit", cl::Hidden, cl::init(7), cl::desc("The limit on recursion depth for LSRs setup cost"))
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
Register Reg
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
#define T
uint64_t IntrinsicInst * II
#define P(N)
PowerPC TLS Dynamic Call Fixup
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the PointerIntPair class.
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
SI optimize exec mask operations pre RA
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
static const unsigned UnknownAddressSpace
#define LLVM_DEBUG(...)
Definition Debug.h:114
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
Virtual Register Rewriter
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1555
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:330
LLVM_ABI APInt sdiv(const APInt &RHS) const
Signed division function for APInt.
Definition APInt.cpp:1655
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition APInt.h:1546
LLVM_ABI APInt srem(const APInt &RHS) const
Function for signed remainder operation.
Definition APInt.cpp:1747
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1577
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
LLVM_ABI AnalysisUsage & addRequiredID(const void *ID)
Definition Pass.cpp:284
AnalysisUsage & addPreservedID(const void *ID)
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:539
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:397
LLVM_ABI bool isLandingPad() const
Return true if this basic block is a landing pad.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
BinaryOps getOpcode() const
Definition InstrTypes.h:374
static LLVM_ABI BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
bool isUnconditional() const
Value * getCondition() const
static LLVM_ABI Instruction::CastOps getCastOpcode(const Value *Val, bool SrcIsSigned, Type *Ty, bool DstIsSigned)
Returns the opcode necessary to cast Val into Ty using usual casting rules.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_NE
not equal
Definition InstrTypes.h:698
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
Value * getCondition() const
static LLVM_ABI bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI DIArgList * get(LLVMContext &Context, ArrayRef< ValueAsMetadata * > Args)
DWARF expression.
iterator_range< expr_op_iterator > expr_ops() const
static LLVM_ABI DIExpression * append(const DIExpression *Expr, ArrayRef< uint64_t > Ops)
Append the opcodes Ops to DIExpr.
unsigned getNumElements() const
static LLVM_ABI void appendOffset(SmallVectorImpl< uint64_t > &Ops, int64_t Offset)
Append Ops with operations to apply the Offset.
LLVM_ABI bool isComplex() const
Return whether the location is computed on the expression stack, meaning it cannot be a simple regist...
LLVM_ABI LLVMContext & getContext()
Record of a variable value-assignment, aka a non instruction representation of the dbg....
LLVM_ABI bool isKillLocation() const
void setRawLocation(Metadata *NewLocation)
Use of this should generally be avoided; instead, replaceVariableLocationOp and addVariableLocationOp...
void setExpression(DIExpression *NewExpr)
DIExpression * getExpression() const
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
NodeT * getBlock() const
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:321
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
LLVM_ABI Instruction * findNearestCommonDominator(Instruction *I1, Instruction *I2) const
Find the nearest instruction I that dominates both I1 and I2, in the sense that a result produced bef...
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
PointerType * getType() const
Global values are always pointers.
IVStrideUse - Keep track of one use of a strided induction variable.
Definition IVUsers.h:35
void transformToPostInc(const Loop *L)
transformToPostInc - Transform the expression to post-inc form for the given loop.
Definition IVUsers.cpp:365
Value * getOperandValToReplace() const
getOperandValToReplace - Return the Value of the operand in the user instruction that this IVStrideUs...
Definition IVUsers.h:54
void setUser(Instruction *NewUser)
setUser - Assign a new user instruction for this use.
Definition IVUsers.h:48
Analysis pass that exposes the IVUsers for a loop.
Definition IVUsers.h:184
ilist< IVStrideUse >::const_iterator const_iterator
Definition IVUsers.h:142
iterator end()
Definition IVUsers.h:144
iterator begin()
Definition IVUsers.h:143
bool empty() const
Definition IVUsers.h:147
LLVM_ABI void print(raw_ostream &OS) const
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isLifetimeStartOrEnd() const LLVM_READONLY
Return true if the instruction is a llvm.lifetime.start or llvm.lifetime.end marker.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isEHPad() const
Return true if the instruction is a variety of EH-block.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI Type * getAccessType() const LLVM_READONLY
Return the type this instruction accesses in memory, if any.
const char * getOpcodeName() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
An instruction for reading from memory.
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
The legacy pass manager's analysis pass to compute loop information.
Definition LoopInfo.h:596
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
An analysis that produces MemorySSA for a function.
Definition MemorySSA.h:922
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition MemorySSA.h:702
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
iterator_range< const_block_iterator > blocks() const
op_range incoming_values()
void setIncomingValue(unsigned i, Value *V)
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
static unsigned getIncomingValueNumForOperand(unsigned i)
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
This node represents an addition of some number of SCEVs.
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
SCEVUse getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
This class represents a constant integer value.
ConstantInt * getValue() const
const APInt & getAPInt() const
This class uses information about analyze scalars to rewrite expressions in canonical form.
This node represents multiplication of some number of SCEVs.
ArrayRef< SCEVUse > operands() const
This means that we are dealing with an entirely unknown SCEV value, and only represent it as its LLVM...
This class represents an analyzed expression in the program.
unsigned short getExpressionSize() const
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI ArrayRef< SCEVUse > operands() const
Return operands of this SCEV expression.
SCEVTypes getSCEVType() const
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
LLVM_ABI uint64_t getTypeSizeInBits(Type *Ty) const
Return the size in bits of the specified type, for which isSCEVable must return true.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getAddRecExpr(SCEVUse Start, SCEVUse Step, const Loop *L, SCEV::NoWrapFlags Flags)
Get an add recurrence expression for the specified loop.
LLVM_ABI const SCEV * getNoopOrSignExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
LLVM_ABI Type * getEffectiveSCEVType(Type *Ty) const
Return a type with the same bitwidth as the given type and which represents how SCEV will treat the g...
LLVM_ABI const SCEV * getAnyExtendExpr(const SCEV *Op, Type *Ty)
getAnyExtendExpr - Return a SCEV for the given operand extended with unspecified bits out to the give...
LLVM_ABI bool containsUndefs(const SCEV *S) const
Return true if the SCEV expression contains an undef value.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
LLVM_ABI const SCEV * getVScale(Type *Ty)
LLVM_ABI bool hasComputableLoopEvolution(const SCEV *S, const Loop *L)
Return true if the given SCEV changes value in a known way in the specified loop.
LLVM_ABI const SCEV * getPointerBase(const SCEV *V)
Transitively follow the chain of pointer-type operands until reaching a SCEV that does not have a sin...
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVM_ABI const SCEV * getUnknown(Value *V)
LLVM_ABI std::optional< APInt > computeConstantDifference(const SCEV *LHS, const SCEV *RHS)
Compute LHS - RHS and returns the result as an APInt if it is a constant, and std::nullopt if it isn'...
LLVM_ABI bool properlyDominates(const SCEV *S, const BasicBlock *BB)
Return true if elements that makes up the given SCEV properly dominate the specified basic block.
LLVM_ABI bool containsErasedValue(const SCEV *S) const
Return true if the SCEV expression contains a Value that has been optimised out and is now a nullptr.
LLVMContext & getContext() const
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
iterator end()
Get an iterator to the end of the SetVector.
Definition SetVector.h:112
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition SetVector.h:106
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
iterator_range< const_set_bits_iterator > set_bits() const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
size_type size() const
Returns the number of bits in this bitvector.
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
iterator erase(const_iterator CI)
typename SuperClass::const_iterator const_iterator
typename SuperClass::iterator iterator
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
static StackOffset get(int64_t Fixed, int64_t Scalable)
Definition TypeSize.h:41
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI bool shouldDropLSRSolutionIfLessProfitable() const
Return true if LSR should drop a found solution if it's calculated to be less profitable than the bas...
LLVM_ABI bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const
Return true if LSR cost of C1 is lower than C2.
LLVM_ABI bool isIndexedStoreLegal(enum MemIndexedMode Mode, Type *Ty) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace=0, Instruction *I=nullptr, int64_t ScalableOffset=0) const
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
LLVM_ABI bool isIndexedLoadLegal(enum MemIndexedMode Mode, Type *Ty) const
LLVM_ABI bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
LLVM_ABI bool isLegalAddImmediate(int64_t Imm) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
LLVM_ABI bool canSaveCmp(Loop *L, CondBrInst **BI, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *LibInfo) const
Return true if the target can save a compare for loop count, for example hardware loop saves a compar...
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI bool canMacroFuseCmp() const
Return true if the target can fuse a compare and branch.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
@ AMK_All
Consider all addressing modes.
@ AMK_PreIndexed
Prefer pre-indexed addressing mode.
@ AMK_None
Don't prefer any addressing mode.
LLVM_ABI bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
This class represents a truncation of integer types.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI int getFPMantissaWidth() const
Return the width of the mantissa of this type.
Definition Type.cpp:235
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
Use * op_iterator
Definition User.h:254
op_range operands()
Definition User.h:267
op_iterator op_begin()
Definition User.h:259
void setOperand(unsigned i, Value *Val)
Definition User.h:212
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:25
Value * getOperand(unsigned i) const
Definition User.h:207
op_iterator op_end()
Definition User.h:261
static LLVM_ABI ValueAsMetadata * get(Value *V)
Definition Metadata.cpp:509
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:440
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:259
iterator_range< user_iterator > users()
Definition Value.h:427
LLVM_ABI void printAsOperand(raw_ostream &O, bool PrintType=true, const Module *M=nullptr) const
Print the name of this Value out to the specified raw_ostream.
iterator_range< use_iterator > uses()
Definition Value.h:381
A nullable Value handle that is nullable.
int getNumOccurrences() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
class_match< const SCEVVScale > m_SCEVVScale()
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
class_match< const SCEVConstant > m_SCEVConstant()
SCEVAffineAddRec_match< Op0_t, Op1_t, class_match< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
bind_ty< const SCEVMulExpr > m_scev_Mul(const SCEVMulExpr *&V)
bool match(const SCEV *S, const Pattern &P)
class_match< const Loop > m_Loop()
cst_pred_ty< is_specific_cst > m_scev_SpecificInt(uint64_t V)
Match an SCEV constant with a plain unsigned integer.
class_match< const SCEV > m_SCEV()
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
@ DW_OP_LLVM_arg
Only used in LLVM metadata.
Definition Dwarf.h:149
@ DW_OP_LLVM_convert
Only used in LLVM metadata.
Definition Dwarf.h:145
constexpr double e
Sequence
A sequence of states that a pointer may go through in which an objc_retain and objc_release are actua...
Definition PtrState.h:41
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
iterator end() const
Definition BasicBlock.h:89
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
Definition SFrame.h:77
unsigned KindType
For isa, dyn_cast, etc operations on TelemetryInfo.
Definition Telemetry.h:83
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1765
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
InstructionCost Cost
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition Utils.cpp:1726
bool operator!=(uint64_t V1, const APInt &V2)
Definition APInt.h:2128
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
LLVM_ABI char & LoopSimplifyID
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
DomTreeNodeBase< BasicBlock > DomTreeNode
Definition Dominators.h:94
AnalysisManager< Loop, LoopStandardAnalysisResults & > LoopAnalysisManager
The loop analysis manager.
LLVM_ABI bool matchSimpleRecurrence(const PHINode *P, BinaryOperator *&BO, Value *&Start, Value *&Step)
Attempt to match a simple first order recurrence cycle of the form: iv = phi Ty [Start,...
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
LLVM_ABI bool DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr)
Examine each PHI in the given block and delete it if it is dead.
LLVM_ABI void initializeLoopStrengthReducePass(PassRegistry &)
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
LLVM_ABI const SCEV * denormalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE)
Denormalize S to be post-increment for all loops present in Loops.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
LLVM_ABI Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void SplitLandingPadPredecessors(BasicBlock *OrigBB, ArrayRef< BasicBlock * > Preds, const char *Suffix, const char *Suffix2, SmallVectorImpl< BasicBlock * > &NewBBs, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, bool PreserveLCSSA=false)
This method transforms the landing pad, OrigBB, by introducing two new basic blocks into the function...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
LLVM_ABI const SCEV * normalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE, bool CheckInvertible=true)
Normalize S to be post-increment for all loops present in Loops.
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Other
Any other memory.
Definition ModRef.h:68
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
@ Add
Sum of integers.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2012
DWARFExpression::Operation Op
LLVM_ABI Pass * createLoopStrengthReducePass()
LLVM_ABI BasicBlock * SplitCriticalEdge(Instruction *TI, unsigned SuccNum, const CriticalEdgeSplittingOptions &Options=CriticalEdgeSplittingOptions(), const Twine &BBName="")
If this edge is a critical edge, insert a new node to split the critical edge.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructionsPermissive(SmallVectorImpl< WeakTrackingVH > &DeadInsts, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
Same functionality as RecursivelyDeleteTriviallyDeadInstructions, but allow instructions that are not...
Definition Local.cpp:553
constexpr unsigned BitWidth
LLVM_ABI bool formLCSSAForInstructions(SmallVectorImpl< Instruction * > &Worklist, const DominatorTree &DT, const LoopInfo &LI, ScalarEvolution *SE, SmallVectorImpl< PHINode * > *PHIsToRemove=nullptr, SmallVectorImpl< PHINode * > *InsertedPHIs=nullptr)
Ensures LCSSA form for every instruction from the Worklist in the scope of innermost containing loop.
Definition LCSSA.cpp:308
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
SmallPtrSet< const Loop *, 2 > PostIncLoopSet
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
LLVM_ABI int rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI, ScalarEvolution *SE, const TargetTransformInfo *TTI, SCEVExpander &Rewriter, DominatorTree *DT, ReplaceExitVal ReplaceExitValue, SmallVector< WeakTrackingVH, 16 > &DeadInsts)
If the final value of any expressions that are recurrent in the loop can be computed,...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
@ UnusedIndVarInLoop
Definition LoopUtils.h:572
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
bool SCEVExprContains(const SCEV *Root, PredTy Pred)
Return true if any node in Root satisfies the predicate Pred.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
Attributes of a target dependent hardware loop.
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Information about a load/store intrinsic defined by the target.
Value * PtrVal
This is the pointer that the intrinsic is loading from or storing to.