LLVM 23.0.0git
LoopStrengthReduce.cpp
Go to the documentation of this file.
1//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This transformation analyzes and transforms the induction variables (and
10// computations derived from them) into forms suitable for efficient execution
11// on the target.
12//
13// This pass performs a strength reduction on array references inside loops that
14// have as one or more of their components the loop induction variable, it
15// rewrites expressions to take advantage of scaled-index addressing modes
16// available on the target, and it performs a variety of other optimizations
17// related to loop induction variables.
18//
19// Terminology note: this code has a lot of handling for "post-increment" or
20// "post-inc" users. This is not talking about post-increment addressing modes;
21// it is instead talking about code like this:
22//
23// %i = phi [ 0, %entry ], [ %i.next, %latch ]
24// ...
25// %i.next = add %i, 1
26// %c = icmp eq %i.next, %n
27//
28// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
29// it's useful to think about these as the same register, with some uses using
30// the value of the register before the add and some using it after. In this
31// example, the icmp is a post-increment user, since it uses %i.next, which is
32// the value of the induction variable after the increment. The other common
33// case of post-increment users is users outside the loop.
34//
35// TODO: More sophistication in the way Formulae are generated and filtered.
36//
37// TODO: Handle multiple loops at a time.
38//
39// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
40// of a GlobalValue?
41//
42// TODO: When truncation is free, truncate ICmp users' operands to make it a
43// smaller encoding (on x86 at least).
44//
45// TODO: When a negated register is used by an add (such as in a list of
46// multiple base registers, or as the increment expression in an addrec),
47// we may not actually need both reg and (-1 * reg) in registers; the
48// negation can be implemented by using a sub instead of an add. The
49// lack of support for taking this into consideration when making
50// register pressure decisions is partly worked around by the "Special"
51// use kind.
52//
53//===----------------------------------------------------------------------===//
54
56#include "llvm/ADT/APInt.h"
57#include "llvm/ADT/DenseMap.h"
58#include "llvm/ADT/DenseSet.h"
60#include "llvm/ADT/STLExtras.h"
61#include "llvm/ADT/SetVector.h"
64#include "llvm/ADT/SmallSet.h"
66#include "llvm/ADT/Statistic.h"
84#include "llvm/IR/BasicBlock.h"
85#include "llvm/IR/Constant.h"
86#include "llvm/IR/Constants.h"
89#include "llvm/IR/Dominators.h"
90#include "llvm/IR/GlobalValue.h"
91#include "llvm/IR/IRBuilder.h"
92#include "llvm/IR/InstrTypes.h"
93#include "llvm/IR/Instruction.h"
96#include "llvm/IR/Module.h"
97#include "llvm/IR/Operator.h"
98#include "llvm/IR/Type.h"
99#include "llvm/IR/Use.h"
100#include "llvm/IR/User.h"
101#include "llvm/IR/Value.h"
102#include "llvm/IR/ValueHandle.h"
104#include "llvm/Pass.h"
105#include "llvm/Support/Casting.h"
108#include "llvm/Support/Debug.h"
118#include <algorithm>
119#include <cassert>
120#include <cstddef>
121#include <cstdint>
122#include <iterator>
123#include <limits>
124#include <map>
125#include <numeric>
126#include <optional>
127#include <utility>
128
129using namespace llvm;
130using namespace SCEVPatternMatch;
131
132#define DEBUG_TYPE "loop-reduce"
133
134/// MaxIVUsers is an arbitrary threshold that provides an early opportunity for
135/// bail out. This threshold is far beyond the number of users that LSR can
136/// conceivably solve, so it should not affect generated code, but catches the
137/// worst cases before LSR burns too much compile time and stack space.
138static const unsigned MaxIVUsers = 200;
139
140/// Limit the size of expression that SCEV-based salvaging will attempt to
141/// translate into a DIExpression.
142/// Choose a maximum size such that debuginfo is not excessively increased and
143/// the salvaging is not too expensive for the compiler.
144static const unsigned MaxSCEVSalvageExpressionSize = 64;
145
146// Cleanup congruent phis after LSR phi expansion.
148 "enable-lsr-phielim", cl::Hidden, cl::init(true),
149 cl::desc("Enable LSR phi elimination"));
150
151// The flag adds instruction count to solutions cost comparison.
153 "lsr-insns-cost", cl::Hidden, cl::init(true),
154 cl::desc("Add instruction count to a LSR cost model"));
155
156// Flag to choose how to narrow complex lsr solution
158 "lsr-exp-narrow", cl::Hidden, cl::init(false),
159 cl::desc("Narrow LSR complex solution using"
160 " expectation of registers number"));
161
162// Flag to narrow search space by filtering non-optimal formulae with
163// the same ScaledReg and Scale.
165 "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true),
166 cl::desc("Narrow LSR search space by filtering non-optimal formulae"
167 " with the same ScaledReg and Scale"));
168
170 "lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None),
171 cl::desc("A flag that overrides the target's preferred addressing mode."),
173 clEnumValN(TTI::AMK_None, "none", "Don't prefer any addressing mode"),
174 clEnumValN(TTI::AMK_PreIndexed, "preindexed",
175 "Prefer pre-indexed addressing mode"),
176 clEnumValN(TTI::AMK_PostIndexed, "postindexed",
177 "Prefer post-indexed addressing mode"),
178 clEnumValN(TTI::AMK_All, "all", "Consider all addressing modes")));
179
181 "lsr-complexity-limit", cl::Hidden,
182 cl::init(std::numeric_limits<uint16_t>::max()),
183 cl::desc("LSR search space complexity limit"));
184
186 "lsr-setupcost-depth-limit", cl::Hidden, cl::init(7),
187 cl::desc("The limit on recursion depth for LSRs setup cost"));
188
190 "lsr-drop-solution", cl::Hidden,
191 cl::desc("Attempt to drop solution if it is less profitable"));
192
194 "lsr-enable-vscale-immediates", cl::Hidden, cl::init(true),
195 cl::desc("Enable analysis of vscale-relative immediates in LSR"));
196
198 "lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true),
199 cl::desc("Avoid using scaled registers with vscale-relative addressing"));
200
201#ifndef NDEBUG
202// Stress test IV chain generation.
204 "stress-ivchain", cl::Hidden, cl::init(false),
205 cl::desc("Stress test LSR IV chains"));
206#else
207static bool StressIVChain = false;
208#endif
209
210namespace {
211
212struct MemAccessTy {
213 /// Used in situations where the accessed memory type is unknown.
214 static const unsigned UnknownAddressSpace =
215 std::numeric_limits<unsigned>::max();
216
217 Type *MemTy = nullptr;
218 unsigned AddrSpace = UnknownAddressSpace;
219
220 MemAccessTy() = default;
221 MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {}
222
223 bool operator==(MemAccessTy Other) const {
224 return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
225 }
226
227 bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
228
229 static MemAccessTy getUnknown(LLVMContext &Ctx,
230 unsigned AS = UnknownAddressSpace) {
231 return MemAccessTy(Type::getVoidTy(Ctx), AS);
232 }
233
234 Type *getType() { return MemTy; }
235};
236
237/// This class holds data which is used to order reuse candidates.
238class RegSortData {
239public:
240 /// This represents the set of LSRUse indices which reference
241 /// a particular register.
242 SmallBitVector UsedByIndices;
243
244 void print(raw_ostream &OS) const;
245 void dump() const;
246};
247
248// An offset from an address that is either scalable or fixed. Used for
249// per-target optimizations of addressing modes.
250class Immediate : public details::FixedOrScalableQuantity<Immediate, int64_t> {
251 constexpr Immediate(ScalarTy MinVal, bool Scalable)
252 : FixedOrScalableQuantity(MinVal, Scalable) {}
253
254 constexpr Immediate(const FixedOrScalableQuantity<Immediate, int64_t> &V)
255 : FixedOrScalableQuantity(V) {}
256
257public:
258 constexpr Immediate() = delete;
259
260 static constexpr Immediate getFixed(ScalarTy MinVal) {
261 return {MinVal, false};
262 }
263 static constexpr Immediate getScalable(ScalarTy MinVal) {
264 return {MinVal, true};
265 }
266 static constexpr Immediate get(ScalarTy MinVal, bool Scalable) {
267 return {MinVal, Scalable};
268 }
269 static constexpr Immediate getZero() { return {0, false}; }
270 static constexpr Immediate getFixedMin() {
271 return {std::numeric_limits<int64_t>::min(), false};
272 }
273 static constexpr Immediate getFixedMax() {
274 return {std::numeric_limits<int64_t>::max(), false};
275 }
276 static constexpr Immediate getScalableMin() {
277 return {std::numeric_limits<int64_t>::min(), true};
278 }
279 static constexpr Immediate getScalableMax() {
280 return {std::numeric_limits<int64_t>::max(), true};
281 }
282
283 constexpr bool isLessThanZero() const { return Quantity < 0; }
284
285 constexpr bool isGreaterThanZero() const { return Quantity > 0; }
286
287 constexpr bool isCompatibleImmediate(const Immediate &Imm) const {
288 return isZero() || Imm.isZero() || Imm.Scalable == Scalable;
289 }
290
291 constexpr bool isMin() const {
292 return Quantity == std::numeric_limits<ScalarTy>::min();
293 }
294
295 constexpr bool isMax() const {
296 return Quantity == std::numeric_limits<ScalarTy>::max();
297 }
298
299 // Arithmetic 'operators' that cast to unsigned types first.
300 constexpr Immediate addUnsigned(const Immediate &RHS) const {
301 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
302 ScalarTy Value = (uint64_t)Quantity + RHS.getKnownMinValue();
303 return {Value, Scalable || RHS.isScalable()};
304 }
305
306 constexpr Immediate subUnsigned(const Immediate &RHS) const {
307 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
308 ScalarTy Value = (uint64_t)Quantity - RHS.getKnownMinValue();
309 return {Value, Scalable || RHS.isScalable()};
310 }
311
312 // Scale the quantity by a constant without caring about runtime scalability.
313 constexpr Immediate mulUnsigned(const ScalarTy RHS) const {
314 ScalarTy Value = (uint64_t)Quantity * RHS;
315 return {Value, Scalable};
316 }
317
318 // Helpers for generating SCEVs with vscale terms where needed.
319 const SCEV *getSCEV(ScalarEvolution &SE, Type *Ty) const {
320 const SCEV *S = SE.getConstant(Ty, Quantity);
321 if (Scalable)
322 S = SE.getMulExpr(S, SE.getVScale(S->getType()));
323 return S;
324 }
325
326 const SCEV *getNegativeSCEV(ScalarEvolution &SE, Type *Ty) const {
327 const SCEV *NegS = SE.getConstant(Ty, -(uint64_t)Quantity);
328 if (Scalable)
329 NegS = SE.getMulExpr(NegS, SE.getVScale(NegS->getType()));
330 return NegS;
331 }
332
333 const SCEV *getUnknownSCEV(ScalarEvolution &SE, Type *Ty) const {
334 // TODO: Avoid implicit trunc?
335 // See https://github.com/llvm/llvm-project/issues/112510.
336 const SCEV *SU = SE.getUnknown(
337 ConstantInt::getSigned(Ty, Quantity, /*ImplicitTrunc=*/true));
338 if (Scalable)
339 SU = SE.getMulExpr(SU, SE.getVScale(SU->getType()));
340 return SU;
341 }
342};
343
344// This is needed for the Compare type of std::map when Immediate is used
345// as a key. We don't need it to be fully correct against any value of vscale,
346// just to make sure that vscale-related terms in the map are considered against
347// each other rather than being mixed up and potentially missing opportunities.
348struct KeyOrderTargetImmediate {
349 bool operator()(const Immediate &LHS, const Immediate &RHS) const {
350 if (LHS.isScalable() && !RHS.isScalable())
351 return false;
352 if (!LHS.isScalable() && RHS.isScalable())
353 return true;
354 return LHS.getKnownMinValue() < RHS.getKnownMinValue();
355 }
356};
357
358// This would be nicer if we could be generic instead of directly using size_t,
359// but there doesn't seem to be a type trait for is_orderable or
360// is_lessthan_comparable or similar.
361struct KeyOrderSizeTAndImmediate {
362 bool operator()(const std::pair<size_t, Immediate> &LHS,
363 const std::pair<size_t, Immediate> &RHS) const {
364 size_t LSize = LHS.first;
365 size_t RSize = RHS.first;
366 if (LSize != RSize)
367 return LSize < RSize;
368 return KeyOrderTargetImmediate()(LHS.second, RHS.second);
369 }
370};
371} // end anonymous namespace
372
373#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
374void RegSortData::print(raw_ostream &OS) const {
375 OS << "[NumUses=" << UsedByIndices.count() << ']';
376}
377
378LLVM_DUMP_METHOD void RegSortData::dump() const {
379 print(errs()); errs() << '\n';
380}
381#endif
382
383namespace {
384
385/// Map register candidates to information about how they are used.
386class RegUseTracker {
387 using RegUsesTy = DenseMap<const SCEV *, RegSortData>;
388
389 RegUsesTy RegUsesMap;
391
392public:
393 void countRegister(const SCEV *Reg, size_t LUIdx);
394 void dropRegister(const SCEV *Reg, size_t LUIdx);
395 void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
396
397 bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
398
399 const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
400
401 void clear();
402
405
406 iterator begin() { return RegSequence.begin(); }
407 iterator end() { return RegSequence.end(); }
408 const_iterator begin() const { return RegSequence.begin(); }
409 const_iterator end() const { return RegSequence.end(); }
410};
411
412} // end anonymous namespace
413
414void
415RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
416 std::pair<RegUsesTy::iterator, bool> Pair = RegUsesMap.try_emplace(Reg);
417 RegSortData &RSD = Pair.first->second;
418 if (Pair.second)
419 RegSequence.push_back(Reg);
420 RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
421 RSD.UsedByIndices.set(LUIdx);
422}
423
424void
425RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
426 RegUsesTy::iterator It = RegUsesMap.find(Reg);
427 assert(It != RegUsesMap.end());
428 RegSortData &RSD = It->second;
429 assert(RSD.UsedByIndices.size() > LUIdx);
430 RSD.UsedByIndices.reset(LUIdx);
431}
432
433void
434RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
435 assert(LUIdx <= LastLUIdx);
436
437 // Update RegUses. The data structure is not optimized for this purpose;
438 // we must iterate through it and update each of the bit vectors.
439 for (auto &Pair : RegUsesMap) {
440 SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
441 if (LUIdx < UsedByIndices.size())
442 UsedByIndices[LUIdx] =
443 LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : false;
444 UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
445 }
446}
447
448bool
449RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
450 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
451 if (I == RegUsesMap.end())
452 return false;
453 const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
454 int i = UsedByIndices.find_first();
455 if (i == -1) return false;
456 if ((size_t)i != LUIdx) return true;
457 return UsedByIndices.find_next(i) != -1;
458}
459
460const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
461 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
462 assert(I != RegUsesMap.end() && "Unknown register!");
463 return I->second.UsedByIndices;
464}
465
466void RegUseTracker::clear() {
467 RegUsesMap.clear();
468 RegSequence.clear();
469}
470
471namespace {
472
473/// This class holds information that describes a formula for computing
474/// satisfying a use. It may include broken-out immediates and scaled registers.
475struct Formula {
476 /// Global base address used for complex addressing.
477 GlobalValue *BaseGV = nullptr;
478
479 /// Base offset for complex addressing.
480 Immediate BaseOffset = Immediate::getZero();
481
482 /// Whether any complex addressing has a base register.
483 bool HasBaseReg = false;
484
485 /// The scale of any complex addressing.
486 int64_t Scale = 0;
487
488 /// The list of "base" registers for this use. When this is non-empty. The
489 /// canonical representation of a formula is
490 /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
491 /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
492 /// 3. The reg containing recurrent expr related with currect loop in the
493 /// formula should be put in the ScaledReg.
494 /// #1 enforces that the scaled register is always used when at least two
495 /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
496 /// #2 enforces that 1 * reg is reg.
497 /// #3 ensures invariant regs with respect to current loop can be combined
498 /// together in LSR codegen.
499 /// This invariant can be temporarily broken while building a formula.
500 /// However, every formula inserted into the LSRInstance must be in canonical
501 /// form.
503
504 /// The 'scaled' register for this use. This should be non-null when Scale is
505 /// not zero.
506 const SCEV *ScaledReg = nullptr;
507
508 /// An additional constant offset which added near the use. This requires a
509 /// temporary register, but the offset itself can live in an add immediate
510 /// field rather than a register.
511 Immediate UnfoldedOffset = Immediate::getZero();
512
513 Formula() = default;
514
515 void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
516
517 bool isCanonical(const Loop &L) const;
518
519 void canonicalize(const Loop &L);
520
521 bool unscale();
522
523 bool hasZeroEnd() const;
524
525 bool countsDownToZero() const;
526
527 size_t getNumRegs() const;
528 Type *getType() const;
529
530 void deleteBaseReg(const SCEV *&S);
531
532 bool referencesReg(const SCEV *S) const;
533 bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
534 const RegUseTracker &RegUses) const;
535
536 void print(raw_ostream &OS) const;
537 void dump() const;
538};
539
540} // end anonymous namespace
541
542/// Recursion helper for initialMatch.
543static void DoInitialMatch(const SCEV *S, Loop *L,
546 // Collect expressions which properly dominate the loop header.
547 if (SE.properlyDominates(S, L->getHeader())) {
548 Good.push_back(S);
549 return;
550 }
551
552 // Look at add operands.
553 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
554 for (const SCEV *S : Add->operands())
555 DoInitialMatch(S, L, Good, Bad, SE);
556 return;
557 }
558
559 // Look at addrec operands.
560 const SCEV *Start, *Step;
561 const Loop *ARLoop;
562 if (match(S,
563 m_scev_AffineAddRec(m_SCEV(Start), m_SCEV(Step), m_Loop(ARLoop))) &&
564 !Start->isZero()) {
565 DoInitialMatch(Start, L, Good, Bad, SE);
566 DoInitialMatch(SE.getAddRecExpr(SE.getConstant(S->getType(), 0), Step,
567 // FIXME: AR->getNoWrapFlags()
568 ARLoop, SCEV::FlagAnyWrap),
569 L, Good, Bad, SE);
570 return;
571 }
572
573 // Handle a multiplication by -1 (negation) if it didn't fold.
574 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
575 if (Mul->getOperand(0)->isAllOnesValue()) {
577 const SCEV *NewMul = SE.getMulExpr(Ops);
578
581 DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
582 const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
583 SE.getEffectiveSCEVType(NewMul->getType())));
584 for (const SCEV *S : MyGood)
585 Good.push_back(SE.getMulExpr(NegOne, S));
586 for (const SCEV *S : MyBad)
587 Bad.push_back(SE.getMulExpr(NegOne, S));
588 return;
589 }
590
591 // Ok, we can't do anything interesting. Just stuff the whole thing into a
592 // register and hope for the best.
593 Bad.push_back(S);
594}
595
596/// Incorporate loop-variant parts of S into this Formula, attempting to keep
597/// all loop-invariant and loop-computable values in a single base register.
598void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
601 DoInitialMatch(S, L, Good, Bad, SE);
602 if (!Good.empty()) {
603 const SCEV *Sum = SE.getAddExpr(Good);
604 if (!Sum->isZero())
605 BaseRegs.push_back(Sum);
606 HasBaseReg = true;
607 }
608 if (!Bad.empty()) {
609 const SCEV *Sum = SE.getAddExpr(Bad);
610 if (!Sum->isZero())
611 BaseRegs.push_back(Sum);
612 HasBaseReg = true;
613 }
614 canonicalize(*L);
615}
616
617static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L) {
618 return SCEVExprContains(S, [&L](const SCEV *S) {
619 return isa<SCEVAddRecExpr>(S) && (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
620 });
621}
622
623/// Check whether or not this formula satisfies the canonical
624/// representation.
625/// \see Formula::BaseRegs.
626bool Formula::isCanonical(const Loop &L) const {
627 assert((Scale == 0 || ScaledReg) &&
628 "ScaledReg must be non-null if Scale is non-zero");
629
630 if (!ScaledReg)
631 return BaseRegs.size() <= 1;
632
633 if (Scale != 1)
634 return true;
635
636 if (Scale == 1 && BaseRegs.empty())
637 return false;
638
639 if (containsAddRecDependentOnLoop(ScaledReg, L))
640 return true;
641
642 // If ScaledReg is not a recurrent expr, or it is but its loop is not current
643 // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
644 // loop, we want to swap the reg in BaseRegs with ScaledReg.
645 return none_of(BaseRegs, [&L](const SCEV *S) {
647 });
648}
649
650/// Helper method to morph a formula into its canonical representation.
651/// \see Formula::BaseRegs.
652/// Every formula having more than one base register, must use the ScaledReg
653/// field. Otherwise, we would have to do special cases everywhere in LSR
654/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
655/// On the other hand, 1*reg should be canonicalized into reg.
656void Formula::canonicalize(const Loop &L) {
657 if (isCanonical(L))
658 return;
659
660 if (BaseRegs.empty()) {
661 // No base reg? Use scale reg with scale = 1 as such.
662 assert(ScaledReg && "Expected 1*reg => reg");
663 assert(Scale == 1 && "Expected 1*reg => reg");
664 BaseRegs.push_back(ScaledReg);
665 Scale = 0;
666 ScaledReg = nullptr;
667 return;
668 }
669
670 // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
671 if (!ScaledReg) {
672 ScaledReg = BaseRegs.pop_back_val();
673 Scale = 1;
674 }
675
676 // If ScaledReg is an invariant with respect to L, find the reg from
677 // BaseRegs containing the recurrent expr related with Loop L. Swap the
678 // reg with ScaledReg.
679 if (!containsAddRecDependentOnLoop(ScaledReg, L)) {
680 auto I = find_if(BaseRegs, [&L](const SCEV *S) {
682 });
683 if (I != BaseRegs.end())
684 std::swap(ScaledReg, *I);
685 }
686 assert(isCanonical(L) && "Failed to canonicalize?");
687}
688
689/// Get rid of the scale in the formula.
690/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
691/// \return true if it was possible to get rid of the scale, false otherwise.
692/// \note After this operation the formula may not be in the canonical form.
693bool Formula::unscale() {
694 if (Scale != 1)
695 return false;
696 Scale = 0;
697 BaseRegs.push_back(ScaledReg);
698 ScaledReg = nullptr;
699 return true;
700}
701
702bool Formula::hasZeroEnd() const {
703 if (UnfoldedOffset || BaseOffset)
704 return false;
705 if (BaseRegs.size() != 1 || ScaledReg)
706 return false;
707 return true;
708}
709
710bool Formula::countsDownToZero() const {
711 if (!hasZeroEnd())
712 return false;
713 assert(BaseRegs.size() == 1 && "hasZeroEnd should mean one BaseReg");
714 const APInt *StepInt;
715 if (!match(BaseRegs[0], m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StepInt))))
716 return false;
717 return StepInt->isNegative();
718}
719
720/// Return the total number of register operands used by this formula. This does
721/// not include register uses implied by non-constant addrec strides.
722size_t Formula::getNumRegs() const {
723 return !!ScaledReg + BaseRegs.size();
724}
725
726/// Return the type of this formula, if it has one, or null otherwise. This type
727/// is meaningless except for the bit size.
728Type *Formula::getType() const {
729 return !BaseRegs.empty() ? BaseRegs.front()->getType() :
730 ScaledReg ? ScaledReg->getType() :
731 BaseGV ? BaseGV->getType() :
732 nullptr;
733}
734
735/// Delete the given base reg from the BaseRegs list.
736void Formula::deleteBaseReg(const SCEV *&S) {
737 if (&S != &BaseRegs.back())
738 std::swap(S, BaseRegs.back());
739 BaseRegs.pop_back();
740}
741
742/// Test if this formula references the given register.
743bool Formula::referencesReg(const SCEV *S) const {
744 return S == ScaledReg || is_contained(BaseRegs, S);
745}
746
747/// Test whether this formula uses registers which are used by uses other than
748/// the use with the given index.
749bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
750 const RegUseTracker &RegUses) const {
751 if (ScaledReg)
752 if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
753 return true;
754 for (const SCEV *BaseReg : BaseRegs)
755 if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
756 return true;
757 return false;
758}
759
760#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
761void Formula::print(raw_ostream &OS) const {
762 ListSeparator Plus(" + ");
763 if (BaseGV) {
764 OS << Plus;
765 BaseGV->printAsOperand(OS, /*PrintType=*/false);
766 }
767 if (BaseOffset.isNonZero())
768 OS << Plus << BaseOffset;
769
770 for (const SCEV *BaseReg : BaseRegs)
771 OS << Plus << "reg(" << *BaseReg << ')';
772
773 if (HasBaseReg && BaseRegs.empty())
774 OS << Plus << "**error: HasBaseReg**";
775 else if (!HasBaseReg && !BaseRegs.empty())
776 OS << Plus << "**error: !HasBaseReg**";
777
778 if (Scale != 0) {
779 OS << Plus << Scale << "*reg(";
780 if (ScaledReg)
781 OS << *ScaledReg;
782 else
783 OS << "<unknown>";
784 OS << ')';
785 }
786 if (UnfoldedOffset.isNonZero())
787 OS << Plus << "imm(" << UnfoldedOffset << ')';
788}
789
790LLVM_DUMP_METHOD void Formula::dump() const {
791 print(errs()); errs() << '\n';
792}
793#endif
794
795/// Return true if the given addrec can be sign-extended without changing its
796/// value.
798 Type *WideTy =
800 return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
801}
802
803/// Return true if the given add can be sign-extended without changing its
804/// value.
805static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
806 Type *WideTy =
807 IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
808 return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
809}
810
811/// Return true if the given mul can be sign-extended without changing its
812/// value.
813static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
814 Type *WideTy =
816 SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
817 return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
818}
819
820/// Return an expression for LHS /s RHS, if it can be determined and if the
821/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
822/// is true, expressions like (X * Y) /s Y are simplified to X, ignoring that
823/// the multiplication may overflow, which is useful when the result will be
824/// used in a context where the most significant bits are ignored.
825static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
826 ScalarEvolution &SE,
827 bool IgnoreSignificantBits = false) {
828 // Handle the trivial case, which works for any SCEV type.
829 if (LHS == RHS)
830 return SE.getConstant(LHS->getType(), 1);
831
832 // Handle a few RHS special cases.
834 if (RC) {
835 const APInt &RA = RC->getAPInt();
836 // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
837 // some folding.
838 if (RA.isAllOnes()) {
839 if (LHS->getType()->isPointerTy())
840 return nullptr;
841 return SE.getMulExpr(LHS, RC);
842 }
843 // Handle x /s 1 as x.
844 if (RA == 1)
845 return LHS;
846 }
847
848 // Check for a division of a constant by a constant.
850 if (!RC)
851 return nullptr;
852 const APInt &LA = C->getAPInt();
853 const APInt &RA = RC->getAPInt();
854 if (LA.srem(RA) != 0)
855 return nullptr;
856 return SE.getConstant(LA.sdiv(RA));
857 }
858
859 // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
861 if ((IgnoreSignificantBits || isAddRecSExtable(AR, SE)) && AR->isAffine()) {
862 const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
863 IgnoreSignificantBits);
864 if (!Step) return nullptr;
865 const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
866 IgnoreSignificantBits);
867 if (!Start) return nullptr;
868 // FlagNW is independent of the start value, step direction, and is
869 // preserved with smaller magnitude steps.
870 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
871 return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
872 }
873 return nullptr;
874 }
875
876 // Distribute the sdiv over add operands, if the add doesn't overflow.
878 if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
880 for (const SCEV *S : Add->operands()) {
881 const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
882 if (!Op) return nullptr;
883 Ops.push_back(Op);
884 }
885 return SE.getAddExpr(Ops);
886 }
887 return nullptr;
888 }
889
890 // Check for a multiply operand that we can pull RHS out of.
892 if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
893 // Handle special case C1*X*Y /s C2*X*Y.
894 if (const SCEVMulExpr *MulRHS = dyn_cast<SCEVMulExpr>(RHS)) {
895 if (IgnoreSignificantBits || isMulSExtable(MulRHS, SE)) {
896 const SCEVConstant *LC = dyn_cast<SCEVConstant>(Mul->getOperand(0));
897 const SCEVConstant *RC =
898 dyn_cast<SCEVConstant>(MulRHS->getOperand(0));
899 if (LC && RC) {
901 SmallVector<const SCEV *, 4> ROps(drop_begin(MulRHS->operands()));
902 if (LOps == ROps)
903 return getExactSDiv(LC, RC, SE, IgnoreSignificantBits);
904 }
905 }
906 }
907
909 bool Found = false;
910 for (const SCEV *S : Mul->operands()) {
911 if (!Found)
912 if (const SCEV *Q = getExactSDiv(S, RHS, SE,
913 IgnoreSignificantBits)) {
914 S = Q;
915 Found = true;
916 }
917 Ops.push_back(S);
918 }
919 return Found ? SE.getMulExpr(Ops) : nullptr;
920 }
921 return nullptr;
922 }
923
924 // Otherwise we don't know.
925 return nullptr;
926}
927
928/// If S involves the addition of a constant integer value, return that integer
929/// value, and mutate S to point to a new SCEV with that value excluded.
930static Immediate ExtractImmediate(SCEVUse &S, ScalarEvolution &SE) {
931 const APInt *C;
932 if (match(S, m_scev_APInt(C))) {
933 if (C->getSignificantBits() <= 64) {
934 S = SE.getConstant(S->getType(), 0);
935 return Immediate::getFixed(C->getSExtValue());
936 }
937 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
938 SmallVector<SCEVUse, 8> NewOps(Add->operands());
939 Immediate Result = ExtractImmediate(NewOps.front(), SE);
940 if (Result.isNonZero())
941 S = SE.getAddExpr(NewOps);
942 return Result;
943 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
944 SmallVector<SCEVUse, 8> NewOps(AR->operands());
945 Immediate Result = ExtractImmediate(NewOps.front(), SE);
946 if (Result.isNonZero())
947 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
948 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
950 return Result;
951 } else if (EnableVScaleImmediates &&
953 S = SE.getConstant(S->getType(), 0);
954 return Immediate::getScalable(C->getSExtValue());
955 }
956 return Immediate::getZero();
957}
958
959/// If S involves the addition of a GlobalValue address, return that symbol, and
960/// mutate S to point to a new SCEV with that value excluded.
962 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
963 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
964 S = SE.getConstant(GV->getType(), 0);
965 return GV;
966 }
967 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
968 SmallVector<SCEVUse, 8> NewOps(Add->operands());
969 GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
970 if (Result)
971 S = SE.getAddExpr(NewOps);
972 return Result;
973 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
974 SmallVector<SCEVUse, 8> NewOps(AR->operands());
975 GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
976 if (Result)
977 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
978 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
980 return Result;
981 }
982 return nullptr;
983}
984
985/// Returns true if the specified instruction is using the specified value as an
986/// address.
988 Instruction *Inst, Value *OperandVal) {
989 bool isAddress = isa<LoadInst>(Inst);
990 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
991 if (SI->getPointerOperand() == OperandVal)
992 isAddress = true;
993 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
994 // Addressing modes can also be folded into prefetches and a variety
995 // of intrinsics.
996 switch (II->getIntrinsicID()) {
997 case Intrinsic::memset:
998 case Intrinsic::prefetch:
999 case Intrinsic::masked_load:
1000 if (II->getArgOperand(0) == OperandVal)
1001 isAddress = true;
1002 break;
1003 case Intrinsic::masked_store:
1004 if (II->getArgOperand(1) == OperandVal)
1005 isAddress = true;
1006 break;
1007 case Intrinsic::memmove:
1008 case Intrinsic::memcpy:
1009 if (II->getArgOperand(0) == OperandVal ||
1010 II->getArgOperand(1) == OperandVal)
1011 isAddress = true;
1012 break;
1013 default: {
1014 MemIntrinsicInfo IntrInfo;
1015 if (TTI.getTgtMemIntrinsic(II, IntrInfo)) {
1016 if (IntrInfo.PtrVal == OperandVal)
1017 isAddress = true;
1018 }
1019 }
1020 }
1021 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1022 if (RMW->getPointerOperand() == OperandVal)
1023 isAddress = true;
1024 } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1025 if (CmpX->getPointerOperand() == OperandVal)
1026 isAddress = true;
1027 }
1028 return isAddress;
1029}
1030
1031/// Return the type of the memory being accessed.
1032static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
1033 Instruction *Inst, Value *OperandVal) {
1034 MemAccessTy AccessTy = MemAccessTy::getUnknown(Inst->getContext());
1035
1036 // First get the type of memory being accessed.
1037 if (Type *Ty = Inst->getAccessType())
1038 AccessTy.MemTy = Ty;
1039
1040 // Then get the pointer address space.
1041 if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
1042 AccessTy.AddrSpace = SI->getPointerAddressSpace();
1043 } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
1044 AccessTy.AddrSpace = LI->getPointerAddressSpace();
1045 } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1046 AccessTy.AddrSpace = RMW->getPointerAddressSpace();
1047 } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1048 AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
1049 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
1050 switch (II->getIntrinsicID()) {
1051 case Intrinsic::prefetch:
1052 case Intrinsic::memset:
1053 AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
1054 AccessTy.MemTy = OperandVal->getType();
1055 break;
1056 case Intrinsic::memmove:
1057 case Intrinsic::memcpy:
1058 AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace();
1059 AccessTy.MemTy = OperandVal->getType();
1060 break;
1061 case Intrinsic::masked_load:
1062 AccessTy.AddrSpace =
1063 II->getArgOperand(0)->getType()->getPointerAddressSpace();
1064 break;
1065 case Intrinsic::masked_store:
1066 AccessTy.AddrSpace =
1067 II->getArgOperand(1)->getType()->getPointerAddressSpace();
1068 break;
1069 default: {
1070 MemIntrinsicInfo IntrInfo;
1071 if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) {
1072 AccessTy.AddrSpace
1073 = IntrInfo.PtrVal->getType()->getPointerAddressSpace();
1074 }
1075
1076 break;
1077 }
1078 }
1079 }
1080
1081 return AccessTy;
1082}
1083
1084/// Return true if this AddRec is already a phi in its loop.
1085static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
1086 for (PHINode &PN : AR->getLoop()->getHeader()->phis()) {
1087 if (SE.isSCEVable(PN.getType()) &&
1088 (SE.getEffectiveSCEVType(PN.getType()) ==
1089 SE.getEffectiveSCEVType(AR->getType())) &&
1090 SE.getSCEV(&PN) == AR)
1091 return true;
1092 }
1093 return false;
1094}
1095
1096/// Check if expanding this expression is likely to incur significant cost. This
1097/// is tricky because SCEV doesn't track which expressions are actually computed
1098/// by the current IR.
1099///
1100/// We currently allow expansion of IV increments that involve adds,
1101/// multiplication by constants, and AddRecs from existing phis.
1102///
1103/// TODO: Allow UDivExpr if we can find an existing IV increment that is an
1104/// obvious multiple of the UDivExpr.
1105static bool isHighCostExpansion(const SCEV *S,
1107 ScalarEvolution &SE) {
1108 // Zero/One operand expressions
1109 switch (S->getSCEVType()) {
1110 case scUnknown:
1111 case scConstant:
1112 case scVScale:
1113 return false;
1114 case scTruncate:
1115 return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
1116 Processed, SE);
1117 case scZeroExtend:
1118 return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
1119 Processed, SE);
1120 case scSignExtend:
1121 return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
1122 Processed, SE);
1123 default:
1124 break;
1125 }
1126
1127 if (!Processed.insert(S).second)
1128 return false;
1129
1130 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
1131 for (const SCEV *S : Add->operands()) {
1132 if (isHighCostExpansion(S, Processed, SE))
1133 return true;
1134 }
1135 return false;
1136 }
1137
1138 const SCEV *Op0, *Op1;
1139 if (match(S, m_scev_Mul(m_SCEV(Op0), m_SCEV(Op1)))) {
1140 // Multiplication by a constant is ok
1141 if (isa<SCEVConstant>(Op0))
1142 return isHighCostExpansion(Op1, Processed, SE);
1143
1144 // If we have the value of one operand, check if an existing
1145 // multiplication already generates this expression.
1146 if (const auto *U = dyn_cast<SCEVUnknown>(Op1)) {
1147 Value *UVal = U->getValue();
1148 for (User *UR : UVal->users()) {
1149 // If U is a constant, it may be used by a ConstantExpr.
1151 if (UI && UI->getOpcode() == Instruction::Mul &&
1152 SE.isSCEVable(UI->getType())) {
1153 return SE.getSCEV(UI) == S;
1154 }
1155 }
1156 }
1157 }
1158
1159 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
1160 if (isExistingPhi(AR, SE))
1161 return false;
1162 }
1163
1164 // Fow now, consider any other type of expression (div/mul/min/max) high cost.
1165 return true;
1166}
1167
1168namespace {
1169
1170class LSRUse;
1171
1172} // end anonymous namespace
1173
1174/// Check if the addressing mode defined by \p F is completely
1175/// folded in \p LU at isel time.
1176/// This includes address-mode folding and special icmp tricks.
1177/// This function returns true if \p LU can accommodate what \p F
1178/// defines and up to 1 base + 1 scaled + offset.
1179/// In other words, if \p F has several base registers, this function may
1180/// still return true. Therefore, users still need to account for
1181/// additional base registers and/or unfolded offsets to derive an
1182/// accurate cost model.
1183static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1184 const LSRUse &LU, const Formula &F);
1185
1186// Get the cost of the scaling factor used in F for LU.
1187static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI,
1188 const LSRUse &LU, const Formula &F,
1189 const Loop &L);
1190
1191namespace {
1192
1193/// This class is used to measure and compare candidate formulae.
1194class Cost {
1195 const Loop *L = nullptr;
1196 ScalarEvolution *SE = nullptr;
1197 const TargetTransformInfo *TTI = nullptr;
1198 TargetTransformInfo::LSRCost C;
1200
1201public:
1202 Cost() = delete;
1203 Cost(const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
1205 L(L), SE(&SE), TTI(&TTI), AMK(AMK) {
1206 C.Insns = 0;
1207 C.NumRegs = 0;
1208 C.AddRecCost = 0;
1209 C.NumIVMuls = 0;
1210 C.NumBaseAdds = 0;
1211 C.ImmCost = 0;
1212 C.SetupCost = 0;
1213 C.ScaleCost = 0;
1214 }
1215
1216 bool isLess(const Cost &Other) const;
1217
1218 void Lose();
1219
1220#ifndef NDEBUG
1221 // Once any of the metrics loses, they must all remain losers.
1222 bool isValid() {
1223 return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
1224 | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
1225 || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
1226 & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
1227 }
1228#endif
1229
1230 bool isLoser() {
1231 assert(isValid() && "invalid cost");
1232 return C.NumRegs == ~0u;
1233 }
1234
1235 void RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
1236 const DenseSet<const SCEV *> &VisitedRegs, const LSRUse &LU,
1237 bool HardwareLoopProfitable,
1238 SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
1239
1240 void print(raw_ostream &OS) const;
1241 void dump() const;
1242
1243private:
1244 void RateRegister(const Formula &F, const SCEV *Reg,
1245 SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
1246 bool HardwareLoopProfitable);
1247 void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1248 SmallPtrSetImpl<const SCEV *> &Regs,
1249 const LSRUse &LU, bool HardwareLoopProfitable,
1250 SmallPtrSetImpl<const SCEV *> *LoserRegs);
1251};
1252
1253/// An operand value in an instruction which is to be replaced with some
1254/// equivalent, possibly strength-reduced, replacement.
1255struct LSRFixup {
1256 /// The instruction which will be updated.
1257 Instruction *UserInst = nullptr;
1258
1259 /// The operand of the instruction which will be replaced. The operand may be
1260 /// used more than once; every instance will be replaced.
1261 Value *OperandValToReplace = nullptr;
1262
1263 /// If this user is to use the post-incremented value of an induction
1264 /// variable, this set is non-empty and holds the loops associated with the
1265 /// induction variable.
1266 PostIncLoopSet PostIncLoops;
1267
1268 /// A constant offset to be added to the LSRUse expression. This allows
1269 /// multiple fixups to share the same LSRUse with different offsets, for
1270 /// example in an unrolled loop.
1271 Immediate Offset = Immediate::getZero();
1272
1273 LSRFixup() = default;
1274
1275 bool isUseFullyOutsideLoop(const Loop *L) const;
1276
1277 void print(raw_ostream &OS) const;
1278 void dump() const;
1279};
1280
1281/// This class holds the state that LSR keeps for each use in IVUsers, as well
1282/// as uses invented by LSR itself. It includes information about what kinds of
1283/// things can be folded into the user, information about the user itself, and
1284/// information about how the use may be satisfied. TODO: Represent multiple
1285/// users of the same expression in common?
1286class LSRUse {
1287 DenseSet<SmallVector<const SCEV *, 4>> Uniquifier;
1288
1289public:
1290 /// An enum for a kind of use, indicating what types of scaled and immediate
1291 /// operands it might support.
1292 enum KindType {
1293 Basic, ///< A normal use, with no folding.
1294 Special, ///< A special case of basic, allowing -1 scales.
1295 Address, ///< An address use; folding according to TargetLowering
1296 ICmpZero ///< An equality icmp with both operands folded into one.
1297 // TODO: Add a generic icmp too?
1298 };
1299
1300 using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>;
1301
1302 KindType Kind;
1303 MemAccessTy AccessTy;
1304
1305 /// The list of operands which are to be replaced.
1307
1308 /// Keep track of the min and max offsets of the fixups.
1309 Immediate MinOffset = Immediate::getFixedMax();
1310 Immediate MaxOffset = Immediate::getFixedMin();
1311
1312 /// This records whether all of the fixups using this LSRUse are outside of
1313 /// the loop, in which case some special-case heuristics may be used.
1314 bool AllFixupsOutsideLoop = true;
1315
1316 /// This records whether all of the fixups using this LSRUse are unconditional
1317 /// within the loop, meaning they will be executed on every path to the loop
1318 /// latch. This includes fixups before early exits.
1319 bool AllFixupsUnconditional = true;
1320
1321 /// RigidFormula is set to true to guarantee that this use will be associated
1322 /// with a single formula--the one that initially matched. Some SCEV
1323 /// expressions cannot be expanded. This allows LSR to consider the registers
1324 /// used by those expressions without the need to expand them later after
1325 /// changing the formula.
1326 bool RigidFormula = false;
1327
1328 /// A list of ways to build a value that can satisfy this user. After the
1329 /// list is populated, one of these is selected heuristically and used to
1330 /// formulate a replacement for OperandValToReplace in UserInst.
1331 SmallVector<Formula, 12> Formulae;
1332
1333 /// The set of register candidates used by all formulae in this LSRUse.
1334 SmallPtrSet<const SCEV *, 4> Regs;
1335
1336 LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {}
1337
1338 LSRFixup &getNewFixup() {
1339 Fixups.push_back(LSRFixup());
1340 return Fixups.back();
1341 }
1342
1343 void pushFixup(LSRFixup &f) {
1344 Fixups.push_back(f);
1345 if (Immediate::isKnownGT(f.Offset, MaxOffset))
1346 MaxOffset = f.Offset;
1347 if (Immediate::isKnownLT(f.Offset, MinOffset))
1348 MinOffset = f.Offset;
1349 }
1350
1351 bool HasFormulaWithSameRegs(const Formula &F) const;
1352 float getNotSelectedProbability(const SCEV *Reg) const;
1353 bool InsertFormula(const Formula &F, const Loop &L);
1354 void DeleteFormula(Formula &F);
1355 void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
1356
1357 void print(raw_ostream &OS) const;
1358 void dump() const;
1359};
1360
1361} // end anonymous namespace
1362
1363static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1364 LSRUse::KindType Kind, MemAccessTy AccessTy,
1365 GlobalValue *BaseGV, Immediate BaseOffset,
1366 bool HasBaseReg, int64_t Scale,
1367 Instruction *Fixup = nullptr);
1368
1369static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) {
1371 return 1;
1372 if (Depth == 0)
1373 return 0;
1374 if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))
1375 return getSetupCost(S->getStart(), Depth - 1);
1376 if (auto S = dyn_cast<SCEVIntegralCastExpr>(Reg))
1377 return getSetupCost(S->getOperand(), Depth - 1);
1378 if (auto S = dyn_cast<SCEVNAryExpr>(Reg))
1379 return std::accumulate(S->operands().begin(), S->operands().end(), 0,
1380 [&](unsigned i, const SCEV *Reg) {
1381 return i + getSetupCost(Reg, Depth - 1);
1382 });
1383 if (auto S = dyn_cast<SCEVUDivExpr>(Reg))
1384 return getSetupCost(S->getLHS(), Depth - 1) +
1385 getSetupCost(S->getRHS(), Depth - 1);
1386 return 0;
1387}
1388
1389/// Tally up interesting quantities from the given register.
1390void Cost::RateRegister(const Formula &F, const SCEV *Reg,
1391 SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
1392 bool HardwareLoopProfitable) {
1393 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
1394 // If this is an addrec for another loop, it should be an invariant
1395 // with respect to L since L is the innermost loop (at least
1396 // for now LSR only handles innermost loops).
1397 if (AR->getLoop() != L) {
1398 // If the AddRec exists, consider it's register free and leave it alone.
1399 if (isExistingPhi(AR, *SE) && !(AMK & TTI::AMK_PostIndexed))
1400 return;
1401
1402 // It is bad to allow LSR for current loop to add induction variables
1403 // for its sibling loops.
1404 if (!AR->getLoop()->contains(L)) {
1405 Lose();
1406 return;
1407 }
1408
1409 // Otherwise, it will be an invariant with respect to Loop L.
1410 ++C.NumRegs;
1411 return;
1412 }
1413
1414 unsigned LoopCost = 1;
1415 if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||
1416 TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {
1417 const SCEV *Start;
1418 const APInt *Step;
1419 if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_scev_APInt(Step)))) {
1420 // If the step size matches the base offset, we could use pre-indexed
1421 // addressing.
1422 bool CanPreIndex = (AMK & TTI::AMK_PreIndexed) &&
1423 F.BaseOffset.isFixed() &&
1424 *Step == F.BaseOffset.getFixedValue();
1425 bool CanPostIndex = (AMK & TTI::AMK_PostIndexed) &&
1426 !isa<SCEVConstant>(Start) &&
1427 SE->isLoopInvariant(Start, L);
1428 // We can only pre or post index when the load/store is unconditional.
1429 if ((CanPreIndex || CanPostIndex) && LU.AllFixupsUnconditional)
1430 LoopCost = 0;
1431 }
1432 }
1433
1434 // If the loop counts down to zero and we'll be using a hardware loop then
1435 // the addrec will be combined into the hardware loop instruction.
1436 if (LU.Kind == LSRUse::ICmpZero && F.countsDownToZero() &&
1437 HardwareLoopProfitable)
1438 LoopCost = 0;
1439 C.AddRecCost += LoopCost;
1440
1441 // Add the step value register, if it needs one.
1442 // TODO: The non-affine case isn't precisely modeled here.
1443 if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
1444 if (!Regs.count(AR->getOperand(1))) {
1445 RateRegister(F, AR->getOperand(1), Regs, LU, HardwareLoopProfitable);
1446 if (isLoser())
1447 return;
1448 }
1449 }
1450 }
1451 ++C.NumRegs;
1452
1453 // Rough heuristic; favor registers which don't require extra setup
1454 // instructions in the preheader.
1455 C.SetupCost += getSetupCost(Reg, SetupCostDepthLimit);
1456 // Ensure we don't, even with the recusion limit, produce invalid costs.
1457 C.SetupCost = std::min<unsigned>(C.SetupCost, 1 << 16);
1458
1459 C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
1461}
1462
1463/// Record this register in the set. If we haven't seen it before, rate
1464/// it. Optional LoserRegs provides a way to declare any formula that refers to
1465/// one of those regs an instant loser.
1466void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1467 SmallPtrSetImpl<const SCEV *> &Regs,
1468 const LSRUse &LU, bool HardwareLoopProfitable,
1469 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1470 if (LoserRegs && LoserRegs->count(Reg)) {
1471 Lose();
1472 return;
1473 }
1474 if (Regs.insert(Reg).second) {
1475 RateRegister(F, Reg, Regs, LU, HardwareLoopProfitable);
1476 if (LoserRegs && isLoser())
1477 LoserRegs->insert(Reg);
1478 }
1479}
1480
1481void Cost::RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
1482 const DenseSet<const SCEV *> &VisitedRegs,
1483 const LSRUse &LU, bool HardwareLoopProfitable,
1484 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1485 if (isLoser())
1486 return;
1487 assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
1488 // Tally up the registers.
1489 unsigned PrevAddRecCost = C.AddRecCost;
1490 unsigned PrevNumRegs = C.NumRegs;
1491 unsigned PrevNumBaseAdds = C.NumBaseAdds;
1492 if (const SCEV *ScaledReg = F.ScaledReg) {
1493 if (VisitedRegs.count(ScaledReg)) {
1494 Lose();
1495 return;
1496 }
1497 RatePrimaryRegister(F, ScaledReg, Regs, LU, HardwareLoopProfitable,
1498 LoserRegs);
1499 if (isLoser())
1500 return;
1501 }
1502 for (const SCEV *BaseReg : F.BaseRegs) {
1503 if (VisitedRegs.count(BaseReg)) {
1504 Lose();
1505 return;
1506 }
1507 RatePrimaryRegister(F, BaseReg, Regs, LU, HardwareLoopProfitable,
1508 LoserRegs);
1509 if (isLoser())
1510 return;
1511 }
1512
1513 // Determine how many (unfolded) adds we'll need inside the loop.
1514 size_t NumBaseParts = F.getNumRegs();
1515 if (NumBaseParts > 1)
1516 // Do not count the base and a possible second register if the target
1517 // allows to fold 2 registers.
1518 C.NumBaseAdds +=
1519 NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F)));
1520 C.NumBaseAdds += (F.UnfoldedOffset.isNonZero());
1521
1522 // Accumulate non-free scaling amounts.
1523 C.ScaleCost += getScalingFactorCost(*TTI, LU, F, *L).getValue();
1524
1525 // Tally up the non-zero immediates.
1526 for (const LSRFixup &Fixup : LU.Fixups) {
1527 if (Fixup.Offset.isCompatibleImmediate(F.BaseOffset)) {
1528 Immediate Offset = Fixup.Offset.addUnsigned(F.BaseOffset);
1529 if (F.BaseGV)
1530 C.ImmCost += 64; // Handle symbolic values conservatively.
1531 // TODO: This should probably be the pointer size.
1532 else if (Offset.isNonZero())
1533 C.ImmCost +=
1534 APInt(64, Offset.getKnownMinValue(), true).getSignificantBits();
1535
1536 // Check with target if this offset with this instruction is
1537 // specifically not supported.
1538 if (LU.Kind == LSRUse::Address && Offset.isNonZero() &&
1539 !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1540 Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
1541 C.NumBaseAdds++;
1542 } else {
1543 // Incompatible immediate type, increase cost to avoid using
1544 C.ImmCost += 2048;
1545 }
1546 }
1547
1548 // If we don't count instruction cost exit here.
1549 if (!InsnsCost) {
1550 assert(isValid() && "invalid cost");
1551 return;
1552 }
1553
1554 // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
1555 // additional instruction (at least fill).
1556 // TODO: Need distinguish register class?
1557 unsigned TTIRegNum = TTI->getNumberOfRegisters(
1558 TTI->getRegisterClassForType(false, F.getType())) - 1;
1559 if (C.NumRegs > TTIRegNum) {
1560 // Cost already exceeded TTIRegNum, then only newly added register can add
1561 // new instructions.
1562 if (PrevNumRegs > TTIRegNum)
1563 C.Insns += (C.NumRegs - PrevNumRegs);
1564 else
1565 C.Insns += (C.NumRegs - TTIRegNum);
1566 }
1567
1568 // If ICmpZero formula ends with not 0, it could not be replaced by
1569 // just add or sub. We'll need to compare final result of AddRec.
1570 // That means we'll need an additional instruction. But if the target can
1571 // macro-fuse a compare with a branch, don't count this extra instruction.
1572 // For -10 + {0, +, 1}:
1573 // i = i + 1;
1574 // cmp i, 10
1575 //
1576 // For {-10, +, 1}:
1577 // i = i + 1;
1578 if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() &&
1579 !TTI->canMacroFuseCmp())
1580 C.Insns++;
1581 // Each new AddRec adds 1 instruction to calculation.
1582 C.Insns += (C.AddRecCost - PrevAddRecCost);
1583
1584 // BaseAdds adds instructions for unfolded registers.
1585 if (LU.Kind != LSRUse::ICmpZero)
1586 C.Insns += C.NumBaseAdds - PrevNumBaseAdds;
1587 assert(isValid() && "invalid cost");
1588}
1589
1590/// Set this cost to a losing value.
1591void Cost::Lose() {
1592 C.Insns = std::numeric_limits<unsigned>::max();
1593 C.NumRegs = std::numeric_limits<unsigned>::max();
1594 C.AddRecCost = std::numeric_limits<unsigned>::max();
1595 C.NumIVMuls = std::numeric_limits<unsigned>::max();
1596 C.NumBaseAdds = std::numeric_limits<unsigned>::max();
1597 C.ImmCost = std::numeric_limits<unsigned>::max();
1598 C.SetupCost = std::numeric_limits<unsigned>::max();
1599 C.ScaleCost = std::numeric_limits<unsigned>::max();
1600}
1601
1602/// Choose the lower cost.
1603bool Cost::isLess(const Cost &Other) const {
1604 if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
1605 C.Insns != Other.C.Insns)
1606 return C.Insns < Other.C.Insns;
1607 return TTI->isLSRCostLess(C, Other.C);
1608}
1609
1610#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1611void Cost::print(raw_ostream &OS) const {
1612 if (InsnsCost)
1613 OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
1614 OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s");
1615 if (C.AddRecCost != 0)
1616 OS << ", with addrec cost " << C.AddRecCost;
1617 if (C.NumIVMuls != 0)
1618 OS << ", plus " << C.NumIVMuls << " IV mul"
1619 << (C.NumIVMuls == 1 ? "" : "s");
1620 if (C.NumBaseAdds != 0)
1621 OS << ", plus " << C.NumBaseAdds << " base add"
1622 << (C.NumBaseAdds == 1 ? "" : "s");
1623 if (C.ScaleCost != 0)
1624 OS << ", plus " << C.ScaleCost << " scale cost";
1625 if (C.ImmCost != 0)
1626 OS << ", plus " << C.ImmCost << " imm cost";
1627 if (C.SetupCost != 0)
1628 OS << ", plus " << C.SetupCost << " setup cost";
1629}
1630
1631LLVM_DUMP_METHOD void Cost::dump() const {
1632 print(errs()); errs() << '\n';
1633}
1634#endif
1635
1636/// Test whether this fixup always uses its value outside of the given loop.
1637bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
1638 // PHI nodes use their value in their incoming blocks.
1639 if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
1640 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
1641 if (PN->getIncomingValue(i) == OperandValToReplace &&
1642 L->contains(PN->getIncomingBlock(i)))
1643 return false;
1644 return true;
1645 }
1646
1647 return !L->contains(UserInst);
1648}
1649
1650#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1651void LSRFixup::print(raw_ostream &OS) const {
1652 OS << "UserInst=";
1653 // Store is common and interesting enough to be worth special-casing.
1654 if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
1655 OS << "store ";
1656 Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
1657 } else if (UserInst->getType()->isVoidTy())
1658 OS << UserInst->getOpcodeName();
1659 else
1660 UserInst->printAsOperand(OS, /*PrintType=*/false);
1661
1662 OS << ", OperandValToReplace=";
1663 OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
1664
1665 for (const Loop *PIL : PostIncLoops) {
1666 OS << ", PostIncLoop=";
1667 PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
1668 }
1669
1670 if (Offset.isNonZero())
1671 OS << ", Offset=" << Offset;
1672}
1673
1674LLVM_DUMP_METHOD void LSRFixup::dump() const {
1675 print(errs()); errs() << '\n';
1676}
1677#endif
1678
1679/// Test whether this use as a formula which has the same registers as the given
1680/// formula.
1681bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
1683 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1684 // Unstable sort by host order ok, because this is only used for uniquifying.
1685 llvm::sort(Key);
1686 return Uniquifier.count(Key);
1687}
1688
1689/// The function returns a probability of selecting formula without Reg.
1690float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
1691 unsigned FNum = 0;
1692 for (const Formula &F : Formulae)
1693 if (F.referencesReg(Reg))
1694 FNum++;
1695 return ((float)(Formulae.size() - FNum)) / Formulae.size();
1696}
1697
1698/// If the given formula has not yet been inserted, add it to the list, and
1699/// return true. Return false otherwise. The formula must be in canonical form.
1700bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
1701 assert(F.isCanonical(L) && "Invalid canonical representation");
1702
1703 if (!Formulae.empty() && RigidFormula)
1704 return false;
1705
1707 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1708 // Unstable sort by host order ok, because this is only used for uniquifying.
1709 llvm::sort(Key);
1710
1711 if (!Uniquifier.insert(Key).second)
1712 return false;
1713
1714 // Using a register to hold the value of 0 is not profitable.
1715 assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
1716 "Zero allocated in a scaled register!");
1717#ifndef NDEBUG
1718 for (const SCEV *BaseReg : F.BaseRegs)
1719 assert(!BaseReg->isZero() && "Zero allocated in a base register!");
1720#endif
1721
1722 // Add the formula to the list.
1723 Formulae.push_back(F);
1724
1725 // Record registers now being used by this use.
1726 Regs.insert_range(F.BaseRegs);
1727 if (F.ScaledReg)
1728 Regs.insert(F.ScaledReg);
1729
1730 return true;
1731}
1732
1733/// Remove the given formula from this use's list.
1734void LSRUse::DeleteFormula(Formula &F) {
1735 if (&F != &Formulae.back())
1736 std::swap(F, Formulae.back());
1737 Formulae.pop_back();
1738}
1739
1740/// Recompute the Regs field, and update RegUses.
1741void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
1742 // Now that we've filtered out some formulae, recompute the Regs set.
1743 SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
1744 Regs.clear();
1745 for (const Formula &F : Formulae) {
1746 if (F.ScaledReg) Regs.insert(F.ScaledReg);
1747 Regs.insert_range(F.BaseRegs);
1748 }
1749
1750 // Update the RegTracker.
1751 for (const SCEV *S : OldRegs)
1752 if (!Regs.count(S))
1753 RegUses.dropRegister(S, LUIdx);
1754}
1755
1756#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1757void LSRUse::print(raw_ostream &OS) const {
1758 OS << "LSR Use: Kind=";
1759 switch (Kind) {
1760 case Basic: OS << "Basic"; break;
1761 case Special: OS << "Special"; break;
1762 case ICmpZero: OS << "ICmpZero"; break;
1763 case Address:
1764 OS << "Address of ";
1765 if (AccessTy.MemTy->isPointerTy())
1766 OS << "pointer"; // the full pointer type could be really verbose
1767 else {
1768 OS << *AccessTy.MemTy;
1769 }
1770
1771 OS << " in addrspace(" << AccessTy.AddrSpace << ')';
1772 }
1773
1774 OS << ", Offsets={";
1775 bool NeedComma = false;
1776 for (const LSRFixup &Fixup : Fixups) {
1777 if (NeedComma) OS << ',';
1778 OS << Fixup.Offset;
1779 NeedComma = true;
1780 }
1781 OS << '}';
1782
1783 if (AllFixupsOutsideLoop)
1784 OS << ", all-fixups-outside-loop";
1785
1786 if (AllFixupsUnconditional)
1787 OS << ", all-fixups-unconditional";
1788}
1789
1790LLVM_DUMP_METHOD void LSRUse::dump() const {
1791 print(errs()); errs() << '\n';
1792}
1793#endif
1794
1796 LSRUse::KindType Kind, MemAccessTy AccessTy,
1797 GlobalValue *BaseGV, Immediate BaseOffset,
1798 bool HasBaseReg, int64_t Scale,
1799 Instruction *Fixup /* = nullptr */) {
1800 switch (Kind) {
1801 case LSRUse::Address: {
1802 int64_t FixedOffset =
1803 BaseOffset.isScalable() ? 0 : BaseOffset.getFixedValue();
1804 int64_t ScalableOffset =
1805 BaseOffset.isScalable() ? BaseOffset.getKnownMinValue() : 0;
1806 return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, FixedOffset,
1807 HasBaseReg, Scale, AccessTy.AddrSpace,
1808 Fixup, ScalableOffset);
1809 }
1810 case LSRUse::ICmpZero:
1811 // There's not even a target hook for querying whether it would be legal to
1812 // fold a GV into an ICmp.
1813 if (BaseGV)
1814 return false;
1815
1816 // ICmp only has two operands; don't allow more than two non-trivial parts.
1817 if (Scale != 0 && HasBaseReg && BaseOffset.isNonZero())
1818 return false;
1819
1820 // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
1821 // putting the scaled register in the other operand of the icmp.
1822 if (Scale != 0 && Scale != -1)
1823 return false;
1824
1825 // If we have low-level target information, ask the target if it can fold an
1826 // integer immediate on an icmp.
1827 if (BaseOffset.isNonZero()) {
1828 // We don't have an interface to query whether the target supports
1829 // icmpzero against scalable quantities yet.
1830 if (BaseOffset.isScalable())
1831 return false;
1832
1833 // We have one of:
1834 // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
1835 // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
1836 // Offs is the ICmp immediate.
1837 if (Scale == 0)
1838 // The cast does the right thing with
1839 // std::numeric_limits<int64_t>::min().
1840 BaseOffset = BaseOffset.getFixed(-(uint64_t)BaseOffset.getFixedValue());
1841 return TTI.isLegalICmpImmediate(BaseOffset.getFixedValue());
1842 }
1843
1844 // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
1845 return true;
1846
1847 case LSRUse::Basic:
1848 // Only handle single-register values.
1849 return !BaseGV && Scale == 0 && BaseOffset.isZero();
1850
1851 case LSRUse::Special:
1852 // Special case Basic to handle -1 scales.
1853 return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset.isZero();
1854 }
1855
1856 llvm_unreachable("Invalid LSRUse Kind!");
1857}
1858
1860 Immediate MinOffset, Immediate MaxOffset,
1861 LSRUse::KindType Kind, MemAccessTy AccessTy,
1862 GlobalValue *BaseGV, Immediate BaseOffset,
1863 bool HasBaseReg, int64_t Scale) {
1864 if (BaseOffset.isNonZero() &&
1865 (BaseOffset.isScalable() != MinOffset.isScalable() ||
1866 BaseOffset.isScalable() != MaxOffset.isScalable()))
1867 return false;
1868 // Check for overflow.
1869 int64_t Base = BaseOffset.getKnownMinValue();
1870 int64_t Min = MinOffset.getKnownMinValue();
1871 int64_t Max = MaxOffset.getKnownMinValue();
1872 if (((int64_t)((uint64_t)Base + Min) > Base) != (Min > 0))
1873 return false;
1874 MinOffset = Immediate::get((uint64_t)Base + Min, MinOffset.isScalable());
1875 if (((int64_t)((uint64_t)Base + Max) > Base) != (Max > 0))
1876 return false;
1877 MaxOffset = Immediate::get((uint64_t)Base + Max, MaxOffset.isScalable());
1878
1879 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
1880 HasBaseReg, Scale) &&
1881 isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
1882 HasBaseReg, Scale);
1883}
1884
1886 Immediate MinOffset, Immediate MaxOffset,
1887 LSRUse::KindType Kind, MemAccessTy AccessTy,
1888 const Formula &F, const Loop &L) {
1889 // For the purpose of isAMCompletelyFolded either having a canonical formula
1890 // or a scale not equal to zero is correct.
1891 // Problems may arise from non canonical formulae having a scale == 0.
1892 // Strictly speaking it would best to just rely on canonical formulae.
1893 // However, when we generate the scaled formulae, we first check that the
1894 // scaling factor is profitable before computing the actual ScaledReg for
1895 // compile time sake.
1896 assert((F.isCanonical(L) || F.Scale != 0));
1897 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1898 F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
1899}
1900
1901/// Test whether we know how to expand the current formula.
1902static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1903 Immediate MaxOffset, LSRUse::KindType Kind,
1904 MemAccessTy AccessTy, GlobalValue *BaseGV,
1905 Immediate BaseOffset, bool HasBaseReg, int64_t Scale) {
1906 // We know how to expand completely foldable formulae.
1907 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1908 BaseOffset, HasBaseReg, Scale) ||
1909 // Or formulae that use a base register produced by a sum of base
1910 // registers.
1911 (Scale == 1 &&
1912 isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1913 BaseGV, BaseOffset, true, 0));
1914}
1915
1916static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1917 Immediate MaxOffset, LSRUse::KindType Kind,
1918 MemAccessTy AccessTy, const Formula &F) {
1919 return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
1920 F.BaseOffset, F.HasBaseReg, F.Scale);
1921}
1922
1924 Immediate Offset) {
1925 if (Offset.isScalable())
1926 return TTI.isLegalAddScalableImmediate(Offset.getKnownMinValue());
1927
1928 return TTI.isLegalAddImmediate(Offset.getFixedValue());
1929}
1930
1932 const LSRUse &LU, const Formula &F) {
1933 // Target may want to look at the user instructions.
1934 if (LU.Kind == LSRUse::Address && TTI.LSRWithInstrQueries()) {
1935 for (const LSRFixup &Fixup : LU.Fixups)
1936 if (!isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1937 (F.BaseOffset + Fixup.Offset), F.HasBaseReg,
1938 F.Scale, Fixup.UserInst))
1939 return false;
1940 return true;
1941 }
1942
1943 return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1944 LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
1945 F.Scale);
1946}
1947
1949 const LSRUse &LU, const Formula &F,
1950 const Loop &L) {
1951 if (!F.Scale)
1952 return 0;
1953
1954 // If the use is not completely folded in that instruction, we will have to
1955 // pay an extra cost only for scale != 1.
1956 if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1957 LU.AccessTy, F, L))
1958 return F.Scale != 1;
1959
1960 switch (LU.Kind) {
1961 case LSRUse::Address: {
1962 // Check the scaling factor cost with both the min and max offsets.
1963 int64_t ScalableMin = 0, ScalableMax = 0, FixedMin = 0, FixedMax = 0;
1964 if (F.BaseOffset.isScalable()) {
1965 ScalableMin = (F.BaseOffset + LU.MinOffset).getKnownMinValue();
1966 ScalableMax = (F.BaseOffset + LU.MaxOffset).getKnownMinValue();
1967 } else {
1968 FixedMin = (F.BaseOffset + LU.MinOffset).getFixedValue();
1969 FixedMax = (F.BaseOffset + LU.MaxOffset).getFixedValue();
1970 }
1971 InstructionCost ScaleCostMinOffset = TTI.getScalingFactorCost(
1972 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMin, ScalableMin),
1973 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
1974 InstructionCost ScaleCostMaxOffset = TTI.getScalingFactorCost(
1975 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMax, ScalableMax),
1976 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
1977
1978 assert(ScaleCostMinOffset.isValid() && ScaleCostMaxOffset.isValid() &&
1979 "Legal addressing mode has an illegal cost!");
1980 return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
1981 }
1982 case LSRUse::ICmpZero:
1983 case LSRUse::Basic:
1984 case LSRUse::Special:
1985 // The use is completely folded, i.e., everything is folded into the
1986 // instruction.
1987 return 0;
1988 }
1989
1990 llvm_unreachable("Invalid LSRUse Kind!");
1991}
1992
1994 LSRUse::KindType Kind, MemAccessTy AccessTy,
1995 GlobalValue *BaseGV, Immediate BaseOffset,
1996 bool HasBaseReg) {
1997 // Fast-path: zero is always foldable.
1998 if (BaseOffset.isZero() && !BaseGV)
1999 return true;
2000
2001 // Conservatively, create an address with an immediate and a
2002 // base and a scale.
2003 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2004
2005 // Canonicalize a scale of 1 to a base register if the formula doesn't
2006 // already have a base register.
2007 if (!HasBaseReg && Scale == 1) {
2008 Scale = 0;
2009 HasBaseReg = true;
2010 }
2011
2012 // FIXME: Try with + without a scale? Maybe based on TTI?
2013 // I think basereg + scaledreg + immediateoffset isn't a good 'conservative'
2014 // default for many architectures, not just AArch64 SVE. More investigation
2015 // needed later to determine if this should be used more widely than just
2016 // on scalable types.
2017 if (HasBaseReg && BaseOffset.isNonZero() && Kind != LSRUse::ICmpZero &&
2018 AccessTy.MemTy && AccessTy.MemTy->isScalableTy() && DropScaledForVScale)
2019 Scale = 0;
2020
2021 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
2022 HasBaseReg, Scale);
2023}
2024
2026 ScalarEvolution &SE, Immediate MinOffset,
2027 Immediate MaxOffset, LSRUse::KindType Kind,
2028 MemAccessTy AccessTy, const SCEV *S,
2029 bool HasBaseReg) {
2030 // Fast-path: zero is always foldable.
2031 if (S->isZero()) return true;
2032
2033 // Conservatively, create an address with an immediate and a
2034 // base and a scale.
2035 SCEVUse SCopy = S;
2036 Immediate BaseOffset = ExtractImmediate(SCopy, SE);
2037 GlobalValue *BaseGV = ExtractSymbol(SCopy, SE);
2038
2039 // If there's anything else involved, it's not foldable.
2040 if (!SCopy->isZero())
2041 return false;
2042
2043 // Fast-path: zero is always foldable.
2044 if (BaseOffset.isZero() && !BaseGV)
2045 return true;
2046
2047 if (BaseOffset.isScalable())
2048 return false;
2049
2050 // Conservatively, create an address with an immediate and a
2051 // base and a scale.
2052 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2053
2054 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
2055 BaseOffset, HasBaseReg, Scale);
2056}
2057
2058namespace {
2059
2060/// An individual increment in a Chain of IV increments. Relate an IV user to
2061/// an expression that computes the IV it uses from the IV used by the previous
2062/// link in the Chain.
2063///
2064/// For the head of a chain, IncExpr holds the absolute SCEV expression for the
2065/// original IVOperand. The head of the chain's IVOperand is only valid during
2066/// chain collection, before LSR replaces IV users. During chain generation,
2067/// IncExpr can be used to find the new IVOperand that computes the same
2068/// expression.
2069struct IVInc {
2070 Instruction *UserInst;
2071 Value* IVOperand;
2072 const SCEV *IncExpr;
2073
2074 IVInc(Instruction *U, Value *O, const SCEV *E)
2075 : UserInst(U), IVOperand(O), IncExpr(E) {}
2076};
2077
2078// The list of IV increments in program order. We typically add the head of a
2079// chain without finding subsequent links.
2080struct IVChain {
2082 const SCEV *ExprBase = nullptr;
2083
2084 IVChain() = default;
2085 IVChain(const IVInc &Head, const SCEV *Base)
2086 : Incs(1, Head), ExprBase(Base) {}
2087
2088 using const_iterator = SmallVectorImpl<IVInc>::const_iterator;
2089
2090 // Return the first increment in the chain.
2091 const_iterator begin() const {
2092 assert(!Incs.empty());
2093 return std::next(Incs.begin());
2094 }
2095 const_iterator end() const {
2096 return Incs.end();
2097 }
2098
2099 // Returns true if this chain contains any increments.
2100 bool hasIncs() const { return Incs.size() >= 2; }
2101
2102 // Add an IVInc to the end of this chain.
2103 void add(const IVInc &X) { Incs.push_back(X); }
2104
2105 // Returns the last UserInst in the chain.
2106 Instruction *tailUserInst() const { return Incs.back().UserInst; }
2107
2108 // Returns true if IncExpr can be profitably added to this chain.
2109 bool isProfitableIncrement(const SCEV *OperExpr,
2110 const SCEV *IncExpr,
2111 ScalarEvolution&);
2112};
2113
2114/// Helper for CollectChains to track multiple IV increment uses. Distinguish
2115/// between FarUsers that definitely cross IV increments and NearUsers that may
2116/// be used between IV increments.
2117struct ChainUsers {
2118 SmallPtrSet<Instruction*, 4> FarUsers;
2119 SmallPtrSet<Instruction*, 4> NearUsers;
2120};
2121
2122/// This class holds state for the main loop strength reduction logic.
2123class LSRInstance {
2124 IVUsers &IU;
2125 ScalarEvolution &SE;
2126 DominatorTree &DT;
2127 LoopInfo &LI;
2128 AssumptionCache &AC;
2129 TargetLibraryInfo &TLI;
2130 const TargetTransformInfo &TTI;
2131 Loop *const L;
2132 MemorySSAUpdater *MSSAU;
2134 mutable SCEVExpander Rewriter;
2135 bool Changed = false;
2136 bool HardwareLoopProfitable = false;
2137
2138 /// This is the insert position that the current loop's induction variable
2139 /// increment should be placed. In simple loops, this is the latch block's
2140 /// terminator. But in more complicated cases, this is a position which will
2141 /// dominate all the in-loop post-increment users.
2142 Instruction *IVIncInsertPos = nullptr;
2143
2144 /// Interesting factors between use strides.
2145 ///
2146 /// We explicitly use a SetVector which contains a SmallSet, instead of the
2147 /// default, a SmallDenseSet, because we need to use the full range of
2148 /// int64_ts, and there's currently no good way of doing that with
2149 /// SmallDenseSet.
2150 SetVector<int64_t, SmallVector<int64_t, 8>, SmallSet<int64_t, 8>> Factors;
2151
2152 /// The cost of the current SCEV, the best solution by LSR will be dropped if
2153 /// the solution is not profitable.
2154 Cost BaselineCost;
2155
2156 /// Interesting use types, to facilitate truncation reuse.
2157 SmallSetVector<Type *, 4> Types;
2158
2159 /// The list of interesting uses.
2161
2162 /// Track which uses use which register candidates.
2163 RegUseTracker RegUses;
2164
2165 // Limit the number of chains to avoid quadratic behavior. We don't expect to
2166 // have more than a few IV increment chains in a loop. Missing a Chain falls
2167 // back to normal LSR behavior for those uses.
2168 static const unsigned MaxChains = 8;
2169
2170 /// IV users can form a chain of IV increments.
2172
2173 /// IV users that belong to profitable IVChains.
2174 SmallPtrSet<Use*, MaxChains> IVIncSet;
2175
2176 /// Induction variables that were generated and inserted by the SCEV Expander.
2177 SmallVector<llvm::WeakVH, 2> ScalarEvolutionIVs;
2178
2179 // Inserting instructions in the loop and using them as PHI's input could
2180 // break LCSSA in case if PHI's parent block is not a loop exit (i.e. the
2181 // corresponding incoming block is not loop exiting). So collect all such
2182 // instructions to form LCSSA for them later.
2183 SmallSetVector<Instruction *, 4> InsertedNonLCSSAInsts;
2184
2185 void OptimizeShadowIV();
2186 bool FindIVUserForCond(Instruction *Cond, IVStrideUse *&CondUse);
2187 Instruction *OptimizeMax(ICmpInst *Cond, IVStrideUse *&CondUse);
2188 void OptimizeLoopTermCond();
2189
2190 void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
2191 SmallVectorImpl<ChainUsers> &ChainUsersVec);
2192 void FinalizeChain(IVChain &Chain);
2193 void CollectChains();
2194 void GenerateIVChain(const IVChain &Chain,
2195 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2196
2197 void CollectInterestingTypesAndFactors();
2198 void CollectFixupsAndInitialFormulae();
2199
2200 // Support for sharing of LSRUses between LSRFixups.
2201 using UseMapTy = DenseMap<LSRUse::SCEVUseKindPair, size_t>;
2202 UseMapTy UseMap;
2203
2204 bool reconcileNewOffset(LSRUse &LU, Immediate NewOffset, bool HasBaseReg,
2205 LSRUse::KindType Kind, MemAccessTy AccessTy);
2206
2207 std::pair<size_t, Immediate> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
2208 MemAccessTy AccessTy);
2209
2210 void DeleteUse(LSRUse &LU, size_t LUIdx);
2211
2212 LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
2213
2214 void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2215 void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2216 void CountRegisters(const Formula &F, size_t LUIdx);
2217 bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
2218 bool IsFixupExecutedEachIncrement(const LSRFixup &LF) const;
2219
2220 void CollectLoopInvariantFixupsAndFormulae();
2221
2222 void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
2223 unsigned Depth = 0);
2224
2225 void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
2226 const Formula &Base, unsigned Depth,
2227 size_t Idx, bool IsScaledReg = false);
2228 void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
2229 void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2230 const Formula &Base, size_t Idx,
2231 bool IsScaledReg = false);
2232 void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2233 void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2234 const Formula &Base,
2235 const SmallVectorImpl<Immediate> &Worklist,
2236 size_t Idx, bool IsScaledReg = false);
2237 void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2238 void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2239 void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2240 void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
2241 void GenerateCrossUseConstantOffsets();
2242 void GenerateAllReuseFormulae();
2243
2244 void FilterOutUndesirableDedicatedRegisters();
2245
2246 size_t EstimateSearchSpaceComplexity() const;
2247 void NarrowSearchSpaceByDetectingSupersets();
2248 void NarrowSearchSpaceByCollapsingUnrolledCode();
2249 void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
2250 void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
2251 void NarrowSearchSpaceByFilterPostInc();
2252 void NarrowSearchSpaceByDeletingCostlyFormulas();
2253 void NarrowSearchSpaceByPickingWinnerRegs();
2254 void NarrowSearchSpaceUsingHeuristics();
2255
2256 void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
2257 Cost &SolutionCost,
2258 SmallVectorImpl<const Formula *> &Workspace,
2259 const Cost &CurCost,
2260 const SmallPtrSet<const SCEV *, 16> &CurRegs,
2261 DenseSet<const SCEV *> &VisitedRegs) const;
2262 void Solve(SmallVectorImpl<const Formula *> &Solution) const;
2263
2265 HoistInsertPosition(BasicBlock::iterator IP,
2266 const SmallVectorImpl<Instruction *> &Inputs) const;
2267 BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP,
2268 const LSRFixup &LF,
2269 const LSRUse &LU) const;
2270
2271 Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2273 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2274 void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
2275 const Formula &F,
2276 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2277 void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2278 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2279 void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
2280
2281public:
2282 LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
2283 LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC,
2284 TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU);
2285
2286 bool getChanged() const { return Changed; }
2287 const SmallVectorImpl<WeakVH> &getScalarEvolutionIVs() const {
2288 return ScalarEvolutionIVs;
2289 }
2290
2291 void print_factors_and_types(raw_ostream &OS) const;
2292 void print_fixups(raw_ostream &OS) const;
2293 void print_uses(raw_ostream &OS) const;
2294 void print(raw_ostream &OS) const;
2295 void dump() const;
2296};
2297
2298} // end anonymous namespace
2299
2300/// If IV is used in a int-to-float cast inside the loop then try to eliminate
2301/// the cast operation.
2302void LSRInstance::OptimizeShadowIV() {
2303 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2304 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2305 return;
2306
2307 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
2308 UI != E; /* empty */) {
2309 IVUsers::const_iterator CandidateUI = UI;
2310 ++UI;
2311 Instruction *ShadowUse = CandidateUI->getUser();
2312 Type *DestTy = nullptr;
2313 bool IsSigned = false;
2314
2315 /* If shadow use is a int->float cast then insert a second IV
2316 to eliminate this cast.
2317
2318 for (unsigned i = 0; i < n; ++i)
2319 foo((double)i);
2320
2321 is transformed into
2322
2323 double d = 0.0;
2324 for (unsigned i = 0; i < n; ++i, ++d)
2325 foo(d);
2326 */
2327 if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
2328 IsSigned = false;
2329 DestTy = UCast->getDestTy();
2330 }
2331 else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
2332 IsSigned = true;
2333 DestTy = SCast->getDestTy();
2334 }
2335 if (!DestTy) continue;
2336
2337 // If target does not support DestTy natively then do not apply
2338 // this transformation.
2339 if (!TTI.isTypeLegal(DestTy)) continue;
2340
2341 PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
2342 if (!PH) continue;
2343 if (PH->getNumIncomingValues() != 2) continue;
2344
2345 // If the calculation in integers overflows, the result in FP type will
2346 // differ. So we only can do this transformation if we are guaranteed to not
2347 // deal with overflowing values
2348 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PH));
2349 if (!AR) continue;
2350 if (IsSigned && !AR->hasNoSignedWrap()) continue;
2351 if (!IsSigned && !AR->hasNoUnsignedWrap()) continue;
2352
2353 Type *SrcTy = PH->getType();
2354 int Mantissa = DestTy->getFPMantissaWidth();
2355 if (Mantissa == -1) continue;
2356 if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
2357 continue;
2358
2359 unsigned Entry, Latch;
2360 if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
2361 Entry = 0;
2362 Latch = 1;
2363 } else {
2364 Entry = 1;
2365 Latch = 0;
2366 }
2367
2368 ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
2369 if (!Init) continue;
2370 Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
2371 (double)Init->getSExtValue() :
2372 (double)Init->getZExtValue());
2373
2374 BinaryOperator *Incr =
2376 if (!Incr) continue;
2377 if (Incr->getOpcode() != Instruction::Add
2378 && Incr->getOpcode() != Instruction::Sub)
2379 continue;
2380
2381 /* Initialize new IV, double d = 0.0 in above example. */
2382 ConstantInt *C = nullptr;
2383 if (Incr->getOperand(0) == PH)
2385 else if (Incr->getOperand(1) == PH)
2387 else
2388 continue;
2389
2390 if (!C) continue;
2391
2392 // Ignore negative constants, as the code below doesn't handle them
2393 // correctly. TODO: Remove this restriction.
2394 if (!C->getValue().isStrictlyPositive())
2395 continue;
2396
2397 /* Add new PHINode. */
2398 PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH->getIterator());
2399 NewPH->setDebugLoc(PH->getDebugLoc());
2400
2401 /* create new increment. '++d' in above example. */
2402 Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
2403 BinaryOperator *NewIncr = BinaryOperator::Create(
2404 Incr->getOpcode() == Instruction::Add ? Instruction::FAdd
2405 : Instruction::FSub,
2406 NewPH, CFP, "IV.S.next.", Incr->getIterator());
2407 NewIncr->setDebugLoc(Incr->getDebugLoc());
2408
2409 NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
2410 NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
2411
2412 /* Remove cast operation */
2413 ShadowUse->replaceAllUsesWith(NewPH);
2414 ShadowUse->eraseFromParent();
2415 Changed = true;
2416 break;
2417 }
2418}
2419
2420/// If Cond has an operand that is an expression of an IV, set the IV user and
2421/// stride information and return true, otherwise return false.
2422bool LSRInstance::FindIVUserForCond(Instruction *Cond, IVStrideUse *&CondUse) {
2423 for (IVStrideUse &U : IU)
2424 if (U.getUser() == Cond) {
2425 // NOTE: we could handle setcc instructions with multiple uses here, but
2426 // InstCombine does it as well for simple uses, it's not clear that it
2427 // occurs enough in real life to handle.
2428 CondUse = &U;
2429 return true;
2430 }
2431 return false;
2432}
2433
2434/// Rewrite the loop's terminating condition if it uses a max computation.
2435///
2436/// This is a narrow solution to a specific, but acute, problem. For loops
2437/// like this:
2438///
2439/// i = 0;
2440/// do {
2441/// p[i] = 0.0;
2442/// } while (++i < n);
2443///
2444/// the trip count isn't just 'n', because 'n' might not be positive. And
2445/// unfortunately this can come up even for loops where the user didn't use
2446/// a C do-while loop. For example, seemingly well-behaved top-test loops
2447/// will commonly be lowered like this:
2448///
2449/// if (n > 0) {
2450/// i = 0;
2451/// do {
2452/// p[i] = 0.0;
2453/// } while (++i < n);
2454/// }
2455///
2456/// and then it's possible for subsequent optimization to obscure the if
2457/// test in such a way that indvars can't find it.
2458///
2459/// When indvars can't find the if test in loops like this, it creates a
2460/// max expression, which allows it to give the loop a canonical
2461/// induction variable:
2462///
2463/// i = 0;
2464/// max = n < 1 ? 1 : n;
2465/// do {
2466/// p[i] = 0.0;
2467/// } while (++i != max);
2468///
2469/// Canonical induction variables are necessary because the loop passes
2470/// are designed around them. The most obvious example of this is the
2471/// LoopInfo analysis, which doesn't remember trip count values. It
2472/// expects to be able to rediscover the trip count each time it is
2473/// needed, and it does this using a simple analysis that only succeeds if
2474/// the loop has a canonical induction variable.
2475///
2476/// However, when it comes time to generate code, the maximum operation
2477/// can be quite costly, especially if it's inside of an outer loop.
2478///
2479/// This function solves this problem by detecting this type of loop and
2480/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
2481/// the instructions for the maximum computation.
2482Instruction *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse *&CondUse) {
2483 // Check that the loop matches the pattern we're looking for.
2484 if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
2485 Cond->getPredicate() != CmpInst::ICMP_NE)
2486 return Cond;
2487
2488 SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
2489 if (!Sel || !Sel->hasOneUse()) return Cond;
2490
2491 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2492 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2493 return Cond;
2494 const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
2495
2496 // Add one to the backedge-taken count to get the trip count.
2497 const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
2498 if (IterationCount != SE.getSCEV(Sel)) return Cond;
2499
2500 // Check for a max calculation that matches the pattern. There's no check
2501 // for ICMP_ULE here because the comparison would be with zero, which
2502 // isn't interesting.
2503 CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
2504 const SCEVNAryExpr *Max = nullptr;
2505 if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
2506 Pred = ICmpInst::ICMP_SLE;
2507 Max = S;
2508 } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
2509 Pred = ICmpInst::ICMP_SLT;
2510 Max = S;
2511 } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
2512 Pred = ICmpInst::ICMP_ULT;
2513 Max = U;
2514 } else {
2515 // No match; bail.
2516 return Cond;
2517 }
2518
2519 // To handle a max with more than two operands, this optimization would
2520 // require additional checking and setup.
2521 if (Max->getNumOperands() != 2)
2522 return Cond;
2523
2524 const SCEV *MaxLHS = Max->getOperand(0);
2525 const SCEV *MaxRHS = Max->getOperand(1);
2526
2527 // ScalarEvolution canonicalizes constants to the left. For < and >, look
2528 // for a comparison with 1. For <= and >=, a comparison with zero.
2529 if (!MaxLHS ||
2530 (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
2531 return Cond;
2532
2533 // Check the relevant induction variable for conformance to
2534 // the pattern.
2535 const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
2536 if (!match(IV,
2538 return Cond;
2539
2540 assert(cast<SCEVAddRecExpr>(IV)->getLoop() == L &&
2541 "Loop condition operand is an addrec in a different loop!");
2542
2543 // Check the right operand of the select, and remember it, as it will
2544 // be used in the new comparison instruction.
2545 Value *NewRHS = nullptr;
2546 if (ICmpInst::isTrueWhenEqual(Pred)) {
2547 // Look for n+1, and grab n.
2548 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
2549 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2550 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2551 NewRHS = BO->getOperand(0);
2552 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
2553 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2554 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2555 NewRHS = BO->getOperand(0);
2556 if (!NewRHS)
2557 return Cond;
2558 } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
2559 NewRHS = Sel->getOperand(1);
2560 else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
2561 NewRHS = Sel->getOperand(2);
2562 else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
2563 NewRHS = SU->getValue();
2564 else
2565 // Max doesn't match expected pattern.
2566 return Cond;
2567
2568 // Determine the new comparison opcode. It may be signed or unsigned,
2569 // and the original comparison may be either equality or inequality.
2570 if (Cond->getPredicate() == CmpInst::ICMP_EQ)
2571 Pred = CmpInst::getInversePredicate(Pred);
2572
2573 // Ok, everything looks ok to change the condition into an SLT or SGE and
2574 // delete the max calculation.
2575 ICmpInst *NewCond = new ICmpInst(Cond->getIterator(), Pred,
2576 Cond->getOperand(0), NewRHS, "scmp");
2577
2578 // Delete the max calculation instructions.
2579 NewCond->setDebugLoc(Cond->getDebugLoc());
2580 Cond->replaceAllUsesWith(NewCond);
2581 CondUse->setUser(NewCond);
2583 Cond->eraseFromParent();
2584 Sel->eraseFromParent();
2585 if (Cmp->use_empty()) {
2586 salvageDebugInfo(*Cmp);
2587 Cmp->eraseFromParent();
2588 }
2589 return NewCond;
2590}
2591
2592/// Change loop terminating condition to use the postinc iv when possible.
2593void
2594LSRInstance::OptimizeLoopTermCond() {
2595 SmallPtrSet<Instruction *, 4> PostIncs;
2596
2597 // We need a different set of heuristics for rotated and non-rotated loops.
2598 // If a loop is rotated then the latch is also the backedge, so inserting
2599 // post-inc expressions just before the latch is ideal. To reduce live ranges
2600 // it also makes sense to rewrite terminating conditions to use post-inc
2601 // expressions.
2602 //
2603 // If the loop is not rotated then the latch is not a backedge; the latch
2604 // check is done in the loop head. Adding post-inc expressions before the
2605 // latch will cause overlapping live-ranges of pre-inc and post-inc expressions
2606 // in the loop body. In this case we do *not* want to use post-inc expressions
2607 // in the latch check, and we want to insert post-inc expressions before
2608 // the backedge.
2609 BasicBlock *LatchBlock = L->getLoopLatch();
2610 SmallVector<BasicBlock*, 8> ExitingBlocks;
2611 L->getExitingBlocks(ExitingBlocks);
2612 if (!llvm::is_contained(ExitingBlocks, LatchBlock)) {
2613 // The backedge doesn't exit the loop; treat this as a head-tested loop.
2614 IVIncInsertPos = LatchBlock->getTerminator();
2615 return;
2616 }
2617
2618 // Otherwise treat this as a rotated loop.
2619 for (BasicBlock *ExitingBlock : ExitingBlocks) {
2620 // Get the terminating condition for the loop if possible. If we
2621 // can, we want to change it to use a post-incremented version of its
2622 // induction variable, to allow coalescing the live ranges for the IV into
2623 // one register value.
2624
2625 CondBrInst *TermBr = dyn_cast<CondBrInst>(ExitingBlock->getTerminator());
2626 if (!TermBr)
2627 continue;
2628
2630 // If the argument to TermBr is an extractelement, then the source of that
2631 // instruction is what's generated the condition.
2633 if (Extract)
2634 Cond = dyn_cast<Instruction>(Extract->getVectorOperand());
2635 // FIXME: We could do more here, like handling logical operations where one
2636 // side is a cmp that uses an induction variable.
2637 if (!Cond)
2638 continue;
2639
2640 // Search IVUsesByStride to find Cond's IVUse if there is one.
2641 IVStrideUse *CondUse = nullptr;
2642 if (!FindIVUserForCond(Cond, CondUse))
2643 continue;
2644
2645 // If the trip count is computed in terms of a max (due to ScalarEvolution
2646 // being unable to find a sufficient guard, for example), change the loop
2647 // comparison to use SLT or ULT instead of NE.
2648 // One consequence of doing this now is that it disrupts the count-down
2649 // optimization. That's not always a bad thing though, because in such
2650 // cases it may still be worthwhile to avoid a max.
2651 if (auto *Cmp = dyn_cast<ICmpInst>(Cond))
2652 Cond = OptimizeMax(Cmp, CondUse);
2653
2654 // If this exiting block dominates the latch block, it may also use
2655 // the post-inc value if it won't be shared with other uses.
2656 // Check for dominance.
2657 if (!DT.dominates(ExitingBlock, LatchBlock))
2658 continue;
2659
2660 // Conservatively avoid trying to use the post-inc value in non-latch
2661 // exits if there may be pre-inc users in intervening blocks.
2662 if (LatchBlock != ExitingBlock)
2663 for (const IVStrideUse &UI : IU)
2664 // Test if the use is reachable from the exiting block. This dominator
2665 // query is a conservative approximation of reachability.
2666 if (&UI != CondUse &&
2667 !DT.properlyDominates(UI.getUser()->getParent(), ExitingBlock)) {
2668 // Conservatively assume there may be reuse if the quotient of their
2669 // strides could be a legal scale.
2670 const SCEV *A = IU.getStride(*CondUse, L);
2671 const SCEV *B = IU.getStride(UI, L);
2672 if (!A || !B) continue;
2673 if (SE.getTypeSizeInBits(A->getType()) !=
2674 SE.getTypeSizeInBits(B->getType())) {
2675 if (SE.getTypeSizeInBits(A->getType()) >
2676 SE.getTypeSizeInBits(B->getType()))
2677 B = SE.getSignExtendExpr(B, A->getType());
2678 else
2679 A = SE.getSignExtendExpr(A, B->getType());
2680 }
2681 if (const SCEVConstant *D =
2683 const ConstantInt *C = D->getValue();
2684 // Stride of one or negative one can have reuse with non-addresses.
2685 if (C->isOne() || C->isMinusOne())
2686 goto decline_post_inc;
2687 // Avoid weird situations.
2688 if (C->getValue().getSignificantBits() >= 64 ||
2689 C->getValue().isMinSignedValue())
2690 goto decline_post_inc;
2691 // Check for possible scaled-address reuse.
2692 if (isAddressUse(TTI, UI.getUser(), UI.getOperandValToReplace())) {
2693 MemAccessTy AccessTy =
2694 getAccessType(TTI, UI.getUser(), UI.getOperandValToReplace());
2695 int64_t Scale = C->getSExtValue();
2696 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2697 /*BaseOffset=*/0,
2698 /*HasBaseReg=*/true, Scale,
2699 AccessTy.AddrSpace))
2700 goto decline_post_inc;
2701 Scale = -Scale;
2702 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2703 /*BaseOffset=*/0,
2704 /*HasBaseReg=*/true, Scale,
2705 AccessTy.AddrSpace))
2706 goto decline_post_inc;
2707 }
2708 }
2709 }
2710
2711 LLVM_DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: "
2712 << *Cond << '\n');
2713
2714 // It's possible for the setcc instruction to be anywhere in the loop, and
2715 // possible for it to have multiple users. If it is not immediately before
2716 // the exiting block branch, move it.
2717 if (isa_and_nonnull<CmpInst>(Cond) && Cond->getNextNode() != TermBr &&
2718 !Extract) {
2719 if (Cond->hasOneUse()) {
2720 Cond->moveBefore(TermBr->getIterator());
2721 } else {
2722 // Clone the terminating condition and insert into the loopend.
2723 Instruction *OldCond = Cond;
2724 Cond = Cond->clone();
2725 Cond->setName(L->getHeader()->getName() + ".termcond");
2726 Cond->insertInto(ExitingBlock, TermBr->getIterator());
2727
2728 // Clone the IVUse, as the old use still exists!
2729 CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
2730 TermBr->replaceUsesOfWith(OldCond, Cond);
2731 }
2732 }
2733
2734 // If we get to here, we know that we can transform the setcc instruction to
2735 // use the post-incremented version of the IV, allowing us to coalesce the
2736 // live ranges for the IV correctly.
2737 CondUse->transformToPostInc(L);
2738 Changed = true;
2739
2740 PostIncs.insert(Cond);
2741 decline_post_inc:;
2742 }
2743
2744 // Determine an insertion point for the loop induction variable increment. It
2745 // must dominate all the post-inc comparisons we just set up, and it must
2746 // dominate the loop latch edge.
2747 IVIncInsertPos = L->getLoopLatch()->getTerminator();
2748 for (Instruction *Inst : PostIncs)
2749 IVIncInsertPos = DT.findNearestCommonDominator(IVIncInsertPos, Inst);
2750}
2751
2752/// Determine if the given use can accommodate a fixup at the given offset and
2753/// other details. If so, update the use and return true.
2754bool LSRInstance::reconcileNewOffset(LSRUse &LU, Immediate NewOffset,
2755 bool HasBaseReg, LSRUse::KindType Kind,
2756 MemAccessTy AccessTy) {
2757 Immediate NewMinOffset = LU.MinOffset;
2758 Immediate NewMaxOffset = LU.MaxOffset;
2759 MemAccessTy NewAccessTy = AccessTy;
2760
2761 // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
2762 // something conservative, however this can pessimize in the case that one of
2763 // the uses will have all its uses outside the loop, for example.
2764 if (LU.Kind != Kind)
2765 return false;
2766
2767 // Check for a mismatched access type, and fall back conservatively as needed.
2768 // TODO: Be less conservative when the type is similar and can use the same
2769 // addressing modes.
2770 if (Kind == LSRUse::Address) {
2771 if (AccessTy.MemTy != LU.AccessTy.MemTy) {
2772 NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext(),
2773 AccessTy.AddrSpace);
2774 }
2775 }
2776
2777 // Conservatively assume HasBaseReg is true for now.
2778 if (Immediate::isKnownLT(NewOffset, LU.MinOffset)) {
2779 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2780 LU.MaxOffset - NewOffset, HasBaseReg))
2781 return false;
2782 NewMinOffset = NewOffset;
2783 } else if (Immediate::isKnownGT(NewOffset, LU.MaxOffset)) {
2784 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2785 NewOffset - LU.MinOffset, HasBaseReg))
2786 return false;
2787 NewMaxOffset = NewOffset;
2788 }
2789
2790 // FIXME: We should be able to handle some level of scalable offset support
2791 // for 'void', but in order to get basic support up and running this is
2792 // being left out.
2793 if (NewAccessTy.MemTy && NewAccessTy.MemTy->isVoidTy() &&
2794 (NewMinOffset.isScalable() || NewMaxOffset.isScalable()))
2795 return false;
2796
2797 // Update the use.
2798 LU.MinOffset = NewMinOffset;
2799 LU.MaxOffset = NewMaxOffset;
2800 LU.AccessTy = NewAccessTy;
2801 return true;
2802}
2803
2804/// Return an LSRUse index and an offset value for a fixup which needs the given
2805/// expression, with the given kind and optional access type. Either reuse an
2806/// existing use or create a new one, as needed.
2807std::pair<size_t, Immediate> LSRInstance::getUse(const SCEV *&Expr,
2808 LSRUse::KindType Kind,
2809 MemAccessTy AccessTy) {
2810 const SCEV *Copy = Expr;
2811 SCEVUse ExprUse = Expr;
2812 Immediate Offset = ExtractImmediate(ExprUse, SE);
2813 Expr = ExprUse;
2814
2815 // Basic uses can't accept any offset, for example.
2816 if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
2817 Offset, /*HasBaseReg=*/ true)) {
2818 Expr = Copy;
2819 Offset = Immediate::getFixed(0);
2820 }
2821
2822 std::pair<UseMapTy::iterator, bool> P =
2823 UseMap.try_emplace(LSRUse::SCEVUseKindPair(Expr, Kind));
2824 if (!P.second) {
2825 // A use already existed with this base.
2826 size_t LUIdx = P.first->second;
2827 LSRUse &LU = Uses[LUIdx];
2828 if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
2829 // Reuse this use.
2830 return std::make_pair(LUIdx, Offset);
2831 }
2832
2833 // Create a new use.
2834 size_t LUIdx = Uses.size();
2835 P.first->second = LUIdx;
2836 Uses.push_back(LSRUse(Kind, AccessTy));
2837 LSRUse &LU = Uses[LUIdx];
2838
2839 LU.MinOffset = Offset;
2840 LU.MaxOffset = Offset;
2841 return std::make_pair(LUIdx, Offset);
2842}
2843
2844/// Delete the given use from the Uses list.
2845void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
2846 if (&LU != &Uses.back())
2847 std::swap(LU, Uses.back());
2848 Uses.pop_back();
2849
2850 // Update RegUses.
2851 RegUses.swapAndDropUse(LUIdx, Uses.size());
2852}
2853
2854/// Look for a use distinct from OrigLU which is has a formula that has the same
2855/// registers as the given formula.
2856LSRUse *
2857LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
2858 const LSRUse &OrigLU) {
2859 // Search all uses for the formula. This could be more clever.
2860 for (LSRUse &LU : Uses) {
2861 // Check whether this use is close enough to OrigLU, to see whether it's
2862 // worthwhile looking through its formulae.
2863 // Ignore ICmpZero uses because they may contain formulae generated by
2864 // GenerateICmpZeroScales, in which case adding fixup offsets may
2865 // be invalid.
2866 if (&LU != &OrigLU && LU.Kind != LSRUse::ICmpZero &&
2867 LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
2868 LU.HasFormulaWithSameRegs(OrigF)) {
2869 // Scan through this use's formulae.
2870 for (const Formula &F : LU.Formulae) {
2871 // Check to see if this formula has the same registers and symbols
2872 // as OrigF.
2873 if (F.BaseRegs == OrigF.BaseRegs &&
2874 F.ScaledReg == OrigF.ScaledReg &&
2875 F.BaseGV == OrigF.BaseGV &&
2876 F.Scale == OrigF.Scale &&
2877 F.UnfoldedOffset == OrigF.UnfoldedOffset) {
2878 if (F.BaseOffset.isZero())
2879 return &LU;
2880 // This is the formula where all the registers and symbols matched;
2881 // there aren't going to be any others. Since we declined it, we
2882 // can skip the rest of the formulae and proceed to the next LSRUse.
2883 break;
2884 }
2885 }
2886 }
2887 }
2888
2889 // Nothing looked good.
2890 return nullptr;
2891}
2892
2893void LSRInstance::CollectInterestingTypesAndFactors() {
2894 SmallSetVector<const SCEV *, 4> Strides;
2895
2896 // Collect interesting types and strides.
2898 for (const IVStrideUse &U : IU) {
2899 const SCEV *Expr = IU.getExpr(U);
2900 if (!Expr)
2901 continue;
2902
2903 // Collect interesting types.
2904 Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
2905
2906 // Add strides for mentioned loops.
2907 Worklist.push_back(Expr);
2908 do {
2909 const SCEV *S = Worklist.pop_back_val();
2910 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
2911 if (AR->getLoop() == L)
2912 Strides.insert(AR->getStepRecurrence(SE));
2913 Worklist.push_back(AR->getStart());
2914 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
2915 append_range(Worklist, Add->operands());
2916 }
2917 } while (!Worklist.empty());
2918 }
2919
2920 // Compute interesting factors from the set of interesting strides.
2921 for (SmallSetVector<const SCEV *, 4>::const_iterator
2922 I = Strides.begin(), E = Strides.end(); I != E; ++I)
2923 for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
2924 std::next(I); NewStrideIter != E; ++NewStrideIter) {
2925 const SCEV *OldStride = *I;
2926 const SCEV *NewStride = *NewStrideIter;
2927
2928 if (SE.getTypeSizeInBits(OldStride->getType()) !=
2929 SE.getTypeSizeInBits(NewStride->getType())) {
2930 if (SE.getTypeSizeInBits(OldStride->getType()) >
2931 SE.getTypeSizeInBits(NewStride->getType()))
2932 NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
2933 else
2934 OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
2935 }
2936 if (const SCEVConstant *Factor =
2937 dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
2938 SE, true))) {
2939 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2940 Factors.insert(Factor->getAPInt().getSExtValue());
2941 } else if (const SCEVConstant *Factor =
2943 NewStride,
2944 SE, true))) {
2945 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2946 Factors.insert(Factor->getAPInt().getSExtValue());
2947 }
2948 }
2949
2950 // If all uses use the same type, don't bother looking for truncation-based
2951 // reuse.
2952 if (Types.size() == 1)
2953 Types.clear();
2954
2955 LLVM_DEBUG(print_factors_and_types(dbgs()));
2956}
2957
2958/// Helper for CollectChains that finds an IV operand (computed by an AddRec in
2959/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
2960/// IVStrideUses, we could partially skip this.
2961static User::op_iterator
2963 Loop *L, ScalarEvolution &SE) {
2964 for(; OI != OE; ++OI) {
2965 if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
2966 if (!SE.isSCEVable(Oper->getType()))
2967 continue;
2968
2969 if (const SCEVAddRecExpr *AR =
2971 if (AR->getLoop() == L)
2972 break;
2973 }
2974 }
2975 }
2976 return OI;
2977}
2978
2979/// IVChain logic must consistently peek base TruncInst operands, so wrap it in
2980/// a convenient helper.
2982 if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
2983 return Trunc->getOperand(0);
2984 return Oper;
2985}
2986
2987/// Return an approximation of this SCEV expression's "base", or NULL for any
2988/// constant. Returning the expression itself is conservative. Returning a
2989/// deeper subexpression is more precise and valid as long as it isn't less
2990/// complex than another subexpression. For expressions involving multiple
2991/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
2992/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
2993/// IVInc==b-a.
2994///
2995/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
2996/// SCEVUnknown, we simply return the rightmost SCEV operand.
2997static const SCEV *getExprBase(const SCEV *S) {
2998 switch (S->getSCEVType()) {
2999 default: // including scUnknown.
3000 return S;
3001 case scConstant:
3002 case scVScale:
3003 return nullptr;
3004 case scTruncate:
3005 return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
3006 case scZeroExtend:
3007 return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
3008 case scSignExtend:
3009 return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
3010 case scAddExpr: {
3011 // Skip over scaled operands (scMulExpr) to follow add operands as long as
3012 // there's nothing more complex.
3013 // FIXME: not sure if we want to recognize negation.
3014 const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
3015 for (const SCEV *SubExpr : reverse(Add->operands())) {
3016 if (SubExpr->getSCEVType() == scAddExpr)
3017 return getExprBase(SubExpr);
3018
3019 if (SubExpr->getSCEVType() != scMulExpr)
3020 return SubExpr;
3021 }
3022 return S; // all operands are scaled, be conservative.
3023 }
3024 case scAddRecExpr:
3025 return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
3026 }
3027 llvm_unreachable("Unknown SCEV kind!");
3028}
3029
3030/// Return true if the chain increment is profitable to expand into a loop
3031/// invariant value, which may require its own register. A profitable chain
3032/// increment will be an offset relative to the same base. We allow such offsets
3033/// to potentially be used as chain increment as long as it's not obviously
3034/// expensive to expand using real instructions.
3035bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
3036 const SCEV *IncExpr,
3037 ScalarEvolution &SE) {
3038 // Aggressively form chains when -stress-ivchain.
3039 if (StressIVChain)
3040 return true;
3041
3042 // Do not replace a constant offset from IV head with a nonconstant IV
3043 // increment.
3044 if (!isa<SCEVConstant>(IncExpr)) {
3045 const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
3046 if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
3047 return false;
3048 }
3049
3050 SmallPtrSet<const SCEV*, 8> Processed;
3051 return !isHighCostExpansion(IncExpr, Processed, SE);
3052}
3053
3054/// Return true if the number of registers needed for the chain is estimated to
3055/// be less than the number required for the individual IV users. First prohibit
3056/// any IV users that keep the IV live across increments (the Users set should
3057/// be empty). Next count the number and type of increments in the chain.
3058///
3059/// Chaining IVs can lead to considerable code bloat if ISEL doesn't
3060/// effectively use postinc addressing modes. Only consider it profitable it the
3061/// increments can be computed in fewer registers when chained.
3062///
3063/// TODO: Consider IVInc free if it's already used in another chains.
3064static bool isProfitableChain(IVChain &Chain,
3066 ScalarEvolution &SE,
3067 const TargetTransformInfo &TTI) {
3068 if (StressIVChain)
3069 return true;
3070
3071 if (!Chain.hasIncs())
3072 return false;
3073
3074 if (!Users.empty()) {
3075 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
3076 for (Instruction *Inst
3077 : Users) { dbgs() << " " << *Inst << "\n"; });
3078 return false;
3079 }
3080 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3081
3082 // The chain itself may require a register, so initialize cost to 1.
3083 int cost = 1;
3084
3085 // A complete chain likely eliminates the need for keeping the original IV in
3086 // a register. LSR does not currently know how to form a complete chain unless
3087 // the header phi already exists.
3088 if (isa<PHINode>(Chain.tailUserInst())
3089 && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
3090 --cost;
3091 }
3092 const SCEV *LastIncExpr = nullptr;
3093 unsigned NumConstIncrements = 0;
3094 unsigned NumVarIncrements = 0;
3095 unsigned NumReusedIncrements = 0;
3096
3097 if (TTI.isProfitableLSRChainElement(Chain.Incs[0].UserInst))
3098 return true;
3099
3100 for (const IVInc &Inc : Chain) {
3101 if (TTI.isProfitableLSRChainElement(Inc.UserInst))
3102 return true;
3103 if (Inc.IncExpr->isZero())
3104 continue;
3105
3106 // Incrementing by zero or some constant is neutral. We assume constants can
3107 // be folded into an addressing mode or an add's immediate operand.
3108 if (isa<SCEVConstant>(Inc.IncExpr)) {
3109 ++NumConstIncrements;
3110 continue;
3111 }
3112
3113 if (Inc.IncExpr == LastIncExpr)
3114 ++NumReusedIncrements;
3115 else
3116 ++NumVarIncrements;
3117
3118 LastIncExpr = Inc.IncExpr;
3119 }
3120 // An IV chain with a single increment is handled by LSR's postinc
3121 // uses. However, a chain with multiple increments requires keeping the IV's
3122 // value live longer than it needs to be if chained.
3123 if (NumConstIncrements > 1)
3124 --cost;
3125
3126 // Materializing increment expressions in the preheader that didn't exist in
3127 // the original code may cost a register. For example, sign-extended array
3128 // indices can produce ridiculous increments like this:
3129 // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
3130 cost += NumVarIncrements;
3131
3132 // Reusing variable increments likely saves a register to hold the multiple of
3133 // the stride.
3134 cost -= NumReusedIncrements;
3135
3136 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
3137 << "\n");
3138
3139 return cost < 0;
3140}
3141
3142/// Add this IV user to an existing chain or make it the head of a new chain.
3143void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
3144 SmallVectorImpl<ChainUsers> &ChainUsersVec) {
3145 // When IVs are used as types of varying widths, they are generally converted
3146 // to a wider type with some uses remaining narrow under a (free) trunc.
3147 Value *const NextIV = getWideOperand(IVOper);
3148 const SCEV *const OperExpr = SE.getSCEV(NextIV);
3149 const SCEV *const OperExprBase = getExprBase(OperExpr);
3150
3151 // Visit all existing chains. Check if its IVOper can be computed as a
3152 // profitable loop invariant increment from the last link in the Chain.
3153 unsigned ChainIdx = 0, NChains = IVChainVec.size();
3154 const SCEV *LastIncExpr = nullptr;
3155 for (; ChainIdx < NChains; ++ChainIdx) {
3156 IVChain &Chain = IVChainVec[ChainIdx];
3157
3158 // Prune the solution space aggressively by checking that both IV operands
3159 // are expressions that operate on the same unscaled SCEVUnknown. This
3160 // "base" will be canceled by the subsequent getMinusSCEV call. Checking
3161 // first avoids creating extra SCEV expressions.
3162 if (!StressIVChain && Chain.ExprBase != OperExprBase)
3163 continue;
3164
3165 Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
3166 if (PrevIV->getType() != NextIV->getType())
3167 continue;
3168
3169 // A phi node terminates a chain.
3170 if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
3171 continue;
3172
3173 // The increment must be loop-invariant so it can be kept in a register.
3174 const SCEV *PrevExpr = SE.getSCEV(PrevIV);
3175 const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
3176 if (isa<SCEVCouldNotCompute>(IncExpr) || !SE.isLoopInvariant(IncExpr, L))
3177 continue;
3178
3179 if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
3180 LastIncExpr = IncExpr;
3181 break;
3182 }
3183 }
3184 // If we haven't found a chain, create a new one, unless we hit the max. Don't
3185 // bother for phi nodes, because they must be last in the chain.
3186 if (ChainIdx == NChains) {
3187 if (isa<PHINode>(UserInst))
3188 return;
3189 if (NChains >= MaxChains && !StressIVChain) {
3190 LLVM_DEBUG(dbgs() << "IV Chain Limit\n");
3191 return;
3192 }
3193 LastIncExpr = OperExpr;
3194 // IVUsers may have skipped over sign/zero extensions. We don't currently
3195 // attempt to form chains involving extensions unless they can be hoisted
3196 // into this loop's AddRec.
3197 if (!isa<SCEVAddRecExpr>(LastIncExpr))
3198 return;
3199 ++NChains;
3200 IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
3201 OperExprBase));
3202 ChainUsersVec.resize(NChains);
3203 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
3204 << ") IV=" << *LastIncExpr << "\n");
3205 } else {
3206 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst
3207 << ") IV+" << *LastIncExpr << "\n");
3208 // Add this IV user to the end of the chain.
3209 IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
3210 }
3211 IVChain &Chain = IVChainVec[ChainIdx];
3212
3213 SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
3214 // This chain's NearUsers become FarUsers.
3215 if (!LastIncExpr->isZero()) {
3216 ChainUsersVec[ChainIdx].FarUsers.insert_range(NearUsers);
3217 NearUsers.clear();
3218 }
3219
3220 // All other uses of IVOperand become near uses of the chain.
3221 // We currently ignore intermediate values within SCEV expressions, assuming
3222 // they will eventually be used be the current chain, or can be computed
3223 // from one of the chain increments. To be more precise we could
3224 // transitively follow its user and only add leaf IV users to the set.
3225 for (User *U : IVOper->users()) {
3226 Instruction *OtherUse = dyn_cast<Instruction>(U);
3227 if (!OtherUse)
3228 continue;
3229 // Uses in the chain will no longer be uses if the chain is formed.
3230 // Include the head of the chain in this iteration (not Chain.begin()).
3231 IVChain::const_iterator IncIter = Chain.Incs.begin();
3232 IVChain::const_iterator IncEnd = Chain.Incs.end();
3233 for( ; IncIter != IncEnd; ++IncIter) {
3234 if (IncIter->UserInst == OtherUse)
3235 break;
3236 }
3237 if (IncIter != IncEnd)
3238 continue;
3239
3240 if (SE.isSCEVable(OtherUse->getType())
3241 && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
3242 && IU.isIVUserOrOperand(OtherUse)) {
3243 continue;
3244 }
3245 NearUsers.insert(OtherUse);
3246 }
3247
3248 // Since this user is part of the chain, it's no longer considered a use
3249 // of the chain.
3250 ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
3251}
3252
3253/// Populate the vector of Chains.
3254///
3255/// This decreases ILP at the architecture level. Targets with ample registers,
3256/// multiple memory ports, and no register renaming probably don't want
3257/// this. However, such targets should probably disable LSR altogether.
3258///
3259/// The job of LSR is to make a reasonable choice of induction variables across
3260/// the loop. Subsequent passes can easily "unchain" computation exposing more
3261/// ILP *within the loop* if the target wants it.
3262///
3263/// Finding the best IV chain is potentially a scheduling problem. Since LSR
3264/// will not reorder memory operations, it will recognize this as a chain, but
3265/// will generate redundant IV increments. Ideally this would be corrected later
3266/// by a smart scheduler:
3267/// = A[i]
3268/// = A[i+x]
3269/// A[i] =
3270/// A[i+x] =
3271///
3272/// TODO: Walk the entire domtree within this loop, not just the path to the
3273/// loop latch. This will discover chains on side paths, but requires
3274/// maintaining multiple copies of the Chains state.
3275void LSRInstance::CollectChains() {
3276 LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n");
3277 SmallVector<ChainUsers, 8> ChainUsersVec;
3278
3279 SmallVector<BasicBlock *,8> LatchPath;
3280 BasicBlock *LoopHeader = L->getHeader();
3281 for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
3282 Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
3283 LatchPath.push_back(Rung->getBlock());
3284 }
3285 LatchPath.push_back(LoopHeader);
3286
3287 // Walk the instruction stream from the loop header to the loop latch.
3288 for (BasicBlock *BB : reverse(LatchPath)) {
3289 for (Instruction &I : *BB) {
3290 // Skip instructions that weren't seen by IVUsers analysis.
3291 if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&I))
3292 continue;
3293
3294 // Skip ephemeral values, as they don't produce real code.
3295 if (IU.isEphemeral(&I))
3296 continue;
3297
3298 // Ignore users that are part of a SCEV expression. This way we only
3299 // consider leaf IV Users. This effectively rediscovers a portion of
3300 // IVUsers analysis but in program order this time.
3301 if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I)))
3302 continue;
3303
3304 // Remove this instruction from any NearUsers set it may be in.
3305 for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
3306 ChainIdx < NChains; ++ChainIdx) {
3307 ChainUsersVec[ChainIdx].NearUsers.erase(&I);
3308 }
3309 // Search for operands that can be chained.
3310 SmallPtrSet<Instruction*, 4> UniqueOperands;
3311 User::op_iterator IVOpEnd = I.op_end();
3312 User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE);
3313 while (IVOpIter != IVOpEnd) {
3314 Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
3315 if (UniqueOperands.insert(IVOpInst).second)
3316 ChainInstruction(&I, IVOpInst, ChainUsersVec);
3317 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3318 }
3319 } // Continue walking down the instructions.
3320 } // Continue walking down the domtree.
3321 // Visit phi backedges to determine if the chain can generate the IV postinc.
3322 for (PHINode &PN : L->getHeader()->phis()) {
3323 if (!SE.isSCEVable(PN.getType()))
3324 continue;
3325
3326 Instruction *IncV =
3327 dyn_cast<Instruction>(PN.getIncomingValueForBlock(L->getLoopLatch()));
3328 if (IncV)
3329 ChainInstruction(&PN, IncV, ChainUsersVec);
3330 }
3331 // Remove any unprofitable chains.
3332 unsigned ChainIdx = 0;
3333 for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
3334 UsersIdx < NChains; ++UsersIdx) {
3335 if (!isProfitableChain(IVChainVec[UsersIdx],
3336 ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
3337 continue;
3338 // Preserve the chain at UsesIdx.
3339 if (ChainIdx != UsersIdx)
3340 IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
3341 FinalizeChain(IVChainVec[ChainIdx]);
3342 ++ChainIdx;
3343 }
3344 IVChainVec.resize(ChainIdx);
3345}
3346
3347void LSRInstance::FinalizeChain(IVChain &Chain) {
3348 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3349 LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
3350
3351 for (const IVInc &Inc : Chain) {
3352 LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");
3353 auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
3354 assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
3355 IVIncSet.insert(UseI);
3356 }
3357}
3358
3359/// Return true if the IVInc can be folded into an addressing mode.
3360static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
3361 Value *Operand, const TargetTransformInfo &TTI) {
3362 const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
3363 Immediate IncOffset = Immediate::getZero();
3364 if (IncConst) {
3365 if (IncConst && IncConst->getAPInt().getSignificantBits() > 64)
3366 return false;
3367 IncOffset = Immediate::getFixed(IncConst->getValue()->getSExtValue());
3368 } else {
3369 // Look for mul(vscale, constant), to detect a scalable offset.
3370 const APInt *C;
3371 if (!match(IncExpr, m_scev_Mul(m_scev_APInt(C), m_SCEVVScale())) ||
3372 C->getSignificantBits() > 64)
3373 return false;
3374 IncOffset = Immediate::getScalable(C->getSExtValue());
3375 }
3376
3377 if (!isAddressUse(TTI, UserInst, Operand))
3378 return false;
3379
3380 MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
3381 if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
3382 IncOffset, /*HasBaseReg=*/false))
3383 return false;
3384
3385 return true;
3386}
3387
3388/// Generate an add or subtract for each IVInc in a chain to materialize the IV
3389/// user's operand from the previous IV user's operand.
3390void LSRInstance::GenerateIVChain(const IVChain &Chain,
3391 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
3392 // Find the new IVOperand for the head of the chain. It may have been replaced
3393 // by LSR.
3394 const IVInc &Head = Chain.Incs[0];
3395 User::op_iterator IVOpEnd = Head.UserInst->op_end();
3396 // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
3397 User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
3398 IVOpEnd, L, SE);
3399 Value *IVSrc = nullptr;
3400 while (IVOpIter != IVOpEnd) {
3401 IVSrc = getWideOperand(*IVOpIter);
3402
3403 // If this operand computes the expression that the chain needs, we may use
3404 // it. (Check this after setting IVSrc which is used below.)
3405 //
3406 // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
3407 // narrow for the chain, so we can no longer use it. We do allow using a
3408 // wider phi, assuming the LSR checked for free truncation. In that case we
3409 // should already have a truncate on this operand such that
3410 // getSCEV(IVSrc) == IncExpr.
3411 if (SE.getSCEV(*IVOpIter) == Head.IncExpr
3412 || SE.getSCEV(IVSrc) == Head.IncExpr) {
3413 break;
3414 }
3415 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3416 }
3417 if (IVOpIter == IVOpEnd) {
3418 // Gracefully give up on this chain.
3419 LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
3420 return;
3421 }
3422 assert(IVSrc && "Failed to find IV chain source");
3423
3424 LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
3425 Type *IVTy = IVSrc->getType();
3426 Type *IntTy = SE.getEffectiveSCEVType(IVTy);
3427 const SCEV *LeftOverExpr = nullptr;
3428 const SCEV *Accum = SE.getZero(IntTy);
3430 Bases.emplace_back(Accum, IVSrc);
3431
3432 for (const IVInc &Inc : Chain) {
3433 Instruction *InsertPt = Inc.UserInst;
3434 if (isa<PHINode>(InsertPt))
3435 InsertPt = L->getLoopLatch()->getTerminator();
3436
3437 // IVOper will replace the current IV User's operand. IVSrc is the IV
3438 // value currently held in a register.
3439 Value *IVOper = IVSrc;
3440 if (!Inc.IncExpr->isZero()) {
3441 // IncExpr was the result of subtraction of two narrow values, so must
3442 // be signed.
3443 const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
3444 Accum = SE.getAddExpr(Accum, IncExpr);
3445 LeftOverExpr = LeftOverExpr ?
3446 SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
3447 }
3448
3449 // Look through each base to see if any can produce a nice addressing mode.
3450 bool FoundBase = false;
3451 for (auto [MapScev, MapIVOper] : reverse(Bases)) {
3452 const SCEV *Remainder = SE.getMinusSCEV(Accum, MapScev);
3453 if (canFoldIVIncExpr(Remainder, Inc.UserInst, Inc.IVOperand, TTI)) {
3454 if (!Remainder->isZero()) {
3455 Rewriter.clearPostInc();
3456 Value *IncV = Rewriter.expandCodeFor(Remainder, IntTy, InsertPt);
3457 const SCEV *IVOperExpr =
3458 SE.getAddExpr(SE.getUnknown(MapIVOper), SE.getUnknown(IncV));
3459 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3460 } else {
3461 IVOper = MapIVOper;
3462 }
3463
3464 FoundBase = true;
3465 break;
3466 }
3467 }
3468 if (!FoundBase && LeftOverExpr && !LeftOverExpr->isZero()) {
3469 // Expand the IV increment.
3470 Rewriter.clearPostInc();
3471 Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
3472 const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
3473 SE.getUnknown(IncV));
3474 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3475
3476 // If an IV increment can't be folded, use it as the next IV value.
3477 if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
3478 assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
3479 Bases.emplace_back(Accum, IVOper);
3480 IVSrc = IVOper;
3481 LeftOverExpr = nullptr;
3482 }
3483 }
3484 Type *OperTy = Inc.IVOperand->getType();
3485 if (IVTy != OperTy) {
3486 assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
3487 "cannot extend a chained IV");
3488 IRBuilder<> Builder(InsertPt);
3489 IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
3490 }
3491 Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
3492 if (auto *OperandIsInstr = dyn_cast<Instruction>(Inc.IVOperand))
3493 DeadInsts.emplace_back(OperandIsInstr);
3494 }
3495 // If LSR created a new, wider phi, we may also replace its postinc. We only
3496 // do this if we also found a wide value for the head of the chain.
3497 if (isa<PHINode>(Chain.tailUserInst())) {
3498 for (PHINode &Phi : L->getHeader()->phis()) {
3499 if (Phi.getType() != IVSrc->getType())
3500 continue;
3502 Phi.getIncomingValueForBlock(L->getLoopLatch()));
3503 if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
3504 continue;
3505 Value *IVOper = IVSrc;
3506 Type *PostIncTy = PostIncV->getType();
3507 if (IVTy != PostIncTy) {
3508 assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
3509 IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
3510 Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
3511 IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
3512 }
3513 Phi.replaceUsesOfWith(PostIncV, IVOper);
3514 DeadInsts.emplace_back(PostIncV);
3515 }
3516 }
3517}
3518
3519void LSRInstance::CollectFixupsAndInitialFormulae() {
3520 CondBrInst *ExitBranch = nullptr;
3521 bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &TLI);
3522
3523 // For calculating baseline cost
3524 SmallPtrSet<const SCEV *, 16> Regs;
3525 DenseSet<const SCEV *> VisitedRegs;
3526 DenseSet<size_t> VisitedLSRUse;
3527
3528 for (const IVStrideUse &U : IU) {
3529 Instruction *UserInst = U.getUser();
3530 // Skip IV users that are part of profitable IV Chains.
3531 User::op_iterator UseI =
3532 find(UserInst->operands(), U.getOperandValToReplace());
3533 assert(UseI != UserInst->op_end() && "cannot find IV operand");
3534 if (IVIncSet.count(UseI)) {
3535 LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
3536 continue;
3537 }
3538
3539 LSRUse::KindType Kind = LSRUse::Basic;
3540 MemAccessTy AccessTy;
3541 if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
3542 Kind = LSRUse::Address;
3543 AccessTy = getAccessType(TTI, UserInst, U.getOperandValToReplace());
3544 }
3545
3546 const SCEV *S = IU.getExpr(U);
3547 if (!S)
3548 continue;
3549 PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
3550
3551 // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
3552 // (N - i == 0), and this allows (N - i) to be the expression that we work
3553 // with rather than just N or i, so we can consider the register
3554 // requirements for both N and i at the same time. Limiting this code to
3555 // equality icmps is not a problem because all interesting loops use
3556 // equality icmps, thanks to IndVarSimplify.
3557 if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) {
3558 // If CI can be saved in some target, like replaced inside hardware loop
3559 // in PowerPC, no need to generate initial formulae for it.
3560 if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition()))
3561 continue;
3562 if (CI->isEquality()) {
3563 // Swap the operands if needed to put the OperandValToReplace on the
3564 // left, for consistency.
3565 Value *NV = CI->getOperand(1);
3566 if (NV == U.getOperandValToReplace()) {
3567 CI->setOperand(1, CI->getOperand(0));
3568 CI->setOperand(0, NV);
3569 NV = CI->getOperand(1);
3570 Changed = true;
3571 }
3572
3573 // x == y --> x - y == 0
3574 const SCEV *N = SE.getSCEV(NV);
3575 if (SE.isLoopInvariant(N, L) && Rewriter.isSafeToExpand(N) &&
3576 (!NV->getType()->isPointerTy() ||
3577 SE.getPointerBase(N) == SE.getPointerBase(S))) {
3578 // S is normalized, so normalize N before folding it into S
3579 // to keep the result normalized.
3580 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3581 if (!N)
3582 continue;
3583 Kind = LSRUse::ICmpZero;
3584 S = SE.getMinusSCEV(N, S);
3585 } else if (L->isLoopInvariant(NV) &&
3586 (!isa<Instruction>(NV) ||
3587 DT.dominates(cast<Instruction>(NV), L->getHeader())) &&
3588 !NV->getType()->isPointerTy()) {
3589 // If we can't generally expand the expression (e.g. it contains
3590 // a divide), but it is already at a loop invariant point before the
3591 // loop, wrap it in an unknown (to prevent the expander from trying
3592 // to re-expand in a potentially unsafe way.) The restriction to
3593 // integer types is required because the unknown hides the base, and
3594 // SCEV can't compute the difference of two unknown pointers.
3595 N = SE.getUnknown(NV);
3596 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3597 if (!N)
3598 continue;
3599 Kind = LSRUse::ICmpZero;
3600 S = SE.getMinusSCEV(N, S);
3602 }
3603
3604 // -1 and the negations of all interesting strides (except the negation
3605 // of -1) are now also interesting.
3606 for (size_t i = 0, e = Factors.size(); i != e; ++i)
3607 if (Factors[i] != -1)
3608 Factors.insert(-(uint64_t)Factors[i]);
3609 Factors.insert(-1);
3610 }
3611 }
3612
3613 // Get or create an LSRUse.
3614 std::pair<size_t, Immediate> P = getUse(S, Kind, AccessTy);
3615 size_t LUIdx = P.first;
3616 Immediate Offset = P.second;
3617 LSRUse &LU = Uses[LUIdx];
3618
3619 // Record the fixup.
3620 LSRFixup &LF = LU.getNewFixup();
3621 LF.UserInst = UserInst;
3622 LF.OperandValToReplace = U.getOperandValToReplace();
3623 LF.PostIncLoops = TmpPostIncLoops;
3624 LF.Offset = Offset;
3625 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3626 LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF);
3627
3628 // Create SCEV as Formula for calculating baseline cost
3629 if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) {
3630 Formula F;
3631 F.initialMatch(S, L, SE);
3632 BaselineCost.RateFormula(F, Regs, VisitedRegs, LU,
3633 HardwareLoopProfitable);
3634 VisitedLSRUse.insert(LUIdx);
3635 }
3636
3637 // If this is the first use of this LSRUse, give it a formula.
3638 if (LU.Formulae.empty()) {
3639 InsertInitialFormula(S, LU, LUIdx);
3640 CountRegisters(LU.Formulae.back(), LUIdx);
3641 }
3642 }
3643
3644 LLVM_DEBUG(print_fixups(dbgs()));
3645}
3646
3647/// Insert a formula for the given expression into the given use, separating out
3648/// loop-variant portions from loop-invariant and loop-computable portions.
3649void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU,
3650 size_t LUIdx) {
3651 // Mark uses whose expressions cannot be expanded.
3652 if (!Rewriter.isSafeToExpand(S))
3653 LU.RigidFormula = true;
3654
3655 Formula F;
3656 F.initialMatch(S, L, SE);
3657 bool Inserted = InsertFormula(LU, LUIdx, F);
3658 assert(Inserted && "Initial formula already exists!"); (void)Inserted;
3659}
3660
3661/// Insert a simple single-register formula for the given expression into the
3662/// given use.
3663void
3664LSRInstance::InsertSupplementalFormula(const SCEV *S,
3665 LSRUse &LU, size_t LUIdx) {
3666 Formula F;
3667 F.BaseRegs.push_back(S);
3668 F.HasBaseReg = true;
3669 bool Inserted = InsertFormula(LU, LUIdx, F);
3670 assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
3671}
3672
3673/// Note which registers are used by the given formula, updating RegUses.
3674void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
3675 if (F.ScaledReg)
3676 RegUses.countRegister(F.ScaledReg, LUIdx);
3677 for (const SCEV *BaseReg : F.BaseRegs)
3678 RegUses.countRegister(BaseReg, LUIdx);
3679}
3680
3681/// If the given formula has not yet been inserted, add it to the list, and
3682/// return true. Return false otherwise.
3683bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
3684 // Do not insert formula that we will not be able to expand.
3685 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
3686 "Formula is illegal");
3687
3688 if (!LU.InsertFormula(F, *L))
3689 return false;
3690
3691 CountRegisters(F, LUIdx);
3692 return true;
3693}
3694
3695/// Test whether this fixup will be executed each time the corresponding IV
3696/// increment instruction is executed.
3697bool LSRInstance::IsFixupExecutedEachIncrement(const LSRFixup &LF) const {
3698 // If the fixup block dominates the IV increment block then there is no path
3699 // through the loop to the increment that doesn't pass through the fixup.
3700 return DT.dominates(LF.UserInst->getParent(), IVIncInsertPos->getParent());
3701}
3702
3703/// Check for other uses of loop-invariant values which we're tracking. These
3704/// other uses will pin these values in registers, making them less profitable
3705/// for elimination.
3706/// TODO: This currently misses non-constant addrec step registers.
3707/// TODO: Should this give more weight to users inside the loop?
3708void
3709LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
3710 SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
3711 SmallPtrSet<const SCEV *, 32> Visited;
3712
3713 // Don't collect outside uses if we are favoring postinc - the instructions in
3714 // the loop are more important than the ones outside of it.
3715 if (AMK == TTI::AMK_PostIndexed)
3716 return;
3717
3718 while (!Worklist.empty()) {
3719 const SCEV *S = Worklist.pop_back_val();
3720
3721 // Don't process the same SCEV twice
3722 if (!Visited.insert(S).second)
3723 continue;
3724
3725 if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
3726 append_range(Worklist, N->operands());
3727 else if (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(S))
3728 Worklist.push_back(C->getOperand());
3729 else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
3730 Worklist.push_back(D->getLHS());
3731 Worklist.push_back(D->getRHS());
3732 } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
3733 const Value *V = US->getValue();
3734 if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
3735 // Look for instructions defined outside the loop.
3736 if (L->contains(Inst)) continue;
3737 } else if (isa<Constant>(V))
3738 // Constants can be re-materialized.
3739 continue;
3740 for (const Use &U : V->uses()) {
3741 const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
3742 // Ignore non-instructions.
3743 if (!UserInst)
3744 continue;
3745 // Don't bother if the instruction is an EHPad.
3746 if (UserInst->isEHPad())
3747 continue;
3748 // Ignore instructions in other functions (as can happen with
3749 // Constants).
3750 if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
3751 continue;
3752 // Ignore instructions not dominated by the loop.
3753 const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
3754 UserInst->getParent() :
3755 cast<PHINode>(UserInst)->getIncomingBlock(
3757 if (!DT.dominates(L->getHeader(), UseBB))
3758 continue;
3759 // Don't bother if the instruction is in a BB which ends in an EHPad.
3760 if (UseBB->getTerminator()->isEHPad())
3761 continue;
3762
3763 // Ignore cases in which the currently-examined value could come from
3764 // a basic block terminated with an EHPad. This checks all incoming
3765 // blocks of the phi node since it is possible that the same incoming
3766 // value comes from multiple basic blocks, only some of which may end
3767 // in an EHPad. If any of them do, a subsequent rewrite attempt by this
3768 // pass would try to insert instructions into an EHPad, hitting an
3769 // assertion.
3770 if (isa<PHINode>(UserInst)) {
3771 const auto *PhiNode = cast<PHINode>(UserInst);
3772 bool HasIncompatibleEHPTerminatedBlock = false;
3773 llvm::Value *ExpectedValue = U;
3774 for (unsigned int I = 0; I < PhiNode->getNumIncomingValues(); I++) {
3775 if (PhiNode->getIncomingValue(I) == ExpectedValue) {
3776 if (PhiNode->getIncomingBlock(I)->getTerminator()->isEHPad()) {
3777 HasIncompatibleEHPTerminatedBlock = true;
3778 break;
3779 }
3780 }
3781 }
3782 if (HasIncompatibleEHPTerminatedBlock) {
3783 continue;
3784 }
3785 }
3786
3787 // Don't bother rewriting PHIs in catchswitch blocks.
3788 if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
3789 continue;
3790 // Ignore uses which are part of other SCEV expressions, to avoid
3791 // analyzing them multiple times.
3792 if (SE.isSCEVable(UserInst->getType())) {
3793 const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
3794 // If the user is a no-op, look through to its uses.
3795 if (!isa<SCEVUnknown>(UserS))
3796 continue;
3797 if (UserS == US) {
3798 Worklist.push_back(
3799 SE.getUnknown(const_cast<Instruction *>(UserInst)));
3800 continue;
3801 }
3802 }
3803 // Ignore icmp instructions which are already being analyzed.
3804 if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
3805 unsigned OtherIdx = !U.getOperandNo();
3806 Value *OtherOp = ICI->getOperand(OtherIdx);
3807 if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
3808 continue;
3809 }
3810
3811 // Do not consider uses inside lifetime intrinsics. These are not
3812 // actually materialized.
3813 if (UserInst->isLifetimeStartOrEnd())
3814 continue;
3815
3816 std::pair<size_t, Immediate> P =
3817 getUse(S, LSRUse::Basic, MemAccessTy());
3818 size_t LUIdx = P.first;
3819 Immediate Offset = P.second;
3820 LSRUse &LU = Uses[LUIdx];
3821 LSRFixup &LF = LU.getNewFixup();
3822 LF.UserInst = const_cast<Instruction *>(UserInst);
3823 LF.OperandValToReplace = U;
3824 LF.Offset = Offset;
3825 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3826 LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF);
3827 InsertSupplementalFormula(US, LU, LUIdx);
3828 CountRegisters(LU.Formulae.back(), Uses.size() - 1);
3829 break;
3830 }
3831 }
3832 }
3833}
3834
3835/// Split S into subexpressions which can be pulled out into separate
3836/// registers. If C is non-null, multiply each subexpression by C.
3837///
3838/// Return remainder expression after factoring the subexpressions captured by
3839/// Ops. If Ops is complete, return NULL.
3840static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
3842 const Loop *L,
3843 ScalarEvolution &SE,
3844 unsigned Depth = 0) {
3845 // Arbitrarily cap recursion to protect compile time.
3846 if (Depth >= 3)
3847 return S;
3848
3849 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
3850 // Break out add operands.
3851 for (const SCEV *S : Add->operands()) {
3852 const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
3853 if (Remainder)
3854 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3855 }
3856 return nullptr;
3857 }
3858 const SCEV *Start, *Step;
3859 const SCEVConstant *Op0;
3860 const SCEV *Op1;
3861 if (match(S, m_scev_AffineAddRec(m_SCEV(Start), m_SCEV(Step)))) {
3862 // Split a non-zero base out of an addrec.
3863 if (Start->isZero())
3864 return S;
3865
3866 const SCEV *Remainder = CollectSubexprs(Start, C, Ops, L, SE, Depth + 1);
3867 // Split the non-zero AddRec unless it is part of a nested recurrence that
3868 // does not pertain to this loop.
3869 if (Remainder && (cast<SCEVAddRecExpr>(S)->getLoop() == L ||
3870 !isa<SCEVAddRecExpr>(Remainder))) {
3871 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3872 Remainder = nullptr;
3873 }
3874 if (Remainder != Start) {
3875 if (!Remainder)
3876 Remainder = SE.getConstant(S->getType(), 0);
3877 return SE.getAddRecExpr(Remainder, Step,
3878 cast<SCEVAddRecExpr>(S)->getLoop(),
3879 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
3881 }
3882 } else if (match(S, m_scev_Mul(m_SCEVConstant(Op0), m_SCEV(Op1)))) {
3883 // Break (C * (a + b + c)) into C*a + C*b + C*c.
3884 C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
3885 const SCEV *Remainder = CollectSubexprs(Op1, C, Ops, L, SE, Depth + 1);
3886 if (Remainder)
3887 Ops.push_back(SE.getMulExpr(C, Remainder));
3888 return nullptr;
3889 }
3890 return S;
3891}
3892
3893/// Return true if the SCEV represents a value that may end up as a
3894/// post-increment operation.
3896 LSRUse &LU, const SCEV *S, const Loop *L,
3897 ScalarEvolution &SE) {
3898 if (LU.Kind != LSRUse::Address ||
3899 !LU.AccessTy.getType()->isIntOrIntVectorTy())
3900 return false;
3901 const SCEV *Start;
3902 if (!match(S, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant())))
3903 return false;
3904 // Check if a post-indexed load/store can be used.
3905 if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, S->getType()) ||
3906 TTI.isIndexedStoreLegal(TTI.MIM_PostInc, S->getType())) {
3907 if (!isa<SCEVConstant>(Start) && SE.isLoopInvariant(Start, L))
3908 return true;
3909 }
3910 return false;
3911}
3912
3913/// Helper function for LSRInstance::GenerateReassociations.
3914void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
3915 const Formula &Base,
3916 unsigned Depth, size_t Idx,
3917 bool IsScaledReg) {
3918 const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3919 // Don't generate reassociations for the base register of a value that
3920 // may generate a post-increment operator. The reason is that the
3921 // reassociations cause extra base+register formula to be created,
3922 // and possibly chosen, but the post-increment is more efficient.
3923 if (AMK == TTI::AMK_PostIndexed && mayUsePostIncMode(TTI, LU, BaseReg, L, SE))
3924 return;
3926 const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
3927 if (Remainder)
3928 AddOps.push_back(Remainder);
3929
3930 if (AddOps.size() == 1)
3931 return;
3932
3934 JE = AddOps.end();
3935 J != JE; ++J) {
3936 // Loop-variant "unknown" values are uninteresting; we won't be able to
3937 // do anything meaningful with them.
3938 if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
3939 continue;
3940
3941 // Don't pull a constant into a register if the constant could be folded
3942 // into an immediate field.
3943 if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3944 LU.AccessTy, *J, Base.getNumRegs() > 1))
3945 continue;
3946
3947 // Collect all operands except *J.
3948 SmallVector<SCEVUse, 8> InnerAddOps(std::as_const(AddOps).begin(), J);
3949 InnerAddOps.append(std::next(J), std::as_const(AddOps).end());
3950
3951 // Don't leave just a constant behind in a register if the constant could
3952 // be folded into an immediate field.
3953 if (InnerAddOps.size() == 1 &&
3954 isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3955 LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
3956 continue;
3957
3958 const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
3959 if (InnerSum->isZero())
3960 continue;
3961 Formula F = Base;
3962
3963 if (F.UnfoldedOffset.isNonZero() && F.UnfoldedOffset.isScalable())
3964 continue;
3965
3966 // Add the remaining pieces of the add back into the new formula.
3967 const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
3968 if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
3969 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
3970 InnerSumSC->getValue()->getZExtValue())) {
3971 F.UnfoldedOffset =
3972 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
3973 InnerSumSC->getValue()->getZExtValue());
3974 if (IsScaledReg) {
3975 F.ScaledReg = nullptr;
3976 F.Scale = 0;
3977 } else
3978 F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
3979 } else if (IsScaledReg)
3980 F.ScaledReg = InnerSum;
3981 else
3982 F.BaseRegs[Idx] = InnerSum;
3983
3984 // Add J as its own register, or an unfolded immediate.
3985 const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
3986 if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
3987 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
3988 SC->getValue()->getZExtValue()))
3989 F.UnfoldedOffset =
3990 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
3991 SC->getValue()->getZExtValue());
3992 else
3993 F.BaseRegs.push_back(*J);
3994 // We may have changed the number of register in base regs, adjust the
3995 // formula accordingly.
3996 F.canonicalize(*L);
3997
3998 if (InsertFormula(LU, LUIdx, F))
3999 // If that formula hadn't been seen before, recurse to find more like
4000 // it.
4001 // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2)
4002 // Because just Depth is not enough to bound compile time.
4003 // This means that every time AddOps.size() is greater 16^x we will add
4004 // x to Depth.
4005 GenerateReassociations(LU, LUIdx, LU.Formulae.back(),
4006 Depth + 1 + (Log2_32(AddOps.size()) >> 2));
4007 }
4008}
4009
4010/// Split out subexpressions from adds and the bases of addrecs.
4011void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
4012 Formula Base, unsigned Depth) {
4013 assert(Base.isCanonical(*L) && "Input must be in the canonical form");
4014 // Arbitrarily cap recursion to protect compile time.
4015 if (Depth >= 3)
4016 return;
4017
4018 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4019 GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
4020
4021 if (Base.Scale == 1)
4022 GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
4023 /* Idx */ -1, /* IsScaledReg */ true);
4024}
4025
4026/// Generate a formula consisting of all of the loop-dominating registers added
4027/// into a single register.
4028void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
4029 Formula Base) {
4030 // This method is only interesting on a plurality of registers.
4031 if (Base.BaseRegs.size() + (Base.Scale == 1) +
4032 (Base.UnfoldedOffset.isNonZero()) <=
4033 1)
4034 return;
4035
4036 // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
4037 // processing the formula.
4038 Base.unscale();
4040 Formula NewBase = Base;
4041 NewBase.BaseRegs.clear();
4042 Type *CombinedIntegerType = nullptr;
4043 for (const SCEV *BaseReg : Base.BaseRegs) {
4044 if (SE.properlyDominates(BaseReg, L->getHeader()) &&
4045 !SE.hasComputableLoopEvolution(BaseReg, L)) {
4046 if (!CombinedIntegerType)
4047 CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType());
4048 Ops.push_back(BaseReg);
4049 }
4050 else
4051 NewBase.BaseRegs.push_back(BaseReg);
4052 }
4053
4054 // If no register is relevant, we're done.
4055 if (Ops.size() == 0)
4056 return;
4057
4058 // Utility function for generating the required variants of the combined
4059 // registers.
4060 auto GenerateFormula = [&](const SCEV *Sum) {
4061 Formula F = NewBase;
4062
4063 // TODO: If Sum is zero, it probably means ScalarEvolution missed an
4064 // opportunity to fold something. For now, just ignore such cases
4065 // rather than proceed with zero in a register.
4066 if (Sum->isZero())
4067 return;
4068
4069 F.BaseRegs.push_back(Sum);
4070 F.canonicalize(*L);
4071 (void)InsertFormula(LU, LUIdx, F);
4072 };
4073
4074 // If we collected at least two registers, generate a formula combining them.
4075 if (Ops.size() > 1) {
4076 SmallVector<SCEVUse, 4> OpsCopy(Ops); // Don't let SE modify Ops.
4077 GenerateFormula(SE.getAddExpr(OpsCopy));
4078 }
4079
4080 // If we have an unfolded offset, generate a formula combining it with the
4081 // registers collected.
4082 if (NewBase.UnfoldedOffset.isNonZero() && NewBase.UnfoldedOffset.isFixed()) {
4083 assert(CombinedIntegerType && "Missing a type for the unfolded offset");
4084 Ops.push_back(SE.getConstant(CombinedIntegerType,
4085 NewBase.UnfoldedOffset.getFixedValue(), true));
4086 NewBase.UnfoldedOffset = Immediate::getFixed(0);
4087 GenerateFormula(SE.getAddExpr(Ops));
4088 }
4089}
4090
4091/// Helper function for LSRInstance::GenerateSymbolicOffsets.
4092void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
4093 const Formula &Base, size_t Idx,
4094 bool IsScaledReg) {
4095 SCEVUse G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4096 GlobalValue *GV = ExtractSymbol(G, SE);
4097 if (G->isZero() || !GV)
4098 return;
4099 Formula F = Base;
4100 F.BaseGV = GV;
4101 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4102 return;
4103 if (IsScaledReg)
4104 F.ScaledReg = G;
4105 else
4106 F.BaseRegs[Idx] = G;
4107 (void)InsertFormula(LU, LUIdx, F);
4108}
4109
4110/// Generate reuse formulae using symbolic offsets.
4111void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
4112 Formula Base) {
4113 // We can't add a symbolic offset if the address already contains one.
4114 if (Base.BaseGV) return;
4115
4116 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4117 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
4118 if (Base.Scale == 1)
4119 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
4120 /* IsScaledReg */ true);
4121}
4122
4123/// Helper function for LSRInstance::GenerateConstantOffsets.
4124void LSRInstance::GenerateConstantOffsetsImpl(
4125 LSRUse &LU, unsigned LUIdx, const Formula &Base,
4126 const SmallVectorImpl<Immediate> &Worklist, size_t Idx, bool IsScaledReg) {
4127
4128 auto GenerateOffset = [&](const SCEV *G, Immediate Offset) {
4129 Formula F = Base;
4130 if (!Base.BaseOffset.isCompatibleImmediate(Offset))
4131 return;
4132 F.BaseOffset = Base.BaseOffset.subUnsigned(Offset);
4133
4134 if (isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) {
4135 // Add the offset to the base register.
4136 const SCEV *NewOffset = Offset.getSCEV(SE, G->getType());
4137 const SCEV *NewG = SE.getAddExpr(NewOffset, G);
4138 // If it cancelled out, drop the base register, otherwise update it.
4139 if (NewG->isZero()) {
4140 if (IsScaledReg) {
4141 F.Scale = 0;
4142 F.ScaledReg = nullptr;
4143 } else
4144 F.deleteBaseReg(F.BaseRegs[Idx]);
4145 F.canonicalize(*L);
4146 } else if (IsScaledReg)
4147 F.ScaledReg = NewG;
4148 else
4149 F.BaseRegs[Idx] = NewG;
4150
4151 (void)InsertFormula(LU, LUIdx, F);
4152 }
4153 };
4154
4155 SCEVUse G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4156
4157 // With constant offsets and constant steps, we can generate pre-inc
4158 // accesses by having the offset equal the step. So, for access #0 with a
4159 // step of 8, we generate a G - 8 base which would require the first access
4160 // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer
4161 // for itself and hopefully becomes the base for other accesses. This means
4162 // means that a single pre-indexed access can be generated to become the new
4163 // base pointer for each iteration of the loop, resulting in no extra add/sub
4164 // instructions for pointer updating.
4165 if ((AMK & TTI::AMK_PreIndexed) && LU.Kind == LSRUse::Address) {
4166 const APInt *StepInt;
4167 if (match(G, m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StepInt)))) {
4168 int64_t Step = StepInt->isNegative() ? StepInt->getSExtValue()
4169 : StepInt->getZExtValue();
4170
4171 for (Immediate Offset : Worklist) {
4172 if (Offset.isFixed()) {
4173 Offset = Immediate::getFixed(Offset.getFixedValue() - Step);
4174 GenerateOffset(G, Offset);
4175 }
4176 }
4177 }
4178 }
4179 for (Immediate Offset : Worklist)
4180 GenerateOffset(G, Offset);
4181
4182 Immediate Imm = ExtractImmediate(G, SE);
4183 if (G->isZero() || Imm.isZero() ||
4184 !Base.BaseOffset.isCompatibleImmediate(Imm))
4185 return;
4186 Formula F = Base;
4187 F.BaseOffset = F.BaseOffset.addUnsigned(Imm);
4188 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4189 return;
4190 if (IsScaledReg) {
4191 F.ScaledReg = G;
4192 } else {
4193 F.BaseRegs[Idx] = G;
4194 // We may generate non canonical Formula if G is a recurrent expr reg
4195 // related with current loop while F.ScaledReg is not.
4196 F.canonicalize(*L);
4197 }
4198 (void)InsertFormula(LU, LUIdx, F);
4199}
4200
4201/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
4202void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
4203 Formula Base) {
4204 // TODO: For now, just add the min and max offset, because it usually isn't
4205 // worthwhile looking at everything inbetween.
4207 Worklist.push_back(LU.MinOffset);
4208 if (LU.MaxOffset != LU.MinOffset)
4209 Worklist.push_back(LU.MaxOffset);
4210
4211 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4212 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
4213 if (Base.Scale == 1)
4214 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
4215 /* IsScaledReg */ true);
4216}
4217
4218/// For ICmpZero, check to see if we can scale up the comparison. For example, x
4219/// == y -> x*c == y*c.
4220void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
4221 Formula Base) {
4222 if (LU.Kind != LSRUse::ICmpZero) return;
4223
4224 // Determine the integer type for the base formula.
4225 Type *IntTy = Base.getType();
4226 if (!IntTy) return;
4227 if (SE.getTypeSizeInBits(IntTy) > 64) return;
4228
4229 // Don't do this if there is more than one offset.
4230 if (LU.MinOffset != LU.MaxOffset) return;
4231
4232 // Check if transformation is valid. It is illegal to multiply pointer.
4233 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4234 return;
4235 for (const SCEV *BaseReg : Base.BaseRegs)
4236 if (BaseReg->getType()->isPointerTy())
4237 return;
4238 assert(!Base.BaseGV && "ICmpZero use is not legal!");
4239
4240 // Check each interesting stride.
4241 for (int64_t Factor : Factors) {
4242 // Check that Factor can be represented by IntTy
4243 if (!ConstantInt::isValueValidForType(IntTy, Factor))
4244 continue;
4245 // Check that the multiplication doesn't overflow.
4246 if (Base.BaseOffset.isMin() && Factor == -1)
4247 continue;
4248 // Not supporting scalable immediates.
4249 if (Base.BaseOffset.isNonZero() && Base.BaseOffset.isScalable())
4250 continue;
4251 Immediate NewBaseOffset = Base.BaseOffset.mulUnsigned(Factor);
4252 assert(Factor != 0 && "Zero factor not expected!");
4253 if (NewBaseOffset.getFixedValue() / Factor !=
4254 Base.BaseOffset.getFixedValue())
4255 continue;
4256 // If the offset will be truncated at this use, check that it is in bounds.
4257 if (!IntTy->isPointerTy() &&
4258 !ConstantInt::isValueValidForType(IntTy, NewBaseOffset.getFixedValue()))
4259 continue;
4260
4261 // Check that multiplying with the use offset doesn't overflow.
4262 Immediate Offset = LU.MinOffset;
4263 if (Offset.isMin() && Factor == -1)
4264 continue;
4265 Offset = Offset.mulUnsigned(Factor);
4266 if (Offset.getFixedValue() / Factor != LU.MinOffset.getFixedValue())
4267 continue;
4268 // If the offset will be truncated at this use, check that it is in bounds.
4269 if (!IntTy->isPointerTy() &&
4270 !ConstantInt::isValueValidForType(IntTy, Offset.getFixedValue()))
4271 continue;
4272
4273 Formula F = Base;
4274 F.BaseOffset = NewBaseOffset;
4275
4276 // Check that this scale is legal.
4277 if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
4278 continue;
4279
4280 // Compensate for the use having MinOffset built into it.
4281 F.BaseOffset = F.BaseOffset.addUnsigned(Offset).subUnsigned(LU.MinOffset);
4282
4283 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4284
4285 // Check that multiplying with each base register doesn't overflow.
4286 for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
4287 F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
4288 if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
4289 goto next;
4290 }
4291
4292 // Check that multiplying with the scaled register doesn't overflow.
4293 if (F.ScaledReg) {
4294 F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
4295 if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
4296 continue;
4297 }
4298
4299 // Check that multiplying with the unfolded offset doesn't overflow.
4300 if (F.UnfoldedOffset.isNonZero()) {
4301 if (F.UnfoldedOffset.isMin() && Factor == -1)
4302 continue;
4303 F.UnfoldedOffset = F.UnfoldedOffset.mulUnsigned(Factor);
4304 if (F.UnfoldedOffset.getFixedValue() / Factor !=
4305 Base.UnfoldedOffset.getFixedValue())
4306 continue;
4307 // If the offset will be truncated, check that it is in bounds.
4309 IntTy, F.UnfoldedOffset.getFixedValue()))
4310 continue;
4311 }
4312
4313 // If we make it here and it's legal, add it.
4314 (void)InsertFormula(LU, LUIdx, F);
4315 next:;
4316 }
4317}
4318
4319/// Generate stride factor reuse formulae by making use of scaled-offset address
4320/// modes, for example.
4321void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
4322 // Determine the integer type for the base formula.
4323 Type *IntTy = Base.getType();
4324 if (!IntTy) return;
4325
4326 // If this Formula already has a scaled register, we can't add another one.
4327 // Try to unscale the formula to generate a better scale.
4328 if (Base.Scale != 0 && !Base.unscale())
4329 return;
4330
4331 assert(Base.Scale == 0 && "unscale did not did its job!");
4332
4333 // Check each interesting stride.
4334 for (int64_t Factor : Factors) {
4335 Base.Scale = Factor;
4336 Base.HasBaseReg = Base.BaseRegs.size() > 1;
4337 // Check whether this scale is going to be legal.
4338 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4339 Base)) {
4340 // As a special-case, handle special out-of-loop Basic users specially.
4341 // TODO: Reconsider this special case.
4342 if (LU.Kind == LSRUse::Basic &&
4343 isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
4344 LU.AccessTy, Base) &&
4345 LU.AllFixupsOutsideLoop)
4346 LU.Kind = LSRUse::Special;
4347 else
4348 continue;
4349 }
4350 // For an ICmpZero, negating a solitary base register won't lead to
4351 // new solutions.
4352 if (LU.Kind == LSRUse::ICmpZero && !Base.HasBaseReg &&
4353 Base.BaseOffset.isZero() && !Base.BaseGV)
4354 continue;
4355 // For each addrec base reg, if its loop is current loop, apply the scale.
4356 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
4357 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
4358 if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
4359 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4360 if (FactorS->isZero())
4361 continue;
4362 // Divide out the factor, ignoring high bits, since we'll be
4363 // scaling the value back up in the end.
4364 if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true))
4365 if (!Quotient->isZero()) {
4366 // TODO: This could be optimized to avoid all the copying.
4367 Formula F = Base;
4368 F.ScaledReg = Quotient;
4369 F.deleteBaseReg(F.BaseRegs[i]);
4370 // The canonical representation of 1*reg is reg, which is already in
4371 // Base. In that case, do not try to insert the formula, it will be
4372 // rejected anyway.
4373 if (F.Scale == 1 && (F.BaseRegs.empty() ||
4374 (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
4375 continue;
4376 // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
4377 // non canonical Formula with ScaledReg's loop not being L.
4378 if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
4379 F.canonicalize(*L);
4380 (void)InsertFormula(LU, LUIdx, F);
4381 }
4382 }
4383 }
4384 }
4385}
4386
4387/// Extend/Truncate \p Expr to \p ToTy considering post-inc uses in \p Loops.
4388/// For all PostIncLoopSets in \p Loops, first de-normalize \p Expr, then
4389/// perform the extension/truncate and normalize again, as the normalized form
4390/// can result in folds that are not valid in the post-inc use contexts. The
4391/// expressions for all PostIncLoopSets must match, otherwise return nullptr.
4392static const SCEV *
4394 const SCEV *Expr, Type *ToTy,
4395 ScalarEvolution &SE) {
4396 const SCEV *Result = nullptr;
4397 for (auto &L : Loops) {
4398 auto *DenormExpr = denormalizeForPostIncUse(Expr, L, SE);
4399 const SCEV *NewDenormExpr = SE.getAnyExtendExpr(DenormExpr, ToTy);
4400 const SCEV *New = normalizeForPostIncUse(NewDenormExpr, L, SE);
4401 if (!New || (Result && New != Result))
4402 return nullptr;
4403 Result = New;
4404 }
4405
4406 assert(Result && "failed to create expression");
4407 return Result;
4408}
4409
4410/// Generate reuse formulae from different IV types.
4411void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
4412 // Don't bother truncating symbolic values.
4413 if (Base.BaseGV) return;
4414
4415 // Determine the integer type for the base formula.
4416 Type *DstTy = Base.getType();
4417 if (!DstTy) return;
4418 if (DstTy->isPointerTy())
4419 return;
4420
4421 // It is invalid to extend a pointer type so exit early if ScaledReg or
4422 // any of the BaseRegs are pointers.
4423 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4424 return;
4425 if (any_of(Base.BaseRegs,
4426 [](const SCEV *S) { return S->getType()->isPointerTy(); }))
4427 return;
4428
4430 for (auto &LF : LU.Fixups)
4431 Loops.push_back(LF.PostIncLoops);
4432
4433 for (Type *SrcTy : Types) {
4434 if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
4435 Formula F = Base;
4436
4437 // Sometimes SCEV is able to prove zero during ext transform. It may
4438 // happen if SCEV did not do all possible transforms while creating the
4439 // initial node (maybe due to depth limitations), but it can do them while
4440 // taking ext.
4441 if (F.ScaledReg) {
4442 const SCEV *NewScaledReg =
4443 getAnyExtendConsideringPostIncUses(Loops, F.ScaledReg, SrcTy, SE);
4444 if (!NewScaledReg || NewScaledReg->isZero())
4445 continue;
4446 F.ScaledReg = NewScaledReg;
4447 }
4448 bool HasZeroBaseReg = false;
4449 for (const SCEV *&BaseReg : F.BaseRegs) {
4450 const SCEV *NewBaseReg =
4451 getAnyExtendConsideringPostIncUses(Loops, BaseReg, SrcTy, SE);
4452 if (!NewBaseReg || NewBaseReg->isZero()) {
4453 HasZeroBaseReg = true;
4454 break;
4455 }
4456 BaseReg = NewBaseReg;
4457 }
4458 if (HasZeroBaseReg)
4459 continue;
4460
4461 // TODO: This assumes we've done basic processing on all uses and
4462 // have an idea what the register usage is.
4463 if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
4464 continue;
4465
4466 F.canonicalize(*L);
4467 (void)InsertFormula(LU, LUIdx, F);
4468 }
4469 }
4470}
4471
4472namespace {
4473
4474/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
4475/// modifications so that the search phase doesn't have to worry about the data
4476/// structures moving underneath it.
4477struct WorkItem {
4478 size_t LUIdx;
4479 Immediate Imm;
4480 const SCEV *OrigReg;
4481
4482 WorkItem(size_t LI, Immediate I, const SCEV *R)
4483 : LUIdx(LI), Imm(I), OrigReg(R) {}
4484
4485 void print(raw_ostream &OS) const;
4486 void dump() const;
4487};
4488
4489} // end anonymous namespace
4490
4491#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4492void WorkItem::print(raw_ostream &OS) const {
4493 OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
4494 << " , add offset " << Imm;
4495}
4496
4497LLVM_DUMP_METHOD void WorkItem::dump() const {
4498 print(errs()); errs() << '\n';
4499}
4500#endif
4501
4502/// Look for registers which are a constant distance apart and try to form reuse
4503/// opportunities between them.
4504void LSRInstance::GenerateCrossUseConstantOffsets() {
4505 // Group the registers by their value without any added constant offset.
4506 using ImmMapTy = std::map<Immediate, const SCEV *, KeyOrderTargetImmediate>;
4507
4508 DenseMap<const SCEV *, ImmMapTy> Map;
4509 DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
4511 for (const SCEV *Use : RegUses) {
4512 SCEVUse Reg = Use; // Make a copy for ExtractImmediate to modify.
4513 Immediate Imm = ExtractImmediate(Reg, SE);
4514 auto Pair = Map.try_emplace(Reg);
4515 if (Pair.second)
4516 Sequence.push_back(Reg);
4517 Pair.first->second.insert(std::make_pair(Imm, Use));
4518 UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
4519 }
4520
4521 // Now examine each set of registers with the same base value. Build up
4522 // a list of work to do and do the work in a separate step so that we're
4523 // not adding formulae and register counts while we're searching.
4524 SmallVector<WorkItem, 32> WorkItems;
4525 SmallSet<std::pair<size_t, Immediate>, 32, KeyOrderSizeTAndImmediate>
4526 UniqueItems;
4527 for (const SCEV *Reg : Sequence) {
4528 const ImmMapTy &Imms = Map.find(Reg)->second;
4529
4530 // It's not worthwhile looking for reuse if there's only one offset.
4531 if (Imms.size() == 1)
4532 continue;
4533
4534 LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
4535 for (const auto &Entry
4536 : Imms) dbgs()
4537 << ' ' << Entry.first;
4538 dbgs() << '\n');
4539
4540 // Examine each offset.
4541 for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
4542 J != JE; ++J) {
4543 const SCEV *OrigReg = J->second;
4544
4545 Immediate JImm = J->first;
4546 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
4547
4548 if (!isa<SCEVConstant>(OrigReg) &&
4549 UsedByIndicesMap[Reg].count() == 1) {
4550 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4551 << '\n');
4552 continue;
4553 }
4554
4555 // Conservatively examine offsets between this orig reg a few selected
4556 // other orig regs.
4557 Immediate First = Imms.begin()->first;
4558 Immediate Last = std::prev(Imms.end())->first;
4559 if (!First.isCompatibleImmediate(Last)) {
4560 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4561 << "\n");
4562 continue;
4563 }
4564 // Only scalable if both terms are scalable, or if one is scalable and
4565 // the other is 0.
4566 bool Scalable = First.isScalable() || Last.isScalable();
4567 int64_t FI = First.getKnownMinValue();
4568 int64_t LI = Last.getKnownMinValue();
4569 // Compute (First + Last) / 2 without overflow using the fact that
4570 // First + Last = 2 * (First + Last) + (First ^ Last).
4571 int64_t Avg = (FI & LI) + ((FI ^ LI) >> 1);
4572 // If the result is negative and FI is odd and LI even (or vice versa),
4573 // we rounded towards -inf. Add 1 in that case, to round towards 0.
4574 Avg = Avg + ((FI ^ LI) & ((uint64_t)Avg >> 63));
4575 ImmMapTy::const_iterator OtherImms[] = {
4576 Imms.begin(), std::prev(Imms.end()),
4577 Imms.lower_bound(Immediate::get(Avg, Scalable))};
4578 for (const auto &M : OtherImms) {
4579 if (M == J || M == JE) continue;
4580 if (!JImm.isCompatibleImmediate(M->first))
4581 continue;
4582
4583 // Compute the difference between the two.
4584 Immediate Imm = JImm.subUnsigned(M->first);
4585 for (unsigned LUIdx : UsedByIndices.set_bits())
4586 // Make a memo of this use, offset, and register tuple.
4587 if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
4588 WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
4589 }
4590 }
4591 }
4592
4593 Map.clear();
4594 Sequence.clear();
4595 UsedByIndicesMap.clear();
4596 UniqueItems.clear();
4597
4598 // Now iterate through the worklist and add new formulae.
4599 for (const WorkItem &WI : WorkItems) {
4600 size_t LUIdx = WI.LUIdx;
4601 LSRUse &LU = Uses[LUIdx];
4602 Immediate Imm = WI.Imm;
4603 const SCEV *OrigReg = WI.OrigReg;
4604
4605 Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
4606 const SCEV *NegImmS = Imm.getNegativeSCEV(SE, IntTy);
4607 unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
4608
4609 // TODO: Use a more targeted data structure.
4610 for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
4611 Formula F = LU.Formulae[L];
4612 // FIXME: The code for the scaled and unscaled registers looks
4613 // very similar but slightly different. Investigate if they
4614 // could be merged. That way, we would not have to unscale the
4615 // Formula.
4616 F.unscale();
4617 // Use the immediate in the scaled register.
4618 if (F.ScaledReg == OrigReg) {
4619 if (!F.BaseOffset.isCompatibleImmediate(Imm))
4620 continue;
4621 Immediate Offset = F.BaseOffset.addUnsigned(Imm.mulUnsigned(F.Scale));
4622 // Don't create 50 + reg(-50).
4623 const SCEV *S = Offset.getNegativeSCEV(SE, IntTy);
4624 if (F.referencesReg(S))
4625 continue;
4626 Formula NewF = F;
4627 NewF.BaseOffset = Offset;
4628 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4629 NewF))
4630 continue;
4631 NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
4632
4633 // If the new scale is a constant in a register, and adding the constant
4634 // value to the immediate would produce a value closer to zero than the
4635 // immediate itself, then the formula isn't worthwhile.
4636 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) {
4637 // FIXME: Do we need to do something for scalable immediates here?
4638 // A scalable SCEV won't be constant, but we might still have
4639 // something in the offset? Bail out for now to be safe.
4640 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4641 continue;
4642 if (C->getValue()->isNegative() !=
4643 (NewF.BaseOffset.isLessThanZero()) &&
4644 (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
4645 .ule(std::abs(NewF.BaseOffset.getFixedValue())))
4646 continue;
4647 }
4648
4649 // OK, looks good.
4650 NewF.canonicalize(*this->L);
4651 (void)InsertFormula(LU, LUIdx, NewF);
4652 } else {
4653 // Use the immediate in a base register.
4654 for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
4655 const SCEV *BaseReg = F.BaseRegs[N];
4656 if (BaseReg != OrigReg)
4657 continue;
4658 Formula NewF = F;
4659 if (!NewF.BaseOffset.isCompatibleImmediate(Imm) ||
4660 !NewF.UnfoldedOffset.isCompatibleImmediate(Imm) ||
4661 !NewF.BaseOffset.isCompatibleImmediate(NewF.UnfoldedOffset))
4662 continue;
4663 NewF.BaseOffset = NewF.BaseOffset.addUnsigned(Imm);
4664 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
4665 LU.Kind, LU.AccessTy, NewF)) {
4666 if (AMK == TTI::AMK_PostIndexed &&
4667 mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
4668 continue;
4669 Immediate NewUnfoldedOffset = NewF.UnfoldedOffset.addUnsigned(Imm);
4670 if (!isLegalAddImmediate(TTI, NewUnfoldedOffset))
4671 continue;
4672 NewF = F;
4673 NewF.UnfoldedOffset = NewUnfoldedOffset;
4674 }
4675 NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
4676
4677 // If the new formula has a constant in a register, and adding the
4678 // constant value to the immediate would produce a value closer to
4679 // zero than the immediate itself, then the formula isn't worthwhile.
4680 for (const SCEV *NewReg : NewF.BaseRegs)
4681 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg)) {
4682 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4683 goto skip_formula;
4684 if ((C->getAPInt() + NewF.BaseOffset.getFixedValue())
4685 .abs()
4686 .slt(std::abs(NewF.BaseOffset.getFixedValue())) &&
4687 (C->getAPInt() + NewF.BaseOffset.getFixedValue())
4688 .countr_zero() >=
4690 NewF.BaseOffset.getFixedValue()))
4691 goto skip_formula;
4692 }
4693
4694 // Ok, looks good.
4695 NewF.canonicalize(*this->L);
4696 (void)InsertFormula(LU, LUIdx, NewF);
4697 break;
4698 skip_formula:;
4699 }
4700 }
4701 }
4702 }
4703}
4704
4705/// Generate formulae for each use.
4706void
4707LSRInstance::GenerateAllReuseFormulae() {
4708 // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
4709 // queries are more precise.
4710 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4711 LSRUse &LU = Uses[LUIdx];
4712 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4713 GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
4714 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4715 GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
4716 }
4717 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4718 LSRUse &LU = Uses[LUIdx];
4719 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4720 GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
4721 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4722 GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
4723 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4724 GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
4725 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4726 GenerateScales(LU, LUIdx, LU.Formulae[i]);
4727 }
4728 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4729 LSRUse &LU = Uses[LUIdx];
4730 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4731 GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
4732 }
4733
4734 GenerateCrossUseConstantOffsets();
4735
4736 LLVM_DEBUG(dbgs() << "\n"
4737 "After generating reuse formulae:\n";
4738 print_uses(dbgs()));
4739}
4740
4741/// If there are multiple formulae with the same set of registers used
4742/// by other uses, pick the best one and delete the others.
4743void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
4744 DenseSet<const SCEV *> VisitedRegs;
4745 SmallPtrSet<const SCEV *, 16> Regs;
4746 SmallPtrSet<const SCEV *, 16> LoserRegs;
4747#ifndef NDEBUG
4748 bool ChangedFormulae = false;
4749#endif
4750
4751 // Collect the best formula for each unique set of shared registers. This
4752 // is reset for each use.
4753 using BestFormulaeTy = DenseMap<SmallVector<const SCEV *, 4>, size_t>;
4754
4755 BestFormulaeTy BestFormulae;
4756
4757 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4758 LSRUse &LU = Uses[LUIdx];
4759 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
4760 dbgs() << '\n');
4761
4762 bool Any = false;
4763 for (size_t FIdx = 0, NumForms = LU.Formulae.size();
4764 FIdx != NumForms; ++FIdx) {
4765 Formula &F = LU.Formulae[FIdx];
4766
4767 // Some formulas are instant losers. For example, they may depend on
4768 // nonexistent AddRecs from other loops. These need to be filtered
4769 // immediately, otherwise heuristics could choose them over others leading
4770 // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
4771 // avoids the need to recompute this information across formulae using the
4772 // same bad AddRec. Passing LoserRegs is also essential unless we remove
4773 // the corresponding bad register from the Regs set.
4774 Cost CostF(L, SE, TTI, AMK);
4775 Regs.clear();
4776 CostF.RateFormula(F, Regs, VisitedRegs, LU, HardwareLoopProfitable,
4777 &LoserRegs);
4778 if (CostF.isLoser()) {
4779 // During initial formula generation, undesirable formulae are generated
4780 // by uses within other loops that have some non-trivial address mode or
4781 // use the postinc form of the IV. LSR needs to provide these formulae
4782 // as the basis of rediscovering the desired formula that uses an AddRec
4783 // corresponding to the existing phi. Once all formulae have been
4784 // generated, these initial losers may be pruned.
4785 LLVM_DEBUG(dbgs() << " Filtering loser "; F.print(dbgs());
4786 dbgs() << "\n");
4787 }
4788 else {
4790 for (const SCEV *Reg : F.BaseRegs) {
4791 if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
4792 Key.push_back(Reg);
4793 }
4794 if (F.ScaledReg &&
4795 RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
4796 Key.push_back(F.ScaledReg);
4797 // Unstable sort by host order ok, because this is only used for
4798 // uniquifying.
4799 llvm::sort(Key);
4800
4801 std::pair<BestFormulaeTy::const_iterator, bool> P =
4802 BestFormulae.insert(std::make_pair(Key, FIdx));
4803 if (P.second)
4804 continue;
4805
4806 Formula &Best = LU.Formulae[P.first->second];
4807
4808 Cost CostBest(L, SE, TTI, AMK);
4809 Regs.clear();
4810 CostBest.RateFormula(Best, Regs, VisitedRegs, LU,
4811 HardwareLoopProfitable);
4812 if (CostF.isLess(CostBest))
4813 std::swap(F, Best);
4814 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
4815 dbgs() << "\n"
4816 " in favor of formula ";
4817 Best.print(dbgs()); dbgs() << '\n');
4818 }
4819#ifndef NDEBUG
4820 ChangedFormulae = true;
4821#endif
4822 LU.DeleteFormula(F);
4823 --FIdx;
4824 --NumForms;
4825 Any = true;
4826 }
4827
4828 // Now that we've filtered out some formulae, recompute the Regs set.
4829 if (Any)
4830 LU.RecomputeRegs(LUIdx, RegUses);
4831
4832 // Reset this to prepare for the next use.
4833 BestFormulae.clear();
4834 }
4835
4836 LLVM_DEBUG(if (ChangedFormulae) {
4837 dbgs() << "\n"
4838 "After filtering out undesirable candidates:\n";
4839 print_uses(dbgs());
4840 });
4841}
4842
4843/// Estimate the worst-case number of solutions the solver might have to
4844/// consider. It almost never considers this many solutions because it prune the
4845/// search space, but the pruning isn't always sufficient.
4846size_t LSRInstance::EstimateSearchSpaceComplexity() const {
4847 size_t Power = 1;
4848 for (const LSRUse &LU : Uses) {
4849 size_t FSize = LU.Formulae.size();
4850 if (FSize >= ComplexityLimit) {
4851 Power = ComplexityLimit;
4852 break;
4853 }
4854 Power *= FSize;
4855 if (Power >= ComplexityLimit)
4856 break;
4857 }
4858 return Power;
4859}
4860
4861/// When one formula uses a superset of the registers of another formula, it
4862/// won't help reduce register pressure (though it may not necessarily hurt
4863/// register pressure); remove it to simplify the system.
4864void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
4865 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4866 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4867
4868 LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
4869 "which use a superset of registers used by other "
4870 "formulae.\n");
4871
4872 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4873 LSRUse &LU = Uses[LUIdx];
4874 bool Any = false;
4875 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
4876 Formula &F = LU.Formulae[i];
4877 if (F.BaseOffset.isNonZero() && F.BaseOffset.isScalable())
4878 continue;
4879 // Look for a formula with a constant or GV in a register. If the use
4880 // also has a formula with that same value in an immediate field,
4881 // delete the one that uses a register.
4883 I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
4884 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
4885 Formula NewF = F;
4886 //FIXME: Formulas should store bitwidth to do wrapping properly.
4887 // See PR41034.
4888 NewF.BaseOffset =
4889 Immediate::getFixed(NewF.BaseOffset.getFixedValue() +
4890 (uint64_t)C->getValue()->getSExtValue());
4891 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4892 (I - F.BaseRegs.begin()));
4893 if (LU.HasFormulaWithSameRegs(NewF)) {
4894 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4895 dbgs() << '\n');
4896 LU.DeleteFormula(F);
4897 --i;
4898 --e;
4899 Any = true;
4900 break;
4901 }
4902 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
4903 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
4904 if (!F.BaseGV) {
4905 Formula NewF = F;
4906 NewF.BaseGV = GV;
4907 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4908 (I - F.BaseRegs.begin()));
4909 if (LU.HasFormulaWithSameRegs(NewF)) {
4910 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4911 dbgs() << '\n');
4912 LU.DeleteFormula(F);
4913 --i;
4914 --e;
4915 Any = true;
4916 break;
4917 }
4918 }
4919 }
4920 }
4921 }
4922 if (Any)
4923 LU.RecomputeRegs(LUIdx, RegUses);
4924 }
4925
4926 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4927 }
4928}
4929
4930/// When there are many registers for expressions like A, A+1, A+2, etc.,
4931/// allocate a single register for them.
4932void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
4933 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4934 return;
4935
4936 LLVM_DEBUG(
4937 dbgs() << "The search space is too complex.\n"
4938 "Narrowing the search space by assuming that uses separated "
4939 "by a constant offset will use the same registers.\n");
4940
4941 // This is especially useful for unrolled loops.
4942
4943 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4944 LSRUse &LU = Uses[LUIdx];
4945 for (const Formula &F : LU.Formulae) {
4946 if (F.BaseOffset.isZero() || (F.Scale != 0 && F.Scale != 1))
4947 continue;
4948 assert((LU.Kind == LSRUse::Address || LU.Kind == LSRUse::ICmpZero) &&
4949 "Only address and cmp uses expected to have nonzero BaseOffset");
4950
4951 LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
4952 if (!LUThatHas)
4953 continue;
4954
4955 if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
4956 LU.Kind, LU.AccessTy))
4957 continue;
4958
4959 LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');
4960
4961 LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
4962 LUThatHas->AllFixupsUnconditional &= LU.AllFixupsUnconditional;
4963
4964 // Transfer the fixups of LU to LUThatHas.
4965 for (LSRFixup &Fixup : LU.Fixups) {
4966 Fixup.Offset += F.BaseOffset;
4967 LUThatHas->pushFixup(Fixup);
4968 LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
4969 }
4970
4971#ifndef NDEBUG
4972 Type *FixupType = LUThatHas->Fixups[0].OperandValToReplace->getType();
4973 for (LSRFixup &Fixup : LUThatHas->Fixups)
4974 assert(Fixup.OperandValToReplace->getType() == FixupType &&
4975 "Expected all fixups to have the same type");
4976#endif
4977
4978 // Delete formulae from the new use which are no longer legal.
4979 bool Any = false;
4980 for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
4981 Formula &F = LUThatHas->Formulae[i];
4982 if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
4983 LUThatHas->Kind, LUThatHas->AccessTy, F)) {
4984 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
4985 LUThatHas->DeleteFormula(F);
4986 --i;
4987 --e;
4988 Any = true;
4989 }
4990 }
4991
4992 if (Any)
4993 LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
4994
4995 // Delete the old use.
4996 DeleteUse(LU, LUIdx);
4997 --LUIdx;
4998 --NumUses;
4999 break;
5000 }
5001 }
5002
5003 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5004}
5005
5006/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
5007/// we've done more filtering, as it may be able to find more formulae to
5008/// eliminate.
5009void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
5010 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5011 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5012
5013 LLVM_DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
5014 "undesirable dedicated registers.\n");
5015
5016 FilterOutUndesirableDedicatedRegisters();
5017
5018 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5019 }
5020}
5021
5022/// If a LSRUse has multiple formulae with the same ScaledReg and Scale.
5023/// Pick the best one and delete the others.
5024/// This narrowing heuristic is to keep as many formulae with different
5025/// Scale and ScaledReg pair as possible while narrowing the search space.
5026/// The benefit is that it is more likely to find out a better solution
5027/// from a formulae set with more Scale and ScaledReg variations than
5028/// a formulae set with the same Scale and ScaledReg. The picking winner
5029/// reg heuristic will often keep the formulae with the same Scale and
5030/// ScaledReg and filter others, and we want to avoid that if possible.
5031void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
5032 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5033 return;
5034
5035 LLVM_DEBUG(
5036 dbgs() << "The search space is too complex.\n"
5037 "Narrowing the search space by choosing the best Formula "
5038 "from the Formulae with the same Scale and ScaledReg.\n");
5039
5040 // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
5041 using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>;
5042
5043 BestFormulaeTy BestFormulae;
5044#ifndef NDEBUG
5045 bool ChangedFormulae = false;
5046#endif
5047 DenseSet<const SCEV *> VisitedRegs;
5048 SmallPtrSet<const SCEV *, 16> Regs;
5049
5050 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5051 LSRUse &LU = Uses[LUIdx];
5052 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
5053 dbgs() << '\n');
5054
5055 // Return true if Formula FA is better than Formula FB.
5056 auto IsBetterThan = [&](Formula &FA, Formula &FB) {
5057 // First we will try to choose the Formula with fewer new registers.
5058 // For a register used by current Formula, the more the register is
5059 // shared among LSRUses, the less we increase the register number
5060 // counter of the formula.
5061 size_t FARegNum = 0;
5062 for (const SCEV *Reg : FA.BaseRegs) {
5063 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5064 FARegNum += (NumUses - UsedByIndices.count() + 1);
5065 }
5066 size_t FBRegNum = 0;
5067 for (const SCEV *Reg : FB.BaseRegs) {
5068 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5069 FBRegNum += (NumUses - UsedByIndices.count() + 1);
5070 }
5071 if (FARegNum != FBRegNum)
5072 return FARegNum < FBRegNum;
5073
5074 // If the new register numbers are the same, choose the Formula with
5075 // less Cost.
5076 Cost CostFA(L, SE, TTI, AMK);
5077 Cost CostFB(L, SE, TTI, AMK);
5078 Regs.clear();
5079 CostFA.RateFormula(FA, Regs, VisitedRegs, LU, HardwareLoopProfitable);
5080 Regs.clear();
5081 CostFB.RateFormula(FB, Regs, VisitedRegs, LU, HardwareLoopProfitable);
5082 return CostFA.isLess(CostFB);
5083 };
5084
5085 bool Any = false;
5086 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5087 ++FIdx) {
5088 Formula &F = LU.Formulae[FIdx];
5089 if (!F.ScaledReg)
5090 continue;
5091 auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx});
5092 if (P.second)
5093 continue;
5094
5095 Formula &Best = LU.Formulae[P.first->second];
5096 if (IsBetterThan(F, Best))
5097 std::swap(F, Best);
5098 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5099 dbgs() << "\n"
5100 " in favor of formula ";
5101 Best.print(dbgs()); dbgs() << '\n');
5102#ifndef NDEBUG
5103 ChangedFormulae = true;
5104#endif
5105 LU.DeleteFormula(F);
5106 --FIdx;
5107 --NumForms;
5108 Any = true;
5109 }
5110 if (Any)
5111 LU.RecomputeRegs(LUIdx, RegUses);
5112
5113 // Reset this to prepare for the next use.
5114 BestFormulae.clear();
5115 }
5116
5117 LLVM_DEBUG(if (ChangedFormulae) {
5118 dbgs() << "\n"
5119 "After filtering out undesirable candidates:\n";
5120 print_uses(dbgs());
5121 });
5122}
5123
5124/// If we are over the complexity limit, filter out any post-inc prefering
5125/// variables to only post-inc values.
5126void LSRInstance::NarrowSearchSpaceByFilterPostInc() {
5127 if (AMK != TTI::AMK_PostIndexed)
5128 return;
5129 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5130 return;
5131
5132 LLVM_DEBUG(dbgs() << "The search space is too complex.\n"
5133 "Narrowing the search space by choosing the lowest "
5134 "register Formula for PostInc Uses.\n");
5135
5136 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5137 LSRUse &LU = Uses[LUIdx];
5138
5139 if (LU.Kind != LSRUse::Address)
5140 continue;
5141 if (!TTI.isIndexedLoadLegal(TTI.MIM_PostInc, LU.AccessTy.getType()) &&
5142 !TTI.isIndexedStoreLegal(TTI.MIM_PostInc, LU.AccessTy.getType()))
5143 continue;
5144
5145 size_t MinRegs = std::numeric_limits<size_t>::max();
5146 for (const Formula &F : LU.Formulae)
5147 MinRegs = std::min(F.getNumRegs(), MinRegs);
5148
5149 bool Any = false;
5150 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5151 ++FIdx) {
5152 Formula &F = LU.Formulae[FIdx];
5153 if (F.getNumRegs() > MinRegs) {
5154 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5155 dbgs() << "\n");
5156 LU.DeleteFormula(F);
5157 --FIdx;
5158 --NumForms;
5159 Any = true;
5160 }
5161 }
5162 if (Any)
5163 LU.RecomputeRegs(LUIdx, RegUses);
5164
5165 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5166 break;
5167 }
5168
5169 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5170}
5171
5172/// The function delete formulas with high registers number expectation.
5173/// Assuming we don't know the value of each formula (already delete
5174/// all inefficient), generate probability of not selecting for each
5175/// register.
5176/// For example,
5177/// Use1:
5178/// reg(a) + reg({0,+,1})
5179/// reg(a) + reg({-1,+,1}) + 1
5180/// reg({a,+,1})
5181/// Use2:
5182/// reg(b) + reg({0,+,1})
5183/// reg(b) + reg({-1,+,1}) + 1
5184/// reg({b,+,1})
5185/// Use3:
5186/// reg(c) + reg(b) + reg({0,+,1})
5187/// reg(c) + reg({b,+,1})
5188///
5189/// Probability of not selecting
5190/// Use1 Use2 Use3
5191/// reg(a) (1/3) * 1 * 1
5192/// reg(b) 1 * (1/3) * (1/2)
5193/// reg({0,+,1}) (2/3) * (2/3) * (1/2)
5194/// reg({-1,+,1}) (2/3) * (2/3) * 1
5195/// reg({a,+,1}) (2/3) * 1 * 1
5196/// reg({b,+,1}) 1 * (2/3) * (2/3)
5197/// reg(c) 1 * 1 * 0
5198///
5199/// Now count registers number mathematical expectation for each formula:
5200/// Note that for each use we exclude probability if not selecting for the use.
5201/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
5202/// probabilty 1/3 of not selecting for Use1).
5203/// Use1:
5204/// reg(a) + reg({0,+,1}) 1 + 1/3 -- to be deleted
5205/// reg(a) + reg({-1,+,1}) + 1 1 + 4/9 -- to be deleted
5206/// reg({a,+,1}) 1
5207/// Use2:
5208/// reg(b) + reg({0,+,1}) 1/2 + 1/3 -- to be deleted
5209/// reg(b) + reg({-1,+,1}) + 1 1/2 + 2/3 -- to be deleted
5210/// reg({b,+,1}) 2/3
5211/// Use3:
5212/// reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
5213/// reg(c) + reg({b,+,1}) 1 + 2/3
5214void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
5215 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5216 return;
5217 // Ok, we have too many of formulae on our hands to conveniently handle.
5218 // Use a rough heuristic to thin out the list.
5219
5220 // Set of Regs wich will be 100% used in final solution.
5221 // Used in each formula of a solution (in example above this is reg(c)).
5222 // We can skip them in calculations.
5223 SmallPtrSet<const SCEV *, 4> UniqRegs;
5224 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5225
5226 // Map each register to probability of not selecting
5227 DenseMap <const SCEV *, float> RegNumMap;
5228 for (const SCEV *Reg : RegUses) {
5229 if (UniqRegs.count(Reg))
5230 continue;
5231 float PNotSel = 1;
5232 for (const LSRUse &LU : Uses) {
5233 if (!LU.Regs.count(Reg))
5234 continue;
5235 float P = LU.getNotSelectedProbability(Reg);
5236 if (P != 0.0)
5237 PNotSel *= P;
5238 else
5239 UniqRegs.insert(Reg);
5240 }
5241 RegNumMap.insert(std::make_pair(Reg, PNotSel));
5242 }
5243
5244 LLVM_DEBUG(
5245 dbgs() << "Narrowing the search space by deleting costly formulas\n");
5246
5247 // Delete formulas where registers number expectation is high.
5248 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5249 LSRUse &LU = Uses[LUIdx];
5250 // If nothing to delete - continue.
5251 if (LU.Formulae.size() < 2)
5252 continue;
5253 // This is temporary solution to test performance. Float should be
5254 // replaced with round independent type (based on integers) to avoid
5255 // different results for different target builds.
5256 float FMinRegNum = LU.Formulae[0].getNumRegs();
5257 float FMinARegNum = LU.Formulae[0].getNumRegs();
5258 size_t MinIdx = 0;
5259 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5260 Formula &F = LU.Formulae[i];
5261 float FRegNum = 0;
5262 float FARegNum = 0;
5263 for (const SCEV *BaseReg : F.BaseRegs) {
5264 if (UniqRegs.count(BaseReg))
5265 continue;
5266 FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5267 if (isa<SCEVAddRecExpr>(BaseReg))
5268 FARegNum +=
5269 RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5270 }
5271 if (const SCEV *ScaledReg = F.ScaledReg) {
5272 if (!UniqRegs.count(ScaledReg)) {
5273 FRegNum +=
5274 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5275 if (isa<SCEVAddRecExpr>(ScaledReg))
5276 FARegNum +=
5277 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5278 }
5279 }
5280 if (FMinRegNum > FRegNum ||
5281 (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
5282 FMinRegNum = FRegNum;
5283 FMinARegNum = FARegNum;
5284 MinIdx = i;
5285 }
5286 }
5287 LLVM_DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs());
5288 dbgs() << " with min reg num " << FMinRegNum << '\n');
5289 if (MinIdx != 0)
5290 std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
5291 while (LU.Formulae.size() != 1) {
5292 LLVM_DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs());
5293 dbgs() << '\n');
5294 LU.Formulae.pop_back();
5295 }
5296 LU.RecomputeRegs(LUIdx, RegUses);
5297 assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
5298 Formula &F = LU.Formulae[0];
5299 LLVM_DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n');
5300 // When we choose the formula, the regs become unique.
5301 UniqRegs.insert_range(F.BaseRegs);
5302 if (F.ScaledReg)
5303 UniqRegs.insert(F.ScaledReg);
5304 }
5305 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5306}
5307
5308// Check if Best and Reg are SCEVs separated by a constant amount C, and if so
5309// would the addressing offset +C would be legal where the negative offset -C is
5310// not.
5312 ScalarEvolution &SE, const SCEV *Best,
5313 const SCEV *Reg,
5314 MemAccessTy AccessType) {
5315 if (Best->getType() != Reg->getType() ||
5317 cast<SCEVAddRecExpr>(Best)->getLoop() !=
5318 cast<SCEVAddRecExpr>(Reg)->getLoop()))
5319 return false;
5320 std::optional<APInt> Diff = SE.computeConstantDifference(Best, Reg);
5321 if (!Diff)
5322 return false;
5323
5324 return TTI.isLegalAddressingMode(
5325 AccessType.MemTy, /*BaseGV=*/nullptr,
5326 /*BaseOffset=*/Diff->getSExtValue(),
5327 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace) &&
5328 !TTI.isLegalAddressingMode(
5329 AccessType.MemTy, /*BaseGV=*/nullptr,
5330 /*BaseOffset=*/-Diff->getSExtValue(),
5331 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace);
5332}
5333
5334/// Pick a register which seems likely to be profitable, and then in any use
5335/// which has any reference to that register, delete all formulae which do not
5336/// reference that register.
5337void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
5338 // With all other options exhausted, loop until the system is simple
5339 // enough to handle.
5340 SmallPtrSet<const SCEV *, 4> Taken;
5341 while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5342 // Ok, we have too many of formulae on our hands to conveniently handle.
5343 // Use a rough heuristic to thin out the list.
5344 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5345
5346 // Pick the register which is used by the most LSRUses, which is likely
5347 // to be a good reuse register candidate.
5348 const SCEV *Best = nullptr;
5349 unsigned BestNum = 0;
5350 for (const SCEV *Reg : RegUses) {
5351 if (Taken.count(Reg))
5352 continue;
5353 if (!Best) {
5354 Best = Reg;
5355 BestNum = RegUses.getUsedByIndices(Reg).count();
5356 } else {
5357 unsigned Count = RegUses.getUsedByIndices(Reg).count();
5358 if (Count > BestNum) {
5359 Best = Reg;
5360 BestNum = Count;
5361 }
5362
5363 // If the scores are the same, but the Reg is simpler for the target
5364 // (for example {x,+,1} as opposed to {x+C,+,1}, where the target can
5365 // handle +C but not -C), opt for the simpler formula.
5366 if (Count == BestNum) {
5367 int LUIdx = RegUses.getUsedByIndices(Reg).find_first();
5368 if (LUIdx >= 0 && Uses[LUIdx].Kind == LSRUse::Address &&
5370 Uses[LUIdx].AccessTy)) {
5371 Best = Reg;
5372 BestNum = Count;
5373 }
5374 }
5375 }
5376 }
5377 assert(Best && "Failed to find best LSRUse candidate");
5378
5379 LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
5380 << " will yield profitable reuse.\n");
5381 Taken.insert(Best);
5382
5383 // In any use with formulae which references this register, delete formulae
5384 // which don't reference it.
5385 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5386 LSRUse &LU = Uses[LUIdx];
5387 if (!LU.Regs.count(Best)) continue;
5388
5389 bool Any = false;
5390 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5391 Formula &F = LU.Formulae[i];
5392 if (!F.referencesReg(Best)) {
5393 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
5394 LU.DeleteFormula(F);
5395 --e;
5396 --i;
5397 Any = true;
5398 assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
5399 continue;
5400 }
5401 }
5402
5403 if (Any)
5404 LU.RecomputeRegs(LUIdx, RegUses);
5405 }
5406
5407 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5408 }
5409}
5410
5411/// If there are an extraordinary number of formulae to choose from, use some
5412/// rough heuristics to prune down the number of formulae. This keeps the main
5413/// solver from taking an extraordinary amount of time in some worst-case
5414/// scenarios.
5415void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
5416 NarrowSearchSpaceByDetectingSupersets();
5417 NarrowSearchSpaceByCollapsingUnrolledCode();
5418 NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
5420 NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
5421 NarrowSearchSpaceByFilterPostInc();
5422 if (LSRExpNarrow)
5423 NarrowSearchSpaceByDeletingCostlyFormulas();
5424 else
5425 NarrowSearchSpaceByPickingWinnerRegs();
5426}
5427
5428/// This is the recursive solver.
5429void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
5430 Cost &SolutionCost,
5431 SmallVectorImpl<const Formula *> &Workspace,
5432 const Cost &CurCost,
5433 const SmallPtrSet<const SCEV *, 16> &CurRegs,
5434 DenseSet<const SCEV *> &VisitedRegs) const {
5435 // Some ideas:
5436 // - prune more:
5437 // - use more aggressive filtering
5438 // - sort the formula so that the most profitable solutions are found first
5439 // - sort the uses too
5440 // - search faster:
5441 // - don't compute a cost, and then compare. compare while computing a cost
5442 // and bail early.
5443 // - track register sets with SmallBitVector
5444
5445 const LSRUse &LU = Uses[Workspace.size()];
5446
5447 // If this use references any register that's already a part of the
5448 // in-progress solution, consider it a requirement that a formula must
5449 // reference that register in order to be considered. This prunes out
5450 // unprofitable searching.
5451 SmallSetVector<const SCEV *, 4> ReqRegs;
5452 for (const SCEV *S : CurRegs)
5453 if (LU.Regs.count(S))
5454 ReqRegs.insert(S);
5455
5456 SmallPtrSet<const SCEV *, 16> NewRegs;
5457 Cost NewCost(L, SE, TTI, AMK);
5458 for (const Formula &F : LU.Formulae) {
5459 // Ignore formulae which may not be ideal in terms of register reuse of
5460 // ReqRegs. The formula should use all required registers before
5461 // introducing new ones.
5462 // This can sometimes (notably when trying to favour postinc) lead to
5463 // sub-optimial decisions. There it is best left to the cost modelling to
5464 // get correct.
5465 if (!(AMK & TTI::AMK_PostIndexed) || LU.Kind != LSRUse::Address) {
5466 int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
5467 for (const SCEV *Reg : ReqRegs) {
5468 if ((F.ScaledReg && F.ScaledReg == Reg) ||
5469 is_contained(F.BaseRegs, Reg)) {
5470 --NumReqRegsToFind;
5471 if (NumReqRegsToFind == 0)
5472 break;
5473 }
5474 }
5475 if (NumReqRegsToFind != 0) {
5476 // If none of the formulae satisfied the required registers, then we could
5477 // clear ReqRegs and try again. Currently, we simply give up in this case.
5478 continue;
5479 }
5480 }
5481
5482 // Evaluate the cost of the current formula. If it's already worse than
5483 // the current best, prune the search at that point.
5484 NewCost = CurCost;
5485 NewRegs = CurRegs;
5486 NewCost.RateFormula(F, NewRegs, VisitedRegs, LU, HardwareLoopProfitable);
5487 if (NewCost.isLess(SolutionCost)) {
5488 Workspace.push_back(&F);
5489 if (Workspace.size() != Uses.size()) {
5490 SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
5491 NewRegs, VisitedRegs);
5492 if (F.getNumRegs() == 1 && Workspace.size() == 1)
5493 VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
5494 } else {
5495 LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
5496 dbgs() << ".\nRegs:\n";
5497 for (const SCEV *S : NewRegs) dbgs()
5498 << "- " << *S << "\n";
5499 dbgs() << '\n');
5500
5501 SolutionCost = NewCost;
5502 Solution = Workspace;
5503 }
5504 Workspace.pop_back();
5505 }
5506 }
5507}
5508
5509/// Choose one formula from each use. Return the results in the given Solution
5510/// vector.
5511void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
5513 Cost SolutionCost(L, SE, TTI, AMK);
5514 SolutionCost.Lose();
5515 Cost CurCost(L, SE, TTI, AMK);
5516 SmallPtrSet<const SCEV *, 16> CurRegs;
5517 DenseSet<const SCEV *> VisitedRegs;
5518 Workspace.reserve(Uses.size());
5519
5520 // SolveRecurse does all the work.
5521 SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
5522 CurRegs, VisitedRegs);
5523 if (Solution.empty()) {
5524 LLVM_DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
5525 return;
5526 }
5527
5528 // Ok, we've now made all our decisions.
5529 LLVM_DEBUG(dbgs() << "\n"
5530 "The chosen solution requires ";
5531 SolutionCost.print(dbgs()); dbgs() << ":\n";
5532 for (size_t i = 0, e = Uses.size(); i != e; ++i) {
5533 dbgs() << " ";
5534 Uses[i].print(dbgs());
5535 dbgs() << "\n"
5536 " ";
5537 Solution[i]->print(dbgs());
5538 dbgs() << '\n';
5539 });
5540
5541 assert(Solution.size() == Uses.size() && "Malformed solution!");
5542
5543 const bool EnableDropUnprofitableSolution = [&] {
5545 case cl::BOU_TRUE:
5546 return true;
5547 case cl::BOU_FALSE:
5548 return false;
5549 case cl::BOU_UNSET:
5551 }
5552 llvm_unreachable("Unhandled cl::boolOrDefault enum");
5553 }();
5554
5555 if (BaselineCost.isLess(SolutionCost)) {
5556 if (!EnableDropUnprofitableSolution)
5557 LLVM_DEBUG(
5558 dbgs() << "Baseline is more profitable than chosen solution, "
5559 "add option 'lsr-drop-solution' to drop LSR solution.\n");
5560 else {
5561 LLVM_DEBUG(dbgs() << "Baseline is more profitable than chosen "
5562 "solution, dropping LSR solution.\n";);
5563 Solution.clear();
5564 }
5565 }
5566}
5567
5568/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
5569/// we can go while still being dominated by the input positions. This helps
5570/// canonicalize the insert position, which encourages sharing.
5572LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
5573 const SmallVectorImpl<Instruction *> &Inputs)
5574 const {
5575 Instruction *Tentative = &*IP;
5576 while (true) {
5577 bool AllDominate = true;
5578 Instruction *BetterPos = nullptr;
5579 // Don't bother attempting to insert before a catchswitch, their basic block
5580 // cannot have other non-PHI instructions.
5581 if (isa<CatchSwitchInst>(Tentative))
5582 return IP;
5583
5584 for (Instruction *Inst : Inputs) {
5585 if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
5586 AllDominate = false;
5587 break;
5588 }
5589 // Attempt to find an insert position in the middle of the block,
5590 // instead of at the end, so that it can be used for other expansions.
5591 if (Tentative->getParent() == Inst->getParent() &&
5592 (!BetterPos || !DT.dominates(Inst, BetterPos)))
5593 BetterPos = &*std::next(BasicBlock::iterator(Inst));
5594 }
5595 if (!AllDominate)
5596 break;
5597 if (BetterPos)
5598 IP = BetterPos->getIterator();
5599 else
5600 IP = Tentative->getIterator();
5601
5602 const Loop *IPLoop = LI.getLoopFor(IP->getParent());
5603 unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
5604
5605 BasicBlock *IDom;
5606 for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
5607 if (!Rung) return IP;
5608 Rung = Rung->getIDom();
5609 if (!Rung) return IP;
5610 IDom = Rung->getBlock();
5611
5612 // Don't climb into a loop though.
5613 const Loop *IDomLoop = LI.getLoopFor(IDom);
5614 unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
5615 if (IDomDepth <= IPLoopDepth &&
5616 (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
5617 break;
5618 }
5619
5620 Tentative = IDom->getTerminator();
5621 }
5622
5623 return IP;
5624}
5625
5626/// Determine an input position which will be dominated by the operands and
5627/// which will dominate the result.
5628BasicBlock::iterator LSRInstance::AdjustInsertPositionForExpand(
5629 BasicBlock::iterator LowestIP, const LSRFixup &LF, const LSRUse &LU) const {
5630 // Collect some instructions which must be dominated by the
5631 // expanding replacement. These must be dominated by any operands that
5632 // will be required in the expansion.
5633 SmallVector<Instruction *, 4> Inputs;
5634 if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
5635 Inputs.push_back(I);
5636 if (LU.Kind == LSRUse::ICmpZero)
5637 if (Instruction *I =
5638 dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
5639 Inputs.push_back(I);
5640 if (LF.PostIncLoops.count(L)) {
5641 if (LF.isUseFullyOutsideLoop(L))
5642 Inputs.push_back(L->getLoopLatch()->getTerminator());
5643 else
5644 Inputs.push_back(IVIncInsertPos);
5645 }
5646 // The expansion must also be dominated by the increment positions of any
5647 // loops it for which it is using post-inc mode.
5648 for (const Loop *PIL : LF.PostIncLoops) {
5649 if (PIL == L) continue;
5650
5651 // Be dominated by the loop exit.
5652 SmallVector<BasicBlock *, 4> ExitingBlocks;
5653 PIL->getExitingBlocks(ExitingBlocks);
5654 if (!ExitingBlocks.empty()) {
5655 BasicBlock *BB = ExitingBlocks[0];
5656 for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
5657 BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
5658 Inputs.push_back(BB->getTerminator());
5659 }
5660 }
5661
5662 assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad() &&
5663 "Insertion point must be a normal instruction");
5664
5665 // Then, climb up the immediate dominator tree as far as we can go while
5666 // still being dominated by the input positions.
5667 BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
5668
5669 // Don't insert instructions before PHI nodes.
5670 while (isa<PHINode>(IP)) ++IP;
5671
5672 // Ignore landingpad instructions.
5673 while (IP->isEHPad()) ++IP;
5674
5675 // Set IP below instructions recently inserted by SCEVExpander. This keeps the
5676 // IP consistent across expansions and allows the previously inserted
5677 // instructions to be reused by subsequent expansion.
5678 while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP)
5679 ++IP;
5680
5681 return IP;
5682}
5683
5684/// Emit instructions for the leading candidate expression for this LSRUse (this
5685/// is called "expanding").
5686Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
5687 const Formula &F, BasicBlock::iterator IP,
5688 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5689 if (LU.RigidFormula)
5690 return LF.OperandValToReplace;
5691
5692 // Determine an input position which will be dominated by the operands and
5693 // which will dominate the result.
5694 IP = AdjustInsertPositionForExpand(IP, LF, LU);
5695 Rewriter.setInsertPoint(&*IP);
5696
5697 // Inform the Rewriter if we have a post-increment use, so that it can
5698 // perform an advantageous expansion.
5699 Rewriter.setPostInc(LF.PostIncLoops);
5700
5701 // This is the type that the user actually needs.
5702 Type *OpTy = LF.OperandValToReplace->getType();
5703 // This will be the type that we'll initially expand to.
5704 Type *Ty = F.getType();
5705 if (!Ty)
5706 // No type known; just expand directly to the ultimate type.
5707 Ty = OpTy;
5708 else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
5709 // Expand directly to the ultimate type if it's the right size.
5710 Ty = OpTy;
5711 // This is the type to do integer arithmetic in.
5712 Type *IntTy = SE.getEffectiveSCEVType(Ty);
5713
5714 // Build up a list of operands to add together to form the full base.
5716
5717 // Expand the BaseRegs portion.
5718 for (const SCEV *Reg : F.BaseRegs) {
5719 assert(!Reg->isZero() && "Zero allocated in a base register!");
5720
5721 // If we're expanding for a post-inc user, make the post-inc adjustment.
5722 Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE);
5723 Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
5724 }
5725
5726 // Expand the ScaledReg portion.
5727 Value *ICmpScaledV = nullptr;
5728 if (F.Scale != 0) {
5729 const SCEV *ScaledS = F.ScaledReg;
5730
5731 // If we're expanding for a post-inc user, make the post-inc adjustment.
5732 PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
5733 ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE);
5734
5735 if (LU.Kind == LSRUse::ICmpZero) {
5736 // Expand ScaleReg as if it was part of the base regs.
5737 if (F.Scale == 1)
5738 Ops.push_back(
5739 SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)));
5740 else {
5741 // An interesting way of "folding" with an icmp is to use a negated
5742 // scale, which we'll implement by inserting it into the other operand
5743 // of the icmp.
5744 assert(F.Scale == -1 &&
5745 "The only scale supported by ICmpZero uses is -1!");
5746 ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr);
5747 }
5748 } else {
5749 // Otherwise just expand the scaled register and an explicit scale,
5750 // which is expected to be matched as part of the address.
5751
5752 // Flush the operand list to suppress SCEVExpander hoisting address modes.
5753 // Unless the addressing mode will not be folded.
5754 if (!Ops.empty() && LU.Kind == LSRUse::Address &&
5755 isAMCompletelyFolded(TTI, LU, F)) {
5756 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), nullptr);
5757 Ops.clear();
5758 Ops.push_back(SE.getUnknown(FullV));
5759 }
5760 ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr));
5761 if (F.Scale != 1)
5762 ScaledS =
5763 SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
5764 Ops.push_back(ScaledS);
5765 }
5766 }
5767
5768 // Expand the GV portion.
5769 if (F.BaseGV) {
5770 // Flush the operand list to suppress SCEVExpander hoisting.
5771 if (!Ops.empty()) {
5772 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), IntTy);
5773 Ops.clear();
5774 Ops.push_back(SE.getUnknown(FullV));
5775 }
5776 Ops.push_back(SE.getUnknown(F.BaseGV));
5777 }
5778
5779 // Flush the operand list to suppress SCEVExpander hoisting of both folded and
5780 // unfolded offsets. LSR assumes they both live next to their uses.
5781 if (!Ops.empty()) {
5782 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
5783 Ops.clear();
5784 Ops.push_back(SE.getUnknown(FullV));
5785 }
5786
5787 // FIXME: Are we sure we won't get a mismatch here? Is there a way to bail
5788 // out at this point, or should we generate a SCEV adding together mixed
5789 // offsets?
5790 assert(F.BaseOffset.isCompatibleImmediate(LF.Offset) &&
5791 "Expanding mismatched offsets\n");
5792 // Expand the immediate portion.
5793 Immediate Offset = F.BaseOffset.addUnsigned(LF.Offset);
5794 if (Offset.isNonZero()) {
5795 if (LU.Kind == LSRUse::ICmpZero) {
5796 // The other interesting way of "folding" with an ICmpZero is to use a
5797 // negated immediate.
5798 if (!ICmpScaledV) {
5799 // TODO: Avoid implicit trunc?
5800 // See https://github.com/llvm/llvm-project/issues/112510.
5801 ICmpScaledV = ConstantInt::getSigned(
5802 IntTy, -(uint64_t)Offset.getFixedValue(), /*ImplicitTrunc=*/true);
5803 } else {
5804 Ops.push_back(SE.getUnknown(ICmpScaledV));
5805 ICmpScaledV = ConstantInt::getSigned(IntTy, Offset.getFixedValue(),
5806 /*ImplicitTrunc=*/true);
5807 }
5808 } else {
5809 // Just add the immediate values. These again are expected to be matched
5810 // as part of the address.
5811 Ops.push_back(Offset.getUnknownSCEV(SE, IntTy));
5812 }
5813 }
5814
5815 // Expand the unfolded offset portion.
5816 Immediate UnfoldedOffset = F.UnfoldedOffset;
5817 if (UnfoldedOffset.isNonZero()) {
5818 // Just add the immediate values.
5819 Ops.push_back(UnfoldedOffset.getUnknownSCEV(SE, IntTy));
5820 }
5821
5822 // Emit instructions summing all the operands.
5823 const SCEV *FullS = Ops.empty() ?
5824 SE.getConstant(IntTy, 0) :
5825 SE.getAddExpr(Ops);
5826 Value *FullV = Rewriter.expandCodeFor(FullS, Ty);
5827
5828 // We're done expanding now, so reset the rewriter.
5829 Rewriter.clearPostInc();
5830
5831 // An ICmpZero Formula represents an ICmp which we're handling as a
5832 // comparison against zero. Now that we've expanded an expression for that
5833 // form, update the ICmp's other operand.
5834 if (LU.Kind == LSRUse::ICmpZero) {
5835 ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
5836 if (auto *OperandIsInstr = dyn_cast<Instruction>(CI->getOperand(1)))
5837 DeadInsts.emplace_back(OperandIsInstr);
5838 assert(!F.BaseGV && "ICmp does not support folding a global value and "
5839 "a scale at the same time!");
5840 if (F.Scale == -1) {
5841 if (ICmpScaledV->getType() != OpTy) {
5843 CastInst::getCastOpcode(ICmpScaledV, false, OpTy, false),
5844 ICmpScaledV, OpTy, "tmp", CI->getIterator());
5845 ICmpScaledV = Cast;
5846 }
5847 CI->setOperand(1, ICmpScaledV);
5848 } else {
5849 // A scale of 1 means that the scale has been expanded as part of the
5850 // base regs.
5851 assert((F.Scale == 0 || F.Scale == 1) &&
5852 "ICmp does not support folding a global value and "
5853 "a scale at the same time!");
5854 // TODO: Avoid implicit trunc?
5855 // See https://github.com/llvm/llvm-project/issues/112510.
5857 -(uint64_t)Offset.getFixedValue(),
5858 /*ImplicitTrunc=*/true);
5859 if (C->getType() != OpTy) {
5861 CastInst::getCastOpcode(C, false, OpTy, false), C, OpTy,
5862 CI->getDataLayout());
5863 assert(C && "Cast of ConstantInt should have folded");
5864 }
5865
5866 CI->setOperand(1, C);
5867 }
5868 }
5869
5870 return FullV;
5871}
5872
5873/// Helper for Rewrite. PHI nodes are special because the use of their operands
5874/// effectively happens in their predecessor blocks, so the expression may need
5875/// to be expanded in multiple places.
5876void LSRInstance::RewriteForPHI(PHINode *PN, const LSRUse &LU,
5877 const LSRFixup &LF, const Formula &F,
5878 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
5879 DenseMap<BasicBlock *, Value *> Inserted;
5880
5881 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
5882 if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
5883 bool needUpdateFixups = false;
5884 BasicBlock *BB = PN->getIncomingBlock(i);
5885
5886 // If this is a critical edge, split the edge so that we do not insert
5887 // the code on all predecessor/successor paths. We do this unless this
5888 // is the canonical backedge for this loop, which complicates post-inc
5889 // users.
5890 if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
5893 BasicBlock *Parent = PN->getParent();
5894 Loop *PNLoop = LI.getLoopFor(Parent);
5895 if (!PNLoop || Parent != PNLoop->getHeader()) {
5896 // Split the critical edge.
5897 BasicBlock *NewBB = nullptr;
5898 if (!Parent->isLandingPad()) {
5899 NewBB =
5900 SplitCriticalEdge(BB, Parent,
5901 CriticalEdgeSplittingOptions(&DT, &LI, MSSAU)
5902 .setMergeIdenticalEdges()
5903 .setKeepOneInputPHIs());
5904 } else {
5906 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
5907 SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DTU, &LI);
5908 NewBB = NewBBs[0];
5909 }
5910 // If NewBB==NULL, then SplitCriticalEdge refused to split because all
5911 // phi predecessors are identical. The simple thing to do is skip
5912 // splitting in this case rather than complicate the API.
5913 if (NewBB) {
5914 // If PN is outside of the loop and BB is in the loop, we want to
5915 // move the block to be immediately before the PHI block, not
5916 // immediately after BB.
5917 if (L->contains(BB) && !L->contains(PN))
5918 NewBB->moveBefore(PN->getParent());
5919
5920 // Splitting the edge can reduce the number of PHI entries we have.
5921 e = PN->getNumIncomingValues();
5922 BB = NewBB;
5923 i = PN->getBasicBlockIndex(BB);
5924
5925 needUpdateFixups = true;
5926 }
5927 }
5928 }
5929
5930 std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
5931 Inserted.try_emplace(BB);
5932 if (!Pair.second)
5933 PN->setIncomingValue(i, Pair.first->second);
5934 else {
5935 Value *FullV =
5936 Expand(LU, LF, F, BB->getTerminator()->getIterator(), DeadInsts);
5937
5938 // If this is reuse-by-noop-cast, insert the noop cast.
5939 Type *OpTy = LF.OperandValToReplace->getType();
5940 if (FullV->getType() != OpTy)
5941 FullV = CastInst::Create(
5942 CastInst::getCastOpcode(FullV, false, OpTy, false), FullV,
5943 LF.OperandValToReplace->getType(), "tmp",
5944 BB->getTerminator()->getIterator());
5945
5946 // If the incoming block for this value is not in the loop, it means the
5947 // current PHI is not in a loop exit, so we must create a LCSSA PHI for
5948 // the inserted value.
5949 if (auto *I = dyn_cast<Instruction>(FullV))
5950 if (L->contains(I) && !L->contains(BB))
5951 InsertedNonLCSSAInsts.insert(I);
5952
5953 PN->setIncomingValue(i, FullV);
5954 Pair.first->second = FullV;
5955 }
5956
5957 // If LSR splits critical edge and phi node has other pending
5958 // fixup operands, we need to update those pending fixups. Otherwise
5959 // formulae will not be implemented completely and some instructions
5960 // will not be eliminated.
5961 if (needUpdateFixups) {
5962 for (LSRUse &LU : Uses)
5963 for (LSRFixup &Fixup : LU.Fixups)
5964 // If fixup is supposed to rewrite some operand in the phi
5965 // that was just updated, it may be already moved to
5966 // another phi node. Such fixup requires update.
5967 if (Fixup.UserInst == PN) {
5968 // Check if the operand we try to replace still exists in the
5969 // original phi.
5970 bool foundInOriginalPHI = false;
5971 for (const auto &val : PN->incoming_values())
5972 if (val == Fixup.OperandValToReplace) {
5973 foundInOriginalPHI = true;
5974 break;
5975 }
5976
5977 // If fixup operand found in original PHI - nothing to do.
5978 if (foundInOriginalPHI)
5979 continue;
5980
5981 // Otherwise it might be moved to another PHI and requires update.
5982 // If fixup operand not found in any of the incoming blocks that
5983 // means we have already rewritten it - nothing to do.
5984 for (const auto &Block : PN->blocks())
5985 for (BasicBlock::iterator I = Block->begin(); isa<PHINode>(I);
5986 ++I) {
5987 PHINode *NewPN = cast<PHINode>(I);
5988 for (const auto &val : NewPN->incoming_values())
5989 if (val == Fixup.OperandValToReplace)
5990 Fixup.UserInst = NewPN;
5991 }
5992 }
5993 }
5994 }
5995}
5996
5997/// Emit instructions for the leading candidate expression for this LSRUse (this
5998/// is called "expanding"), and update the UserInst to reference the newly
5999/// expanded value.
6000void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
6001 const Formula &F,
6002 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
6003 // First, find an insertion point that dominates UserInst. For PHI nodes,
6004 // find the nearest block which dominates all the relevant uses.
6005 if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
6006 RewriteForPHI(PN, LU, LF, F, DeadInsts);
6007 } else {
6008 Value *FullV = Expand(LU, LF, F, LF.UserInst->getIterator(), DeadInsts);
6009
6010 // If this is reuse-by-noop-cast, insert the noop cast.
6011 Type *OpTy = LF.OperandValToReplace->getType();
6012 if (FullV->getType() != OpTy) {
6013 Instruction *Cast =
6014 CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
6015 FullV, OpTy, "tmp", LF.UserInst->getIterator());
6016 FullV = Cast;
6017 }
6018
6019 // Update the user. ICmpZero is handled specially here (for now) because
6020 // Expand may have updated one of the operands of the icmp already, and
6021 // its new value may happen to be equal to LF.OperandValToReplace, in
6022 // which case doing replaceUsesOfWith leads to replacing both operands
6023 // with the same value. TODO: Reorganize this.
6024 if (LU.Kind == LSRUse::ICmpZero)
6025 LF.UserInst->setOperand(0, FullV);
6026 else
6027 LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
6028 }
6029
6030 if (auto *OperandIsInstr = dyn_cast<Instruction>(LF.OperandValToReplace))
6031 DeadInsts.emplace_back(OperandIsInstr);
6032}
6033
6034// Determine where to insert the transformed IV increment instruction for this
6035// fixup. By default this is the default insert position, but if this is a
6036// postincrement opportunity then we try to insert it in the same block as the
6037// fixup user instruction, as this is needed for a postincrement instruction to
6038// be generated.
6040 const LSRFixup &Fixup, const LSRUse &LU,
6041 Instruction *IVIncInsertPos,
6042 DominatorTree &DT) {
6043 // Only address uses can be postincremented
6044 if (LU.Kind != LSRUse::Address)
6045 return IVIncInsertPos;
6046
6047 // Don't try to postincrement if it's not legal
6048 Instruction *I = Fixup.UserInst;
6049 Type *Ty = I->getType();
6050 if (!(isa<LoadInst>(I) && TTI.isIndexedLoadLegal(TTI.MIM_PostInc, Ty)) &&
6051 !(isa<StoreInst>(I) && TTI.isIndexedStoreLegal(TTI.MIM_PostInc, Ty)))
6052 return IVIncInsertPos;
6053
6054 // It's only legal to hoist to the user block if it dominates the default
6055 // insert position.
6056 BasicBlock *HoistBlock = I->getParent();
6057 BasicBlock *IVIncBlock = IVIncInsertPos->getParent();
6058 if (!DT.dominates(I, IVIncBlock))
6059 return IVIncInsertPos;
6060
6061 return HoistBlock->getTerminator();
6062}
6063
6064/// Rewrite all the fixup locations with new values, following the chosen
6065/// solution.
6066void LSRInstance::ImplementSolution(
6067 const SmallVectorImpl<const Formula *> &Solution) {
6068 // Keep track of instructions we may have made dead, so that
6069 // we can remove them after we are done working.
6071
6072 // Mark phi nodes that terminate chains so the expander tries to reuse them.
6073 for (const IVChain &Chain : IVChainVec) {
6074 if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst()))
6075 Rewriter.setChainedPhi(PN);
6076 }
6077
6078 // Expand the new value definitions and update the users.
6079 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
6080 for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) {
6081 Instruction *InsertPos =
6082 getFixupInsertPos(TTI, Fixup, Uses[LUIdx], IVIncInsertPos, DT);
6083 Rewriter.setIVIncInsertPos(L, InsertPos);
6084 Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], DeadInsts);
6085 Changed = true;
6086 }
6087
6088 auto InsertedInsts = InsertedNonLCSSAInsts.takeVector();
6089 formLCSSAForInstructions(InsertedInsts, DT, LI, &SE);
6090
6091 for (const IVChain &Chain : IVChainVec) {
6092 GenerateIVChain(Chain, DeadInsts);
6093 Changed = true;
6094 }
6095
6096 for (const WeakVH &IV : Rewriter.getInsertedIVs())
6097 if (IV && dyn_cast<Instruction>(&*IV)->getParent())
6098 ScalarEvolutionIVs.push_back(IV);
6099
6100 // Clean up after ourselves. This must be done before deleting any
6101 // instructions.
6102 Rewriter.clear();
6103
6105 &TLI, MSSAU);
6106
6107 // In our cost analysis above, we assume that each addrec consumes exactly
6108 // one register, and arrange to have increments inserted just before the
6109 // latch to maximimize the chance this is true. However, if we reused
6110 // existing IVs, we now need to move the increments to match our
6111 // expectations. Otherwise, our cost modeling results in us having a
6112 // chosen a non-optimal result for the actual schedule. (And yes, this
6113 // scheduling decision does impact later codegen.)
6114 for (PHINode &PN : L->getHeader()->phis()) {
6115 BinaryOperator *BO = nullptr;
6116 Value *Start = nullptr, *Step = nullptr;
6117 if (!matchSimpleRecurrence(&PN, BO, Start, Step))
6118 continue;
6119
6120 switch (BO->getOpcode()) {
6121 case Instruction::Sub:
6122 if (BO->getOperand(0) != &PN)
6123 // sub is non-commutative - match handling elsewhere in LSR
6124 continue;
6125 break;
6126 case Instruction::Add:
6127 break;
6128 default:
6129 continue;
6130 };
6131
6132 if (!isa<Constant>(Step))
6133 // If not a constant step, might increase register pressure
6134 // (We assume constants have been canonicalized to RHS)
6135 continue;
6136
6137 if (BO->getParent() == IVIncInsertPos->getParent())
6138 // Only bother moving across blocks. Isel can handle block local case.
6139 continue;
6140
6141 // Can we legally schedule inc at the desired point?
6142 if (!llvm::all_of(BO->uses(),
6143 [&](Use &U) {return DT.dominates(IVIncInsertPos, U);}))
6144 continue;
6145 BO->moveBefore(IVIncInsertPos->getIterator());
6146 Changed = true;
6147 }
6148
6149
6150}
6151
6152LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
6153 DominatorTree &DT, LoopInfo &LI,
6154 const TargetTransformInfo &TTI, AssumptionCache &AC,
6155 TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU)
6156 : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), TLI(TLI), TTI(TTI), L(L),
6157 MSSAU(MSSAU), AMK(PreferredAddresingMode.getNumOccurrences() > 0
6159 : TTI.getPreferredAddressingMode(L, &SE)),
6160 Rewriter(SE, "lsr", false), BaselineCost(L, SE, TTI, AMK) {
6161 // If LoopSimplify form is not available, stay out of trouble.
6162 if (!L->isLoopSimplifyForm())
6163 return;
6164
6165 // If there's no interesting work to be done, bail early.
6166 if (IU.empty()) return;
6167
6168 // If there's too much analysis to be done, bail early. We won't be able to
6169 // model the problem anyway.
6170 unsigned NumUsers = 0;
6171 for (const IVStrideUse &U : IU) {
6172 if (++NumUsers > MaxIVUsers) {
6173 (void)U;
6174 LLVM_DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U
6175 << "\n");
6176 return;
6177 }
6178 // Bail out if we have a PHI on an EHPad that gets a value from a
6179 // CatchSwitchInst. Because the CatchSwitchInst cannot be split, there is
6180 // no good place to stick any instructions.
6181 if (auto *PN = dyn_cast<PHINode>(U.getUser())) {
6182 auto FirstNonPHI = PN->getParent()->getFirstNonPHIIt();
6183 if (isa<FuncletPadInst>(FirstNonPHI) ||
6184 isa<CatchSwitchInst>(FirstNonPHI))
6185 for (BasicBlock *PredBB : PN->blocks())
6186 if (isa<CatchSwitchInst>(PredBB->getFirstNonPHIIt()))
6187 return;
6188 }
6189 }
6190
6191 LLVM_DEBUG(dbgs() << "\nLSR on loop ";
6192 L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
6193 dbgs() << ":\n");
6194
6195 // Check if we expect this loop to use a hardware loop instruction, which will
6196 // be used when calculating the costs of formulas.
6197 HardwareLoopInfo HWLoopInfo(L);
6198 HardwareLoopProfitable =
6199 TTI.isHardwareLoopProfitable(L, SE, AC, &TLI, HWLoopInfo);
6200
6201 // Configure SCEVExpander already now, so the correct mode is used for
6202 // isSafeToExpand() checks.
6203#if LLVM_ENABLE_ABI_BREAKING_CHECKS
6204 Rewriter.setDebugType(DEBUG_TYPE);
6205#endif
6206 Rewriter.disableCanonicalMode();
6207 Rewriter.enableLSRMode();
6208
6209 // First, perform some low-level loop optimizations.
6210 OptimizeShadowIV();
6211 OptimizeLoopTermCond();
6212
6213 // If loop preparation eliminates all interesting IV users, bail.
6214 if (IU.empty()) return;
6215
6216 // Skip nested loops until we can model them better with formulae.
6217 if (!L->isInnermost()) {
6218 LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
6219 return;
6220 }
6221
6222 // Start collecting data and preparing for the solver.
6223 // If number of registers is not the major cost, we cannot benefit from the
6224 // current profitable chain optimization which is based on number of
6225 // registers.
6226 // FIXME: add profitable chain optimization for other kinds major cost, for
6227 // example number of instructions.
6228 if (TTI.isNumRegsMajorCostOfLSR() || StressIVChain)
6229 CollectChains();
6230 CollectInterestingTypesAndFactors();
6231 CollectFixupsAndInitialFormulae();
6232 CollectLoopInvariantFixupsAndFormulae();
6233
6234 if (Uses.empty())
6235 return;
6236
6237 LLVM_DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
6238 print_uses(dbgs()));
6239 LLVM_DEBUG(dbgs() << "The baseline solution requires ";
6240 BaselineCost.print(dbgs()); dbgs() << "\n");
6241
6242 // Now use the reuse data to generate a bunch of interesting ways
6243 // to formulate the values needed for the uses.
6244 GenerateAllReuseFormulae();
6245
6246 FilterOutUndesirableDedicatedRegisters();
6247 NarrowSearchSpaceUsingHeuristics();
6248
6250 Solve(Solution);
6251
6252 // Release memory that is no longer needed.
6253 Factors.clear();
6254 Types.clear();
6255 RegUses.clear();
6256
6257 if (Solution.empty())
6258 return;
6259
6260#ifndef NDEBUG
6261 // Formulae should be legal.
6262 for (const LSRUse &LU : Uses) {
6263 for (const Formula &F : LU.Formulae)
6264 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
6265 F) && "Illegal formula generated!");
6266 };
6267#endif
6268
6269 // Now that we've decided what we want, make it so.
6270 ImplementSolution(Solution);
6271}
6272
6273#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
6274void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
6275 if (Factors.empty() && Types.empty()) return;
6276
6277 OS << "LSR has identified the following interesting factors and types: ";
6278 ListSeparator LS;
6279
6280 for (int64_t Factor : Factors)
6281 OS << LS << '*' << Factor;
6282
6283 for (Type *Ty : Types)
6284 OS << LS << '(' << *Ty << ')';
6285 OS << '\n';
6286}
6287
6288void LSRInstance::print_fixups(raw_ostream &OS) const {
6289 OS << "LSR is examining the following fixup sites:\n";
6290 for (const LSRUse &LU : Uses)
6291 for (const LSRFixup &LF : LU.Fixups) {
6292 dbgs() << " ";
6293 LF.print(OS);
6294 OS << '\n';
6295 }
6296}
6297
6298void LSRInstance::print_uses(raw_ostream &OS) const {
6299 OS << "LSR is examining the following uses:\n";
6300 for (const LSRUse &LU : Uses) {
6301 dbgs() << " ";
6302 LU.print(OS);
6303 OS << '\n';
6304 for (const Formula &F : LU.Formulae) {
6305 OS << " ";
6306 F.print(OS);
6307 OS << '\n';
6308 }
6309 }
6310}
6311
6312void LSRInstance::print(raw_ostream &OS) const {
6313 print_factors_and_types(OS);
6314 print_fixups(OS);
6315 print_uses(OS);
6316}
6317
6318LLVM_DUMP_METHOD void LSRInstance::dump() const {
6319 print(errs()); errs() << '\n';
6320}
6321#endif
6322
6323namespace {
6324
6325class LoopStrengthReduce : public LoopPass {
6326public:
6327 static char ID; // Pass ID, replacement for typeid
6328
6329 LoopStrengthReduce();
6330
6331private:
6332 bool runOnLoop(Loop *L, LPPassManager &LPM) override;
6333 void getAnalysisUsage(AnalysisUsage &AU) const override;
6334};
6335
6336} // end anonymous namespace
6337
6338LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
6340}
6341
6342void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
6343 // We split critical edges, so we change the CFG. However, we do update
6344 // many analyses if they are around.
6346
6347 AU.addRequired<LoopInfoWrapperPass>();
6348 AU.addPreserved<LoopInfoWrapperPass>();
6350 AU.addRequired<DominatorTreeWrapperPass>();
6351 AU.addPreserved<DominatorTreeWrapperPass>();
6352 AU.addRequired<ScalarEvolutionWrapperPass>();
6353 AU.addPreserved<ScalarEvolutionWrapperPass>();
6354 AU.addRequired<AssumptionCacheTracker>();
6355 AU.addRequired<TargetLibraryInfoWrapperPass>();
6356 // Requiring LoopSimplify a second time here prevents IVUsers from running
6357 // twice, since LoopSimplify was invalidated by running ScalarEvolution.
6359 AU.addRequired<IVUsersWrapperPass>();
6360 AU.addPreserved<IVUsersWrapperPass>();
6361 AU.addRequired<TargetTransformInfoWrapperPass>();
6362 AU.addPreserved<MemorySSAWrapperPass>();
6363}
6364
6365namespace {
6366
6367/// Enables more convenient iteration over a DWARF expression vector.
6369ToDwarfOpIter(SmallVectorImpl<uint64_t> &Expr) {
6370 llvm::DIExpression::expr_op_iterator Begin =
6371 llvm::DIExpression::expr_op_iterator(Expr.begin());
6372 llvm::DIExpression::expr_op_iterator End =
6373 llvm::DIExpression::expr_op_iterator(Expr.end());
6374 return {Begin, End};
6375}
6376
6377struct SCEVDbgValueBuilder {
6378 SCEVDbgValueBuilder() = default;
6379 SCEVDbgValueBuilder(const SCEVDbgValueBuilder &Base) { clone(Base); }
6380
6381 void clone(const SCEVDbgValueBuilder &Base) {
6382 LocationOps = Base.LocationOps;
6383 Expr = Base.Expr;
6384 }
6385
6386 void clear() {
6387 LocationOps.clear();
6388 Expr.clear();
6389 }
6390
6391 /// The DIExpression as we translate the SCEV.
6393 /// The location ops of the DIExpression.
6394 SmallVector<Value *, 2> LocationOps;
6395
6396 void pushOperator(uint64_t Op) { Expr.push_back(Op); }
6397 void pushUInt(uint64_t Operand) { Expr.push_back(Operand); }
6398
6399 /// Add a DW_OP_LLVM_arg to the expression, followed by the index of the value
6400 /// in the set of values referenced by the expression.
6401 void pushLocation(llvm::Value *V) {
6403 auto *It = llvm::find(LocationOps, V);
6404 unsigned ArgIndex = 0;
6405 if (It != LocationOps.end()) {
6406 ArgIndex = std::distance(LocationOps.begin(), It);
6407 } else {
6408 ArgIndex = LocationOps.size();
6409 LocationOps.push_back(V);
6410 }
6411 Expr.push_back(ArgIndex);
6412 }
6413
6414 void pushValue(const SCEVUnknown *U) {
6415 llvm::Value *V = cast<SCEVUnknown>(U)->getValue();
6416 pushLocation(V);
6417 }
6418
6419 bool pushConst(const SCEVConstant *C) {
6420 if (C->getAPInt().getSignificantBits() > 64)
6421 return false;
6422 Expr.push_back(llvm::dwarf::DW_OP_consts);
6423 Expr.push_back(C->getAPInt().getSExtValue());
6424 return true;
6425 }
6426
6427 // Iterating the expression as DWARF ops is convenient when updating
6428 // DWARF_OP_LLVM_args.
6430 return ToDwarfOpIter(Expr);
6431 }
6432
6433 /// Several SCEV types are sequences of the same arithmetic operator applied
6434 /// to constants and values that may be extended or truncated.
6435 bool pushArithmeticExpr(const llvm::SCEVCommutativeExpr *CommExpr,
6436 uint64_t DwarfOp) {
6437 assert((isa<llvm::SCEVAddExpr>(CommExpr) || isa<SCEVMulExpr>(CommExpr)) &&
6438 "Expected arithmetic SCEV type");
6439 bool Success = true;
6440 unsigned EmitOperator = 0;
6441 for (const auto &Op : CommExpr->operands()) {
6442 Success &= pushSCEV(Op);
6443
6444 if (EmitOperator >= 1)
6445 pushOperator(DwarfOp);
6446 ++EmitOperator;
6447 }
6448 return Success;
6449 }
6450
6451 // TODO: Identify and omit noop casts.
6452 bool pushCast(const llvm::SCEVCastExpr *C, bool IsSigned) {
6453 const llvm::SCEV *Inner = C->getOperand(0);
6454 const llvm::Type *Type = C->getType();
6455 uint64_t ToWidth = Type->getIntegerBitWidth();
6456 bool Success = pushSCEV(Inner);
6457 uint64_t CastOps[] = {dwarf::DW_OP_LLVM_convert, ToWidth,
6458 IsSigned ? llvm::dwarf::DW_ATE_signed
6459 : llvm::dwarf::DW_ATE_unsigned};
6460 for (const auto &Op : CastOps)
6461 pushOperator(Op);
6462 return Success;
6463 }
6464
6465 // TODO: MinMax - although these haven't been encountered in the test suite.
6466 bool pushSCEV(const llvm::SCEV *S) {
6467 bool Success = true;
6468 if (const SCEVConstant *StartInt = dyn_cast<SCEVConstant>(S)) {
6469 Success &= pushConst(StartInt);
6470
6471 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
6472 if (!U->getValue())
6473 return false;
6474 pushLocation(U->getValue());
6475
6476 } else if (const SCEVMulExpr *MulRec = dyn_cast<SCEVMulExpr>(S)) {
6477 Success &= pushArithmeticExpr(MulRec, llvm::dwarf::DW_OP_mul);
6478
6479 } else if (const SCEVUDivExpr *UDiv = dyn_cast<SCEVUDivExpr>(S)) {
6480 Success &= pushSCEV(UDiv->getLHS());
6481 Success &= pushSCEV(UDiv->getRHS());
6482 pushOperator(llvm::dwarf::DW_OP_div);
6483
6484 } else if (const SCEVCastExpr *Cast = dyn_cast<SCEVCastExpr>(S)) {
6485 // Assert if a new and unknown SCEVCastEXpr type is encountered.
6488 isa<SCEVSignExtendExpr>(Cast)) &&
6489 "Unexpected cast type in SCEV.");
6490 Success &= pushCast(Cast, (isa<SCEVSignExtendExpr>(Cast)));
6491
6492 } else if (const SCEVAddExpr *AddExpr = dyn_cast<SCEVAddExpr>(S)) {
6493 Success &= pushArithmeticExpr(AddExpr, llvm::dwarf::DW_OP_plus);
6494
6495 } else if (isa<SCEVAddRecExpr>(S)) {
6496 // Nested SCEVAddRecExpr are generated by nested loops and are currently
6497 // unsupported.
6498 return false;
6499
6500 } else {
6501 return false;
6502 }
6503 return Success;
6504 }
6505
6506 /// Return true if the combination of arithmetic operator and underlying
6507 /// SCEV constant value is an identity function.
6508 bool isIdentityFunction(uint64_t Op, const SCEV *S) {
6509 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
6510 if (C->getAPInt().getSignificantBits() > 64)
6511 return false;
6512 int64_t I = C->getAPInt().getSExtValue();
6513 switch (Op) {
6514 case llvm::dwarf::DW_OP_plus:
6515 case llvm::dwarf::DW_OP_minus:
6516 return I == 0;
6517 case llvm::dwarf::DW_OP_mul:
6518 case llvm::dwarf::DW_OP_div:
6519 return I == 1;
6520 }
6521 }
6522 return false;
6523 }
6524
6525 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6526 /// builder's expression stack. The stack should already contain an
6527 /// expression for the iteration count, so that it can be multiplied by
6528 /// the stride and added to the start.
6529 /// Components of the expression are omitted if they are an identity function.
6530 /// Chain (non-affine) SCEVs are not supported.
6531 bool SCEVToValueExpr(const llvm::SCEVAddRecExpr &SAR, ScalarEvolution &SE) {
6532 assert(SAR.isAffine() && "Expected affine SCEV");
6533 const SCEV *Start = SAR.getStart();
6534 const SCEV *Stride = SAR.getStepRecurrence(SE);
6535
6536 // Skip pushing arithmetic noops.
6537 if (!isIdentityFunction(llvm::dwarf::DW_OP_mul, Stride)) {
6538 if (!pushSCEV(Stride))
6539 return false;
6540 pushOperator(llvm::dwarf::DW_OP_mul);
6541 }
6542 if (!isIdentityFunction(llvm::dwarf::DW_OP_plus, Start)) {
6543 if (!pushSCEV(Start))
6544 return false;
6545 pushOperator(llvm::dwarf::DW_OP_plus);
6546 }
6547 return true;
6548 }
6549
6550 /// Create an expression that is an offset from a value (usually the IV).
6551 void createOffsetExpr(int64_t Offset, Value *OffsetValue) {
6552 pushLocation(OffsetValue);
6554 LLVM_DEBUG(
6555 dbgs() << "scev-salvage: Generated IV offset expression. Offset: "
6556 << std::to_string(Offset) << "\n");
6557 }
6558
6559 /// Combine a translation of the SCEV and the IV to create an expression that
6560 /// recovers a location's value.
6561 /// returns true if an expression was created.
6562 bool createIterCountExpr(const SCEV *S,
6563 const SCEVDbgValueBuilder &IterationCount,
6564 ScalarEvolution &SE) {
6565 // SCEVs for SSA values are most frquently of the form
6566 // {start,+,stride}, but sometimes they are ({start,+,stride} + %a + ..).
6567 // This is because %a is a PHI node that is not the IV. However, these
6568 // SCEVs have not been observed to result in debuginfo-lossy optimisations,
6569 // so its not expected this point will be reached.
6570 if (!isa<SCEVAddRecExpr>(S))
6571 return false;
6572
6573 LLVM_DEBUG(dbgs() << "scev-salvage: Location to salvage SCEV: " << *S
6574 << '\n');
6575
6576 const auto *Rec = cast<SCEVAddRecExpr>(S);
6577 if (!Rec->isAffine())
6578 return false;
6579
6581 return false;
6582
6583 // Initialise a new builder with the iteration count expression. In
6584 // combination with the value's SCEV this enables recovery.
6585 clone(IterationCount);
6586 if (!SCEVToValueExpr(*Rec, SE))
6587 return false;
6588
6589 return true;
6590 }
6591
6592 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6593 /// builder's expression stack. The stack should already contain an
6594 /// expression for the iteration count, so that it can be multiplied by
6595 /// the stride and added to the start.
6596 /// Components of the expression are omitted if they are an identity function.
6597 bool SCEVToIterCountExpr(const llvm::SCEVAddRecExpr &SAR,
6598 ScalarEvolution &SE) {
6599 assert(SAR.isAffine() && "Expected affine SCEV");
6600 const SCEV *Start = SAR.getStart();
6601 const SCEV *Stride = SAR.getStepRecurrence(SE);
6602
6603 // Skip pushing arithmetic noops.
6604 if (!isIdentityFunction(llvm::dwarf::DW_OP_minus, Start)) {
6605 if (!pushSCEV(Start))
6606 return false;
6607 pushOperator(llvm::dwarf::DW_OP_minus);
6608 }
6609 if (!isIdentityFunction(llvm::dwarf::DW_OP_div, Stride)) {
6610 if (!pushSCEV(Stride))
6611 return false;
6612 pushOperator(llvm::dwarf::DW_OP_div);
6613 }
6614 return true;
6615 }
6616
6617 // Append the current expression and locations to a location list and an
6618 // expression list. Modify the DW_OP_LLVM_arg indexes to account for
6619 // the locations already present in the destination list.
6620 void appendToVectors(SmallVectorImpl<uint64_t> &DestExpr,
6621 SmallVectorImpl<Value *> &DestLocations) {
6622 assert(!DestLocations.empty() &&
6623 "Expected the locations vector to contain the IV");
6624 // The DWARF_OP_LLVM_arg arguments of the expression being appended must be
6625 // modified to account for the locations already in the destination vector.
6626 // All builders contain the IV as the first location op.
6627 assert(!LocationOps.empty() &&
6628 "Expected the location ops to contain the IV.");
6629 // DestIndexMap[n] contains the index in DestLocations for the nth
6630 // location in this SCEVDbgValueBuilder.
6631 SmallVector<uint64_t, 2> DestIndexMap;
6632 for (const auto &Op : LocationOps) {
6633 auto It = find(DestLocations, Op);
6634 if (It != DestLocations.end()) {
6635 // Location already exists in DestLocations, reuse existing ArgIndex.
6636 DestIndexMap.push_back(std::distance(DestLocations.begin(), It));
6637 continue;
6638 }
6639 // Location is not in DestLocations, add it.
6640 DestIndexMap.push_back(DestLocations.size());
6641 DestLocations.push_back(Op);
6642 }
6643
6644 for (const auto &Op : expr_ops()) {
6645 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6646 Op.appendToVector(DestExpr);
6647 continue;
6648 }
6649
6651 // `DW_OP_LLVM_arg n` represents the nth LocationOp in this SCEV,
6652 // DestIndexMap[n] contains its new index in DestLocations.
6653 uint64_t NewIndex = DestIndexMap[Op.getArg(0)];
6654 DestExpr.push_back(NewIndex);
6655 }
6656 }
6657};
6658
6659/// Holds all the required data to salvage a dbg.value using the pre-LSR SCEVs
6660/// and DIExpression.
6661struct DVIRecoveryRec {
6662 DVIRecoveryRec(DbgVariableRecord *DVR)
6663 : DbgRef(DVR), Expr(DVR->getExpression()), HadLocationArgList(false) {}
6664
6665 DbgVariableRecord *DbgRef;
6666 DIExpression *Expr;
6667 bool HadLocationArgList;
6668 SmallVector<WeakVH, 2> LocationOps;
6671
6672 void clear() {
6673 for (auto &RE : RecoveryExprs)
6674 RE.reset();
6675 RecoveryExprs.clear();
6676 }
6677
6678 ~DVIRecoveryRec() { clear(); }
6679};
6680} // namespace
6681
6682/// Returns the total number of DW_OP_llvm_arg operands in the expression.
6683/// This helps in determining if a DIArglist is necessary or can be omitted from
6684/// the dbg.value.
6686 auto expr_ops = ToDwarfOpIter(Expr);
6687 unsigned Count = 0;
6688 for (auto Op : expr_ops)
6689 if (Op.getOp() == dwarf::DW_OP_LLVM_arg)
6690 Count++;
6691 return Count;
6692}
6693
6694/// Overwrites DVI with the location and Ops as the DIExpression. This will
6695/// create an invalid expression if Ops has any dwarf::DW_OP_llvm_arg operands,
6696/// because a DIArglist is not created for the first argument of the dbg.value.
6697template <typename T>
6698static void updateDVIWithLocation(T &DbgVal, Value *Location,
6700 assert(numLLVMArgOps(Ops) == 0 && "Expected expression that does not "
6701 "contain any DW_OP_llvm_arg operands.");
6702 DbgVal.setRawLocation(ValueAsMetadata::get(Location));
6703 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6704 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6705}
6706
6707/// Overwrite DVI with locations placed into a DIArglist.
6708template <typename T>
6709static void updateDVIWithLocations(T &DbgVal,
6710 SmallVectorImpl<Value *> &Locations,
6712 assert(numLLVMArgOps(Ops) != 0 &&
6713 "Expected expression that references DIArglist locations using "
6714 "DW_OP_llvm_arg operands.");
6716 for (Value *V : Locations)
6717 MetadataLocs.push_back(ValueAsMetadata::get(V));
6718 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6719 DbgVal.setRawLocation(llvm::DIArgList::get(DbgVal.getContext(), ValArrayRef));
6720 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6721}
6722
6723/// Write the new expression and new location ops for the dbg.value. If possible
6724/// reduce the szie of the dbg.value by omitting DIArglist. This
6725/// can be omitted if:
6726/// 1. There is only a single location, refenced by a single DW_OP_llvm_arg.
6727/// 2. The DW_OP_LLVM_arg is the first operand in the expression.
6728static void UpdateDbgValue(DVIRecoveryRec &DVIRec,
6729 SmallVectorImpl<Value *> &NewLocationOps,
6731 DbgVariableRecord *DbgVal = DVIRec.DbgRef;
6732 unsigned NumLLVMArgs = numLLVMArgOps(NewExpr);
6733 if (NumLLVMArgs == 0) {
6734 // Location assumed to be on the stack.
6735 updateDVIWithLocation(*DbgVal, NewLocationOps[0], NewExpr);
6736 } else if (NumLLVMArgs == 1 && NewExpr[0] == dwarf::DW_OP_LLVM_arg) {
6737 // There is only a single DW_OP_llvm_arg at the start of the expression,
6738 // so it can be omitted along with DIArglist.
6739 assert(NewExpr[1] == 0 &&
6740 "Lone LLVM_arg in a DIExpression should refer to location-op 0.");
6742 updateDVIWithLocation(*DbgVal, NewLocationOps[0], ShortenedOps);
6743 } else {
6744 // Multiple DW_OP_llvm_arg, so DIArgList is strictly necessary.
6745 updateDVIWithLocations(*DbgVal, NewLocationOps, NewExpr);
6746 }
6747
6748 // If the DIExpression was previously empty then add the stack terminator.
6749 // Non-empty expressions have only had elements inserted into them and so
6750 // the terminator should already be present e.g. stack_value or fragment.
6751 DIExpression *SalvageExpr = DbgVal->getExpression();
6752 if (!DVIRec.Expr->isComplex() && SalvageExpr->isComplex()) {
6753 SalvageExpr = DIExpression::append(SalvageExpr, {dwarf::DW_OP_stack_value});
6754 DbgVal->setExpression(SalvageExpr);
6755 }
6756}
6757
6758/// Cached location ops may be erased during LSR, in which case a poison is
6759/// required when restoring from the cache. The type of that location is no
6760/// longer available, so just use int8. The poison will be replaced by one or
6761/// more locations later when a SCEVDbgValueBuilder selects alternative
6762/// locations to use for the salvage.
6764 return (VH) ? VH : PoisonValue::get(llvm::Type::getInt8Ty(C));
6765}
6766
6767/// Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
6768static void restorePreTransformState(DVIRecoveryRec &DVIRec) {
6769 DbgVariableRecord *DbgVal = DVIRec.DbgRef;
6770 LLVM_DEBUG(dbgs() << "scev-salvage: restore dbg.value to pre-LSR state\n"
6771 << "scev-salvage: post-LSR: " << *DbgVal << '\n');
6772 assert(DVIRec.Expr && "Expected an expression");
6773 DbgVal->setExpression(DVIRec.Expr);
6774
6775 // Even a single location-op may be inside a DIArgList and referenced with
6776 // DW_OP_LLVM_arg, which is valid only with a DIArgList.
6777 if (!DVIRec.HadLocationArgList) {
6778 assert(DVIRec.LocationOps.size() == 1 &&
6779 "Unexpected number of location ops.");
6780 // LSR's unsuccessful salvage attempt may have added DIArgList, which in
6781 // this case was not present before, so force the location back to a
6782 // single uncontained Value.
6783 Value *CachedValue =
6784 getValueOrPoison(DVIRec.LocationOps[0], DbgVal->getContext());
6785 DbgVal->setRawLocation(ValueAsMetadata::get(CachedValue));
6786 } else {
6788 for (WeakVH VH : DVIRec.LocationOps) {
6789 Value *CachedValue = getValueOrPoison(VH, DbgVal->getContext());
6790 MetadataLocs.push_back(ValueAsMetadata::get(CachedValue));
6791 }
6792 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6793 DbgVal->setRawLocation(
6794 llvm::DIArgList::get(DbgVal->getContext(), ValArrayRef));
6795 }
6796 LLVM_DEBUG(dbgs() << "scev-salvage: pre-LSR: " << *DbgVal << '\n');
6797}
6798
6800 llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec,
6801 const SCEV *SCEVInductionVar,
6802 SCEVDbgValueBuilder IterCountExpr) {
6803
6804 if (!DVIRec.DbgRef->isKillLocation())
6805 return false;
6806
6807 // LSR may have caused several changes to the dbg.value in the failed salvage
6808 // attempt. So restore the DIExpression, the location ops and also the
6809 // location ops format, which is always DIArglist for multiple ops, but only
6810 // sometimes for a single op.
6812
6813 // LocationOpIndexMap[i] will store the post-LSR location index of
6814 // the non-optimised out location at pre-LSR index i.
6815 SmallVector<int64_t, 2> LocationOpIndexMap;
6816 LocationOpIndexMap.assign(DVIRec.LocationOps.size(), -1);
6817 SmallVector<Value *, 2> NewLocationOps;
6818 NewLocationOps.push_back(LSRInductionVar);
6819
6820 for (unsigned i = 0; i < DVIRec.LocationOps.size(); i++) {
6821 WeakVH VH = DVIRec.LocationOps[i];
6822 // Place the locations not optimised out in the list first, avoiding
6823 // inserts later. The map is used to update the DIExpression's
6824 // DW_OP_LLVM_arg arguments as the expression is updated.
6825 if (VH && !isa<UndefValue>(VH)) {
6826 NewLocationOps.push_back(VH);
6827 LocationOpIndexMap[i] = NewLocationOps.size() - 1;
6828 LLVM_DEBUG(dbgs() << "scev-salvage: Location index " << i
6829 << " now at index " << LocationOpIndexMap[i] << "\n");
6830 continue;
6831 }
6832
6833 // It's possible that a value referred to in the SCEV may have been
6834 // optimised out by LSR.
6835 if (SE.containsErasedValue(DVIRec.SCEVs[i]) ||
6836 SE.containsUndefs(DVIRec.SCEVs[i])) {
6837 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV for location at index: " << i
6838 << " refers to a location that is now undef or erased. "
6839 "Salvage abandoned.\n");
6840 return false;
6841 }
6842
6843 LLVM_DEBUG(dbgs() << "scev-salvage: salvaging location at index " << i
6844 << " with SCEV: " << *DVIRec.SCEVs[i] << "\n");
6845
6846 DVIRec.RecoveryExprs[i] = std::make_unique<SCEVDbgValueBuilder>();
6847 SCEVDbgValueBuilder *SalvageExpr = DVIRec.RecoveryExprs[i].get();
6848
6849 // Create an offset-based salvage expression if possible, as it requires
6850 // less DWARF ops than an iteration count-based expression.
6851 if (std::optional<APInt> Offset =
6852 SE.computeConstantDifference(DVIRec.SCEVs[i], SCEVInductionVar)) {
6853 if (Offset->getSignificantBits() <= 64)
6854 SalvageExpr->createOffsetExpr(Offset->getSExtValue(), LSRInductionVar);
6855 else
6856 return false;
6857 } else if (!SalvageExpr->createIterCountExpr(DVIRec.SCEVs[i], IterCountExpr,
6858 SE))
6859 return false;
6860 }
6861
6862 // Merge the DbgValueBuilder generated expressions and the original
6863 // DIExpression, place the result into an new vector.
6865 if (DVIRec.Expr->getNumElements() == 0) {
6866 assert(DVIRec.RecoveryExprs.size() == 1 &&
6867 "Expected only a single recovery expression for an empty "
6868 "DIExpression.");
6869 assert(DVIRec.RecoveryExprs[0] &&
6870 "Expected a SCEVDbgSalvageBuilder for location 0");
6871 SCEVDbgValueBuilder *B = DVIRec.RecoveryExprs[0].get();
6872 B->appendToVectors(NewExpr, NewLocationOps);
6873 }
6874 for (const auto &Op : DVIRec.Expr->expr_ops()) {
6875 // Most Ops needn't be updated.
6876 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6877 Op.appendToVector(NewExpr);
6878 continue;
6879 }
6880
6881 uint64_t LocationArgIndex = Op.getArg(0);
6882 SCEVDbgValueBuilder *DbgBuilder =
6883 DVIRec.RecoveryExprs[LocationArgIndex].get();
6884 // The location doesn't have s SCEVDbgValueBuilder, so LSR did not
6885 // optimise it away. So just translate the argument to the updated
6886 // location index.
6887 if (!DbgBuilder) {
6888 NewExpr.push_back(dwarf::DW_OP_LLVM_arg);
6889 assert(LocationOpIndexMap[Op.getArg(0)] != -1 &&
6890 "Expected a positive index for the location-op position.");
6891 NewExpr.push_back(LocationOpIndexMap[Op.getArg(0)]);
6892 continue;
6893 }
6894 // The location has a recovery expression.
6895 DbgBuilder->appendToVectors(NewExpr, NewLocationOps);
6896 }
6897
6898 UpdateDbgValue(DVIRec, NewLocationOps, NewExpr);
6899 LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: " << *DVIRec.DbgRef << "\n");
6900 return true;
6901}
6902
6903/// Obtain an expression for the iteration count, then attempt to salvage the
6904/// dbg.value intrinsics.
6906 llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar,
6907 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &DVIToUpdate) {
6908 if (DVIToUpdate.empty())
6909 return;
6910
6911 const llvm::SCEV *SCEVInductionVar = SE.getSCEV(LSRInductionVar);
6912 assert(SCEVInductionVar &&
6913 "Anticipated a SCEV for the post-LSR induction variable");
6914
6915 if (const SCEVAddRecExpr *IVAddRec =
6916 dyn_cast<SCEVAddRecExpr>(SCEVInductionVar)) {
6917 if (!IVAddRec->isAffine())
6918 return;
6919
6920 // Prevent translation using excessive resources.
6921 if (IVAddRec->getExpressionSize() > MaxSCEVSalvageExpressionSize)
6922 return;
6923
6924 // The iteration count is required to recover location values.
6925 SCEVDbgValueBuilder IterCountExpr;
6926 IterCountExpr.pushLocation(LSRInductionVar);
6927 if (!IterCountExpr.SCEVToIterCountExpr(*IVAddRec, SE))
6928 return;
6929
6930 LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV: " << *SCEVInductionVar
6931 << '\n');
6932
6933 for (auto &DVIRec : DVIToUpdate) {
6934 SalvageDVI(L, SE, LSRInductionVar, *DVIRec, SCEVInductionVar,
6935 IterCountExpr);
6936 }
6937 }
6938}
6939
6940/// Identify and cache salvageable DVI locations and expressions along with the
6941/// corresponding SCEV(s). Also ensure that the DVI is not deleted between
6942/// cacheing and salvaging.
6944 Loop *L, ScalarEvolution &SE,
6945 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &SalvageableDVISCEVs) {
6946 for (const auto &B : L->getBlocks()) {
6947 for (auto &I : *B) {
6948 for (DbgVariableRecord &DbgVal : filterDbgVars(I.getDbgRecordRange())) {
6949 if (!DbgVal.isDbgValue() && !DbgVal.isDbgAssign())
6950 continue;
6951
6952 // Ensure that if any location op is undef that the dbg.vlue is not
6953 // cached.
6954 if (DbgVal.isKillLocation())
6955 continue;
6956
6957 // Check that the location op SCEVs are suitable for translation to
6958 // DIExpression.
6959 const auto &HasTranslatableLocationOps =
6960 [&](const DbgVariableRecord &DbgValToTranslate) -> bool {
6961 for (const auto LocOp : DbgValToTranslate.location_ops()) {
6962 if (!LocOp)
6963 return false;
6964
6965 if (!SE.isSCEVable(LocOp->getType()))
6966 return false;
6967
6968 const SCEV *S = SE.getSCEV(LocOp);
6969 if (SE.containsUndefs(S))
6970 return false;
6971 }
6972 return true;
6973 };
6974
6975 if (!HasTranslatableLocationOps(DbgVal))
6976 continue;
6977
6978 std::unique_ptr<DVIRecoveryRec> NewRec =
6979 std::make_unique<DVIRecoveryRec>(&DbgVal);
6980 // Each location Op may need a SCEVDbgValueBuilder in order to recover
6981 // it. Pre-allocating a vector will enable quick lookups of the builder
6982 // later during the salvage.
6983 NewRec->RecoveryExprs.resize(DbgVal.getNumVariableLocationOps());
6984 for (const auto LocOp : DbgVal.location_ops()) {
6985 NewRec->SCEVs.push_back(SE.getSCEV(LocOp));
6986 NewRec->LocationOps.push_back(LocOp);
6987 NewRec->HadLocationArgList = DbgVal.hasArgList();
6988 }
6989 SalvageableDVISCEVs.push_back(std::move(NewRec));
6990 }
6991 }
6992 }
6993}
6994
6995/// Ideally pick the PHI IV inserted by ScalarEvolutionExpander. As a fallback
6996/// any PHi from the loop header is usable, but may have less chance of
6997/// surviving subsequent transforms.
6999 const LSRInstance &LSR) {
7000
7001 auto IsSuitableIV = [&](PHINode *P) {
7002 if (!SE.isSCEVable(P->getType()))
7003 return false;
7004 if (const SCEVAddRecExpr *Rec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(P)))
7005 return Rec->isAffine() && !SE.containsUndefs(SE.getSCEV(P));
7006 return false;
7007 };
7008
7009 // For now, just pick the first IV that was generated and inserted by
7010 // ScalarEvolution. Ideally pick an IV that is unlikely to be optimised away
7011 // by subsequent transforms.
7012 for (const WeakVH &IV : LSR.getScalarEvolutionIVs()) {
7013 if (!IV)
7014 continue;
7015
7016 // There should only be PHI node IVs.
7017 PHINode *P = cast<PHINode>(&*IV);
7018
7019 if (IsSuitableIV(P))
7020 return P;
7021 }
7022
7023 for (PHINode &P : L.getHeader()->phis()) {
7024 if (IsSuitableIV(&P))
7025 return &P;
7026 }
7027 return nullptr;
7028}
7029
7031 DominatorTree &DT, LoopInfo &LI,
7032 const TargetTransformInfo &TTI,
7034 MemorySSA *MSSA) {
7035
7036 // Debug preservation - before we start removing anything identify which DVI
7037 // meet the salvageable criteria and store their DIExpression and SCEVs.
7038 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> SalvageableDVIRecords;
7039 DbgGatherSalvagableDVI(L, SE, SalvageableDVIRecords);
7040
7041 bool Changed = false;
7042 std::unique_ptr<MemorySSAUpdater> MSSAU;
7043 if (MSSA)
7044 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
7045
7046 // Run the main LSR transformation.
7047 const LSRInstance &Reducer =
7048 LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get());
7049 Changed |= Reducer.getChanged();
7050
7051 // Remove any extra phis created by processing inner loops.
7052 Changed |= DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7053 if (EnablePhiElim && L->isLoopSimplifyForm()) {
7055 SCEVExpander Rewriter(SE, "lsr", false);
7056#if LLVM_ENABLE_ABI_BREAKING_CHECKS
7057 Rewriter.setDebugType(DEBUG_TYPE);
7058#endif
7059 unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI);
7060 Rewriter.clear();
7061 if (numFolded) {
7062 Changed = true;
7064 MSSAU.get());
7065 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7066 }
7067 }
7068 // LSR may at times remove all uses of an induction variable from a loop.
7069 // The only remaining use is the PHI in the exit block.
7070 // When this is the case, if the exit value of the IV can be calculated using
7071 // SCEV, we can replace the exit block PHI with the final value of the IV and
7072 // skip the updates in each loop iteration.
7073 if (L->isRecursivelyLCSSAForm(DT, LI) && L->getExitBlock()) {
7075 SCEVExpander Rewriter(SE, "lsr", true);
7076 int Rewrites = rewriteLoopExitValues(L, &LI, &TLI, &SE, &TTI, Rewriter, &DT,
7077 UnusedIndVarInLoop, DeadInsts);
7078 Rewriter.clear();
7079 if (Rewrites) {
7080 Changed = true;
7082 MSSAU.get());
7083 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7084 }
7085 }
7086
7087 if (SalvageableDVIRecords.empty())
7088 return Changed;
7089
7090 // Obtain relevant IVs and attempt to rewrite the salvageable DVIs with
7091 // expressions composed using the derived iteration count.
7092 // TODO: Allow for multiple IV references for nested AddRecSCEVs
7093 for (const auto &L : LI) {
7094 if (llvm::PHINode *IV = GetInductionVariable(*L, SE, Reducer))
7095 DbgRewriteSalvageableDVIs(L, SE, IV, SalvageableDVIRecords);
7096 else {
7097 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV salvaging not possible. An IV "
7098 "could not be identified.\n");
7099 }
7100 }
7101
7102 for (auto &Rec : SalvageableDVIRecords)
7103 Rec->clear();
7104 SalvageableDVIRecords.clear();
7105 return Changed;
7106}
7107
7108bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
7109 if (skipLoop(L))
7110 return false;
7111
7112 auto &IU = getAnalysis<IVUsersWrapperPass>().getIU();
7113 auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
7114 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
7115 auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
7116 const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
7117 *L->getHeader()->getParent());
7118 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
7119 *L->getHeader()->getParent());
7120 auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
7121 *L->getHeader()->getParent());
7122 auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
7123 MemorySSA *MSSA = nullptr;
7124 if (MSSAAnalysis)
7125 MSSA = &MSSAAnalysis->getMSSA();
7126 return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, TLI, MSSA);
7127}
7128
7131 LPMUpdater &) {
7132 if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,
7133 AR.DT, AR.LI, AR.TTI, AR.AC, AR.TLI, AR.MSSA))
7134 return PreservedAnalyses::all();
7135
7136 auto PA = getLoopPassPreservedAnalyses();
7137 if (AR.MSSA)
7138 PA.preserve<MemorySSAAnalysis>();
7139 return PA;
7140}
7141
7142char LoopStrengthReduce::ID = 0;
7143
7144INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
7145 "Loop Strength Reduction", false, false)
7151INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
7152INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
7153 "Loop Strength Reduction", false, false)
7154
7155Pass *llvm::createLoopStrengthReducePass() { return new LoopStrengthReduce(); }
#define Success
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
Function Alias Analysis false
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:849
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:661
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isCanonical(const MDString *S)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
This file contains constants used for implementing Dwarf debug support.
early cse Early CSE w MemorySSA
#define DEBUG_TYPE
Hexagon Hardware Loops
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
This header provides classes for managing per-loop analyses.
static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec, const SCEV *SCEVInductionVar, SCEVDbgValueBuilder IterCountExpr)
static Immediate ExtractImmediate(SCEVUse &S, ScalarEvolution &SE)
If S involves the addition of a constant integer value, return that integer value,...
static cl::opt< bool > DropScaledForVScale("lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true), cl::desc("Avoid using scaled registers with vscale-relative addressing"))
static Value * getWideOperand(Value *Oper)
IVChain logic must consistently peek base TruncInst operands, so wrap it in a convenient helper.
static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE)
Return true if the given add can be sign-extended without changing its value.
static bool mayUsePostIncMode(const TargetTransformInfo &TTI, LSRUse &LU, const SCEV *S, const Loop *L, ScalarEvolution &SE)
Return true if the SCEV represents a value that may end up as a post-increment operation.
static void restorePreTransformState(DVIRecoveryRec &DVIRec)
Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L)
static User::op_iterator findIVOperand(User::op_iterator OI, User::op_iterator OE, Loop *L, ScalarEvolution &SE)
Helper for CollectChains that finds an IV operand (computed by an AddRec in this loop) within [OI,...
static cl::opt< TTI::AddressingModeKind > PreferredAddresingMode("lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None), cl::desc("A flag that overrides the target's preferred addressing mode."), cl::values(clEnumValN(TTI::AMK_None, "none", "Don't prefer any addressing mode"), clEnumValN(TTI::AMK_PreIndexed, "preindexed", "Prefer pre-indexed addressing mode"), clEnumValN(TTI::AMK_PostIndexed, "postindexed", "Prefer post-indexed addressing mode"), clEnumValN(TTI::AMK_All, "all", "Consider all addressing modes")))
static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset, Immediate MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg, int64_t Scale)
Test whether we know how to expand the current formula.
static void DbgGatherSalvagableDVI(Loop *L, ScalarEvolution &SE, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &SalvageableDVISCEVs)
Identify and cache salvageable DVI locations and expressions along with the corresponding SCEV(s).
static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE)
Return true if the given mul can be sign-extended without changing its value.
static const unsigned MaxSCEVSalvageExpressionSize
Limit the size of expression that SCEV-based salvaging will attempt to translate into a DIExpression.
static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if this AddRec is already a phi in its loop.
static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F, const Loop &L)
static cl::opt< bool > InsnsCost("lsr-insns-cost", cl::Hidden, cl::init(true), cl::desc("Add instruction count to a LSR cost model"))
static cl::opt< bool > StressIVChain("stress-ivchain", cl::Hidden, cl::init(false), cl::desc("Stress test LSR IV chains"))
static bool isAddressUse(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Returns true if the specified instruction is using the specified value as an address.
static void DoInitialMatch(const SCEV *S, Loop *L, SmallVectorImpl< SCEVUse > &Good, SmallVectorImpl< SCEVUse > &Bad, ScalarEvolution &SE)
Recursion helper for initialMatch.
static void updateDVIWithLocation(T &DbgVal, Value *Location, SmallVectorImpl< uint64_t > &Ops)
Overwrites DVI with the location and Ops as the DIExpression.
static bool isLegalAddImmediate(const TargetTransformInfo &TTI, Immediate Offset)
static cl::opt< cl::boolOrDefault > AllowDropSolutionIfLessProfitable("lsr-drop-solution", cl::Hidden, cl::desc("Attempt to drop solution if it is less profitable"))
static cl::opt< bool > EnableVScaleImmediates("lsr-enable-vscale-immediates", cl::Hidden, cl::init(true), cl::desc("Enable analysis of vscale-relative immediates in LSR"))
static Instruction * getFixupInsertPos(const TargetTransformInfo &TTI, const LSRFixup &Fixup, const LSRUse &LU, Instruction *IVIncInsertPos, DominatorTree &DT)
static const SCEV * getExprBase(const SCEV *S)
Return an approximation of this SCEV expression's "base", or NULL for any constant.
static bool isAlwaysFoldable(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg)
static llvm::PHINode * GetInductionVariable(const Loop &L, ScalarEvolution &SE, const LSRInstance &LSR)
Ideally pick the PHI IV inserted by ScalarEvolutionExpander.
static bool IsSimplerBaseSCEVForTarget(const TargetTransformInfo &TTI, ScalarEvolution &SE, const SCEV *Best, const SCEV *Reg, MemAccessTy AccessType)
static const unsigned MaxIVUsers
MaxIVUsers is an arbitrary threshold that provides an early opportunity for bail out.
static bool isHighCostExpansion(const SCEV *S, SmallPtrSetImpl< const SCEV * > &Processed, ScalarEvolution &SE)
Check if expanding this expression is likely to incur significant cost.
static Value * getValueOrPoison(WeakVH &VH, LLVMContext &C)
Cached location ops may be erased during LSR, in which case a poison is required when restoring from ...
static MemAccessTy getAccessType(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Return the type of the memory being accessed.
static unsigned numLLVMArgOps(SmallVectorImpl< uint64_t > &Expr)
Returns the total number of DW_OP_llvm_arg operands in the expression.
static void DbgRewriteSalvageableDVIs(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &DVIToUpdate)
Obtain an expression for the iteration count, then attempt to salvage the dbg.value intrinsics.
static cl::opt< bool > EnablePhiElim("enable-lsr-phielim", cl::Hidden, cl::init(true), cl::desc("Enable LSR phi elimination"))
static void UpdateDbgValue(DVIRecoveryRec &DVIRec, SmallVectorImpl< Value * > &NewLocationOps, SmallVectorImpl< uint64_t > &NewExpr)
Write the new expression and new location ops for the dbg.value.
static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if the given addrec can be sign-extended without changing its value.
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F)
Check if the addressing mode defined by F is completely folded in LU at isel time.
static cl::opt< bool > LSRExpNarrow("lsr-exp-narrow", cl::Hidden, cl::init(false), cl::desc("Narrow LSR complex solution using" " expectation of registers number"))
static cl::opt< bool > FilterSameScaledReg("lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true), cl::desc("Narrow LSR search space by filtering non-optimal formulae" " with the same ScaledReg and Scale"))
static void updateDVIWithLocations(T &DbgVal, SmallVectorImpl< Value * > &Locations, SmallVectorImpl< uint64_t > &Ops)
Overwrite DVI with locations placed into a DIArglist.
static cl::opt< unsigned > ComplexityLimit("lsr-complexity-limit", cl::Hidden, cl::init(std::numeric_limits< uint16_t >::max()), cl::desc("LSR search space complexity limit"))
static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC, TargetLibraryInfo &TLI, MemorySSA *MSSA)
static GlobalValue * ExtractSymbol(SCEVUse &S, ScalarEvolution &SE)
If S involves the addition of a GlobalValue address, return that symbol, and mutate S to point to a n...
static bool isProfitableChain(IVChain &Chain, SmallPtrSetImpl< Instruction * > &Users, ScalarEvolution &SE, const TargetTransformInfo &TTI)
Return true if the number of registers needed for the chain is estimated to be less than the number r...
static const SCEV * CollectSubexprs(const SCEV *S, const SCEVConstant *C, SmallVectorImpl< const SCEV * > &Ops, const Loop *L, ScalarEvolution &SE, unsigned Depth=0)
Split S into subexpressions which can be pulled out into separate registers.
static const SCEV * getExactSDiv(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool IgnoreSignificantBits=false)
Return an expression for LHS /s RHS, if it can be determined and if the remainder is known to be zero...
static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, Value *Operand, const TargetTransformInfo &TTI)
Return true if the IVInc can be folded into an addressing mode.
static const SCEV * getAnyExtendConsideringPostIncUses(ArrayRef< PostIncLoopSet > Loops, const SCEV *Expr, Type *ToTy, ScalarEvolution &SE)
Extend/Truncate Expr to ToTy considering post-inc uses in Loops.
static unsigned getSetupCost(const SCEV *Reg, unsigned Depth)
static cl::opt< unsigned > SetupCostDepthLimit("lsr-setupcost-depth-limit", cl::Hidden, cl::init(7), cl::desc("The limit on recursion depth for LSRs setup cost"))
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
Register Reg
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
#define T
uint64_t IntrinsicInst * II
#define P(N)
PowerPC TLS Dynamic Call Fixup
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the PointerIntPair class.
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
SI optimize exec mask operations pre RA
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
static const unsigned UnknownAddressSpace
#define LLVM_DEBUG(...)
Definition Debug.h:114
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
Virtual Register Rewriter
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1555
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:330
LLVM_ABI APInt sdiv(const APInt &RHS) const
Signed division function for APInt.
Definition APInt.cpp:1655
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition APInt.h:1546
LLVM_ABI APInt srem(const APInt &RHS) const
Function for signed remainder operation.
Definition APInt.cpp:1747
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1577
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
LLVM_ABI AnalysisUsage & addRequiredID(const void *ID)
Definition Pass.cpp:284
AnalysisUsage & addPreservedID(const void *ID)
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:518
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:376
LLVM_ABI bool isLandingPad() const
Return true if this basic block is a landing pad.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
BinaryOps getOpcode() const
Definition InstrTypes.h:374
static LLVM_ABI BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
static LLVM_ABI Instruction::CastOps getCastOpcode(const Value *Val, bool SrcIsSigned, Type *Ty, bool DstIsSigned)
Returns the opcode necessary to cast Val into Ty using usual casting rules.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_NE
not equal
Definition InstrTypes.h:698
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
Value * getCondition() const
static LLVM_ABI bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI DIArgList * get(LLVMContext &Context, ArrayRef< ValueAsMetadata * > Args)
DWARF expression.
iterator_range< expr_op_iterator > expr_ops() const
static LLVM_ABI DIExpression * append(const DIExpression *Expr, ArrayRef< uint64_t > Ops)
Append the opcodes Ops to DIExpr.
unsigned getNumElements() const
static LLVM_ABI void appendOffset(SmallVectorImpl< uint64_t > &Ops, int64_t Offset)
Append Ops with operations to apply the Offset.
LLVM_ABI bool isComplex() const
Return whether the location is computed on the expression stack, meaning it cannot be a simple regist...
LLVM_ABI LLVMContext & getContext()
Record of a variable value-assignment, aka a non instruction representation of the dbg....
LLVM_ABI bool isKillLocation() const
void setRawLocation(Metadata *NewLocation)
Use of this should generally be avoided; instead, replaceVariableLocationOp and addVariableLocationOp...
void setExpression(DIExpression *NewExpr)
DIExpression * getExpression() const
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
NodeT * getBlock() const
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:316
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
LLVM_ABI Instruction * findNearestCommonDominator(Instruction *I1, Instruction *I2) const
Find the nearest instruction I that dominates both I1 and I2, in the sense that a result produced bef...
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
PointerType * getType() const
Global values are always pointers.
IVStrideUse - Keep track of one use of a strided induction variable.
Definition IVUsers.h:35
void transformToPostInc(const Loop *L)
transformToPostInc - Transform the expression to post-inc form for the given loop.
Definition IVUsers.cpp:365
Value * getOperandValToReplace() const
getOperandValToReplace - Return the Value of the operand in the user instruction that this IVStrideUs...
Definition IVUsers.h:54
void setUser(Instruction *NewUser)
setUser - Assign a new user instruction for this use.
Definition IVUsers.h:48
Analysis pass that exposes the IVUsers for a loop.
Definition IVUsers.h:186
ilist< IVStrideUse >::const_iterator const_iterator
Definition IVUsers.h:142
iterator end()
Definition IVUsers.h:144
iterator begin()
Definition IVUsers.h:143
bool empty() const
Definition IVUsers.h:147
LLVM_ABI void print(raw_ostream &OS) const
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isLifetimeStartOrEnd() const LLVM_READONLY
Return true if the instruction is a llvm.lifetime.start or llvm.lifetime.end marker.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isEHPad() const
Return true if the instruction is a variety of EH-block.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI Type * getAccessType() const LLVM_READONLY
Return the type this instruction accesses in memory, if any.
const char * getOpcodeName() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
An instruction for reading from memory.
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
The legacy pass manager's analysis pass to compute loop information.
Definition LoopInfo.h:596
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
An analysis that produces MemorySSA for a function.
Definition MemorySSA.h:922
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition MemorySSA.h:702
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
iterator_range< const_block_iterator > blocks() const
op_range incoming_values()
void setIncomingValue(unsigned i, Value *V)
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
static unsigned getIncomingValueNumForOperand(unsigned i)
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
This node represents an addition of some number of SCEVs.
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
SCEVUse getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
This class represents a constant integer value.
ConstantInt * getValue() const
const APInt & getAPInt() const
This class uses information about analyze scalars to rewrite expressions in canonical form.
This node represents multiplication of some number of SCEVs.
ArrayRef< SCEVUse > operands() const
This means that we are dealing with an entirely unknown SCEV value, and only represent it as its LLVM...
This class represents an analyzed expression in the program.
unsigned short getExpressionSize() const
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
LLVM_ABI ArrayRef< SCEVUse > operands() const
Return operands of this SCEV expression.
SCEVTypes getSCEVType() const
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
LLVM_ABI uint64_t getTypeSizeInBits(Type *Ty) const
Return the size in bits of the specified type, for which isSCEVable must return true.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getAddRecExpr(SCEVUse Start, SCEVUse Step, const Loop *L, SCEV::NoWrapFlags Flags)
Get an add recurrence expression for the specified loop.
LLVM_ABI const SCEV * getNoopOrSignExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
LLVM_ABI Type * getEffectiveSCEVType(Type *Ty) const
Return a type with the same bitwidth as the given type and which represents how SCEV will treat the g...
LLVM_ABI const SCEV * getAnyExtendExpr(const SCEV *Op, Type *Ty)
getAnyExtendExpr - Return a SCEV for the given operand extended with unspecified bits out to the give...
LLVM_ABI bool containsUndefs(const SCEV *S) const
Return true if the SCEV expression contains an undef value.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
LLVM_ABI const SCEV * getVScale(Type *Ty)
LLVM_ABI bool hasComputableLoopEvolution(const SCEV *S, const Loop *L)
Return true if the given SCEV changes value in a known way in the specified loop.
LLVM_ABI const SCEV * getPointerBase(const SCEV *V)
Transitively follow the chain of pointer-type operands until reaching a SCEV that does not have a sin...
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVM_ABI const SCEV * getUnknown(Value *V)
LLVM_ABI std::optional< APInt > computeConstantDifference(const SCEV *LHS, const SCEV *RHS)
Compute LHS - RHS and returns the result as an APInt if it is a constant, and std::nullopt if it isn'...
LLVM_ABI bool properlyDominates(const SCEV *S, const BasicBlock *BB)
Return true if elements that makes up the given SCEV properly dominate the specified basic block.
LLVM_ABI bool containsErasedValue(const SCEV *S) const
Return true if the SCEV expression contains a Value that has been optimised out and is now a nullptr.
LLVMContext & getContext() const
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
iterator end()
Get an iterator to the end of the SetVector.
Definition SetVector.h:112
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition SetVector.h:106
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
iterator_range< const_set_bits_iterator > set_bits() const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
size_type size() const
Returns the number of bits in this bitvector.
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
iterator erase(const_iterator CI)
typename SuperClass::const_iterator const_iterator
typename SuperClass::iterator iterator
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
static StackOffset get(int64_t Fixed, int64_t Scalable)
Definition TypeSize.h:41
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI bool shouldDropLSRSolutionIfLessProfitable() const
Return true if LSR should drop a found solution if it's calculated to be less profitable than the bas...
LLVM_ABI bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const
Return true if LSR cost of C1 is lower than C2.
LLVM_ABI bool isIndexedStoreLegal(enum MemIndexedMode Mode, Type *Ty) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace=0, Instruction *I=nullptr, int64_t ScalableOffset=0) const
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
LLVM_ABI bool isIndexedLoadLegal(enum MemIndexedMode Mode, Type *Ty) const
LLVM_ABI bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
LLVM_ABI bool isLegalAddImmediate(int64_t Imm) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
LLVM_ABI bool canSaveCmp(Loop *L, CondBrInst **BI, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *LibInfo) const
Return true if the target can save a compare for loop count, for example hardware loop saves a compar...
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI bool canMacroFuseCmp() const
Return true if the target can fuse a compare and branch.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
@ AMK_All
Consider all addressing modes.
@ AMK_PreIndexed
Prefer pre-indexed addressing mode.
@ AMK_None
Don't prefer any addressing mode.
LLVM_ABI bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
This class represents a truncation of integer types.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:65
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:311
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI int getFPMantissaWidth() const
Return the width of the mantissa of this type.
Definition Type.cpp:241
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
Use * op_iterator
Definition User.h:254
op_range operands()
Definition User.h:267
op_iterator op_begin()
Definition User.h:259
void setOperand(unsigned i, Value *Val)
Definition User.h:212
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:25
Value * getOperand(unsigned i) const
Definition User.h:207
op_iterator op_end()
Definition User.h:261
static LLVM_ABI ValueAsMetadata * get(Value *V)
Definition Metadata.cpp:509
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:440
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:259
iterator_range< user_iterator > users()
Definition Value.h:427
LLVM_ABI void printAsOperand(raw_ostream &O, bool PrintType=true, const Module *M=nullptr) const
Print the name of this Value out to the specified raw_ostream.
iterator_range< use_iterator > uses()
Definition Value.h:381
A nullable Value handle that is nullable.
int getNumOccurrences() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
class_match< const SCEVVScale > m_SCEVVScale()
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
class_match< const SCEVConstant > m_SCEVConstant()
SCEVAffineAddRec_match< Op0_t, Op1_t, class_match< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
bind_ty< const SCEVMulExpr > m_scev_Mul(const SCEVMulExpr *&V)
bool match(const SCEV *S, const Pattern &P)
class_match< const Loop > m_Loop()
cst_pred_ty< is_specific_cst > m_scev_SpecificInt(uint64_t V)
Match an SCEV constant with a plain unsigned integer.
class_match< const SCEV > m_SCEV()
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
@ DW_OP_LLVM_arg
Only used in LLVM metadata.
Definition Dwarf.h:149
@ DW_OP_LLVM_convert
Only used in LLVM metadata.
Definition Dwarf.h:145
constexpr double e
Sequence
A sequence of states that a pointer may go through in which an objc_retain and objc_release are actua...
Definition PtrState.h:41
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
iterator end() const
Definition BasicBlock.h:89
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
Definition SFrame.h:77
unsigned KindType
For isa, dyn_cast, etc operations on TelemetryInfo.
Definition Telemetry.h:83
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1765
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
InstructionCost Cost
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition Utils.cpp:1725
bool operator!=(uint64_t V1, const APInt &V2)
Definition APInt.h:2128
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
LLVM_ABI char & LoopSimplifyID
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
DomTreeNodeBase< BasicBlock > DomTreeNode
Definition Dominators.h:94
AnalysisManager< Loop, LoopStandardAnalysisResults & > LoopAnalysisManager
The loop analysis manager.
LLVM_ABI bool matchSimpleRecurrence(const PHINode *P, BinaryOperator *&BO, Value *&Start, Value *&Step)
Attempt to match a simple first order recurrence cycle of the form: iv = phi Ty [Start,...
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
LLVM_ABI bool DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr)
Examine each PHI in the given block and delete it if it is dead.
LLVM_ABI void initializeLoopStrengthReducePass(PassRegistry &)
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
LLVM_ABI const SCEV * denormalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE)
Denormalize S to be post-increment for all loops present in Loops.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
LLVM_ABI Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void SplitLandingPadPredecessors(BasicBlock *OrigBB, ArrayRef< BasicBlock * > Preds, const char *Suffix, const char *Suffix2, SmallVectorImpl< BasicBlock * > &NewBBs, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, bool PreserveLCSSA=false)
This method transforms the landing pad, OrigBB, by introducing two new basic blocks into the function...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
LLVM_ABI const SCEV * normalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE, bool CheckInvertible=true)
Normalize S to be post-increment for all loops present in Loops.
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Other
Any other memory.
Definition ModRef.h:68
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
@ Add
Sum of integers.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2012
DWARFExpression::Operation Op
LLVM_ABI Pass * createLoopStrengthReducePass()
LLVM_ABI BasicBlock * SplitCriticalEdge(Instruction *TI, unsigned SuccNum, const CriticalEdgeSplittingOptions &Options=CriticalEdgeSplittingOptions(), const Twine &BBName="")
If this edge is a critical edge, insert a new node to split the critical edge.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructionsPermissive(SmallVectorImpl< WeakTrackingVH > &DeadInsts, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
Same functionality as RecursivelyDeleteTriviallyDeadInstructions, but allow instructions that are not...
Definition Local.cpp:550
constexpr unsigned BitWidth
LLVM_ABI bool formLCSSAForInstructions(SmallVectorImpl< Instruction * > &Worklist, const DominatorTree &DT, const LoopInfo &LI, ScalarEvolution *SE, SmallVectorImpl< PHINode * > *PHIsToRemove=nullptr, SmallVectorImpl< PHINode * > *InsertedPHIs=nullptr)
Ensures LCSSA form for every instruction from the Worklist in the scope of innermost containing loop.
Definition LCSSA.cpp:308
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
SmallPtrSet< const Loop *, 2 > PostIncLoopSet
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
LLVM_ABI int rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI, ScalarEvolution *SE, const TargetTransformInfo *TTI, SCEVExpander &Rewriter, DominatorTree *DT, ReplaceExitVal ReplaceExitValue, SmallVector< WeakTrackingVH, 16 > &DeadInsts)
If the final value of any expressions that are recurrent in the loop can be computed,...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
@ UnusedIndVarInLoop
Definition LoopUtils.h:569
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
bool SCEVExprContains(const SCEV *Root, PredTy Pred)
Return true if any node in Root satisfies the predicate Pred.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
Attributes of a target dependent hardware loop.
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Information about a load/store intrinsic defined by the target.
Value * PtrVal
This is the pointer that the intrinsic is loading from or storing to.