LLVM 22.0.0git
LoopStrengthReduce.cpp
Go to the documentation of this file.
1//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This transformation analyzes and transforms the induction variables (and
10// computations derived from them) into forms suitable for efficient execution
11// on the target.
12//
13// This pass performs a strength reduction on array references inside loops that
14// have as one or more of their components the loop induction variable, it
15// rewrites expressions to take advantage of scaled-index addressing modes
16// available on the target, and it performs a variety of other optimizations
17// related to loop induction variables.
18//
19// Terminology note: this code has a lot of handling for "post-increment" or
20// "post-inc" users. This is not talking about post-increment addressing modes;
21// it is instead talking about code like this:
22//
23// %i = phi [ 0, %entry ], [ %i.next, %latch ]
24// ...
25// %i.next = add %i, 1
26// %c = icmp eq %i.next, %n
27//
28// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
29// it's useful to think about these as the same register, with some uses using
30// the value of the register before the add and some using it after. In this
31// example, the icmp is a post-increment user, since it uses %i.next, which is
32// the value of the induction variable after the increment. The other common
33// case of post-increment users is users outside the loop.
34//
35// TODO: More sophistication in the way Formulae are generated and filtered.
36//
37// TODO: Handle multiple loops at a time.
38//
39// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
40// of a GlobalValue?
41//
42// TODO: When truncation is free, truncate ICmp users' operands to make it a
43// smaller encoding (on x86 at least).
44//
45// TODO: When a negated register is used by an add (such as in a list of
46// multiple base registers, or as the increment expression in an addrec),
47// we may not actually need both reg and (-1 * reg) in registers; the
48// negation can be implemented by using a sub instead of an add. The
49// lack of support for taking this into consideration when making
50// register pressure decisions is partly worked around by the "Special"
51// use kind.
52//
53//===----------------------------------------------------------------------===//
54
56#include "llvm/ADT/APInt.h"
57#include "llvm/ADT/DenseMap.h"
58#include "llvm/ADT/DenseSet.h"
60#include "llvm/ADT/STLExtras.h"
61#include "llvm/ADT/SetVector.h"
64#include "llvm/ADT/SmallSet.h"
66#include "llvm/ADT/Statistic.h"
84#include "llvm/IR/BasicBlock.h"
85#include "llvm/IR/Constant.h"
86#include "llvm/IR/Constants.h"
89#include "llvm/IR/Dominators.h"
90#include "llvm/IR/GlobalValue.h"
91#include "llvm/IR/IRBuilder.h"
92#include "llvm/IR/InstrTypes.h"
93#include "llvm/IR/Instruction.h"
96#include "llvm/IR/Module.h"
97#include "llvm/IR/Operator.h"
98#include "llvm/IR/Type.h"
99#include "llvm/IR/Use.h"
100#include "llvm/IR/User.h"
101#include "llvm/IR/Value.h"
102#include "llvm/IR/ValueHandle.h"
104#include "llvm/Pass.h"
105#include "llvm/Support/Casting.h"
108#include "llvm/Support/Debug.h"
118#include <algorithm>
119#include <cassert>
120#include <cstddef>
121#include <cstdint>
122#include <iterator>
123#include <limits>
124#include <map>
125#include <numeric>
126#include <optional>
127#include <utility>
128
129using namespace llvm;
130using namespace SCEVPatternMatch;
131
132#define DEBUG_TYPE "loop-reduce"
133
134/// MaxIVUsers is an arbitrary threshold that provides an early opportunity for
135/// bail out. This threshold is far beyond the number of users that LSR can
136/// conceivably solve, so it should not affect generated code, but catches the
137/// worst cases before LSR burns too much compile time and stack space.
138static const unsigned MaxIVUsers = 200;
139
140/// Limit the size of expression that SCEV-based salvaging will attempt to
141/// translate into a DIExpression.
142/// Choose a maximum size such that debuginfo is not excessively increased and
143/// the salvaging is not too expensive for the compiler.
144static const unsigned MaxSCEVSalvageExpressionSize = 64;
145
146// Cleanup congruent phis after LSR phi expansion.
148 "enable-lsr-phielim", cl::Hidden, cl::init(true),
149 cl::desc("Enable LSR phi elimination"));
150
151// The flag adds instruction count to solutions cost comparison.
153 "lsr-insns-cost", cl::Hidden, cl::init(true),
154 cl::desc("Add instruction count to a LSR cost model"));
155
156// Flag to choose how to narrow complex lsr solution
158 "lsr-exp-narrow", cl::Hidden, cl::init(false),
159 cl::desc("Narrow LSR complex solution using"
160 " expectation of registers number"));
161
162// Flag to narrow search space by filtering non-optimal formulae with
163// the same ScaledReg and Scale.
165 "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true),
166 cl::desc("Narrow LSR search space by filtering non-optimal formulae"
167 " with the same ScaledReg and Scale"));
168
170 "lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None),
171 cl::desc("A flag that overrides the target's preferred addressing mode."),
173 clEnumValN(TTI::AMK_None, "none", "Don't prefer any addressing mode"),
174 clEnumValN(TTI::AMK_PreIndexed, "preindexed",
175 "Prefer pre-indexed addressing mode"),
176 clEnumValN(TTI::AMK_PostIndexed, "postindexed",
177 "Prefer post-indexed addressing mode"),
178 clEnumValN(TTI::AMK_All, "all", "Consider all addressing modes")));
179
181 "lsr-complexity-limit", cl::Hidden,
182 cl::init(std::numeric_limits<uint16_t>::max()),
183 cl::desc("LSR search space complexity limit"));
184
186 "lsr-setupcost-depth-limit", cl::Hidden, cl::init(7),
187 cl::desc("The limit on recursion depth for LSRs setup cost"));
188
190 "lsr-drop-solution", cl::Hidden,
191 cl::desc("Attempt to drop solution if it is less profitable"));
192
194 "lsr-enable-vscale-immediates", cl::Hidden, cl::init(true),
195 cl::desc("Enable analysis of vscale-relative immediates in LSR"));
196
198 "lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true),
199 cl::desc("Avoid using scaled registers with vscale-relative addressing"));
200
201#ifndef NDEBUG
202// Stress test IV chain generation.
204 "stress-ivchain", cl::Hidden, cl::init(false),
205 cl::desc("Stress test LSR IV chains"));
206#else
207static bool StressIVChain = false;
208#endif
209
210namespace {
211
212struct MemAccessTy {
213 /// Used in situations where the accessed memory type is unknown.
214 static const unsigned UnknownAddressSpace =
215 std::numeric_limits<unsigned>::max();
216
217 Type *MemTy = nullptr;
218 unsigned AddrSpace = UnknownAddressSpace;
219
220 MemAccessTy() = default;
221 MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {}
222
223 bool operator==(MemAccessTy Other) const {
224 return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
225 }
226
227 bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
228
229 static MemAccessTy getUnknown(LLVMContext &Ctx,
230 unsigned AS = UnknownAddressSpace) {
231 return MemAccessTy(Type::getVoidTy(Ctx), AS);
232 }
233
234 Type *getType() { return MemTy; }
235};
236
237/// This class holds data which is used to order reuse candidates.
238class RegSortData {
239public:
240 /// This represents the set of LSRUse indices which reference
241 /// a particular register.
242 SmallBitVector UsedByIndices;
243
244 void print(raw_ostream &OS) const;
245 void dump() const;
246};
247
248// An offset from an address that is either scalable or fixed. Used for
249// per-target optimizations of addressing modes.
250class Immediate : public details::FixedOrScalableQuantity<Immediate, int64_t> {
251 constexpr Immediate(ScalarTy MinVal, bool Scalable)
252 : FixedOrScalableQuantity(MinVal, Scalable) {}
253
254 constexpr Immediate(const FixedOrScalableQuantity<Immediate, int64_t> &V)
255 : FixedOrScalableQuantity(V) {}
256
257public:
258 constexpr Immediate() = delete;
259
260 static constexpr Immediate getFixed(ScalarTy MinVal) {
261 return {MinVal, false};
262 }
263 static constexpr Immediate getScalable(ScalarTy MinVal) {
264 return {MinVal, true};
265 }
266 static constexpr Immediate get(ScalarTy MinVal, bool Scalable) {
267 return {MinVal, Scalable};
268 }
269 static constexpr Immediate getZero() { return {0, false}; }
270 static constexpr Immediate getFixedMin() {
271 return {std::numeric_limits<int64_t>::min(), false};
272 }
273 static constexpr Immediate getFixedMax() {
274 return {std::numeric_limits<int64_t>::max(), false};
275 }
276 static constexpr Immediate getScalableMin() {
277 return {std::numeric_limits<int64_t>::min(), true};
278 }
279 static constexpr Immediate getScalableMax() {
280 return {std::numeric_limits<int64_t>::max(), true};
281 }
282
283 constexpr bool isLessThanZero() const { return Quantity < 0; }
284
285 constexpr bool isGreaterThanZero() const { return Quantity > 0; }
286
287 constexpr bool isCompatibleImmediate(const Immediate &Imm) const {
288 return isZero() || Imm.isZero() || Imm.Scalable == Scalable;
289 }
290
291 constexpr bool isMin() const {
292 return Quantity == std::numeric_limits<ScalarTy>::min();
293 }
294
295 constexpr bool isMax() const {
296 return Quantity == std::numeric_limits<ScalarTy>::max();
297 }
298
299 // Arithmetic 'operators' that cast to unsigned types first.
300 constexpr Immediate addUnsigned(const Immediate &RHS) const {
301 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
302 ScalarTy Value = (uint64_t)Quantity + RHS.getKnownMinValue();
303 return {Value, Scalable || RHS.isScalable()};
304 }
305
306 constexpr Immediate subUnsigned(const Immediate &RHS) const {
307 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
308 ScalarTy Value = (uint64_t)Quantity - RHS.getKnownMinValue();
309 return {Value, Scalable || RHS.isScalable()};
310 }
311
312 // Scale the quantity by a constant without caring about runtime scalability.
313 constexpr Immediate mulUnsigned(const ScalarTy RHS) const {
314 ScalarTy Value = (uint64_t)Quantity * RHS;
315 return {Value, Scalable};
316 }
317
318 // Helpers for generating SCEVs with vscale terms where needed.
319 const SCEV *getSCEV(ScalarEvolution &SE, Type *Ty) const {
320 const SCEV *S = SE.getConstant(Ty, Quantity);
321 if (Scalable)
322 S = SE.getMulExpr(S, SE.getVScale(S->getType()));
323 return S;
324 }
325
326 const SCEV *getNegativeSCEV(ScalarEvolution &SE, Type *Ty) const {
327 const SCEV *NegS = SE.getConstant(Ty, -(uint64_t)Quantity);
328 if (Scalable)
329 NegS = SE.getMulExpr(NegS, SE.getVScale(NegS->getType()));
330 return NegS;
331 }
332
333 const SCEV *getUnknownSCEV(ScalarEvolution &SE, Type *Ty) const {
334 // TODO: Avoid implicit trunc?
335 // See https://github.com/llvm/llvm-project/issues/112510.
336 const SCEV *SU = SE.getUnknown(
337 ConstantInt::getSigned(Ty, Quantity, /*ImplicitTrunc=*/true));
338 if (Scalable)
339 SU = SE.getMulExpr(SU, SE.getVScale(SU->getType()));
340 return SU;
341 }
342};
343
344// This is needed for the Compare type of std::map when Immediate is used
345// as a key. We don't need it to be fully correct against any value of vscale,
346// just to make sure that vscale-related terms in the map are considered against
347// each other rather than being mixed up and potentially missing opportunities.
348struct KeyOrderTargetImmediate {
349 bool operator()(const Immediate &LHS, const Immediate &RHS) const {
350 if (LHS.isScalable() && !RHS.isScalable())
351 return false;
352 if (!LHS.isScalable() && RHS.isScalable())
353 return true;
354 return LHS.getKnownMinValue() < RHS.getKnownMinValue();
355 }
356};
357
358// This would be nicer if we could be generic instead of directly using size_t,
359// but there doesn't seem to be a type trait for is_orderable or
360// is_lessthan_comparable or similar.
361struct KeyOrderSizeTAndImmediate {
362 bool operator()(const std::pair<size_t, Immediate> &LHS,
363 const std::pair<size_t, Immediate> &RHS) const {
364 size_t LSize = LHS.first;
365 size_t RSize = RHS.first;
366 if (LSize != RSize)
367 return LSize < RSize;
368 return KeyOrderTargetImmediate()(LHS.second, RHS.second);
369 }
370};
371} // end anonymous namespace
372
373#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
374void RegSortData::print(raw_ostream &OS) const {
375 OS << "[NumUses=" << UsedByIndices.count() << ']';
376}
377
378LLVM_DUMP_METHOD void RegSortData::dump() const {
379 print(errs()); errs() << '\n';
380}
381#endif
382
383namespace {
384
385/// Map register candidates to information about how they are used.
386class RegUseTracker {
387 using RegUsesTy = DenseMap<const SCEV *, RegSortData>;
388
389 RegUsesTy RegUsesMap;
391
392public:
393 void countRegister(const SCEV *Reg, size_t LUIdx);
394 void dropRegister(const SCEV *Reg, size_t LUIdx);
395 void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
396
397 bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
398
399 const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
400
401 void clear();
402
405
406 iterator begin() { return RegSequence.begin(); }
407 iterator end() { return RegSequence.end(); }
408 const_iterator begin() const { return RegSequence.begin(); }
409 const_iterator end() const { return RegSequence.end(); }
410};
411
412} // end anonymous namespace
413
414void
415RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
416 std::pair<RegUsesTy::iterator, bool> Pair = RegUsesMap.try_emplace(Reg);
417 RegSortData &RSD = Pair.first->second;
418 if (Pair.second)
419 RegSequence.push_back(Reg);
420 RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
421 RSD.UsedByIndices.set(LUIdx);
422}
423
424void
425RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
426 RegUsesTy::iterator It = RegUsesMap.find(Reg);
427 assert(It != RegUsesMap.end());
428 RegSortData &RSD = It->second;
429 assert(RSD.UsedByIndices.size() > LUIdx);
430 RSD.UsedByIndices.reset(LUIdx);
431}
432
433void
434RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
435 assert(LUIdx <= LastLUIdx);
436
437 // Update RegUses. The data structure is not optimized for this purpose;
438 // we must iterate through it and update each of the bit vectors.
439 for (auto &Pair : RegUsesMap) {
440 SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
441 if (LUIdx < UsedByIndices.size())
442 UsedByIndices[LUIdx] =
443 LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : false;
444 UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
445 }
446}
447
448bool
449RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
450 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
451 if (I == RegUsesMap.end())
452 return false;
453 const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
454 int i = UsedByIndices.find_first();
455 if (i == -1) return false;
456 if ((size_t)i != LUIdx) return true;
457 return UsedByIndices.find_next(i) != -1;
458}
459
460const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
461 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
462 assert(I != RegUsesMap.end() && "Unknown register!");
463 return I->second.UsedByIndices;
464}
465
466void RegUseTracker::clear() {
467 RegUsesMap.clear();
468 RegSequence.clear();
469}
470
471namespace {
472
473/// This class holds information that describes a formula for computing
474/// satisfying a use. It may include broken-out immediates and scaled registers.
475struct Formula {
476 /// Global base address used for complex addressing.
477 GlobalValue *BaseGV = nullptr;
478
479 /// Base offset for complex addressing.
480 Immediate BaseOffset = Immediate::getZero();
481
482 /// Whether any complex addressing has a base register.
483 bool HasBaseReg = false;
484
485 /// The scale of any complex addressing.
486 int64_t Scale = 0;
487
488 /// The list of "base" registers for this use. When this is non-empty. The
489 /// canonical representation of a formula is
490 /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
491 /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
492 /// 3. The reg containing recurrent expr related with currect loop in the
493 /// formula should be put in the ScaledReg.
494 /// #1 enforces that the scaled register is always used when at least two
495 /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
496 /// #2 enforces that 1 * reg is reg.
497 /// #3 ensures invariant regs with respect to current loop can be combined
498 /// together in LSR codegen.
499 /// This invariant can be temporarily broken while building a formula.
500 /// However, every formula inserted into the LSRInstance must be in canonical
501 /// form.
503
504 /// The 'scaled' register for this use. This should be non-null when Scale is
505 /// not zero.
506 const SCEV *ScaledReg = nullptr;
507
508 /// An additional constant offset which added near the use. This requires a
509 /// temporary register, but the offset itself can live in an add immediate
510 /// field rather than a register.
511 Immediate UnfoldedOffset = Immediate::getZero();
512
513 Formula() = default;
514
515 void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
516
517 bool isCanonical(const Loop &L) const;
518
519 void canonicalize(const Loop &L);
520
521 bool unscale();
522
523 bool hasZeroEnd() const;
524
525 bool countsDownToZero() const;
526
527 size_t getNumRegs() const;
528 Type *getType() const;
529
530 void deleteBaseReg(const SCEV *&S);
531
532 bool referencesReg(const SCEV *S) const;
533 bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
534 const RegUseTracker &RegUses) const;
535
536 void print(raw_ostream &OS) const;
537 void dump() const;
538};
539
540} // end anonymous namespace
541
542/// Recursion helper for initialMatch.
543static void DoInitialMatch(const SCEV *S, Loop *L,
546 ScalarEvolution &SE) {
547 // Collect expressions which properly dominate the loop header.
548 if (SE.properlyDominates(S, L->getHeader())) {
549 Good.push_back(S);
550 return;
551 }
552
553 // Look at add operands.
554 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
555 for (const SCEV *S : Add->operands())
556 DoInitialMatch(S, L, Good, Bad, SE);
557 return;
558 }
559
560 // Look at addrec operands.
561 const SCEV *Start, *Step;
562 const Loop *ARLoop;
563 if (match(S,
564 m_scev_AffineAddRec(m_SCEV(Start), m_SCEV(Step), m_Loop(ARLoop))) &&
565 !Start->isZero()) {
566 DoInitialMatch(Start, L, Good, Bad, SE);
567 DoInitialMatch(SE.getAddRecExpr(SE.getConstant(S->getType(), 0), Step,
568 // FIXME: AR->getNoWrapFlags()
569 ARLoop, SCEV::FlagAnyWrap),
570 L, Good, Bad, SE);
571 return;
572 }
573
574 // Handle a multiplication by -1 (negation) if it didn't fold.
575 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
576 if (Mul->getOperand(0)->isAllOnesValue()) {
578 const SCEV *NewMul = SE.getMulExpr(Ops);
579
582 DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
583 const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
584 SE.getEffectiveSCEVType(NewMul->getType())));
585 for (const SCEV *S : MyGood)
586 Good.push_back(SE.getMulExpr(NegOne, S));
587 for (const SCEV *S : MyBad)
588 Bad.push_back(SE.getMulExpr(NegOne, S));
589 return;
590 }
591
592 // Ok, we can't do anything interesting. Just stuff the whole thing into a
593 // register and hope for the best.
594 Bad.push_back(S);
595}
596
597/// Incorporate loop-variant parts of S into this Formula, attempting to keep
598/// all loop-invariant and loop-computable values in a single base register.
599void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
602 DoInitialMatch(S, L, Good, Bad, SE);
603 if (!Good.empty()) {
604 const SCEV *Sum = SE.getAddExpr(Good);
605 if (!Sum->isZero())
606 BaseRegs.push_back(Sum);
607 HasBaseReg = true;
608 }
609 if (!Bad.empty()) {
610 const SCEV *Sum = SE.getAddExpr(Bad);
611 if (!Sum->isZero())
612 BaseRegs.push_back(Sum);
613 HasBaseReg = true;
614 }
615 canonicalize(*L);
616}
617
618static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L) {
619 return SCEVExprContains(S, [&L](const SCEV *S) {
620 return isa<SCEVAddRecExpr>(S) && (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
621 });
622}
623
624/// Check whether or not this formula satisfies the canonical
625/// representation.
626/// \see Formula::BaseRegs.
627bool Formula::isCanonical(const Loop &L) const {
628 assert((Scale == 0 || ScaledReg) &&
629 "ScaledReg must be non-null if Scale is non-zero");
630
631 if (!ScaledReg)
632 return BaseRegs.size() <= 1;
633
634 if (Scale != 1)
635 return true;
636
637 if (Scale == 1 && BaseRegs.empty())
638 return false;
639
640 if (containsAddRecDependentOnLoop(ScaledReg, L))
641 return true;
642
643 // If ScaledReg is not a recurrent expr, or it is but its loop is not current
644 // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
645 // loop, we want to swap the reg in BaseRegs with ScaledReg.
646 return none_of(BaseRegs, [&L](const SCEV *S) {
648 });
649}
650
651/// Helper method to morph a formula into its canonical representation.
652/// \see Formula::BaseRegs.
653/// Every formula having more than one base register, must use the ScaledReg
654/// field. Otherwise, we would have to do special cases everywhere in LSR
655/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
656/// On the other hand, 1*reg should be canonicalized into reg.
657void Formula::canonicalize(const Loop &L) {
658 if (isCanonical(L))
659 return;
660
661 if (BaseRegs.empty()) {
662 // No base reg? Use scale reg with scale = 1 as such.
663 assert(ScaledReg && "Expected 1*reg => reg");
664 assert(Scale == 1 && "Expected 1*reg => reg");
665 BaseRegs.push_back(ScaledReg);
666 Scale = 0;
667 ScaledReg = nullptr;
668 return;
669 }
670
671 // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
672 if (!ScaledReg) {
673 ScaledReg = BaseRegs.pop_back_val();
674 Scale = 1;
675 }
676
677 // If ScaledReg is an invariant with respect to L, find the reg from
678 // BaseRegs containing the recurrent expr related with Loop L. Swap the
679 // reg with ScaledReg.
680 if (!containsAddRecDependentOnLoop(ScaledReg, L)) {
681 auto I = find_if(BaseRegs, [&L](const SCEV *S) {
683 });
684 if (I != BaseRegs.end())
685 std::swap(ScaledReg, *I);
686 }
687 assert(isCanonical(L) && "Failed to canonicalize?");
688}
689
690/// Get rid of the scale in the formula.
691/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
692/// \return true if it was possible to get rid of the scale, false otherwise.
693/// \note After this operation the formula may not be in the canonical form.
694bool Formula::unscale() {
695 if (Scale != 1)
696 return false;
697 Scale = 0;
698 BaseRegs.push_back(ScaledReg);
699 ScaledReg = nullptr;
700 return true;
701}
702
703bool Formula::hasZeroEnd() const {
704 if (UnfoldedOffset || BaseOffset)
705 return false;
706 if (BaseRegs.size() != 1 || ScaledReg)
707 return false;
708 return true;
709}
710
711bool Formula::countsDownToZero() const {
712 if (!hasZeroEnd())
713 return false;
714 assert(BaseRegs.size() == 1 && "hasZeroEnd should mean one BaseReg");
715 const APInt *StepInt;
716 if (!match(BaseRegs[0], m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StepInt))))
717 return false;
718 return StepInt->isNegative();
719}
720
721/// Return the total number of register operands used by this formula. This does
722/// not include register uses implied by non-constant addrec strides.
723size_t Formula::getNumRegs() const {
724 return !!ScaledReg + BaseRegs.size();
725}
726
727/// Return the type of this formula, if it has one, or null otherwise. This type
728/// is meaningless except for the bit size.
729Type *Formula::getType() const {
730 return !BaseRegs.empty() ? BaseRegs.front()->getType() :
731 ScaledReg ? ScaledReg->getType() :
732 BaseGV ? BaseGV->getType() :
733 nullptr;
734}
735
736/// Delete the given base reg from the BaseRegs list.
737void Formula::deleteBaseReg(const SCEV *&S) {
738 if (&S != &BaseRegs.back())
739 std::swap(S, BaseRegs.back());
740 BaseRegs.pop_back();
741}
742
743/// Test if this formula references the given register.
744bool Formula::referencesReg(const SCEV *S) const {
745 return S == ScaledReg || is_contained(BaseRegs, S);
746}
747
748/// Test whether this formula uses registers which are used by uses other than
749/// the use with the given index.
750bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
751 const RegUseTracker &RegUses) const {
752 if (ScaledReg)
753 if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
754 return true;
755 for (const SCEV *BaseReg : BaseRegs)
756 if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
757 return true;
758 return false;
759}
760
761#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
762void Formula::print(raw_ostream &OS) const {
763 bool First = true;
764 if (BaseGV) {
765 if (!First) OS << " + "; else First = false;
766 BaseGV->printAsOperand(OS, /*PrintType=*/false);
767 }
768 if (BaseOffset.isNonZero()) {
769 if (!First) OS << " + "; else First = false;
770 OS << BaseOffset;
771 }
772 for (const SCEV *BaseReg : BaseRegs) {
773 if (!First) OS << " + "; else First = false;
774 OS << "reg(" << *BaseReg << ')';
775 }
776 if (HasBaseReg && BaseRegs.empty()) {
777 if (!First) OS << " + "; else First = false;
778 OS << "**error: HasBaseReg**";
779 } else if (!HasBaseReg && !BaseRegs.empty()) {
780 if (!First) OS << " + "; else First = false;
781 OS << "**error: !HasBaseReg**";
782 }
783 if (Scale != 0) {
784 if (!First) OS << " + "; else First = false;
785 OS << Scale << "*reg(";
786 if (ScaledReg)
787 OS << *ScaledReg;
788 else
789 OS << "<unknown>";
790 OS << ')';
791 }
792 if (UnfoldedOffset.isNonZero()) {
793 if (!First) OS << " + ";
794 OS << "imm(" << UnfoldedOffset << ')';
795 }
796}
797
798LLVM_DUMP_METHOD void Formula::dump() const {
799 print(errs()); errs() << '\n';
800}
801#endif
802
803/// Return true if the given addrec can be sign-extended without changing its
804/// value.
806 Type *WideTy =
808 return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
809}
810
811/// Return true if the given add can be sign-extended without changing its
812/// value.
813static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
814 Type *WideTy =
815 IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
816 return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
817}
818
819/// Return true if the given mul can be sign-extended without changing its
820/// value.
821static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
822 Type *WideTy =
824 SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
825 return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
826}
827
828/// Return an expression for LHS /s RHS, if it can be determined and if the
829/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
830/// is true, expressions like (X * Y) /s Y are simplified to X, ignoring that
831/// the multiplication may overflow, which is useful when the result will be
832/// used in a context where the most significant bits are ignored.
833static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
834 ScalarEvolution &SE,
835 bool IgnoreSignificantBits = false) {
836 // Handle the trivial case, which works for any SCEV type.
837 if (LHS == RHS)
838 return SE.getConstant(LHS->getType(), 1);
839
840 // Handle a few RHS special cases.
842 if (RC) {
843 const APInt &RA = RC->getAPInt();
844 // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
845 // some folding.
846 if (RA.isAllOnes()) {
847 if (LHS->getType()->isPointerTy())
848 return nullptr;
849 return SE.getMulExpr(LHS, RC);
850 }
851 // Handle x /s 1 as x.
852 if (RA == 1)
853 return LHS;
854 }
855
856 // Check for a division of a constant by a constant.
858 if (!RC)
859 return nullptr;
860 const APInt &LA = C->getAPInt();
861 const APInt &RA = RC->getAPInt();
862 if (LA.srem(RA) != 0)
863 return nullptr;
864 return SE.getConstant(LA.sdiv(RA));
865 }
866
867 // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
869 if ((IgnoreSignificantBits || isAddRecSExtable(AR, SE)) && AR->isAffine()) {
870 const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
871 IgnoreSignificantBits);
872 if (!Step) return nullptr;
873 const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
874 IgnoreSignificantBits);
875 if (!Start) return nullptr;
876 // FlagNW is independent of the start value, step direction, and is
877 // preserved with smaller magnitude steps.
878 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
879 return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
880 }
881 return nullptr;
882 }
883
884 // Distribute the sdiv over add operands, if the add doesn't overflow.
886 if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
888 for (const SCEV *S : Add->operands()) {
889 const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
890 if (!Op) return nullptr;
891 Ops.push_back(Op);
892 }
893 return SE.getAddExpr(Ops);
894 }
895 return nullptr;
896 }
897
898 // Check for a multiply operand that we can pull RHS out of.
900 if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
901 // Handle special case C1*X*Y /s C2*X*Y.
902 if (const SCEVMulExpr *MulRHS = dyn_cast<SCEVMulExpr>(RHS)) {
903 if (IgnoreSignificantBits || isMulSExtable(MulRHS, SE)) {
904 const SCEVConstant *LC = dyn_cast<SCEVConstant>(Mul->getOperand(0));
905 const SCEVConstant *RC =
906 dyn_cast<SCEVConstant>(MulRHS->getOperand(0));
907 if (LC && RC) {
909 SmallVector<const SCEV *, 4> ROps(drop_begin(MulRHS->operands()));
910 if (LOps == ROps)
911 return getExactSDiv(LC, RC, SE, IgnoreSignificantBits);
912 }
913 }
914 }
915
917 bool Found = false;
918 for (const SCEV *S : Mul->operands()) {
919 if (!Found)
920 if (const SCEV *Q = getExactSDiv(S, RHS, SE,
921 IgnoreSignificantBits)) {
922 S = Q;
923 Found = true;
924 }
925 Ops.push_back(S);
926 }
927 return Found ? SE.getMulExpr(Ops) : nullptr;
928 }
929 return nullptr;
930 }
931
932 // Otherwise we don't know.
933 return nullptr;
934}
935
936/// If S involves the addition of a constant integer value, return that integer
937/// value, and mutate S to point to a new SCEV with that value excluded.
938static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
939 const APInt *C;
940 if (match(S, m_scev_APInt(C))) {
941 if (C->getSignificantBits() <= 64) {
942 S = SE.getConstant(S->getType(), 0);
943 return Immediate::getFixed(C->getSExtValue());
944 }
945 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
946 SmallVector<const SCEV *, 8> NewOps(Add->operands());
947 Immediate Result = ExtractImmediate(NewOps.front(), SE);
948 if (Result.isNonZero())
949 S = SE.getAddExpr(NewOps);
950 return Result;
951 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
952 SmallVector<const SCEV *, 8> NewOps(AR->operands());
953 Immediate Result = ExtractImmediate(NewOps.front(), SE);
954 if (Result.isNonZero())
955 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
956 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
958 return Result;
959 } else if (EnableVScaleImmediates &&
961 S = SE.getConstant(S->getType(), 0);
962 return Immediate::getScalable(C->getSExtValue());
963 }
964 return Immediate::getZero();
965}
966
967/// If S involves the addition of a GlobalValue address, return that symbol, and
968/// mutate S to point to a new SCEV with that value excluded.
970 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
971 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
972 S = SE.getConstant(GV->getType(), 0);
973 return GV;
974 }
975 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
976 SmallVector<const SCEV *, 8> NewOps(Add->operands());
977 GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
978 if (Result)
979 S = SE.getAddExpr(NewOps);
980 return Result;
981 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
982 SmallVector<const SCEV *, 8> NewOps(AR->operands());
983 GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
984 if (Result)
985 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
986 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
988 return Result;
989 }
990 return nullptr;
991}
992
993/// Returns true if the specified instruction is using the specified value as an
994/// address.
996 Instruction *Inst, Value *OperandVal) {
997 bool isAddress = isa<LoadInst>(Inst);
998 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
999 if (SI->getPointerOperand() == OperandVal)
1000 isAddress = true;
1001 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
1002 // Addressing modes can also be folded into prefetches and a variety
1003 // of intrinsics.
1004 switch (II->getIntrinsicID()) {
1005 case Intrinsic::memset:
1006 case Intrinsic::prefetch:
1007 case Intrinsic::masked_load:
1008 if (II->getArgOperand(0) == OperandVal)
1009 isAddress = true;
1010 break;
1011 case Intrinsic::masked_store:
1012 if (II->getArgOperand(1) == OperandVal)
1013 isAddress = true;
1014 break;
1015 case Intrinsic::memmove:
1016 case Intrinsic::memcpy:
1017 if (II->getArgOperand(0) == OperandVal ||
1018 II->getArgOperand(1) == OperandVal)
1019 isAddress = true;
1020 break;
1021 default: {
1022 MemIntrinsicInfo IntrInfo;
1023 if (TTI.getTgtMemIntrinsic(II, IntrInfo)) {
1024 if (IntrInfo.PtrVal == OperandVal)
1025 isAddress = true;
1026 }
1027 }
1028 }
1029 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1030 if (RMW->getPointerOperand() == OperandVal)
1031 isAddress = true;
1032 } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1033 if (CmpX->getPointerOperand() == OperandVal)
1034 isAddress = true;
1035 }
1036 return isAddress;
1037}
1038
1039/// Return the type of the memory being accessed.
1040static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
1041 Instruction *Inst, Value *OperandVal) {
1042 MemAccessTy AccessTy = MemAccessTy::getUnknown(Inst->getContext());
1043
1044 // First get the type of memory being accessed.
1045 if (Type *Ty = Inst->getAccessType())
1046 AccessTy.MemTy = Ty;
1047
1048 // Then get the pointer address space.
1049 if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
1050 AccessTy.AddrSpace = SI->getPointerAddressSpace();
1051 } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
1052 AccessTy.AddrSpace = LI->getPointerAddressSpace();
1053 } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1054 AccessTy.AddrSpace = RMW->getPointerAddressSpace();
1055 } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1056 AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
1057 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
1058 switch (II->getIntrinsicID()) {
1059 case Intrinsic::prefetch:
1060 case Intrinsic::memset:
1061 AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
1062 AccessTy.MemTy = OperandVal->getType();
1063 break;
1064 case Intrinsic::memmove:
1065 case Intrinsic::memcpy:
1066 AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace();
1067 AccessTy.MemTy = OperandVal->getType();
1068 break;
1069 case Intrinsic::masked_load:
1070 AccessTy.AddrSpace =
1071 II->getArgOperand(0)->getType()->getPointerAddressSpace();
1072 break;
1073 case Intrinsic::masked_store:
1074 AccessTy.AddrSpace =
1075 II->getArgOperand(1)->getType()->getPointerAddressSpace();
1076 break;
1077 default: {
1078 MemIntrinsicInfo IntrInfo;
1079 if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) {
1080 AccessTy.AddrSpace
1081 = IntrInfo.PtrVal->getType()->getPointerAddressSpace();
1082 }
1083
1084 break;
1085 }
1086 }
1087 }
1088
1089 return AccessTy;
1090}
1091
1092/// Return true if this AddRec is already a phi in its loop.
1093static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
1094 for (PHINode &PN : AR->getLoop()->getHeader()->phis()) {
1095 if (SE.isSCEVable(PN.getType()) &&
1096 (SE.getEffectiveSCEVType(PN.getType()) ==
1097 SE.getEffectiveSCEVType(AR->getType())) &&
1098 SE.getSCEV(&PN) == AR)
1099 return true;
1100 }
1101 return false;
1102}
1103
1104/// Check if expanding this expression is likely to incur significant cost. This
1105/// is tricky because SCEV doesn't track which expressions are actually computed
1106/// by the current IR.
1107///
1108/// We currently allow expansion of IV increments that involve adds,
1109/// multiplication by constants, and AddRecs from existing phis.
1110///
1111/// TODO: Allow UDivExpr if we can find an existing IV increment that is an
1112/// obvious multiple of the UDivExpr.
1113static bool isHighCostExpansion(const SCEV *S,
1115 ScalarEvolution &SE) {
1116 // Zero/One operand expressions
1117 switch (S->getSCEVType()) {
1118 case scUnknown:
1119 case scConstant:
1120 case scVScale:
1121 return false;
1122 case scTruncate:
1123 return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
1124 Processed, SE);
1125 case scZeroExtend:
1126 return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
1127 Processed, SE);
1128 case scSignExtend:
1129 return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
1130 Processed, SE);
1131 default:
1132 break;
1133 }
1134
1135 if (!Processed.insert(S).second)
1136 return false;
1137
1138 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
1139 for (const SCEV *S : Add->operands()) {
1140 if (isHighCostExpansion(S, Processed, SE))
1141 return true;
1142 }
1143 return false;
1144 }
1145
1146 const SCEV *Op0, *Op1;
1147 if (match(S, m_scev_Mul(m_SCEV(Op0), m_SCEV(Op1)))) {
1148 // Multiplication by a constant is ok
1149 if (isa<SCEVConstant>(Op0))
1150 return isHighCostExpansion(Op1, Processed, SE);
1151
1152 // If we have the value of one operand, check if an existing
1153 // multiplication already generates this expression.
1154 if (const auto *U = dyn_cast<SCEVUnknown>(Op1)) {
1155 Value *UVal = U->getValue();
1156 for (User *UR : UVal->users()) {
1157 // If U is a constant, it may be used by a ConstantExpr.
1159 if (UI && UI->getOpcode() == Instruction::Mul &&
1160 SE.isSCEVable(UI->getType())) {
1161 return SE.getSCEV(UI) == S;
1162 }
1163 }
1164 }
1165 }
1166
1167 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
1168 if (isExistingPhi(AR, SE))
1169 return false;
1170 }
1171
1172 // Fow now, consider any other type of expression (div/mul/min/max) high cost.
1173 return true;
1174}
1175
1176namespace {
1177
1178class LSRUse;
1179
1180} // end anonymous namespace
1181
1182/// Check if the addressing mode defined by \p F is completely
1183/// folded in \p LU at isel time.
1184/// This includes address-mode folding and special icmp tricks.
1185/// This function returns true if \p LU can accommodate what \p F
1186/// defines and up to 1 base + 1 scaled + offset.
1187/// In other words, if \p F has several base registers, this function may
1188/// still return true. Therefore, users still need to account for
1189/// additional base registers and/or unfolded offsets to derive an
1190/// accurate cost model.
1191static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1192 const LSRUse &LU, const Formula &F);
1193
1194// Get the cost of the scaling factor used in F for LU.
1195static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI,
1196 const LSRUse &LU, const Formula &F,
1197 const Loop &L);
1198
1199namespace {
1200
1201/// This class is used to measure and compare candidate formulae.
1202class Cost {
1203 const Loop *L = nullptr;
1204 ScalarEvolution *SE = nullptr;
1205 const TargetTransformInfo *TTI = nullptr;
1206 TargetTransformInfo::LSRCost C;
1208
1209public:
1210 Cost() = delete;
1211 Cost(const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
1213 L(L), SE(&SE), TTI(&TTI), AMK(AMK) {
1214 C.Insns = 0;
1215 C.NumRegs = 0;
1216 C.AddRecCost = 0;
1217 C.NumIVMuls = 0;
1218 C.NumBaseAdds = 0;
1219 C.ImmCost = 0;
1220 C.SetupCost = 0;
1221 C.ScaleCost = 0;
1222 }
1223
1224 bool isLess(const Cost &Other) const;
1225
1226 void Lose();
1227
1228#ifndef NDEBUG
1229 // Once any of the metrics loses, they must all remain losers.
1230 bool isValid() {
1231 return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
1232 | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
1233 || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
1234 & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
1235 }
1236#endif
1237
1238 bool isLoser() {
1239 assert(isValid() && "invalid cost");
1240 return C.NumRegs == ~0u;
1241 }
1242
1243 void RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
1244 const DenseSet<const SCEV *> &VisitedRegs, const LSRUse &LU,
1245 bool HardwareLoopProfitable,
1246 SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
1247
1248 void print(raw_ostream &OS) const;
1249 void dump() const;
1250
1251private:
1252 void RateRegister(const Formula &F, const SCEV *Reg,
1253 SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
1254 bool HardwareLoopProfitable);
1255 void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1256 SmallPtrSetImpl<const SCEV *> &Regs,
1257 const LSRUse &LU, bool HardwareLoopProfitable,
1258 SmallPtrSetImpl<const SCEV *> *LoserRegs);
1259};
1260
1261/// An operand value in an instruction which is to be replaced with some
1262/// equivalent, possibly strength-reduced, replacement.
1263struct LSRFixup {
1264 /// The instruction which will be updated.
1265 Instruction *UserInst = nullptr;
1266
1267 /// The operand of the instruction which will be replaced. The operand may be
1268 /// used more than once; every instance will be replaced.
1269 Value *OperandValToReplace = nullptr;
1270
1271 /// If this user is to use the post-incremented value of an induction
1272 /// variable, this set is non-empty and holds the loops associated with the
1273 /// induction variable.
1274 PostIncLoopSet PostIncLoops;
1275
1276 /// A constant offset to be added to the LSRUse expression. This allows
1277 /// multiple fixups to share the same LSRUse with different offsets, for
1278 /// example in an unrolled loop.
1279 Immediate Offset = Immediate::getZero();
1280
1281 LSRFixup() = default;
1282
1283 bool isUseFullyOutsideLoop(const Loop *L) const;
1284
1285 void print(raw_ostream &OS) const;
1286 void dump() const;
1287};
1288
1289/// This class holds the state that LSR keeps for each use in IVUsers, as well
1290/// as uses invented by LSR itself. It includes information about what kinds of
1291/// things can be folded into the user, information about the user itself, and
1292/// information about how the use may be satisfied. TODO: Represent multiple
1293/// users of the same expression in common?
1294class LSRUse {
1295 DenseSet<SmallVector<const SCEV *, 4>> Uniquifier;
1296
1297public:
1298 /// An enum for a kind of use, indicating what types of scaled and immediate
1299 /// operands it might support.
1300 enum KindType {
1301 Basic, ///< A normal use, with no folding.
1302 Special, ///< A special case of basic, allowing -1 scales.
1303 Address, ///< An address use; folding according to TargetLowering
1304 ICmpZero ///< An equality icmp with both operands folded into one.
1305 // TODO: Add a generic icmp too?
1306 };
1307
1308 using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>;
1309
1310 KindType Kind;
1311 MemAccessTy AccessTy;
1312
1313 /// The list of operands which are to be replaced.
1315
1316 /// Keep track of the min and max offsets of the fixups.
1317 Immediate MinOffset = Immediate::getFixedMax();
1318 Immediate MaxOffset = Immediate::getFixedMin();
1319
1320 /// This records whether all of the fixups using this LSRUse are outside of
1321 /// the loop, in which case some special-case heuristics may be used.
1322 bool AllFixupsOutsideLoop = true;
1323
1324 /// This records whether all of the fixups using this LSRUse are unconditional
1325 /// within the loop, meaning they will be executed on every path to the loop
1326 /// latch. This includes fixups before early exits.
1327 bool AllFixupsUnconditional = true;
1328
1329 /// RigidFormula is set to true to guarantee that this use will be associated
1330 /// with a single formula--the one that initially matched. Some SCEV
1331 /// expressions cannot be expanded. This allows LSR to consider the registers
1332 /// used by those expressions without the need to expand them later after
1333 /// changing the formula.
1334 bool RigidFormula = false;
1335
1336 /// This records the widest use type for any fixup using this
1337 /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
1338 /// fixup widths to be equivalent, because the narrower one may be relying on
1339 /// the implicit truncation to truncate away bogus bits.
1340 Type *WidestFixupType = nullptr;
1341
1342 /// A list of ways to build a value that can satisfy this user. After the
1343 /// list is populated, one of these is selected heuristically and used to
1344 /// formulate a replacement for OperandValToReplace in UserInst.
1345 SmallVector<Formula, 12> Formulae;
1346
1347 /// The set of register candidates used by all formulae in this LSRUse.
1348 SmallPtrSet<const SCEV *, 4> Regs;
1349
1350 LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {}
1351
1352 LSRFixup &getNewFixup() {
1353 Fixups.push_back(LSRFixup());
1354 return Fixups.back();
1355 }
1356
1357 void pushFixup(LSRFixup &f) {
1358 Fixups.push_back(f);
1359 if (Immediate::isKnownGT(f.Offset, MaxOffset))
1360 MaxOffset = f.Offset;
1361 if (Immediate::isKnownLT(f.Offset, MinOffset))
1362 MinOffset = f.Offset;
1363 }
1364
1365 bool HasFormulaWithSameRegs(const Formula &F) const;
1366 float getNotSelectedProbability(const SCEV *Reg) const;
1367 bool InsertFormula(const Formula &F, const Loop &L);
1368 void DeleteFormula(Formula &F);
1369 void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
1370
1371 void print(raw_ostream &OS) const;
1372 void dump() const;
1373};
1374
1375} // end anonymous namespace
1376
1377static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1378 LSRUse::KindType Kind, MemAccessTy AccessTy,
1379 GlobalValue *BaseGV, Immediate BaseOffset,
1380 bool HasBaseReg, int64_t Scale,
1381 Instruction *Fixup = nullptr);
1382
1383static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) {
1385 return 1;
1386 if (Depth == 0)
1387 return 0;
1388 if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))
1389 return getSetupCost(S->getStart(), Depth - 1);
1390 if (auto S = dyn_cast<SCEVIntegralCastExpr>(Reg))
1391 return getSetupCost(S->getOperand(), Depth - 1);
1392 if (auto S = dyn_cast<SCEVNAryExpr>(Reg))
1393 return std::accumulate(S->operands().begin(), S->operands().end(), 0,
1394 [&](unsigned i, const SCEV *Reg) {
1395 return i + getSetupCost(Reg, Depth - 1);
1396 });
1397 if (auto S = dyn_cast<SCEVUDivExpr>(Reg))
1398 return getSetupCost(S->getLHS(), Depth - 1) +
1399 getSetupCost(S->getRHS(), Depth - 1);
1400 return 0;
1401}
1402
1403/// Tally up interesting quantities from the given register.
1404void Cost::RateRegister(const Formula &F, const SCEV *Reg,
1405 SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
1406 bool HardwareLoopProfitable) {
1407 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
1408 // If this is an addrec for another loop, it should be an invariant
1409 // with respect to L since L is the innermost loop (at least
1410 // for now LSR only handles innermost loops).
1411 if (AR->getLoop() != L) {
1412 // If the AddRec exists, consider it's register free and leave it alone.
1413 if (isExistingPhi(AR, *SE) && !(AMK & TTI::AMK_PostIndexed))
1414 return;
1415
1416 // It is bad to allow LSR for current loop to add induction variables
1417 // for its sibling loops.
1418 if (!AR->getLoop()->contains(L)) {
1419 Lose();
1420 return;
1421 }
1422
1423 // Otherwise, it will be an invariant with respect to Loop L.
1424 ++C.NumRegs;
1425 return;
1426 }
1427
1428 unsigned LoopCost = 1;
1429 if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||
1430 TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {
1431 const SCEV *Start;
1432 const APInt *Step;
1433 if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_scev_APInt(Step)))) {
1434 // If the step size matches the base offset, we could use pre-indexed
1435 // addressing.
1436 bool CanPreIndex = (AMK & TTI::AMK_PreIndexed) &&
1437 F.BaseOffset.isFixed() &&
1438 *Step == F.BaseOffset.getFixedValue();
1439 bool CanPostIndex = (AMK & TTI::AMK_PostIndexed) &&
1440 !isa<SCEVConstant>(Start) &&
1441 SE->isLoopInvariant(Start, L);
1442 // We can only pre or post index when the load/store is unconditional.
1443 if ((CanPreIndex || CanPostIndex) && LU.AllFixupsUnconditional)
1444 LoopCost = 0;
1445 }
1446 }
1447
1448 // If the loop counts down to zero and we'll be using a hardware loop then
1449 // the addrec will be combined into the hardware loop instruction.
1450 if (LU.Kind == LSRUse::ICmpZero && F.countsDownToZero() &&
1451 HardwareLoopProfitable)
1452 LoopCost = 0;
1453 C.AddRecCost += LoopCost;
1454
1455 // Add the step value register, if it needs one.
1456 // TODO: The non-affine case isn't precisely modeled here.
1457 if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
1458 if (!Regs.count(AR->getOperand(1))) {
1459 RateRegister(F, AR->getOperand(1), Regs, LU, HardwareLoopProfitable);
1460 if (isLoser())
1461 return;
1462 }
1463 }
1464 }
1465 ++C.NumRegs;
1466
1467 // Rough heuristic; favor registers which don't require extra setup
1468 // instructions in the preheader.
1469 C.SetupCost += getSetupCost(Reg, SetupCostDepthLimit);
1470 // Ensure we don't, even with the recusion limit, produce invalid costs.
1471 C.SetupCost = std::min<unsigned>(C.SetupCost, 1 << 16);
1472
1473 C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
1475}
1476
1477/// Record this register in the set. If we haven't seen it before, rate
1478/// it. Optional LoserRegs provides a way to declare any formula that refers to
1479/// one of those regs an instant loser.
1480void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1481 SmallPtrSetImpl<const SCEV *> &Regs,
1482 const LSRUse &LU, bool HardwareLoopProfitable,
1483 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1484 if (LoserRegs && LoserRegs->count(Reg)) {
1485 Lose();
1486 return;
1487 }
1488 if (Regs.insert(Reg).second) {
1489 RateRegister(F, Reg, Regs, LU, HardwareLoopProfitable);
1490 if (LoserRegs && isLoser())
1491 LoserRegs->insert(Reg);
1492 }
1493}
1494
1495void Cost::RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
1496 const DenseSet<const SCEV *> &VisitedRegs,
1497 const LSRUse &LU, bool HardwareLoopProfitable,
1498 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1499 if (isLoser())
1500 return;
1501 assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
1502 // Tally up the registers.
1503 unsigned PrevAddRecCost = C.AddRecCost;
1504 unsigned PrevNumRegs = C.NumRegs;
1505 unsigned PrevNumBaseAdds = C.NumBaseAdds;
1506 if (const SCEV *ScaledReg = F.ScaledReg) {
1507 if (VisitedRegs.count(ScaledReg)) {
1508 Lose();
1509 return;
1510 }
1511 RatePrimaryRegister(F, ScaledReg, Regs, LU, HardwareLoopProfitable,
1512 LoserRegs);
1513 if (isLoser())
1514 return;
1515 }
1516 for (const SCEV *BaseReg : F.BaseRegs) {
1517 if (VisitedRegs.count(BaseReg)) {
1518 Lose();
1519 return;
1520 }
1521 RatePrimaryRegister(F, BaseReg, Regs, LU, HardwareLoopProfitable,
1522 LoserRegs);
1523 if (isLoser())
1524 return;
1525 }
1526
1527 // Determine how many (unfolded) adds we'll need inside the loop.
1528 size_t NumBaseParts = F.getNumRegs();
1529 if (NumBaseParts > 1)
1530 // Do not count the base and a possible second register if the target
1531 // allows to fold 2 registers.
1532 C.NumBaseAdds +=
1533 NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F)));
1534 C.NumBaseAdds += (F.UnfoldedOffset.isNonZero());
1535
1536 // Accumulate non-free scaling amounts.
1537 C.ScaleCost += getScalingFactorCost(*TTI, LU, F, *L).getValue();
1538
1539 // Tally up the non-zero immediates.
1540 for (const LSRFixup &Fixup : LU.Fixups) {
1541 if (Fixup.Offset.isCompatibleImmediate(F.BaseOffset)) {
1542 Immediate Offset = Fixup.Offset.addUnsigned(F.BaseOffset);
1543 if (F.BaseGV)
1544 C.ImmCost += 64; // Handle symbolic values conservatively.
1545 // TODO: This should probably be the pointer size.
1546 else if (Offset.isNonZero())
1547 C.ImmCost +=
1548 APInt(64, Offset.getKnownMinValue(), true).getSignificantBits();
1549
1550 // Check with target if this offset with this instruction is
1551 // specifically not supported.
1552 if (LU.Kind == LSRUse::Address && Offset.isNonZero() &&
1553 !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1554 Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
1555 C.NumBaseAdds++;
1556 } else {
1557 // Incompatible immediate type, increase cost to avoid using
1558 C.ImmCost += 2048;
1559 }
1560 }
1561
1562 // If we don't count instruction cost exit here.
1563 if (!InsnsCost) {
1564 assert(isValid() && "invalid cost");
1565 return;
1566 }
1567
1568 // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
1569 // additional instruction (at least fill).
1570 // TODO: Need distinguish register class?
1571 unsigned TTIRegNum = TTI->getNumberOfRegisters(
1572 TTI->getRegisterClassForType(false, F.getType())) - 1;
1573 if (C.NumRegs > TTIRegNum) {
1574 // Cost already exceeded TTIRegNum, then only newly added register can add
1575 // new instructions.
1576 if (PrevNumRegs > TTIRegNum)
1577 C.Insns += (C.NumRegs - PrevNumRegs);
1578 else
1579 C.Insns += (C.NumRegs - TTIRegNum);
1580 }
1581
1582 // If ICmpZero formula ends with not 0, it could not be replaced by
1583 // just add or sub. We'll need to compare final result of AddRec.
1584 // That means we'll need an additional instruction. But if the target can
1585 // macro-fuse a compare with a branch, don't count this extra instruction.
1586 // For -10 + {0, +, 1}:
1587 // i = i + 1;
1588 // cmp i, 10
1589 //
1590 // For {-10, +, 1}:
1591 // i = i + 1;
1592 if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() &&
1593 !TTI->canMacroFuseCmp())
1594 C.Insns++;
1595 // Each new AddRec adds 1 instruction to calculation.
1596 C.Insns += (C.AddRecCost - PrevAddRecCost);
1597
1598 // BaseAdds adds instructions for unfolded registers.
1599 if (LU.Kind != LSRUse::ICmpZero)
1600 C.Insns += C.NumBaseAdds - PrevNumBaseAdds;
1601 assert(isValid() && "invalid cost");
1602}
1603
1604/// Set this cost to a losing value.
1605void Cost::Lose() {
1606 C.Insns = std::numeric_limits<unsigned>::max();
1607 C.NumRegs = std::numeric_limits<unsigned>::max();
1608 C.AddRecCost = std::numeric_limits<unsigned>::max();
1609 C.NumIVMuls = std::numeric_limits<unsigned>::max();
1610 C.NumBaseAdds = std::numeric_limits<unsigned>::max();
1611 C.ImmCost = std::numeric_limits<unsigned>::max();
1612 C.SetupCost = std::numeric_limits<unsigned>::max();
1613 C.ScaleCost = std::numeric_limits<unsigned>::max();
1614}
1615
1616/// Choose the lower cost.
1617bool Cost::isLess(const Cost &Other) const {
1618 if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
1619 C.Insns != Other.C.Insns)
1620 return C.Insns < Other.C.Insns;
1621 return TTI->isLSRCostLess(C, Other.C);
1622}
1623
1624#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1625void Cost::print(raw_ostream &OS) const {
1626 if (InsnsCost)
1627 OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
1628 OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s");
1629 if (C.AddRecCost != 0)
1630 OS << ", with addrec cost " << C.AddRecCost;
1631 if (C.NumIVMuls != 0)
1632 OS << ", plus " << C.NumIVMuls << " IV mul"
1633 << (C.NumIVMuls == 1 ? "" : "s");
1634 if (C.NumBaseAdds != 0)
1635 OS << ", plus " << C.NumBaseAdds << " base add"
1636 << (C.NumBaseAdds == 1 ? "" : "s");
1637 if (C.ScaleCost != 0)
1638 OS << ", plus " << C.ScaleCost << " scale cost";
1639 if (C.ImmCost != 0)
1640 OS << ", plus " << C.ImmCost << " imm cost";
1641 if (C.SetupCost != 0)
1642 OS << ", plus " << C.SetupCost << " setup cost";
1643}
1644
1645LLVM_DUMP_METHOD void Cost::dump() const {
1646 print(errs()); errs() << '\n';
1647}
1648#endif
1649
1650/// Test whether this fixup always uses its value outside of the given loop.
1651bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
1652 // PHI nodes use their value in their incoming blocks.
1653 if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
1654 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
1655 if (PN->getIncomingValue(i) == OperandValToReplace &&
1656 L->contains(PN->getIncomingBlock(i)))
1657 return false;
1658 return true;
1659 }
1660
1661 return !L->contains(UserInst);
1662}
1663
1664#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1665void LSRFixup::print(raw_ostream &OS) const {
1666 OS << "UserInst=";
1667 // Store is common and interesting enough to be worth special-casing.
1668 if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
1669 OS << "store ";
1670 Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
1671 } else if (UserInst->getType()->isVoidTy())
1672 OS << UserInst->getOpcodeName();
1673 else
1674 UserInst->printAsOperand(OS, /*PrintType=*/false);
1675
1676 OS << ", OperandValToReplace=";
1677 OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
1678
1679 for (const Loop *PIL : PostIncLoops) {
1680 OS << ", PostIncLoop=";
1681 PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
1682 }
1683
1684 if (Offset.isNonZero())
1685 OS << ", Offset=" << Offset;
1686}
1687
1688LLVM_DUMP_METHOD void LSRFixup::dump() const {
1689 print(errs()); errs() << '\n';
1690}
1691#endif
1692
1693/// Test whether this use as a formula which has the same registers as the given
1694/// formula.
1695bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
1697 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1698 // Unstable sort by host order ok, because this is only used for uniquifying.
1699 llvm::sort(Key);
1700 return Uniquifier.count(Key);
1701}
1702
1703/// The function returns a probability of selecting formula without Reg.
1704float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
1705 unsigned FNum = 0;
1706 for (const Formula &F : Formulae)
1707 if (F.referencesReg(Reg))
1708 FNum++;
1709 return ((float)(Formulae.size() - FNum)) / Formulae.size();
1710}
1711
1712/// If the given formula has not yet been inserted, add it to the list, and
1713/// return true. Return false otherwise. The formula must be in canonical form.
1714bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
1715 assert(F.isCanonical(L) && "Invalid canonical representation");
1716
1717 if (!Formulae.empty() && RigidFormula)
1718 return false;
1719
1721 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1722 // Unstable sort by host order ok, because this is only used for uniquifying.
1723 llvm::sort(Key);
1724
1725 if (!Uniquifier.insert(Key).second)
1726 return false;
1727
1728 // Using a register to hold the value of 0 is not profitable.
1729 assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
1730 "Zero allocated in a scaled register!");
1731#ifndef NDEBUG
1732 for (const SCEV *BaseReg : F.BaseRegs)
1733 assert(!BaseReg->isZero() && "Zero allocated in a base register!");
1734#endif
1735
1736 // Add the formula to the list.
1737 Formulae.push_back(F);
1738
1739 // Record registers now being used by this use.
1740 Regs.insert_range(F.BaseRegs);
1741 if (F.ScaledReg)
1742 Regs.insert(F.ScaledReg);
1743
1744 return true;
1745}
1746
1747/// Remove the given formula from this use's list.
1748void LSRUse::DeleteFormula(Formula &F) {
1749 if (&F != &Formulae.back())
1750 std::swap(F, Formulae.back());
1751 Formulae.pop_back();
1752}
1753
1754/// Recompute the Regs field, and update RegUses.
1755void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
1756 // Now that we've filtered out some formulae, recompute the Regs set.
1757 SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
1758 Regs.clear();
1759 for (const Formula &F : Formulae) {
1760 if (F.ScaledReg) Regs.insert(F.ScaledReg);
1761 Regs.insert_range(F.BaseRegs);
1762 }
1763
1764 // Update the RegTracker.
1765 for (const SCEV *S : OldRegs)
1766 if (!Regs.count(S))
1767 RegUses.dropRegister(S, LUIdx);
1768}
1769
1770#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1771void LSRUse::print(raw_ostream &OS) const {
1772 OS << "LSR Use: Kind=";
1773 switch (Kind) {
1774 case Basic: OS << "Basic"; break;
1775 case Special: OS << "Special"; break;
1776 case ICmpZero: OS << "ICmpZero"; break;
1777 case Address:
1778 OS << "Address of ";
1779 if (AccessTy.MemTy->isPointerTy())
1780 OS << "pointer"; // the full pointer type could be really verbose
1781 else {
1782 OS << *AccessTy.MemTy;
1783 }
1784
1785 OS << " in addrspace(" << AccessTy.AddrSpace << ')';
1786 }
1787
1788 OS << ", Offsets={";
1789 bool NeedComma = false;
1790 for (const LSRFixup &Fixup : Fixups) {
1791 if (NeedComma) OS << ',';
1792 OS << Fixup.Offset;
1793 NeedComma = true;
1794 }
1795 OS << '}';
1796
1797 if (AllFixupsOutsideLoop)
1798 OS << ", all-fixups-outside-loop";
1799
1800 if (AllFixupsUnconditional)
1801 OS << ", all-fixups-unconditional";
1802
1803 if (WidestFixupType)
1804 OS << ", widest fixup type: " << *WidestFixupType;
1805}
1806
1807LLVM_DUMP_METHOD void LSRUse::dump() const {
1808 print(errs()); errs() << '\n';
1809}
1810#endif
1811
1813 LSRUse::KindType Kind, MemAccessTy AccessTy,
1814 GlobalValue *BaseGV, Immediate BaseOffset,
1815 bool HasBaseReg, int64_t Scale,
1816 Instruction *Fixup /* = nullptr */) {
1817 switch (Kind) {
1818 case LSRUse::Address: {
1819 int64_t FixedOffset =
1820 BaseOffset.isScalable() ? 0 : BaseOffset.getFixedValue();
1821 int64_t ScalableOffset =
1822 BaseOffset.isScalable() ? BaseOffset.getKnownMinValue() : 0;
1823 return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, FixedOffset,
1824 HasBaseReg, Scale, AccessTy.AddrSpace,
1825 Fixup, ScalableOffset);
1826 }
1827 case LSRUse::ICmpZero:
1828 // There's not even a target hook for querying whether it would be legal to
1829 // fold a GV into an ICmp.
1830 if (BaseGV)
1831 return false;
1832
1833 // ICmp only has two operands; don't allow more than two non-trivial parts.
1834 if (Scale != 0 && HasBaseReg && BaseOffset.isNonZero())
1835 return false;
1836
1837 // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
1838 // putting the scaled register in the other operand of the icmp.
1839 if (Scale != 0 && Scale != -1)
1840 return false;
1841
1842 // If we have low-level target information, ask the target if it can fold an
1843 // integer immediate on an icmp.
1844 if (BaseOffset.isNonZero()) {
1845 // We don't have an interface to query whether the target supports
1846 // icmpzero against scalable quantities yet.
1847 if (BaseOffset.isScalable())
1848 return false;
1849
1850 // We have one of:
1851 // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
1852 // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
1853 // Offs is the ICmp immediate.
1854 if (Scale == 0)
1855 // The cast does the right thing with
1856 // std::numeric_limits<int64_t>::min().
1857 BaseOffset = BaseOffset.getFixed(-(uint64_t)BaseOffset.getFixedValue());
1858 return TTI.isLegalICmpImmediate(BaseOffset.getFixedValue());
1859 }
1860
1861 // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
1862 return true;
1863
1864 case LSRUse::Basic:
1865 // Only handle single-register values.
1866 return !BaseGV && Scale == 0 && BaseOffset.isZero();
1867
1868 case LSRUse::Special:
1869 // Special case Basic to handle -1 scales.
1870 return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset.isZero();
1871 }
1872
1873 llvm_unreachable("Invalid LSRUse Kind!");
1874}
1875
1877 Immediate MinOffset, Immediate MaxOffset,
1878 LSRUse::KindType Kind, MemAccessTy AccessTy,
1879 GlobalValue *BaseGV, Immediate BaseOffset,
1880 bool HasBaseReg, int64_t Scale) {
1881 if (BaseOffset.isNonZero() &&
1882 (BaseOffset.isScalable() != MinOffset.isScalable() ||
1883 BaseOffset.isScalable() != MaxOffset.isScalable()))
1884 return false;
1885 // Check for overflow.
1886 int64_t Base = BaseOffset.getKnownMinValue();
1887 int64_t Min = MinOffset.getKnownMinValue();
1888 int64_t Max = MaxOffset.getKnownMinValue();
1889 if (((int64_t)((uint64_t)Base + Min) > Base) != (Min > 0))
1890 return false;
1891 MinOffset = Immediate::get((uint64_t)Base + Min, MinOffset.isScalable());
1892 if (((int64_t)((uint64_t)Base + Max) > Base) != (Max > 0))
1893 return false;
1894 MaxOffset = Immediate::get((uint64_t)Base + Max, MaxOffset.isScalable());
1895
1896 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
1897 HasBaseReg, Scale) &&
1898 isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
1899 HasBaseReg, Scale);
1900}
1901
1903 Immediate MinOffset, Immediate MaxOffset,
1904 LSRUse::KindType Kind, MemAccessTy AccessTy,
1905 const Formula &F, const Loop &L) {
1906 // For the purpose of isAMCompletelyFolded either having a canonical formula
1907 // or a scale not equal to zero is correct.
1908 // Problems may arise from non canonical formulae having a scale == 0.
1909 // Strictly speaking it would best to just rely on canonical formulae.
1910 // However, when we generate the scaled formulae, we first check that the
1911 // scaling factor is profitable before computing the actual ScaledReg for
1912 // compile time sake.
1913 assert((F.isCanonical(L) || F.Scale != 0));
1914 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1915 F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
1916}
1917
1918/// Test whether we know how to expand the current formula.
1919static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1920 Immediate MaxOffset, LSRUse::KindType Kind,
1921 MemAccessTy AccessTy, GlobalValue *BaseGV,
1922 Immediate BaseOffset, bool HasBaseReg, int64_t Scale) {
1923 // We know how to expand completely foldable formulae.
1924 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1925 BaseOffset, HasBaseReg, Scale) ||
1926 // Or formulae that use a base register produced by a sum of base
1927 // registers.
1928 (Scale == 1 &&
1929 isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1930 BaseGV, BaseOffset, true, 0));
1931}
1932
1933static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1934 Immediate MaxOffset, LSRUse::KindType Kind,
1935 MemAccessTy AccessTy, const Formula &F) {
1936 return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
1937 F.BaseOffset, F.HasBaseReg, F.Scale);
1938}
1939
1941 Immediate Offset) {
1942 if (Offset.isScalable())
1943 return TTI.isLegalAddScalableImmediate(Offset.getKnownMinValue());
1944
1945 return TTI.isLegalAddImmediate(Offset.getFixedValue());
1946}
1947
1949 const LSRUse &LU, const Formula &F) {
1950 // Target may want to look at the user instructions.
1951 if (LU.Kind == LSRUse::Address && TTI.LSRWithInstrQueries()) {
1952 for (const LSRFixup &Fixup : LU.Fixups)
1953 if (!isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1954 (F.BaseOffset + Fixup.Offset), F.HasBaseReg,
1955 F.Scale, Fixup.UserInst))
1956 return false;
1957 return true;
1958 }
1959
1960 return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1961 LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
1962 F.Scale);
1963}
1964
1966 const LSRUse &LU, const Formula &F,
1967 const Loop &L) {
1968 if (!F.Scale)
1969 return 0;
1970
1971 // If the use is not completely folded in that instruction, we will have to
1972 // pay an extra cost only for scale != 1.
1973 if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1974 LU.AccessTy, F, L))
1975 return F.Scale != 1;
1976
1977 switch (LU.Kind) {
1978 case LSRUse::Address: {
1979 // Check the scaling factor cost with both the min and max offsets.
1980 int64_t ScalableMin = 0, ScalableMax = 0, FixedMin = 0, FixedMax = 0;
1981 if (F.BaseOffset.isScalable()) {
1982 ScalableMin = (F.BaseOffset + LU.MinOffset).getKnownMinValue();
1983 ScalableMax = (F.BaseOffset + LU.MaxOffset).getKnownMinValue();
1984 } else {
1985 FixedMin = (F.BaseOffset + LU.MinOffset).getFixedValue();
1986 FixedMax = (F.BaseOffset + LU.MaxOffset).getFixedValue();
1987 }
1988 InstructionCost ScaleCostMinOffset = TTI.getScalingFactorCost(
1989 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMin, ScalableMin),
1990 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
1991 InstructionCost ScaleCostMaxOffset = TTI.getScalingFactorCost(
1992 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMax, ScalableMax),
1993 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
1994
1995 assert(ScaleCostMinOffset.isValid() && ScaleCostMaxOffset.isValid() &&
1996 "Legal addressing mode has an illegal cost!");
1997 return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
1998 }
1999 case LSRUse::ICmpZero:
2000 case LSRUse::Basic:
2001 case LSRUse::Special:
2002 // The use is completely folded, i.e., everything is folded into the
2003 // instruction.
2004 return 0;
2005 }
2006
2007 llvm_unreachable("Invalid LSRUse Kind!");
2008}
2009
2011 LSRUse::KindType Kind, MemAccessTy AccessTy,
2012 GlobalValue *BaseGV, Immediate BaseOffset,
2013 bool HasBaseReg) {
2014 // Fast-path: zero is always foldable.
2015 if (BaseOffset.isZero() && !BaseGV)
2016 return true;
2017
2018 // Conservatively, create an address with an immediate and a
2019 // base and a scale.
2020 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2021
2022 // Canonicalize a scale of 1 to a base register if the formula doesn't
2023 // already have a base register.
2024 if (!HasBaseReg && Scale == 1) {
2025 Scale = 0;
2026 HasBaseReg = true;
2027 }
2028
2029 // FIXME: Try with + without a scale? Maybe based on TTI?
2030 // I think basereg + scaledreg + immediateoffset isn't a good 'conservative'
2031 // default for many architectures, not just AArch64 SVE. More investigation
2032 // needed later to determine if this should be used more widely than just
2033 // on scalable types.
2034 if (HasBaseReg && BaseOffset.isNonZero() && Kind != LSRUse::ICmpZero &&
2035 AccessTy.MemTy && AccessTy.MemTy->isScalableTy() && DropScaledForVScale)
2036 Scale = 0;
2037
2038 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
2039 HasBaseReg, Scale);
2040}
2041
2043 ScalarEvolution &SE, Immediate MinOffset,
2044 Immediate MaxOffset, LSRUse::KindType Kind,
2045 MemAccessTy AccessTy, const SCEV *S,
2046 bool HasBaseReg) {
2047 // Fast-path: zero is always foldable.
2048 if (S->isZero()) return true;
2049
2050 // Conservatively, create an address with an immediate and a
2051 // base and a scale.
2052 Immediate BaseOffset = ExtractImmediate(S, SE);
2053 GlobalValue *BaseGV = ExtractSymbol(S, SE);
2054
2055 // If there's anything else involved, it's not foldable.
2056 if (!S->isZero()) return false;
2057
2058 // Fast-path: zero is always foldable.
2059 if (BaseOffset.isZero() && !BaseGV)
2060 return true;
2061
2062 if (BaseOffset.isScalable())
2063 return false;
2064
2065 // Conservatively, create an address with an immediate and a
2066 // base and a scale.
2067 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2068
2069 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
2070 BaseOffset, HasBaseReg, Scale);
2071}
2072
2073namespace {
2074
2075/// An individual increment in a Chain of IV increments. Relate an IV user to
2076/// an expression that computes the IV it uses from the IV used by the previous
2077/// link in the Chain.
2078///
2079/// For the head of a chain, IncExpr holds the absolute SCEV expression for the
2080/// original IVOperand. The head of the chain's IVOperand is only valid during
2081/// chain collection, before LSR replaces IV users. During chain generation,
2082/// IncExpr can be used to find the new IVOperand that computes the same
2083/// expression.
2084struct IVInc {
2085 Instruction *UserInst;
2086 Value* IVOperand;
2087 const SCEV *IncExpr;
2088
2089 IVInc(Instruction *U, Value *O, const SCEV *E)
2090 : UserInst(U), IVOperand(O), IncExpr(E) {}
2091};
2092
2093// The list of IV increments in program order. We typically add the head of a
2094// chain without finding subsequent links.
2095struct IVChain {
2097 const SCEV *ExprBase = nullptr;
2098
2099 IVChain() = default;
2100 IVChain(const IVInc &Head, const SCEV *Base)
2101 : Incs(1, Head), ExprBase(Base) {}
2102
2103 using const_iterator = SmallVectorImpl<IVInc>::const_iterator;
2104
2105 // Return the first increment in the chain.
2106 const_iterator begin() const {
2107 assert(!Incs.empty());
2108 return std::next(Incs.begin());
2109 }
2110 const_iterator end() const {
2111 return Incs.end();
2112 }
2113
2114 // Returns true if this chain contains any increments.
2115 bool hasIncs() const { return Incs.size() >= 2; }
2116
2117 // Add an IVInc to the end of this chain.
2118 void add(const IVInc &X) { Incs.push_back(X); }
2119
2120 // Returns the last UserInst in the chain.
2121 Instruction *tailUserInst() const { return Incs.back().UserInst; }
2122
2123 // Returns true if IncExpr can be profitably added to this chain.
2124 bool isProfitableIncrement(const SCEV *OperExpr,
2125 const SCEV *IncExpr,
2126 ScalarEvolution&);
2127};
2128
2129/// Helper for CollectChains to track multiple IV increment uses. Distinguish
2130/// between FarUsers that definitely cross IV increments and NearUsers that may
2131/// be used between IV increments.
2132struct ChainUsers {
2133 SmallPtrSet<Instruction*, 4> FarUsers;
2134 SmallPtrSet<Instruction*, 4> NearUsers;
2135};
2136
2137/// This class holds state for the main loop strength reduction logic.
2138class LSRInstance {
2139 IVUsers &IU;
2140 ScalarEvolution &SE;
2141 DominatorTree &DT;
2142 LoopInfo &LI;
2143 AssumptionCache &AC;
2144 TargetLibraryInfo &TLI;
2145 const TargetTransformInfo &TTI;
2146 Loop *const L;
2147 MemorySSAUpdater *MSSAU;
2149 mutable SCEVExpander Rewriter;
2150 bool Changed = false;
2151 bool HardwareLoopProfitable = false;
2152
2153 /// This is the insert position that the current loop's induction variable
2154 /// increment should be placed. In simple loops, this is the latch block's
2155 /// terminator. But in more complicated cases, this is a position which will
2156 /// dominate all the in-loop post-increment users.
2157 Instruction *IVIncInsertPos = nullptr;
2158
2159 /// Interesting factors between use strides.
2160 ///
2161 /// We explicitly use a SetVector which contains a SmallSet, instead of the
2162 /// default, a SmallDenseSet, because we need to use the full range of
2163 /// int64_ts, and there's currently no good way of doing that with
2164 /// SmallDenseSet.
2165 SetVector<int64_t, SmallVector<int64_t, 8>, SmallSet<int64_t, 8>> Factors;
2166
2167 /// The cost of the current SCEV, the best solution by LSR will be dropped if
2168 /// the solution is not profitable.
2169 Cost BaselineCost;
2170
2171 /// Interesting use types, to facilitate truncation reuse.
2172 SmallSetVector<Type *, 4> Types;
2173
2174 /// The list of interesting uses.
2176
2177 /// Track which uses use which register candidates.
2178 RegUseTracker RegUses;
2179
2180 // Limit the number of chains to avoid quadratic behavior. We don't expect to
2181 // have more than a few IV increment chains in a loop. Missing a Chain falls
2182 // back to normal LSR behavior for those uses.
2183 static const unsigned MaxChains = 8;
2184
2185 /// IV users can form a chain of IV increments.
2187
2188 /// IV users that belong to profitable IVChains.
2189 SmallPtrSet<Use*, MaxChains> IVIncSet;
2190
2191 /// Induction variables that were generated and inserted by the SCEV Expander.
2192 SmallVector<llvm::WeakVH, 2> ScalarEvolutionIVs;
2193
2194 // Inserting instructions in the loop and using them as PHI's input could
2195 // break LCSSA in case if PHI's parent block is not a loop exit (i.e. the
2196 // corresponding incoming block is not loop exiting). So collect all such
2197 // instructions to form LCSSA for them later.
2198 SmallSetVector<Instruction *, 4> InsertedNonLCSSAInsts;
2199
2200 void OptimizeShadowIV();
2201 bool FindIVUserForCond(Instruction *Cond, IVStrideUse *&CondUse);
2202 Instruction *OptimizeMax(ICmpInst *Cond, IVStrideUse *&CondUse);
2203 void OptimizeLoopTermCond();
2204
2205 void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
2206 SmallVectorImpl<ChainUsers> &ChainUsersVec);
2207 void FinalizeChain(IVChain &Chain);
2208 void CollectChains();
2209 void GenerateIVChain(const IVChain &Chain,
2210 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2211
2212 void CollectInterestingTypesAndFactors();
2213 void CollectFixupsAndInitialFormulae();
2214
2215 // Support for sharing of LSRUses between LSRFixups.
2216 using UseMapTy = DenseMap<LSRUse::SCEVUseKindPair, size_t>;
2217 UseMapTy UseMap;
2218
2219 bool reconcileNewOffset(LSRUse &LU, Immediate NewOffset, bool HasBaseReg,
2220 LSRUse::KindType Kind, MemAccessTy AccessTy);
2221
2222 std::pair<size_t, Immediate> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
2223 MemAccessTy AccessTy);
2224
2225 void DeleteUse(LSRUse &LU, size_t LUIdx);
2226
2227 LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
2228
2229 void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2230 void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2231 void CountRegisters(const Formula &F, size_t LUIdx);
2232 bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
2233 bool IsFixupExecutedEachIncrement(const LSRFixup &LF) const;
2234
2235 void CollectLoopInvariantFixupsAndFormulae();
2236
2237 void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
2238 unsigned Depth = 0);
2239
2240 void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
2241 const Formula &Base, unsigned Depth,
2242 size_t Idx, bool IsScaledReg = false);
2243 void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
2244 void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2245 const Formula &Base, size_t Idx,
2246 bool IsScaledReg = false);
2247 void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2248 void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2249 const Formula &Base,
2250 const SmallVectorImpl<Immediate> &Worklist,
2251 size_t Idx, bool IsScaledReg = false);
2252 void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2253 void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2254 void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2255 void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
2256 void GenerateCrossUseConstantOffsets();
2257 void GenerateAllReuseFormulae();
2258
2259 void FilterOutUndesirableDedicatedRegisters();
2260
2261 size_t EstimateSearchSpaceComplexity() const;
2262 void NarrowSearchSpaceByDetectingSupersets();
2263 void NarrowSearchSpaceByCollapsingUnrolledCode();
2264 void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
2265 void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
2266 void NarrowSearchSpaceByFilterPostInc();
2267 void NarrowSearchSpaceByDeletingCostlyFormulas();
2268 void NarrowSearchSpaceByPickingWinnerRegs();
2269 void NarrowSearchSpaceUsingHeuristics();
2270
2271 void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
2272 Cost &SolutionCost,
2273 SmallVectorImpl<const Formula *> &Workspace,
2274 const Cost &CurCost,
2275 const SmallPtrSet<const SCEV *, 16> &CurRegs,
2276 DenseSet<const SCEV *> &VisitedRegs) const;
2277 void Solve(SmallVectorImpl<const Formula *> &Solution) const;
2278
2280 HoistInsertPosition(BasicBlock::iterator IP,
2281 const SmallVectorImpl<Instruction *> &Inputs) const;
2282 BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP,
2283 const LSRFixup &LF,
2284 const LSRUse &LU) const;
2285
2286 Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2288 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2289 void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
2290 const Formula &F,
2291 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2292 void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2293 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2294 void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
2295
2296public:
2297 LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
2298 LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC,
2299 TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU);
2300
2301 bool getChanged() const { return Changed; }
2302 const SmallVectorImpl<WeakVH> &getScalarEvolutionIVs() const {
2303 return ScalarEvolutionIVs;
2304 }
2305
2306 void print_factors_and_types(raw_ostream &OS) const;
2307 void print_fixups(raw_ostream &OS) const;
2308 void print_uses(raw_ostream &OS) const;
2309 void print(raw_ostream &OS) const;
2310 void dump() const;
2311};
2312
2313} // end anonymous namespace
2314
2315/// If IV is used in a int-to-float cast inside the loop then try to eliminate
2316/// the cast operation.
2317void LSRInstance::OptimizeShadowIV() {
2318 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2319 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2320 return;
2321
2322 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
2323 UI != E; /* empty */) {
2324 IVUsers::const_iterator CandidateUI = UI;
2325 ++UI;
2326 Instruction *ShadowUse = CandidateUI->getUser();
2327 Type *DestTy = nullptr;
2328 bool IsSigned = false;
2329
2330 /* If shadow use is a int->float cast then insert a second IV
2331 to eliminate this cast.
2332
2333 for (unsigned i = 0; i < n; ++i)
2334 foo((double)i);
2335
2336 is transformed into
2337
2338 double d = 0.0;
2339 for (unsigned i = 0; i < n; ++i, ++d)
2340 foo(d);
2341 */
2342 if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
2343 IsSigned = false;
2344 DestTy = UCast->getDestTy();
2345 }
2346 else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
2347 IsSigned = true;
2348 DestTy = SCast->getDestTy();
2349 }
2350 if (!DestTy) continue;
2351
2352 // If target does not support DestTy natively then do not apply
2353 // this transformation.
2354 if (!TTI.isTypeLegal(DestTy)) continue;
2355
2356 PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
2357 if (!PH) continue;
2358 if (PH->getNumIncomingValues() != 2) continue;
2359
2360 // If the calculation in integers overflows, the result in FP type will
2361 // differ. So we only can do this transformation if we are guaranteed to not
2362 // deal with overflowing values
2363 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PH));
2364 if (!AR) continue;
2365 if (IsSigned && !AR->hasNoSignedWrap()) continue;
2366 if (!IsSigned && !AR->hasNoUnsignedWrap()) continue;
2367
2368 Type *SrcTy = PH->getType();
2369 int Mantissa = DestTy->getFPMantissaWidth();
2370 if (Mantissa == -1) continue;
2371 if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
2372 continue;
2373
2374 unsigned Entry, Latch;
2375 if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
2376 Entry = 0;
2377 Latch = 1;
2378 } else {
2379 Entry = 1;
2380 Latch = 0;
2381 }
2382
2383 ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
2384 if (!Init) continue;
2385 Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
2386 (double)Init->getSExtValue() :
2387 (double)Init->getZExtValue());
2388
2389 BinaryOperator *Incr =
2391 if (!Incr) continue;
2392 if (Incr->getOpcode() != Instruction::Add
2393 && Incr->getOpcode() != Instruction::Sub)
2394 continue;
2395
2396 /* Initialize new IV, double d = 0.0 in above example. */
2397 ConstantInt *C = nullptr;
2398 if (Incr->getOperand(0) == PH)
2400 else if (Incr->getOperand(1) == PH)
2402 else
2403 continue;
2404
2405 if (!C) continue;
2406
2407 // Ignore negative constants, as the code below doesn't handle them
2408 // correctly. TODO: Remove this restriction.
2409 if (!C->getValue().isStrictlyPositive())
2410 continue;
2411
2412 /* Add new PHINode. */
2413 PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH->getIterator());
2414 NewPH->setDebugLoc(PH->getDebugLoc());
2415
2416 /* create new increment. '++d' in above example. */
2417 Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
2418 BinaryOperator *NewIncr = BinaryOperator::Create(
2419 Incr->getOpcode() == Instruction::Add ? Instruction::FAdd
2420 : Instruction::FSub,
2421 NewPH, CFP, "IV.S.next.", Incr->getIterator());
2422 NewIncr->setDebugLoc(Incr->getDebugLoc());
2423
2424 NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
2425 NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
2426
2427 /* Remove cast operation */
2428 ShadowUse->replaceAllUsesWith(NewPH);
2429 ShadowUse->eraseFromParent();
2430 Changed = true;
2431 break;
2432 }
2433}
2434
2435/// If Cond has an operand that is an expression of an IV, set the IV user and
2436/// stride information and return true, otherwise return false.
2437bool LSRInstance::FindIVUserForCond(Instruction *Cond, IVStrideUse *&CondUse) {
2438 for (IVStrideUse &U : IU)
2439 if (U.getUser() == Cond) {
2440 // NOTE: we could handle setcc instructions with multiple uses here, but
2441 // InstCombine does it as well for simple uses, it's not clear that it
2442 // occurs enough in real life to handle.
2443 CondUse = &U;
2444 return true;
2445 }
2446 return false;
2447}
2448
2449/// Rewrite the loop's terminating condition if it uses a max computation.
2450///
2451/// This is a narrow solution to a specific, but acute, problem. For loops
2452/// like this:
2453///
2454/// i = 0;
2455/// do {
2456/// p[i] = 0.0;
2457/// } while (++i < n);
2458///
2459/// the trip count isn't just 'n', because 'n' might not be positive. And
2460/// unfortunately this can come up even for loops where the user didn't use
2461/// a C do-while loop. For example, seemingly well-behaved top-test loops
2462/// will commonly be lowered like this:
2463///
2464/// if (n > 0) {
2465/// i = 0;
2466/// do {
2467/// p[i] = 0.0;
2468/// } while (++i < n);
2469/// }
2470///
2471/// and then it's possible for subsequent optimization to obscure the if
2472/// test in such a way that indvars can't find it.
2473///
2474/// When indvars can't find the if test in loops like this, it creates a
2475/// max expression, which allows it to give the loop a canonical
2476/// induction variable:
2477///
2478/// i = 0;
2479/// max = n < 1 ? 1 : n;
2480/// do {
2481/// p[i] = 0.0;
2482/// } while (++i != max);
2483///
2484/// Canonical induction variables are necessary because the loop passes
2485/// are designed around them. The most obvious example of this is the
2486/// LoopInfo analysis, which doesn't remember trip count values. It
2487/// expects to be able to rediscover the trip count each time it is
2488/// needed, and it does this using a simple analysis that only succeeds if
2489/// the loop has a canonical induction variable.
2490///
2491/// However, when it comes time to generate code, the maximum operation
2492/// can be quite costly, especially if it's inside of an outer loop.
2493///
2494/// This function solves this problem by detecting this type of loop and
2495/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
2496/// the instructions for the maximum computation.
2497Instruction *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse *&CondUse) {
2498 // Check that the loop matches the pattern we're looking for.
2499 if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
2500 Cond->getPredicate() != CmpInst::ICMP_NE)
2501 return Cond;
2502
2503 SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
2504 if (!Sel || !Sel->hasOneUse()) return Cond;
2505
2506 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2507 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2508 return Cond;
2509 const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
2510
2511 // Add one to the backedge-taken count to get the trip count.
2512 const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
2513 if (IterationCount != SE.getSCEV(Sel)) return Cond;
2514
2515 // Check for a max calculation that matches the pattern. There's no check
2516 // for ICMP_ULE here because the comparison would be with zero, which
2517 // isn't interesting.
2518 CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
2519 const SCEVNAryExpr *Max = nullptr;
2520 if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
2521 Pred = ICmpInst::ICMP_SLE;
2522 Max = S;
2523 } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
2524 Pred = ICmpInst::ICMP_SLT;
2525 Max = S;
2526 } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
2527 Pred = ICmpInst::ICMP_ULT;
2528 Max = U;
2529 } else {
2530 // No match; bail.
2531 return Cond;
2532 }
2533
2534 // To handle a max with more than two operands, this optimization would
2535 // require additional checking and setup.
2536 if (Max->getNumOperands() != 2)
2537 return Cond;
2538
2539 const SCEV *MaxLHS = Max->getOperand(0);
2540 const SCEV *MaxRHS = Max->getOperand(1);
2541
2542 // ScalarEvolution canonicalizes constants to the left. For < and >, look
2543 // for a comparison with 1. For <= and >=, a comparison with zero.
2544 if (!MaxLHS ||
2545 (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
2546 return Cond;
2547
2548 // Check the relevant induction variable for conformance to
2549 // the pattern.
2550 const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
2551 if (!match(IV,
2553 return Cond;
2554
2555 assert(cast<SCEVAddRecExpr>(IV)->getLoop() == L &&
2556 "Loop condition operand is an addrec in a different loop!");
2557
2558 // Check the right operand of the select, and remember it, as it will
2559 // be used in the new comparison instruction.
2560 Value *NewRHS = nullptr;
2561 if (ICmpInst::isTrueWhenEqual(Pred)) {
2562 // Look for n+1, and grab n.
2563 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
2564 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2565 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2566 NewRHS = BO->getOperand(0);
2567 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
2568 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2569 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2570 NewRHS = BO->getOperand(0);
2571 if (!NewRHS)
2572 return Cond;
2573 } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
2574 NewRHS = Sel->getOperand(1);
2575 else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
2576 NewRHS = Sel->getOperand(2);
2577 else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
2578 NewRHS = SU->getValue();
2579 else
2580 // Max doesn't match expected pattern.
2581 return Cond;
2582
2583 // Determine the new comparison opcode. It may be signed or unsigned,
2584 // and the original comparison may be either equality or inequality.
2585 if (Cond->getPredicate() == CmpInst::ICMP_EQ)
2586 Pred = CmpInst::getInversePredicate(Pred);
2587
2588 // Ok, everything looks ok to change the condition into an SLT or SGE and
2589 // delete the max calculation.
2590 ICmpInst *NewCond = new ICmpInst(Cond->getIterator(), Pred,
2591 Cond->getOperand(0), NewRHS, "scmp");
2592
2593 // Delete the max calculation instructions.
2594 NewCond->setDebugLoc(Cond->getDebugLoc());
2595 Cond->replaceAllUsesWith(NewCond);
2596 CondUse->setUser(NewCond);
2598 Cond->eraseFromParent();
2599 Sel->eraseFromParent();
2600 if (Cmp->use_empty()) {
2601 salvageDebugInfo(*Cmp);
2602 Cmp->eraseFromParent();
2603 }
2604 return NewCond;
2605}
2606
2607/// Change loop terminating condition to use the postinc iv when possible.
2608void
2609LSRInstance::OptimizeLoopTermCond() {
2610 SmallPtrSet<Instruction *, 4> PostIncs;
2611
2612 // We need a different set of heuristics for rotated and non-rotated loops.
2613 // If a loop is rotated then the latch is also the backedge, so inserting
2614 // post-inc expressions just before the latch is ideal. To reduce live ranges
2615 // it also makes sense to rewrite terminating conditions to use post-inc
2616 // expressions.
2617 //
2618 // If the loop is not rotated then the latch is not a backedge; the latch
2619 // check is done in the loop head. Adding post-inc expressions before the
2620 // latch will cause overlapping live-ranges of pre-inc and post-inc expressions
2621 // in the loop body. In this case we do *not* want to use post-inc expressions
2622 // in the latch check, and we want to insert post-inc expressions before
2623 // the backedge.
2624 BasicBlock *LatchBlock = L->getLoopLatch();
2625 SmallVector<BasicBlock*, 8> ExitingBlocks;
2626 L->getExitingBlocks(ExitingBlocks);
2627 if (!llvm::is_contained(ExitingBlocks, LatchBlock)) {
2628 // The backedge doesn't exit the loop; treat this as a head-tested loop.
2629 IVIncInsertPos = LatchBlock->getTerminator();
2630 return;
2631 }
2632
2633 // Otherwise treat this as a rotated loop.
2634 for (BasicBlock *ExitingBlock : ExitingBlocks) {
2635 // Get the terminating condition for the loop if possible. If we
2636 // can, we want to change it to use a post-incremented version of its
2637 // induction variable, to allow coalescing the live ranges for the IV into
2638 // one register value.
2639
2640 BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
2641 if (!TermBr || TermBr->isUnconditional())
2642 continue;
2643
2645 // If the argument to TermBr is an extractelement, then the source of that
2646 // instruction is what's generated the condition.
2648 if (Extract)
2649 Cond = dyn_cast<Instruction>(Extract->getVectorOperand());
2650 // FIXME: We could do more here, like handling logical operations where one
2651 // side is a cmp that uses an induction variable.
2652 if (!Cond)
2653 continue;
2654
2655 // Search IVUsesByStride to find Cond's IVUse if there is one.
2656 IVStrideUse *CondUse = nullptr;
2657 if (!FindIVUserForCond(Cond, CondUse))
2658 continue;
2659
2660 // If the trip count is computed in terms of a max (due to ScalarEvolution
2661 // being unable to find a sufficient guard, for example), change the loop
2662 // comparison to use SLT or ULT instead of NE.
2663 // One consequence of doing this now is that it disrupts the count-down
2664 // optimization. That's not always a bad thing though, because in such
2665 // cases it may still be worthwhile to avoid a max.
2666 if (auto *Cmp = dyn_cast<ICmpInst>(Cond))
2667 Cond = OptimizeMax(Cmp, CondUse);
2668
2669 // If this exiting block dominates the latch block, it may also use
2670 // the post-inc value if it won't be shared with other uses.
2671 // Check for dominance.
2672 if (!DT.dominates(ExitingBlock, LatchBlock))
2673 continue;
2674
2675 // Conservatively avoid trying to use the post-inc value in non-latch
2676 // exits if there may be pre-inc users in intervening blocks.
2677 if (LatchBlock != ExitingBlock)
2678 for (const IVStrideUse &UI : IU)
2679 // Test if the use is reachable from the exiting block. This dominator
2680 // query is a conservative approximation of reachability.
2681 if (&UI != CondUse &&
2682 !DT.properlyDominates(UI.getUser()->getParent(), ExitingBlock)) {
2683 // Conservatively assume there may be reuse if the quotient of their
2684 // strides could be a legal scale.
2685 const SCEV *A = IU.getStride(*CondUse, L);
2686 const SCEV *B = IU.getStride(UI, L);
2687 if (!A || !B) continue;
2688 if (SE.getTypeSizeInBits(A->getType()) !=
2689 SE.getTypeSizeInBits(B->getType())) {
2690 if (SE.getTypeSizeInBits(A->getType()) >
2691 SE.getTypeSizeInBits(B->getType()))
2692 B = SE.getSignExtendExpr(B, A->getType());
2693 else
2694 A = SE.getSignExtendExpr(A, B->getType());
2695 }
2696 if (const SCEVConstant *D =
2698 const ConstantInt *C = D->getValue();
2699 // Stride of one or negative one can have reuse with non-addresses.
2700 if (C->isOne() || C->isMinusOne())
2701 goto decline_post_inc;
2702 // Avoid weird situations.
2703 if (C->getValue().getSignificantBits() >= 64 ||
2704 C->getValue().isMinSignedValue())
2705 goto decline_post_inc;
2706 // Check for possible scaled-address reuse.
2707 if (isAddressUse(TTI, UI.getUser(), UI.getOperandValToReplace())) {
2708 MemAccessTy AccessTy =
2709 getAccessType(TTI, UI.getUser(), UI.getOperandValToReplace());
2710 int64_t Scale = C->getSExtValue();
2711 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2712 /*BaseOffset=*/0,
2713 /*HasBaseReg=*/true, Scale,
2714 AccessTy.AddrSpace))
2715 goto decline_post_inc;
2716 Scale = -Scale;
2717 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2718 /*BaseOffset=*/0,
2719 /*HasBaseReg=*/true, Scale,
2720 AccessTy.AddrSpace))
2721 goto decline_post_inc;
2722 }
2723 }
2724 }
2725
2726 LLVM_DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: "
2727 << *Cond << '\n');
2728
2729 // It's possible for the setcc instruction to be anywhere in the loop, and
2730 // possible for it to have multiple users. If it is not immediately before
2731 // the exiting block branch, move it.
2732 if (isa_and_nonnull<CmpInst>(Cond) && Cond->getNextNode() != TermBr &&
2733 !Extract) {
2734 if (Cond->hasOneUse()) {
2735 Cond->moveBefore(TermBr->getIterator());
2736 } else {
2737 // Clone the terminating condition and insert into the loopend.
2738 Instruction *OldCond = Cond;
2739 Cond = Cond->clone();
2740 Cond->setName(L->getHeader()->getName() + ".termcond");
2741 Cond->insertInto(ExitingBlock, TermBr->getIterator());
2742
2743 // Clone the IVUse, as the old use still exists!
2744 CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
2745 TermBr->replaceUsesOfWith(OldCond, Cond);
2746 }
2747 }
2748
2749 // If we get to here, we know that we can transform the setcc instruction to
2750 // use the post-incremented version of the IV, allowing us to coalesce the
2751 // live ranges for the IV correctly.
2752 CondUse->transformToPostInc(L);
2753 Changed = true;
2754
2755 PostIncs.insert(Cond);
2756 decline_post_inc:;
2757 }
2758
2759 // Determine an insertion point for the loop induction variable increment. It
2760 // must dominate all the post-inc comparisons we just set up, and it must
2761 // dominate the loop latch edge.
2762 IVIncInsertPos = L->getLoopLatch()->getTerminator();
2763 for (Instruction *Inst : PostIncs)
2764 IVIncInsertPos = DT.findNearestCommonDominator(IVIncInsertPos, Inst);
2765}
2766
2767/// Determine if the given use can accommodate a fixup at the given offset and
2768/// other details. If so, update the use and return true.
2769bool LSRInstance::reconcileNewOffset(LSRUse &LU, Immediate NewOffset,
2770 bool HasBaseReg, LSRUse::KindType Kind,
2771 MemAccessTy AccessTy) {
2772 Immediate NewMinOffset = LU.MinOffset;
2773 Immediate NewMaxOffset = LU.MaxOffset;
2774 MemAccessTy NewAccessTy = AccessTy;
2775
2776 // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
2777 // something conservative, however this can pessimize in the case that one of
2778 // the uses will have all its uses outside the loop, for example.
2779 if (LU.Kind != Kind)
2780 return false;
2781
2782 // Check for a mismatched access type, and fall back conservatively as needed.
2783 // TODO: Be less conservative when the type is similar and can use the same
2784 // addressing modes.
2785 if (Kind == LSRUse::Address) {
2786 if (AccessTy.MemTy != LU.AccessTy.MemTy) {
2787 NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext(),
2788 AccessTy.AddrSpace);
2789 }
2790 }
2791
2792 // Conservatively assume HasBaseReg is true for now.
2793 if (Immediate::isKnownLT(NewOffset, LU.MinOffset)) {
2794 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2795 LU.MaxOffset - NewOffset, HasBaseReg))
2796 return false;
2797 NewMinOffset = NewOffset;
2798 } else if (Immediate::isKnownGT(NewOffset, LU.MaxOffset)) {
2799 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2800 NewOffset - LU.MinOffset, HasBaseReg))
2801 return false;
2802 NewMaxOffset = NewOffset;
2803 }
2804
2805 // FIXME: We should be able to handle some level of scalable offset support
2806 // for 'void', but in order to get basic support up and running this is
2807 // being left out.
2808 if (NewAccessTy.MemTy && NewAccessTy.MemTy->isVoidTy() &&
2809 (NewMinOffset.isScalable() || NewMaxOffset.isScalable()))
2810 return false;
2811
2812 // Update the use.
2813 LU.MinOffset = NewMinOffset;
2814 LU.MaxOffset = NewMaxOffset;
2815 LU.AccessTy = NewAccessTy;
2816 return true;
2817}
2818
2819/// Return an LSRUse index and an offset value for a fixup which needs the given
2820/// expression, with the given kind and optional access type. Either reuse an
2821/// existing use or create a new one, as needed.
2822std::pair<size_t, Immediate> LSRInstance::getUse(const SCEV *&Expr,
2823 LSRUse::KindType Kind,
2824 MemAccessTy AccessTy) {
2825 const SCEV *Copy = Expr;
2826 Immediate Offset = ExtractImmediate(Expr, SE);
2827
2828 // Basic uses can't accept any offset, for example.
2829 if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
2830 Offset, /*HasBaseReg=*/ true)) {
2831 Expr = Copy;
2832 Offset = Immediate::getFixed(0);
2833 }
2834
2835 std::pair<UseMapTy::iterator, bool> P =
2836 UseMap.try_emplace(LSRUse::SCEVUseKindPair(Expr, Kind));
2837 if (!P.second) {
2838 // A use already existed with this base.
2839 size_t LUIdx = P.first->second;
2840 LSRUse &LU = Uses[LUIdx];
2841 if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
2842 // Reuse this use.
2843 return std::make_pair(LUIdx, Offset);
2844 }
2845
2846 // Create a new use.
2847 size_t LUIdx = Uses.size();
2848 P.first->second = LUIdx;
2849 Uses.push_back(LSRUse(Kind, AccessTy));
2850 LSRUse &LU = Uses[LUIdx];
2851
2852 LU.MinOffset = Offset;
2853 LU.MaxOffset = Offset;
2854 return std::make_pair(LUIdx, Offset);
2855}
2856
2857/// Delete the given use from the Uses list.
2858void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
2859 if (&LU != &Uses.back())
2860 std::swap(LU, Uses.back());
2861 Uses.pop_back();
2862
2863 // Update RegUses.
2864 RegUses.swapAndDropUse(LUIdx, Uses.size());
2865}
2866
2867/// Look for a use distinct from OrigLU which is has a formula that has the same
2868/// registers as the given formula.
2869LSRUse *
2870LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
2871 const LSRUse &OrigLU) {
2872 // Search all uses for the formula. This could be more clever.
2873 for (LSRUse &LU : Uses) {
2874 // Check whether this use is close enough to OrigLU, to see whether it's
2875 // worthwhile looking through its formulae.
2876 // Ignore ICmpZero uses because they may contain formulae generated by
2877 // GenerateICmpZeroScales, in which case adding fixup offsets may
2878 // be invalid.
2879 if (&LU != &OrigLU &&
2880 LU.Kind != LSRUse::ICmpZero &&
2881 LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
2882 LU.WidestFixupType == OrigLU.WidestFixupType &&
2883 LU.HasFormulaWithSameRegs(OrigF)) {
2884 // Scan through this use's formulae.
2885 for (const Formula &F : LU.Formulae) {
2886 // Check to see if this formula has the same registers and symbols
2887 // as OrigF.
2888 if (F.BaseRegs == OrigF.BaseRegs &&
2889 F.ScaledReg == OrigF.ScaledReg &&
2890 F.BaseGV == OrigF.BaseGV &&
2891 F.Scale == OrigF.Scale &&
2892 F.UnfoldedOffset == OrigF.UnfoldedOffset) {
2893 if (F.BaseOffset.isZero())
2894 return &LU;
2895 // This is the formula where all the registers and symbols matched;
2896 // there aren't going to be any others. Since we declined it, we
2897 // can skip the rest of the formulae and proceed to the next LSRUse.
2898 break;
2899 }
2900 }
2901 }
2902 }
2903
2904 // Nothing looked good.
2905 return nullptr;
2906}
2907
2908void LSRInstance::CollectInterestingTypesAndFactors() {
2909 SmallSetVector<const SCEV *, 4> Strides;
2910
2911 // Collect interesting types and strides.
2913 for (const IVStrideUse &U : IU) {
2914 const SCEV *Expr = IU.getExpr(U);
2915 if (!Expr)
2916 continue;
2917
2918 // Collect interesting types.
2919 Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
2920
2921 // Add strides for mentioned loops.
2922 Worklist.push_back(Expr);
2923 do {
2924 const SCEV *S = Worklist.pop_back_val();
2925 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
2926 if (AR->getLoop() == L)
2927 Strides.insert(AR->getStepRecurrence(SE));
2928 Worklist.push_back(AR->getStart());
2929 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
2930 append_range(Worklist, Add->operands());
2931 }
2932 } while (!Worklist.empty());
2933 }
2934
2935 // Compute interesting factors from the set of interesting strides.
2936 for (SmallSetVector<const SCEV *, 4>::const_iterator
2937 I = Strides.begin(), E = Strides.end(); I != E; ++I)
2938 for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
2939 std::next(I); NewStrideIter != E; ++NewStrideIter) {
2940 const SCEV *OldStride = *I;
2941 const SCEV *NewStride = *NewStrideIter;
2942
2943 if (SE.getTypeSizeInBits(OldStride->getType()) !=
2944 SE.getTypeSizeInBits(NewStride->getType())) {
2945 if (SE.getTypeSizeInBits(OldStride->getType()) >
2946 SE.getTypeSizeInBits(NewStride->getType()))
2947 NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
2948 else
2949 OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
2950 }
2951 if (const SCEVConstant *Factor =
2952 dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
2953 SE, true))) {
2954 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2955 Factors.insert(Factor->getAPInt().getSExtValue());
2956 } else if (const SCEVConstant *Factor =
2958 NewStride,
2959 SE, true))) {
2960 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2961 Factors.insert(Factor->getAPInt().getSExtValue());
2962 }
2963 }
2964
2965 // If all uses use the same type, don't bother looking for truncation-based
2966 // reuse.
2967 if (Types.size() == 1)
2968 Types.clear();
2969
2970 LLVM_DEBUG(print_factors_and_types(dbgs()));
2971}
2972
2973/// Helper for CollectChains that finds an IV operand (computed by an AddRec in
2974/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
2975/// IVStrideUses, we could partially skip this.
2976static User::op_iterator
2978 Loop *L, ScalarEvolution &SE) {
2979 for(; OI != OE; ++OI) {
2980 if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
2981 if (!SE.isSCEVable(Oper->getType()))
2982 continue;
2983
2984 if (const SCEVAddRecExpr *AR =
2986 if (AR->getLoop() == L)
2987 break;
2988 }
2989 }
2990 }
2991 return OI;
2992}
2993
2994/// IVChain logic must consistently peek base TruncInst operands, so wrap it in
2995/// a convenient helper.
2997 if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
2998 return Trunc->getOperand(0);
2999 return Oper;
3000}
3001
3002/// Return an approximation of this SCEV expression's "base", or NULL for any
3003/// constant. Returning the expression itself is conservative. Returning a
3004/// deeper subexpression is more precise and valid as long as it isn't less
3005/// complex than another subexpression. For expressions involving multiple
3006/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
3007/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
3008/// IVInc==b-a.
3009///
3010/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
3011/// SCEVUnknown, we simply return the rightmost SCEV operand.
3012static const SCEV *getExprBase(const SCEV *S) {
3013 switch (S->getSCEVType()) {
3014 default: // including scUnknown.
3015 return S;
3016 case scConstant:
3017 case scVScale:
3018 return nullptr;
3019 case scTruncate:
3020 return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
3021 case scZeroExtend:
3022 return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
3023 case scSignExtend:
3024 return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
3025 case scAddExpr: {
3026 // Skip over scaled operands (scMulExpr) to follow add operands as long as
3027 // there's nothing more complex.
3028 // FIXME: not sure if we want to recognize negation.
3029 const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
3030 for (const SCEV *SubExpr : reverse(Add->operands())) {
3031 if (SubExpr->getSCEVType() == scAddExpr)
3032 return getExprBase(SubExpr);
3033
3034 if (SubExpr->getSCEVType() != scMulExpr)
3035 return SubExpr;
3036 }
3037 return S; // all operands are scaled, be conservative.
3038 }
3039 case scAddRecExpr:
3040 return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
3041 }
3042 llvm_unreachable("Unknown SCEV kind!");
3043}
3044
3045/// Return true if the chain increment is profitable to expand into a loop
3046/// invariant value, which may require its own register. A profitable chain
3047/// increment will be an offset relative to the same base. We allow such offsets
3048/// to potentially be used as chain increment as long as it's not obviously
3049/// expensive to expand using real instructions.
3050bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
3051 const SCEV *IncExpr,
3052 ScalarEvolution &SE) {
3053 // Aggressively form chains when -stress-ivchain.
3054 if (StressIVChain)
3055 return true;
3056
3057 // Do not replace a constant offset from IV head with a nonconstant IV
3058 // increment.
3059 if (!isa<SCEVConstant>(IncExpr)) {
3060 const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
3061 if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
3062 return false;
3063 }
3064
3065 SmallPtrSet<const SCEV*, 8> Processed;
3066 return !isHighCostExpansion(IncExpr, Processed, SE);
3067}
3068
3069/// Return true if the number of registers needed for the chain is estimated to
3070/// be less than the number required for the individual IV users. First prohibit
3071/// any IV users that keep the IV live across increments (the Users set should
3072/// be empty). Next count the number and type of increments in the chain.
3073///
3074/// Chaining IVs can lead to considerable code bloat if ISEL doesn't
3075/// effectively use postinc addressing modes. Only consider it profitable it the
3076/// increments can be computed in fewer registers when chained.
3077///
3078/// TODO: Consider IVInc free if it's already used in another chains.
3079static bool isProfitableChain(IVChain &Chain,
3081 ScalarEvolution &SE,
3082 const TargetTransformInfo &TTI) {
3083 if (StressIVChain)
3084 return true;
3085
3086 if (!Chain.hasIncs())
3087 return false;
3088
3089 if (!Users.empty()) {
3090 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
3091 for (Instruction *Inst
3092 : Users) { dbgs() << " " << *Inst << "\n"; });
3093 return false;
3094 }
3095 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3096
3097 // The chain itself may require a register, so intialize cost to 1.
3098 int cost = 1;
3099
3100 // A complete chain likely eliminates the need for keeping the original IV in
3101 // a register. LSR does not currently know how to form a complete chain unless
3102 // the header phi already exists.
3103 if (isa<PHINode>(Chain.tailUserInst())
3104 && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
3105 --cost;
3106 }
3107 const SCEV *LastIncExpr = nullptr;
3108 unsigned NumConstIncrements = 0;
3109 unsigned NumVarIncrements = 0;
3110 unsigned NumReusedIncrements = 0;
3111
3112 if (TTI.isProfitableLSRChainElement(Chain.Incs[0].UserInst))
3113 return true;
3114
3115 for (const IVInc &Inc : Chain) {
3116 if (TTI.isProfitableLSRChainElement(Inc.UserInst))
3117 return true;
3118 if (Inc.IncExpr->isZero())
3119 continue;
3120
3121 // Incrementing by zero or some constant is neutral. We assume constants can
3122 // be folded into an addressing mode or an add's immediate operand.
3123 if (isa<SCEVConstant>(Inc.IncExpr)) {
3124 ++NumConstIncrements;
3125 continue;
3126 }
3127
3128 if (Inc.IncExpr == LastIncExpr)
3129 ++NumReusedIncrements;
3130 else
3131 ++NumVarIncrements;
3132
3133 LastIncExpr = Inc.IncExpr;
3134 }
3135 // An IV chain with a single increment is handled by LSR's postinc
3136 // uses. However, a chain with multiple increments requires keeping the IV's
3137 // value live longer than it needs to be if chained.
3138 if (NumConstIncrements > 1)
3139 --cost;
3140
3141 // Materializing increment expressions in the preheader that didn't exist in
3142 // the original code may cost a register. For example, sign-extended array
3143 // indices can produce ridiculous increments like this:
3144 // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
3145 cost += NumVarIncrements;
3146
3147 // Reusing variable increments likely saves a register to hold the multiple of
3148 // the stride.
3149 cost -= NumReusedIncrements;
3150
3151 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
3152 << "\n");
3153
3154 return cost < 0;
3155}
3156
3157/// Add this IV user to an existing chain or make it the head of a new chain.
3158void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
3159 SmallVectorImpl<ChainUsers> &ChainUsersVec) {
3160 // When IVs are used as types of varying widths, they are generally converted
3161 // to a wider type with some uses remaining narrow under a (free) trunc.
3162 Value *const NextIV = getWideOperand(IVOper);
3163 const SCEV *const OperExpr = SE.getSCEV(NextIV);
3164 const SCEV *const OperExprBase = getExprBase(OperExpr);
3165
3166 // Visit all existing chains. Check if its IVOper can be computed as a
3167 // profitable loop invariant increment from the last link in the Chain.
3168 unsigned ChainIdx = 0, NChains = IVChainVec.size();
3169 const SCEV *LastIncExpr = nullptr;
3170 for (; ChainIdx < NChains; ++ChainIdx) {
3171 IVChain &Chain = IVChainVec[ChainIdx];
3172
3173 // Prune the solution space aggressively by checking that both IV operands
3174 // are expressions that operate on the same unscaled SCEVUnknown. This
3175 // "base" will be canceled by the subsequent getMinusSCEV call. Checking
3176 // first avoids creating extra SCEV expressions.
3177 if (!StressIVChain && Chain.ExprBase != OperExprBase)
3178 continue;
3179
3180 Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
3181 if (PrevIV->getType() != NextIV->getType())
3182 continue;
3183
3184 // A phi node terminates a chain.
3185 if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
3186 continue;
3187
3188 // The increment must be loop-invariant so it can be kept in a register.
3189 const SCEV *PrevExpr = SE.getSCEV(PrevIV);
3190 const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
3191 if (isa<SCEVCouldNotCompute>(IncExpr) || !SE.isLoopInvariant(IncExpr, L))
3192 continue;
3193
3194 if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
3195 LastIncExpr = IncExpr;
3196 break;
3197 }
3198 }
3199 // If we haven't found a chain, create a new one, unless we hit the max. Don't
3200 // bother for phi nodes, because they must be last in the chain.
3201 if (ChainIdx == NChains) {
3202 if (isa<PHINode>(UserInst))
3203 return;
3204 if (NChains >= MaxChains && !StressIVChain) {
3205 LLVM_DEBUG(dbgs() << "IV Chain Limit\n");
3206 return;
3207 }
3208 LastIncExpr = OperExpr;
3209 // IVUsers may have skipped over sign/zero extensions. We don't currently
3210 // attempt to form chains involving extensions unless they can be hoisted
3211 // into this loop's AddRec.
3212 if (!isa<SCEVAddRecExpr>(LastIncExpr))
3213 return;
3214 ++NChains;
3215 IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
3216 OperExprBase));
3217 ChainUsersVec.resize(NChains);
3218 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
3219 << ") IV=" << *LastIncExpr << "\n");
3220 } else {
3221 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst
3222 << ") IV+" << *LastIncExpr << "\n");
3223 // Add this IV user to the end of the chain.
3224 IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
3225 }
3226 IVChain &Chain = IVChainVec[ChainIdx];
3227
3228 SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
3229 // This chain's NearUsers become FarUsers.
3230 if (!LastIncExpr->isZero()) {
3231 ChainUsersVec[ChainIdx].FarUsers.insert_range(NearUsers);
3232 NearUsers.clear();
3233 }
3234
3235 // All other uses of IVOperand become near uses of the chain.
3236 // We currently ignore intermediate values within SCEV expressions, assuming
3237 // they will eventually be used be the current chain, or can be computed
3238 // from one of the chain increments. To be more precise we could
3239 // transitively follow its user and only add leaf IV users to the set.
3240 for (User *U : IVOper->users()) {
3241 Instruction *OtherUse = dyn_cast<Instruction>(U);
3242 if (!OtherUse)
3243 continue;
3244 // Uses in the chain will no longer be uses if the chain is formed.
3245 // Include the head of the chain in this iteration (not Chain.begin()).
3246 IVChain::const_iterator IncIter = Chain.Incs.begin();
3247 IVChain::const_iterator IncEnd = Chain.Incs.end();
3248 for( ; IncIter != IncEnd; ++IncIter) {
3249 if (IncIter->UserInst == OtherUse)
3250 break;
3251 }
3252 if (IncIter != IncEnd)
3253 continue;
3254
3255 if (SE.isSCEVable(OtherUse->getType())
3256 && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
3257 && IU.isIVUserOrOperand(OtherUse)) {
3258 continue;
3259 }
3260 NearUsers.insert(OtherUse);
3261 }
3262
3263 // Since this user is part of the chain, it's no longer considered a use
3264 // of the chain.
3265 ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
3266}
3267
3268/// Populate the vector of Chains.
3269///
3270/// This decreases ILP at the architecture level. Targets with ample registers,
3271/// multiple memory ports, and no register renaming probably don't want
3272/// this. However, such targets should probably disable LSR altogether.
3273///
3274/// The job of LSR is to make a reasonable choice of induction variables across
3275/// the loop. Subsequent passes can easily "unchain" computation exposing more
3276/// ILP *within the loop* if the target wants it.
3277///
3278/// Finding the best IV chain is potentially a scheduling problem. Since LSR
3279/// will not reorder memory operations, it will recognize this as a chain, but
3280/// will generate redundant IV increments. Ideally this would be corrected later
3281/// by a smart scheduler:
3282/// = A[i]
3283/// = A[i+x]
3284/// A[i] =
3285/// A[i+x] =
3286///
3287/// TODO: Walk the entire domtree within this loop, not just the path to the
3288/// loop latch. This will discover chains on side paths, but requires
3289/// maintaining multiple copies of the Chains state.
3290void LSRInstance::CollectChains() {
3291 LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n");
3292 SmallVector<ChainUsers, 8> ChainUsersVec;
3293
3294 SmallVector<BasicBlock *,8> LatchPath;
3295 BasicBlock *LoopHeader = L->getHeader();
3296 for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
3297 Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
3298 LatchPath.push_back(Rung->getBlock());
3299 }
3300 LatchPath.push_back(LoopHeader);
3301
3302 // Walk the instruction stream from the loop header to the loop latch.
3303 for (BasicBlock *BB : reverse(LatchPath)) {
3304 for (Instruction &I : *BB) {
3305 // Skip instructions that weren't seen by IVUsers analysis.
3306 if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&I))
3307 continue;
3308
3309 // Ignore users that are part of a SCEV expression. This way we only
3310 // consider leaf IV Users. This effectively rediscovers a portion of
3311 // IVUsers analysis but in program order this time.
3312 if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I)))
3313 continue;
3314
3315 // Remove this instruction from any NearUsers set it may be in.
3316 for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
3317 ChainIdx < NChains; ++ChainIdx) {
3318 ChainUsersVec[ChainIdx].NearUsers.erase(&I);
3319 }
3320 // Search for operands that can be chained.
3321 SmallPtrSet<Instruction*, 4> UniqueOperands;
3322 User::op_iterator IVOpEnd = I.op_end();
3323 User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE);
3324 while (IVOpIter != IVOpEnd) {
3325 Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
3326 if (UniqueOperands.insert(IVOpInst).second)
3327 ChainInstruction(&I, IVOpInst, ChainUsersVec);
3328 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3329 }
3330 } // Continue walking down the instructions.
3331 } // Continue walking down the domtree.
3332 // Visit phi backedges to determine if the chain can generate the IV postinc.
3333 for (PHINode &PN : L->getHeader()->phis()) {
3334 if (!SE.isSCEVable(PN.getType()))
3335 continue;
3336
3337 Instruction *IncV =
3338 dyn_cast<Instruction>(PN.getIncomingValueForBlock(L->getLoopLatch()));
3339 if (IncV)
3340 ChainInstruction(&PN, IncV, ChainUsersVec);
3341 }
3342 // Remove any unprofitable chains.
3343 unsigned ChainIdx = 0;
3344 for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
3345 UsersIdx < NChains; ++UsersIdx) {
3346 if (!isProfitableChain(IVChainVec[UsersIdx],
3347 ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
3348 continue;
3349 // Preserve the chain at UsesIdx.
3350 if (ChainIdx != UsersIdx)
3351 IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
3352 FinalizeChain(IVChainVec[ChainIdx]);
3353 ++ChainIdx;
3354 }
3355 IVChainVec.resize(ChainIdx);
3356}
3357
3358void LSRInstance::FinalizeChain(IVChain &Chain) {
3359 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3360 LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
3361
3362 for (const IVInc &Inc : Chain) {
3363 LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");
3364 auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
3365 assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
3366 IVIncSet.insert(UseI);
3367 }
3368}
3369
3370/// Return true if the IVInc can be folded into an addressing mode.
3371static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
3372 Value *Operand, const TargetTransformInfo &TTI) {
3373 const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
3374 Immediate IncOffset = Immediate::getZero();
3375 if (IncConst) {
3376 if (IncConst && IncConst->getAPInt().getSignificantBits() > 64)
3377 return false;
3378 IncOffset = Immediate::getFixed(IncConst->getValue()->getSExtValue());
3379 } else {
3380 // Look for mul(vscale, constant), to detect a scalable offset.
3381 const APInt *C;
3382 if (!match(IncExpr, m_scev_Mul(m_scev_APInt(C), m_SCEVVScale())) ||
3383 C->getSignificantBits() > 64)
3384 return false;
3385 IncOffset = Immediate::getScalable(C->getSExtValue());
3386 }
3387
3388 if (!isAddressUse(TTI, UserInst, Operand))
3389 return false;
3390
3391 MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
3392 if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
3393 IncOffset, /*HasBaseReg=*/false))
3394 return false;
3395
3396 return true;
3397}
3398
3399/// Generate an add or subtract for each IVInc in a chain to materialize the IV
3400/// user's operand from the previous IV user's operand.
3401void LSRInstance::GenerateIVChain(const IVChain &Chain,
3402 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
3403 // Find the new IVOperand for the head of the chain. It may have been replaced
3404 // by LSR.
3405 const IVInc &Head = Chain.Incs[0];
3406 User::op_iterator IVOpEnd = Head.UserInst->op_end();
3407 // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
3408 User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
3409 IVOpEnd, L, SE);
3410 Value *IVSrc = nullptr;
3411 while (IVOpIter != IVOpEnd) {
3412 IVSrc = getWideOperand(*IVOpIter);
3413
3414 // If this operand computes the expression that the chain needs, we may use
3415 // it. (Check this after setting IVSrc which is used below.)
3416 //
3417 // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
3418 // narrow for the chain, so we can no longer use it. We do allow using a
3419 // wider phi, assuming the LSR checked for free truncation. In that case we
3420 // should already have a truncate on this operand such that
3421 // getSCEV(IVSrc) == IncExpr.
3422 if (SE.getSCEV(*IVOpIter) == Head.IncExpr
3423 || SE.getSCEV(IVSrc) == Head.IncExpr) {
3424 break;
3425 }
3426 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3427 }
3428 if (IVOpIter == IVOpEnd) {
3429 // Gracefully give up on this chain.
3430 LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
3431 return;
3432 }
3433 assert(IVSrc && "Failed to find IV chain source");
3434
3435 LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
3436 Type *IVTy = IVSrc->getType();
3437 Type *IntTy = SE.getEffectiveSCEVType(IVTy);
3438 const SCEV *LeftOverExpr = nullptr;
3439 const SCEV *Accum = SE.getZero(IntTy);
3441 Bases.emplace_back(Accum, IVSrc);
3442
3443 for (const IVInc &Inc : Chain) {
3444 Instruction *InsertPt = Inc.UserInst;
3445 if (isa<PHINode>(InsertPt))
3446 InsertPt = L->getLoopLatch()->getTerminator();
3447
3448 // IVOper will replace the current IV User's operand. IVSrc is the IV
3449 // value currently held in a register.
3450 Value *IVOper = IVSrc;
3451 if (!Inc.IncExpr->isZero()) {
3452 // IncExpr was the result of subtraction of two narrow values, so must
3453 // be signed.
3454 const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
3455 Accum = SE.getAddExpr(Accum, IncExpr);
3456 LeftOverExpr = LeftOverExpr ?
3457 SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
3458 }
3459
3460 // Look through each base to see if any can produce a nice addressing mode.
3461 bool FoundBase = false;
3462 for (auto [MapScev, MapIVOper] : reverse(Bases)) {
3463 const SCEV *Remainder = SE.getMinusSCEV(Accum, MapScev);
3464 if (canFoldIVIncExpr(Remainder, Inc.UserInst, Inc.IVOperand, TTI)) {
3465 if (!Remainder->isZero()) {
3466 Rewriter.clearPostInc();
3467 Value *IncV = Rewriter.expandCodeFor(Remainder, IntTy, InsertPt);
3468 const SCEV *IVOperExpr =
3469 SE.getAddExpr(SE.getUnknown(MapIVOper), SE.getUnknown(IncV));
3470 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3471 } else {
3472 IVOper = MapIVOper;
3473 }
3474
3475 FoundBase = true;
3476 break;
3477 }
3478 }
3479 if (!FoundBase && LeftOverExpr && !LeftOverExpr->isZero()) {
3480 // Expand the IV increment.
3481 Rewriter.clearPostInc();
3482 Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
3483 const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
3484 SE.getUnknown(IncV));
3485 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3486
3487 // If an IV increment can't be folded, use it as the next IV value.
3488 if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
3489 assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
3490 Bases.emplace_back(Accum, IVOper);
3491 IVSrc = IVOper;
3492 LeftOverExpr = nullptr;
3493 }
3494 }
3495 Type *OperTy = Inc.IVOperand->getType();
3496 if (IVTy != OperTy) {
3497 assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
3498 "cannot extend a chained IV");
3499 IRBuilder<> Builder(InsertPt);
3500 IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
3501 }
3502 Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
3503 if (auto *OperandIsInstr = dyn_cast<Instruction>(Inc.IVOperand))
3504 DeadInsts.emplace_back(OperandIsInstr);
3505 }
3506 // If LSR created a new, wider phi, we may also replace its postinc. We only
3507 // do this if we also found a wide value for the head of the chain.
3508 if (isa<PHINode>(Chain.tailUserInst())) {
3509 for (PHINode &Phi : L->getHeader()->phis()) {
3510 if (Phi.getType() != IVSrc->getType())
3511 continue;
3513 Phi.getIncomingValueForBlock(L->getLoopLatch()));
3514 if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
3515 continue;
3516 Value *IVOper = IVSrc;
3517 Type *PostIncTy = PostIncV->getType();
3518 if (IVTy != PostIncTy) {
3519 assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
3520 IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
3521 Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
3522 IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
3523 }
3524 Phi.replaceUsesOfWith(PostIncV, IVOper);
3525 DeadInsts.emplace_back(PostIncV);
3526 }
3527 }
3528}
3529
3530void LSRInstance::CollectFixupsAndInitialFormulae() {
3531 BranchInst *ExitBranch = nullptr;
3532 bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &TLI);
3533
3534 // For calculating baseline cost
3535 SmallPtrSet<const SCEV *, 16> Regs;
3536 DenseSet<const SCEV *> VisitedRegs;
3537 DenseSet<size_t> VisitedLSRUse;
3538
3539 for (const IVStrideUse &U : IU) {
3540 Instruction *UserInst = U.getUser();
3541 // Skip IV users that are part of profitable IV Chains.
3542 User::op_iterator UseI =
3543 find(UserInst->operands(), U.getOperandValToReplace());
3544 assert(UseI != UserInst->op_end() && "cannot find IV operand");
3545 if (IVIncSet.count(UseI)) {
3546 LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
3547 continue;
3548 }
3549
3550 LSRUse::KindType Kind = LSRUse::Basic;
3551 MemAccessTy AccessTy;
3552 if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
3553 Kind = LSRUse::Address;
3554 AccessTy = getAccessType(TTI, UserInst, U.getOperandValToReplace());
3555 }
3556
3557 const SCEV *S = IU.getExpr(U);
3558 if (!S)
3559 continue;
3560 PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
3561
3562 // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
3563 // (N - i == 0), and this allows (N - i) to be the expression that we work
3564 // with rather than just N or i, so we can consider the register
3565 // requirements for both N and i at the same time. Limiting this code to
3566 // equality icmps is not a problem because all interesting loops use
3567 // equality icmps, thanks to IndVarSimplify.
3568 if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) {
3569 // If CI can be saved in some target, like replaced inside hardware loop
3570 // in PowerPC, no need to generate initial formulae for it.
3571 if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition()))
3572 continue;
3573 if (CI->isEquality()) {
3574 // Swap the operands if needed to put the OperandValToReplace on the
3575 // left, for consistency.
3576 Value *NV = CI->getOperand(1);
3577 if (NV == U.getOperandValToReplace()) {
3578 CI->setOperand(1, CI->getOperand(0));
3579 CI->setOperand(0, NV);
3580 NV = CI->getOperand(1);
3581 Changed = true;
3582 }
3583
3584 // x == y --> x - y == 0
3585 const SCEV *N = SE.getSCEV(NV);
3586 if (SE.isLoopInvariant(N, L) && Rewriter.isSafeToExpand(N) &&
3587 (!NV->getType()->isPointerTy() ||
3588 SE.getPointerBase(N) == SE.getPointerBase(S))) {
3589 // S is normalized, so normalize N before folding it into S
3590 // to keep the result normalized.
3591 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3592 if (!N)
3593 continue;
3594 Kind = LSRUse::ICmpZero;
3595 S = SE.getMinusSCEV(N, S);
3596 } else if (L->isLoopInvariant(NV) &&
3597 (!isa<Instruction>(NV) ||
3598 DT.dominates(cast<Instruction>(NV), L->getHeader())) &&
3599 !NV->getType()->isPointerTy()) {
3600 // If we can't generally expand the expression (e.g. it contains
3601 // a divide), but it is already at a loop invariant point before the
3602 // loop, wrap it in an unknown (to prevent the expander from trying
3603 // to re-expand in a potentially unsafe way.) The restriction to
3604 // integer types is required because the unknown hides the base, and
3605 // SCEV can't compute the difference of two unknown pointers.
3606 N = SE.getUnknown(NV);
3607 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3608 if (!N)
3609 continue;
3610 Kind = LSRUse::ICmpZero;
3611 S = SE.getMinusSCEV(N, S);
3613 }
3614
3615 // -1 and the negations of all interesting strides (except the negation
3616 // of -1) are now also interesting.
3617 for (size_t i = 0, e = Factors.size(); i != e; ++i)
3618 if (Factors[i] != -1)
3619 Factors.insert(-(uint64_t)Factors[i]);
3620 Factors.insert(-1);
3621 }
3622 }
3623
3624 // Get or create an LSRUse.
3625 std::pair<size_t, Immediate> P = getUse(S, Kind, AccessTy);
3626 size_t LUIdx = P.first;
3627 Immediate Offset = P.second;
3628 LSRUse &LU = Uses[LUIdx];
3629
3630 // Record the fixup.
3631 LSRFixup &LF = LU.getNewFixup();
3632 LF.UserInst = UserInst;
3633 LF.OperandValToReplace = U.getOperandValToReplace();
3634 LF.PostIncLoops = TmpPostIncLoops;
3635 LF.Offset = Offset;
3636 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3637 LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF);
3638
3639 // Create SCEV as Formula for calculating baseline cost
3640 if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) {
3641 Formula F;
3642 F.initialMatch(S, L, SE);
3643 BaselineCost.RateFormula(F, Regs, VisitedRegs, LU,
3644 HardwareLoopProfitable);
3645 VisitedLSRUse.insert(LUIdx);
3646 }
3647
3648 if (!LU.WidestFixupType ||
3649 SE.getTypeSizeInBits(LU.WidestFixupType) <
3650 SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3651 LU.WidestFixupType = LF.OperandValToReplace->getType();
3652
3653 // If this is the first use of this LSRUse, give it a formula.
3654 if (LU.Formulae.empty()) {
3655 InsertInitialFormula(S, LU, LUIdx);
3656 CountRegisters(LU.Formulae.back(), LUIdx);
3657 }
3658 }
3659
3660 LLVM_DEBUG(print_fixups(dbgs()));
3661}
3662
3663/// Insert a formula for the given expression into the given use, separating out
3664/// loop-variant portions from loop-invariant and loop-computable portions.
3665void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU,
3666 size_t LUIdx) {
3667 // Mark uses whose expressions cannot be expanded.
3668 if (!Rewriter.isSafeToExpand(S))
3669 LU.RigidFormula = true;
3670
3671 Formula F;
3672 F.initialMatch(S, L, SE);
3673 bool Inserted = InsertFormula(LU, LUIdx, F);
3674 assert(Inserted && "Initial formula already exists!"); (void)Inserted;
3675}
3676
3677/// Insert a simple single-register formula for the given expression into the
3678/// given use.
3679void
3680LSRInstance::InsertSupplementalFormula(const SCEV *S,
3681 LSRUse &LU, size_t LUIdx) {
3682 Formula F;
3683 F.BaseRegs.push_back(S);
3684 F.HasBaseReg = true;
3685 bool Inserted = InsertFormula(LU, LUIdx, F);
3686 assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
3687}
3688
3689/// Note which registers are used by the given formula, updating RegUses.
3690void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
3691 if (F.ScaledReg)
3692 RegUses.countRegister(F.ScaledReg, LUIdx);
3693 for (const SCEV *BaseReg : F.BaseRegs)
3694 RegUses.countRegister(BaseReg, LUIdx);
3695}
3696
3697/// If the given formula has not yet been inserted, add it to the list, and
3698/// return true. Return false otherwise.
3699bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
3700 // Do not insert formula that we will not be able to expand.
3701 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
3702 "Formula is illegal");
3703
3704 if (!LU.InsertFormula(F, *L))
3705 return false;
3706
3707 CountRegisters(F, LUIdx);
3708 return true;
3709}
3710
3711/// Test whether this fixup will be executed each time the corresponding IV
3712/// increment instruction is executed.
3713bool LSRInstance::IsFixupExecutedEachIncrement(const LSRFixup &LF) const {
3714 // If the fixup block dominates the IV increment block then there is no path
3715 // through the loop to the increment that doesn't pass through the fixup.
3716 return DT.dominates(LF.UserInst->getParent(), IVIncInsertPos->getParent());
3717}
3718
3719/// Check for other uses of loop-invariant values which we're tracking. These
3720/// other uses will pin these values in registers, making them less profitable
3721/// for elimination.
3722/// TODO: This currently misses non-constant addrec step registers.
3723/// TODO: Should this give more weight to users inside the loop?
3724void
3725LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
3726 SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
3727 SmallPtrSet<const SCEV *, 32> Visited;
3728
3729 // Don't collect outside uses if we are favoring postinc - the instructions in
3730 // the loop are more important than the ones outside of it.
3731 if (AMK == TTI::AMK_PostIndexed)
3732 return;
3733
3734 while (!Worklist.empty()) {
3735 const SCEV *S = Worklist.pop_back_val();
3736
3737 // Don't process the same SCEV twice
3738 if (!Visited.insert(S).second)
3739 continue;
3740
3741 if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
3742 append_range(Worklist, N->operands());
3743 else if (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(S))
3744 Worklist.push_back(C->getOperand());
3745 else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
3746 Worklist.push_back(D->getLHS());
3747 Worklist.push_back(D->getRHS());
3748 } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
3749 const Value *V = US->getValue();
3750 if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
3751 // Look for instructions defined outside the loop.
3752 if (L->contains(Inst)) continue;
3753 } else if (isa<Constant>(V))
3754 // Constants can be re-materialized.
3755 continue;
3756 for (const Use &U : V->uses()) {
3757 const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
3758 // Ignore non-instructions.
3759 if (!UserInst)
3760 continue;
3761 // Don't bother if the instruction is an EHPad.
3762 if (UserInst->isEHPad())
3763 continue;
3764 // Ignore instructions in other functions (as can happen with
3765 // Constants).
3766 if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
3767 continue;
3768 // Ignore instructions not dominated by the loop.
3769 const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
3770 UserInst->getParent() :
3771 cast<PHINode>(UserInst)->getIncomingBlock(
3773 if (!DT.dominates(L->getHeader(), UseBB))
3774 continue;
3775 // Don't bother if the instruction is in a BB which ends in an EHPad.
3776 if (UseBB->getTerminator()->isEHPad())
3777 continue;
3778
3779 // Ignore cases in which the currently-examined value could come from
3780 // a basic block terminated with an EHPad. This checks all incoming
3781 // blocks of the phi node since it is possible that the same incoming
3782 // value comes from multiple basic blocks, only some of which may end
3783 // in an EHPad. If any of them do, a subsequent rewrite attempt by this
3784 // pass would try to insert instructions into an EHPad, hitting an
3785 // assertion.
3786 if (isa<PHINode>(UserInst)) {
3787 const auto *PhiNode = cast<PHINode>(UserInst);
3788 bool HasIncompatibleEHPTerminatedBlock = false;
3789 llvm::Value *ExpectedValue = U;
3790 for (unsigned int I = 0; I < PhiNode->getNumIncomingValues(); I++) {
3791 if (PhiNode->getIncomingValue(I) == ExpectedValue) {
3792 if (PhiNode->getIncomingBlock(I)->getTerminator()->isEHPad()) {
3793 HasIncompatibleEHPTerminatedBlock = true;
3794 break;
3795 }
3796 }
3797 }
3798 if (HasIncompatibleEHPTerminatedBlock) {
3799 continue;
3800 }
3801 }
3802
3803 // Don't bother rewriting PHIs in catchswitch blocks.
3804 if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
3805 continue;
3806 // Ignore uses which are part of other SCEV expressions, to avoid
3807 // analyzing them multiple times.
3808 if (SE.isSCEVable(UserInst->getType())) {
3809 const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
3810 // If the user is a no-op, look through to its uses.
3811 if (!isa<SCEVUnknown>(UserS))
3812 continue;
3813 if (UserS == US) {
3814 Worklist.push_back(
3815 SE.getUnknown(const_cast<Instruction *>(UserInst)));
3816 continue;
3817 }
3818 }
3819 // Ignore icmp instructions which are already being analyzed.
3820 if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
3821 unsigned OtherIdx = !U.getOperandNo();
3822 Value *OtherOp = ICI->getOperand(OtherIdx);
3823 if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
3824 continue;
3825 }
3826
3827 // Do not consider uses inside lifetime intrinsics. These are not
3828 // actually materialized.
3829 if (UserInst->isLifetimeStartOrEnd())
3830 continue;
3831
3832 std::pair<size_t, Immediate> P =
3833 getUse(S, LSRUse::Basic, MemAccessTy());
3834 size_t LUIdx = P.first;
3835 Immediate Offset = P.second;
3836 LSRUse &LU = Uses[LUIdx];
3837 LSRFixup &LF = LU.getNewFixup();
3838 LF.UserInst = const_cast<Instruction *>(UserInst);
3839 LF.OperandValToReplace = U;
3840 LF.Offset = Offset;
3841 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3842 LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF);
3843 if (!LU.WidestFixupType ||
3844 SE.getTypeSizeInBits(LU.WidestFixupType) <
3845 SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3846 LU.WidestFixupType = LF.OperandValToReplace->getType();
3847 InsertSupplementalFormula(US, LU, LUIdx);
3848 CountRegisters(LU.Formulae.back(), Uses.size() - 1);
3849 break;
3850 }
3851 }
3852 }
3853}
3854
3855/// Split S into subexpressions which can be pulled out into separate
3856/// registers. If C is non-null, multiply each subexpression by C.
3857///
3858/// Return remainder expression after factoring the subexpressions captured by
3859/// Ops. If Ops is complete, return NULL.
3860static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
3862 const Loop *L,
3863 ScalarEvolution &SE,
3864 unsigned Depth = 0) {
3865 // Arbitrarily cap recursion to protect compile time.
3866 if (Depth >= 3)
3867 return S;
3868
3869 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
3870 // Break out add operands.
3871 for (const SCEV *S : Add->operands()) {
3872 const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
3873 if (Remainder)
3874 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3875 }
3876 return nullptr;
3877 }
3878 const SCEV *Start, *Step;
3879 const SCEVConstant *Op0;
3880 const SCEV *Op1;
3881 if (match(S, m_scev_AffineAddRec(m_SCEV(Start), m_SCEV(Step)))) {
3882 // Split a non-zero base out of an addrec.
3883 if (Start->isZero())
3884 return S;
3885
3886 const SCEV *Remainder = CollectSubexprs(Start, C, Ops, L, SE, Depth + 1);
3887 // Split the non-zero AddRec unless it is part of a nested recurrence that
3888 // does not pertain to this loop.
3889 if (Remainder && (cast<SCEVAddRecExpr>(S)->getLoop() == L ||
3890 !isa<SCEVAddRecExpr>(Remainder))) {
3891 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3892 Remainder = nullptr;
3893 }
3894 if (Remainder != Start) {
3895 if (!Remainder)
3896 Remainder = SE.getConstant(S->getType(), 0);
3897 return SE.getAddRecExpr(Remainder, Step,
3898 cast<SCEVAddRecExpr>(S)->getLoop(),
3899 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
3901 }
3902 } else if (match(S, m_scev_Mul(m_SCEVConstant(Op0), m_SCEV(Op1)))) {
3903 // Break (C * (a + b + c)) into C*a + C*b + C*c.
3904 C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
3905 const SCEV *Remainder = CollectSubexprs(Op1, C, Ops, L, SE, Depth + 1);
3906 if (Remainder)
3907 Ops.push_back(SE.getMulExpr(C, Remainder));
3908 return nullptr;
3909 }
3910 return S;
3911}
3912
3913/// Return true if the SCEV represents a value that may end up as a
3914/// post-increment operation.
3916 LSRUse &LU, const SCEV *S, const Loop *L,
3917 ScalarEvolution &SE) {
3918 if (LU.Kind != LSRUse::Address ||
3919 !LU.AccessTy.getType()->isIntOrIntVectorTy())
3920 return false;
3921 const SCEV *Start;
3922 if (!match(S, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant())))
3923 return false;
3924 // Check if a post-indexed load/store can be used.
3925 if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, S->getType()) ||
3926 TTI.isIndexedStoreLegal(TTI.MIM_PostInc, S->getType())) {
3927 if (!isa<SCEVConstant>(Start) && SE.isLoopInvariant(Start, L))
3928 return true;
3929 }
3930 return false;
3931}
3932
3933/// Helper function for LSRInstance::GenerateReassociations.
3934void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
3935 const Formula &Base,
3936 unsigned Depth, size_t Idx,
3937 bool IsScaledReg) {
3938 const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3939 // Don't generate reassociations for the base register of a value that
3940 // may generate a post-increment operator. The reason is that the
3941 // reassociations cause extra base+register formula to be created,
3942 // and possibly chosen, but the post-increment is more efficient.
3943 if (AMK == TTI::AMK_PostIndexed && mayUsePostIncMode(TTI, LU, BaseReg, L, SE))
3944 return;
3946 const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
3947 if (Remainder)
3948 AddOps.push_back(Remainder);
3949
3950 if (AddOps.size() == 1)
3951 return;
3952
3954 JE = AddOps.end();
3955 J != JE; ++J) {
3956 // Loop-variant "unknown" values are uninteresting; we won't be able to
3957 // do anything meaningful with them.
3958 if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
3959 continue;
3960
3961 // Don't pull a constant into a register if the constant could be folded
3962 // into an immediate field.
3963 if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3964 LU.AccessTy, *J, Base.getNumRegs() > 1))
3965 continue;
3966
3967 // Collect all operands except *J.
3968 SmallVector<const SCEV *, 8> InnerAddOps(std::as_const(AddOps).begin(), J);
3969 InnerAddOps.append(std::next(J), std::as_const(AddOps).end());
3970
3971 // Don't leave just a constant behind in a register if the constant could
3972 // be folded into an immediate field.
3973 if (InnerAddOps.size() == 1 &&
3974 isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3975 LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
3976 continue;
3977
3978 const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
3979 if (InnerSum->isZero())
3980 continue;
3981 Formula F = Base;
3982
3983 if (F.UnfoldedOffset.isNonZero() && F.UnfoldedOffset.isScalable())
3984 continue;
3985
3986 // Add the remaining pieces of the add back into the new formula.
3987 const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
3988 if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
3989 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
3990 InnerSumSC->getValue()->getZExtValue())) {
3991 F.UnfoldedOffset =
3992 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
3993 InnerSumSC->getValue()->getZExtValue());
3994 if (IsScaledReg) {
3995 F.ScaledReg = nullptr;
3996 F.Scale = 0;
3997 } else
3998 F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
3999 } else if (IsScaledReg)
4000 F.ScaledReg = InnerSum;
4001 else
4002 F.BaseRegs[Idx] = InnerSum;
4003
4004 // Add J as its own register, or an unfolded immediate.
4005 const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
4006 if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
4007 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
4008 SC->getValue()->getZExtValue()))
4009 F.UnfoldedOffset =
4010 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
4011 SC->getValue()->getZExtValue());
4012 else
4013 F.BaseRegs.push_back(*J);
4014 // We may have changed the number of register in base regs, adjust the
4015 // formula accordingly.
4016 F.canonicalize(*L);
4017
4018 if (InsertFormula(LU, LUIdx, F))
4019 // If that formula hadn't been seen before, recurse to find more like
4020 // it.
4021 // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2)
4022 // Because just Depth is not enough to bound compile time.
4023 // This means that every time AddOps.size() is greater 16^x we will add
4024 // x to Depth.
4025 GenerateReassociations(LU, LUIdx, LU.Formulae.back(),
4026 Depth + 1 + (Log2_32(AddOps.size()) >> 2));
4027 }
4028}
4029
4030/// Split out subexpressions from adds and the bases of addrecs.
4031void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
4032 Formula Base, unsigned Depth) {
4033 assert(Base.isCanonical(*L) && "Input must be in the canonical form");
4034 // Arbitrarily cap recursion to protect compile time.
4035 if (Depth >= 3)
4036 return;
4037
4038 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4039 GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
4040
4041 if (Base.Scale == 1)
4042 GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
4043 /* Idx */ -1, /* IsScaledReg */ true);
4044}
4045
4046/// Generate a formula consisting of all of the loop-dominating registers added
4047/// into a single register.
4048void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
4049 Formula Base) {
4050 // This method is only interesting on a plurality of registers.
4051 if (Base.BaseRegs.size() + (Base.Scale == 1) +
4052 (Base.UnfoldedOffset.isNonZero()) <=
4053 1)
4054 return;
4055
4056 // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
4057 // processing the formula.
4058 Base.unscale();
4060 Formula NewBase = Base;
4061 NewBase.BaseRegs.clear();
4062 Type *CombinedIntegerType = nullptr;
4063 for (const SCEV *BaseReg : Base.BaseRegs) {
4064 if (SE.properlyDominates(BaseReg, L->getHeader()) &&
4065 !SE.hasComputableLoopEvolution(BaseReg, L)) {
4066 if (!CombinedIntegerType)
4067 CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType());
4068 Ops.push_back(BaseReg);
4069 }
4070 else
4071 NewBase.BaseRegs.push_back(BaseReg);
4072 }
4073
4074 // If no register is relevant, we're done.
4075 if (Ops.size() == 0)
4076 return;
4077
4078 // Utility function for generating the required variants of the combined
4079 // registers.
4080 auto GenerateFormula = [&](const SCEV *Sum) {
4081 Formula F = NewBase;
4082
4083 // TODO: If Sum is zero, it probably means ScalarEvolution missed an
4084 // opportunity to fold something. For now, just ignore such cases
4085 // rather than proceed with zero in a register.
4086 if (Sum->isZero())
4087 return;
4088
4089 F.BaseRegs.push_back(Sum);
4090 F.canonicalize(*L);
4091 (void)InsertFormula(LU, LUIdx, F);
4092 };
4093
4094 // If we collected at least two registers, generate a formula combining them.
4095 if (Ops.size() > 1) {
4096 SmallVector<const SCEV *, 4> OpsCopy(Ops); // Don't let SE modify Ops.
4097 GenerateFormula(SE.getAddExpr(OpsCopy));
4098 }
4099
4100 // If we have an unfolded offset, generate a formula combining it with the
4101 // registers collected.
4102 if (NewBase.UnfoldedOffset.isNonZero() && NewBase.UnfoldedOffset.isFixed()) {
4103 assert(CombinedIntegerType && "Missing a type for the unfolded offset");
4104 Ops.push_back(SE.getConstant(CombinedIntegerType,
4105 NewBase.UnfoldedOffset.getFixedValue(), true));
4106 NewBase.UnfoldedOffset = Immediate::getFixed(0);
4107 GenerateFormula(SE.getAddExpr(Ops));
4108 }
4109}
4110
4111/// Helper function for LSRInstance::GenerateSymbolicOffsets.
4112void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
4113 const Formula &Base, size_t Idx,
4114 bool IsScaledReg) {
4115 const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4116 GlobalValue *GV = ExtractSymbol(G, SE);
4117 if (G->isZero() || !GV)
4118 return;
4119 Formula F = Base;
4120 F.BaseGV = GV;
4121 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4122 return;
4123 if (IsScaledReg)
4124 F.ScaledReg = G;
4125 else
4126 F.BaseRegs[Idx] = G;
4127 (void)InsertFormula(LU, LUIdx, F);
4128}
4129
4130/// Generate reuse formulae using symbolic offsets.
4131void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
4132 Formula Base) {
4133 // We can't add a symbolic offset if the address already contains one.
4134 if (Base.BaseGV) return;
4135
4136 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4137 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
4138 if (Base.Scale == 1)
4139 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
4140 /* IsScaledReg */ true);
4141}
4142
4143/// Helper function for LSRInstance::GenerateConstantOffsets.
4144void LSRInstance::GenerateConstantOffsetsImpl(
4145 LSRUse &LU, unsigned LUIdx, const Formula &Base,
4146 const SmallVectorImpl<Immediate> &Worklist, size_t Idx, bool IsScaledReg) {
4147
4148 auto GenerateOffset = [&](const SCEV *G, Immediate Offset) {
4149 Formula F = Base;
4150 if (!Base.BaseOffset.isCompatibleImmediate(Offset))
4151 return;
4152 F.BaseOffset = Base.BaseOffset.subUnsigned(Offset);
4153
4154 if (isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) {
4155 // Add the offset to the base register.
4156 const SCEV *NewOffset = Offset.getSCEV(SE, G->getType());
4157 const SCEV *NewG = SE.getAddExpr(NewOffset, G);
4158 // If it cancelled out, drop the base register, otherwise update it.
4159 if (NewG->isZero()) {
4160 if (IsScaledReg) {
4161 F.Scale = 0;
4162 F.ScaledReg = nullptr;
4163 } else
4164 F.deleteBaseReg(F.BaseRegs[Idx]);
4165 F.canonicalize(*L);
4166 } else if (IsScaledReg)
4167 F.ScaledReg = NewG;
4168 else
4169 F.BaseRegs[Idx] = NewG;
4170
4171 (void)InsertFormula(LU, LUIdx, F);
4172 }
4173 };
4174
4175 const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4176
4177 // With constant offsets and constant steps, we can generate pre-inc
4178 // accesses by having the offset equal the step. So, for access #0 with a
4179 // step of 8, we generate a G - 8 base which would require the first access
4180 // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer
4181 // for itself and hopefully becomes the base for other accesses. This means
4182 // means that a single pre-indexed access can be generated to become the new
4183 // base pointer for each iteration of the loop, resulting in no extra add/sub
4184 // instructions for pointer updating.
4185 if ((AMK & TTI::AMK_PreIndexed) && LU.Kind == LSRUse::Address) {
4186 const APInt *StepInt;
4187 if (match(G, m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StepInt)))) {
4188 int64_t Step = StepInt->isNegative() ? StepInt->getSExtValue()
4189 : StepInt->getZExtValue();
4190
4191 for (Immediate Offset : Worklist) {
4192 if (Offset.isFixed()) {
4193 Offset = Immediate::getFixed(Offset.getFixedValue() - Step);
4194 GenerateOffset(G, Offset);
4195 }
4196 }
4197 }
4198 }
4199 for (Immediate Offset : Worklist)
4200 GenerateOffset(G, Offset);
4201
4202 Immediate Imm = ExtractImmediate(G, SE);
4203 if (G->isZero() || Imm.isZero() ||
4204 !Base.BaseOffset.isCompatibleImmediate(Imm))
4205 return;
4206 Formula F = Base;
4207 F.BaseOffset = F.BaseOffset.addUnsigned(Imm);
4208 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4209 return;
4210 if (IsScaledReg) {
4211 F.ScaledReg = G;
4212 } else {
4213 F.BaseRegs[Idx] = G;
4214 // We may generate non canonical Formula if G is a recurrent expr reg
4215 // related with current loop while F.ScaledReg is not.
4216 F.canonicalize(*L);
4217 }
4218 (void)InsertFormula(LU, LUIdx, F);
4219}
4220
4221/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
4222void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
4223 Formula Base) {
4224 // TODO: For now, just add the min and max offset, because it usually isn't
4225 // worthwhile looking at everything inbetween.
4227 Worklist.push_back(LU.MinOffset);
4228 if (LU.MaxOffset != LU.MinOffset)
4229 Worklist.push_back(LU.MaxOffset);
4230
4231 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4232 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
4233 if (Base.Scale == 1)
4234 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
4235 /* IsScaledReg */ true);
4236}
4237
4238/// For ICmpZero, check to see if we can scale up the comparison. For example, x
4239/// == y -> x*c == y*c.
4240void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
4241 Formula Base) {
4242 if (LU.Kind != LSRUse::ICmpZero) return;
4243
4244 // Determine the integer type for the base formula.
4245 Type *IntTy = Base.getType();
4246 if (!IntTy) return;
4247 if (SE.getTypeSizeInBits(IntTy) > 64) return;
4248
4249 // Don't do this if there is more than one offset.
4250 if (LU.MinOffset != LU.MaxOffset) return;
4251
4252 // Check if transformation is valid. It is illegal to multiply pointer.
4253 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4254 return;
4255 for (const SCEV *BaseReg : Base.BaseRegs)
4256 if (BaseReg->getType()->isPointerTy())
4257 return;
4258 assert(!Base.BaseGV && "ICmpZero use is not legal!");
4259
4260 // Check each interesting stride.
4261 for (int64_t Factor : Factors) {
4262 // Check that Factor can be represented by IntTy
4263 if (!ConstantInt::isValueValidForType(IntTy, Factor))
4264 continue;
4265 // Check that the multiplication doesn't overflow.
4266 if (Base.BaseOffset.isMin() && Factor == -1)
4267 continue;
4268 // Not supporting scalable immediates.
4269 if (Base.BaseOffset.isNonZero() && Base.BaseOffset.isScalable())
4270 continue;
4271 Immediate NewBaseOffset = Base.BaseOffset.mulUnsigned(Factor);
4272 assert(Factor != 0 && "Zero factor not expected!");
4273 if (NewBaseOffset.getFixedValue() / Factor !=
4274 Base.BaseOffset.getFixedValue())
4275 continue;
4276 // If the offset will be truncated at this use, check that it is in bounds.
4277 if (!IntTy->isPointerTy() &&
4278 !ConstantInt::isValueValidForType(IntTy, NewBaseOffset.getFixedValue()))
4279 continue;
4280
4281 // Check that multiplying with the use offset doesn't overflow.
4282 Immediate Offset = LU.MinOffset;
4283 if (Offset.isMin() && Factor == -1)
4284 continue;
4285 Offset = Offset.mulUnsigned(Factor);
4286 if (Offset.getFixedValue() / Factor != LU.MinOffset.getFixedValue())
4287 continue;
4288 // If the offset will be truncated at this use, check that it is in bounds.
4289 if (!IntTy->isPointerTy() &&
4290 !ConstantInt::isValueValidForType(IntTy, Offset.getFixedValue()))
4291 continue;
4292
4293 Formula F = Base;
4294 F.BaseOffset = NewBaseOffset;
4295
4296 // Check that this scale is legal.
4297 if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
4298 continue;
4299
4300 // Compensate for the use having MinOffset built into it.
4301 F.BaseOffset = F.BaseOffset.addUnsigned(Offset).subUnsigned(LU.MinOffset);
4302
4303 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4304
4305 // Check that multiplying with each base register doesn't overflow.
4306 for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
4307 F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
4308 if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
4309 goto next;
4310 }
4311
4312 // Check that multiplying with the scaled register doesn't overflow.
4313 if (F.ScaledReg) {
4314 F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
4315 if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
4316 continue;
4317 }
4318
4319 // Check that multiplying with the unfolded offset doesn't overflow.
4320 if (F.UnfoldedOffset.isNonZero()) {
4321 if (F.UnfoldedOffset.isMin() && Factor == -1)
4322 continue;
4323 F.UnfoldedOffset = F.UnfoldedOffset.mulUnsigned(Factor);
4324 if (F.UnfoldedOffset.getFixedValue() / Factor !=
4325 Base.UnfoldedOffset.getFixedValue())
4326 continue;
4327 // If the offset will be truncated, check that it is in bounds.
4329 IntTy, F.UnfoldedOffset.getFixedValue()))
4330 continue;
4331 }
4332
4333 // If we make it here and it's legal, add it.
4334 (void)InsertFormula(LU, LUIdx, F);
4335 next:;
4336 }
4337}
4338
4339/// Generate stride factor reuse formulae by making use of scaled-offset address
4340/// modes, for example.
4341void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
4342 // Determine the integer type for the base formula.
4343 Type *IntTy = Base.getType();
4344 if (!IntTy) return;
4345
4346 // If this Formula already has a scaled register, we can't add another one.
4347 // Try to unscale the formula to generate a better scale.
4348 if (Base.Scale != 0 && !Base.unscale())
4349 return;
4350
4351 assert(Base.Scale == 0 && "unscale did not did its job!");
4352
4353 // Check each interesting stride.
4354 for (int64_t Factor : Factors) {
4355 Base.Scale = Factor;
4356 Base.HasBaseReg = Base.BaseRegs.size() > 1;
4357 // Check whether this scale is going to be legal.
4358 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4359 Base)) {
4360 // As a special-case, handle special out-of-loop Basic users specially.
4361 // TODO: Reconsider this special case.
4362 if (LU.Kind == LSRUse::Basic &&
4363 isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
4364 LU.AccessTy, Base) &&
4365 LU.AllFixupsOutsideLoop)
4366 LU.Kind = LSRUse::Special;
4367 else
4368 continue;
4369 }
4370 // For an ICmpZero, negating a solitary base register won't lead to
4371 // new solutions.
4372 if (LU.Kind == LSRUse::ICmpZero && !Base.HasBaseReg &&
4373 Base.BaseOffset.isZero() && !Base.BaseGV)
4374 continue;
4375 // For each addrec base reg, if its loop is current loop, apply the scale.
4376 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
4377 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
4378 if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
4379 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4380 if (FactorS->isZero())
4381 continue;
4382 // Divide out the factor, ignoring high bits, since we'll be
4383 // scaling the value back up in the end.
4384 if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true))
4385 if (!Quotient->isZero()) {
4386 // TODO: This could be optimized to avoid all the copying.
4387 Formula F = Base;
4388 F.ScaledReg = Quotient;
4389 F.deleteBaseReg(F.BaseRegs[i]);
4390 // The canonical representation of 1*reg is reg, which is already in
4391 // Base. In that case, do not try to insert the formula, it will be
4392 // rejected anyway.
4393 if (F.Scale == 1 && (F.BaseRegs.empty() ||
4394 (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
4395 continue;
4396 // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
4397 // non canonical Formula with ScaledReg's loop not being L.
4398 if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
4399 F.canonicalize(*L);
4400 (void)InsertFormula(LU, LUIdx, F);
4401 }
4402 }
4403 }
4404 }
4405}
4406
4407/// Extend/Truncate \p Expr to \p ToTy considering post-inc uses in \p Loops.
4408/// For all PostIncLoopSets in \p Loops, first de-normalize \p Expr, then
4409/// perform the extension/truncate and normalize again, as the normalized form
4410/// can result in folds that are not valid in the post-inc use contexts. The
4411/// expressions for all PostIncLoopSets must match, otherwise return nullptr.
4412static const SCEV *
4414 const SCEV *Expr, Type *ToTy,
4415 ScalarEvolution &SE) {
4416 const SCEV *Result = nullptr;
4417 for (auto &L : Loops) {
4418 auto *DenormExpr = denormalizeForPostIncUse(Expr, L, SE);
4419 const SCEV *NewDenormExpr = SE.getAnyExtendExpr(DenormExpr, ToTy);
4420 const SCEV *New = normalizeForPostIncUse(NewDenormExpr, L, SE);
4421 if (!New || (Result && New != Result))
4422 return nullptr;
4423 Result = New;
4424 }
4425
4426 assert(Result && "failed to create expression");
4427 return Result;
4428}
4429
4430/// Generate reuse formulae from different IV types.
4431void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
4432 // Don't bother truncating symbolic values.
4433 if (Base.BaseGV) return;
4434
4435 // Determine the integer type for the base formula.
4436 Type *DstTy = Base.getType();
4437 if (!DstTy) return;
4438 if (DstTy->isPointerTy())
4439 return;
4440
4441 // It is invalid to extend a pointer type so exit early if ScaledReg or
4442 // any of the BaseRegs are pointers.
4443 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4444 return;
4445 if (any_of(Base.BaseRegs,
4446 [](const SCEV *S) { return S->getType()->isPointerTy(); }))
4447 return;
4448
4450 for (auto &LF : LU.Fixups)
4451 Loops.push_back(LF.PostIncLoops);
4452
4453 for (Type *SrcTy : Types) {
4454 if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
4455 Formula F = Base;
4456
4457 // Sometimes SCEV is able to prove zero during ext transform. It may
4458 // happen if SCEV did not do all possible transforms while creating the
4459 // initial node (maybe due to depth limitations), but it can do them while
4460 // taking ext.
4461 if (F.ScaledReg) {
4462 const SCEV *NewScaledReg =
4463 getAnyExtendConsideringPostIncUses(Loops, F.ScaledReg, SrcTy, SE);
4464 if (!NewScaledReg || NewScaledReg->isZero())
4465 continue;
4466 F.ScaledReg = NewScaledReg;
4467 }
4468 bool HasZeroBaseReg = false;
4469 for (const SCEV *&BaseReg : F.BaseRegs) {
4470 const SCEV *NewBaseReg =
4471 getAnyExtendConsideringPostIncUses(Loops, BaseReg, SrcTy, SE);
4472 if (!NewBaseReg || NewBaseReg->isZero()) {
4473 HasZeroBaseReg = true;
4474 break;
4475 }
4476 BaseReg = NewBaseReg;
4477 }
4478 if (HasZeroBaseReg)
4479 continue;
4480
4481 // TODO: This assumes we've done basic processing on all uses and
4482 // have an idea what the register usage is.
4483 if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
4484 continue;
4485
4486 F.canonicalize(*L);
4487 (void)InsertFormula(LU, LUIdx, F);
4488 }
4489 }
4490}
4491
4492namespace {
4493
4494/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
4495/// modifications so that the search phase doesn't have to worry about the data
4496/// structures moving underneath it.
4497struct WorkItem {
4498 size_t LUIdx;
4499 Immediate Imm;
4500 const SCEV *OrigReg;
4501
4502 WorkItem(size_t LI, Immediate I, const SCEV *R)
4503 : LUIdx(LI), Imm(I), OrigReg(R) {}
4504
4505 void print(raw_ostream &OS) const;
4506 void dump() const;
4507};
4508
4509} // end anonymous namespace
4510
4511#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4512void WorkItem::print(raw_ostream &OS) const {
4513 OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
4514 << " , add offset " << Imm;
4515}
4516
4517LLVM_DUMP_METHOD void WorkItem::dump() const {
4518 print(errs()); errs() << '\n';
4519}
4520#endif
4521
4522/// Look for registers which are a constant distance apart and try to form reuse
4523/// opportunities between them.
4524void LSRInstance::GenerateCrossUseConstantOffsets() {
4525 // Group the registers by their value without any added constant offset.
4526 using ImmMapTy = std::map<Immediate, const SCEV *, KeyOrderTargetImmediate>;
4527
4528 DenseMap<const SCEV *, ImmMapTy> Map;
4529 DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
4531 for (const SCEV *Use : RegUses) {
4532 const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify.
4533 Immediate Imm = ExtractImmediate(Reg, SE);
4534 auto Pair = Map.try_emplace(Reg);
4535 if (Pair.second)
4536 Sequence.push_back(Reg);
4537 Pair.first->second.insert(std::make_pair(Imm, Use));
4538 UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
4539 }
4540
4541 // Now examine each set of registers with the same base value. Build up
4542 // a list of work to do and do the work in a separate step so that we're
4543 // not adding formulae and register counts while we're searching.
4544 SmallVector<WorkItem, 32> WorkItems;
4545 SmallSet<std::pair<size_t, Immediate>, 32, KeyOrderSizeTAndImmediate>
4546 UniqueItems;
4547 for (const SCEV *Reg : Sequence) {
4548 const ImmMapTy &Imms = Map.find(Reg)->second;
4549
4550 // It's not worthwhile looking for reuse if there's only one offset.
4551 if (Imms.size() == 1)
4552 continue;
4553
4554 LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
4555 for (const auto &Entry
4556 : Imms) dbgs()
4557 << ' ' << Entry.first;
4558 dbgs() << '\n');
4559
4560 // Examine each offset.
4561 for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
4562 J != JE; ++J) {
4563 const SCEV *OrigReg = J->second;
4564
4565 Immediate JImm = J->first;
4566 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
4567
4568 if (!isa<SCEVConstant>(OrigReg) &&
4569 UsedByIndicesMap[Reg].count() == 1) {
4570 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4571 << '\n');
4572 continue;
4573 }
4574
4575 // Conservatively examine offsets between this orig reg a few selected
4576 // other orig regs.
4577 Immediate First = Imms.begin()->first;
4578 Immediate Last = std::prev(Imms.end())->first;
4579 if (!First.isCompatibleImmediate(Last)) {
4580 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4581 << "\n");
4582 continue;
4583 }
4584 // Only scalable if both terms are scalable, or if one is scalable and
4585 // the other is 0.
4586 bool Scalable = First.isScalable() || Last.isScalable();
4587 int64_t FI = First.getKnownMinValue();
4588 int64_t LI = Last.getKnownMinValue();
4589 // Compute (First + Last) / 2 without overflow using the fact that
4590 // First + Last = 2 * (First + Last) + (First ^ Last).
4591 int64_t Avg = (FI & LI) + ((FI ^ LI) >> 1);
4592 // If the result is negative and FI is odd and LI even (or vice versa),
4593 // we rounded towards -inf. Add 1 in that case, to round towards 0.
4594 Avg = Avg + ((FI ^ LI) & ((uint64_t)Avg >> 63));
4595 ImmMapTy::const_iterator OtherImms[] = {
4596 Imms.begin(), std::prev(Imms.end()),
4597 Imms.lower_bound(Immediate::get(Avg, Scalable))};
4598 for (const auto &M : OtherImms) {
4599 if (M == J || M == JE) continue;
4600 if (!JImm.isCompatibleImmediate(M->first))
4601 continue;
4602
4603 // Compute the difference between the two.
4604 Immediate Imm = JImm.subUnsigned(M->first);
4605 for (unsigned LUIdx : UsedByIndices.set_bits())
4606 // Make a memo of this use, offset, and register tuple.
4607 if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
4608 WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
4609 }
4610 }
4611 }
4612
4613 Map.clear();
4614 Sequence.clear();
4615 UsedByIndicesMap.clear();
4616 UniqueItems.clear();
4617
4618 // Now iterate through the worklist and add new formulae.
4619 for (const WorkItem &WI : WorkItems) {
4620 size_t LUIdx = WI.LUIdx;
4621 LSRUse &LU = Uses[LUIdx];
4622 Immediate Imm = WI.Imm;
4623 const SCEV *OrigReg = WI.OrigReg;
4624
4625 Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
4626 const SCEV *NegImmS = Imm.getNegativeSCEV(SE, IntTy);
4627 unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
4628
4629 // TODO: Use a more targeted data structure.
4630 for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
4631 Formula F = LU.Formulae[L];
4632 // FIXME: The code for the scaled and unscaled registers looks
4633 // very similar but slightly different. Investigate if they
4634 // could be merged. That way, we would not have to unscale the
4635 // Formula.
4636 F.unscale();
4637 // Use the immediate in the scaled register.
4638 if (F.ScaledReg == OrigReg) {
4639 if (!F.BaseOffset.isCompatibleImmediate(Imm))
4640 continue;
4641 Immediate Offset = F.BaseOffset.addUnsigned(Imm.mulUnsigned(F.Scale));
4642 // Don't create 50 + reg(-50).
4643 const SCEV *S = Offset.getNegativeSCEV(SE, IntTy);
4644 if (F.referencesReg(S))
4645 continue;
4646 Formula NewF = F;
4647 NewF.BaseOffset = Offset;
4648 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4649 NewF))
4650 continue;
4651 NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
4652
4653 // If the new scale is a constant in a register, and adding the constant
4654 // value to the immediate would produce a value closer to zero than the
4655 // immediate itself, then the formula isn't worthwhile.
4656 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) {
4657 // FIXME: Do we need to do something for scalable immediates here?
4658 // A scalable SCEV won't be constant, but we might still have
4659 // something in the offset? Bail out for now to be safe.
4660 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4661 continue;
4662 if (C->getValue()->isNegative() !=
4663 (NewF.BaseOffset.isLessThanZero()) &&
4664 (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
4665 .ule(std::abs(NewF.BaseOffset.getFixedValue())))
4666 continue;
4667 }
4668
4669 // OK, looks good.
4670 NewF.canonicalize(*this->L);
4671 (void)InsertFormula(LU, LUIdx, NewF);
4672 } else {
4673 // Use the immediate in a base register.
4674 for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
4675 const SCEV *BaseReg = F.BaseRegs[N];
4676 if (BaseReg != OrigReg)
4677 continue;
4678 Formula NewF = F;
4679 if (!NewF.BaseOffset.isCompatibleImmediate(Imm) ||
4680 !NewF.UnfoldedOffset.isCompatibleImmediate(Imm) ||
4681 !NewF.BaseOffset.isCompatibleImmediate(NewF.UnfoldedOffset))
4682 continue;
4683 NewF.BaseOffset = NewF.BaseOffset.addUnsigned(Imm);
4684 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
4685 LU.Kind, LU.AccessTy, NewF)) {
4686 if (AMK == TTI::AMK_PostIndexed &&
4687 mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
4688 continue;
4689 Immediate NewUnfoldedOffset = NewF.UnfoldedOffset.addUnsigned(Imm);
4690 if (!isLegalAddImmediate(TTI, NewUnfoldedOffset))
4691 continue;
4692 NewF = F;
4693 NewF.UnfoldedOffset = NewUnfoldedOffset;
4694 }
4695 NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
4696
4697 // If the new formula has a constant in a register, and adding the
4698 // constant value to the immediate would produce a value closer to
4699 // zero than the immediate itself, then the formula isn't worthwhile.
4700 for (const SCEV *NewReg : NewF.BaseRegs)
4701 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg)) {
4702 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4703 goto skip_formula;
4704 if ((C->getAPInt() + NewF.BaseOffset.getFixedValue())
4705 .abs()
4706 .slt(std::abs(NewF.BaseOffset.getFixedValue())) &&
4707 (C->getAPInt() + NewF.BaseOffset.getFixedValue())
4708 .countr_zero() >=
4710 NewF.BaseOffset.getFixedValue()))
4711 goto skip_formula;
4712 }
4713
4714 // Ok, looks good.
4715 NewF.canonicalize(*this->L);
4716 (void)InsertFormula(LU, LUIdx, NewF);
4717 break;
4718 skip_formula:;
4719 }
4720 }
4721 }
4722 }
4723}
4724
4725/// Generate formulae for each use.
4726void
4727LSRInstance::GenerateAllReuseFormulae() {
4728 // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
4729 // queries are more precise.
4730 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4731 LSRUse &LU = Uses[LUIdx];
4732 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4733 GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
4734 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4735 GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
4736 }
4737 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4738 LSRUse &LU = Uses[LUIdx];
4739 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4740 GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
4741 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4742 GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
4743 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4744 GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
4745 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4746 GenerateScales(LU, LUIdx, LU.Formulae[i]);
4747 }
4748 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4749 LSRUse &LU = Uses[LUIdx];
4750 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4751 GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
4752 }
4753
4754 GenerateCrossUseConstantOffsets();
4755
4756 LLVM_DEBUG(dbgs() << "\n"
4757 "After generating reuse formulae:\n";
4758 print_uses(dbgs()));
4759}
4760
4761/// If there are multiple formulae with the same set of registers used
4762/// by other uses, pick the best one and delete the others.
4763void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
4764 DenseSet<const SCEV *> VisitedRegs;
4765 SmallPtrSet<const SCEV *, 16> Regs;
4766 SmallPtrSet<const SCEV *, 16> LoserRegs;
4767#ifndef NDEBUG
4768 bool ChangedFormulae = false;
4769#endif
4770
4771 // Collect the best formula for each unique set of shared registers. This
4772 // is reset for each use.
4773 using BestFormulaeTy = DenseMap<SmallVector<const SCEV *, 4>, size_t>;
4774
4775 BestFormulaeTy BestFormulae;
4776
4777 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4778 LSRUse &LU = Uses[LUIdx];
4779 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
4780 dbgs() << '\n');
4781
4782 bool Any = false;
4783 for (size_t FIdx = 0, NumForms = LU.Formulae.size();
4784 FIdx != NumForms; ++FIdx) {
4785 Formula &F = LU.Formulae[FIdx];
4786
4787 // Some formulas are instant losers. For example, they may depend on
4788 // nonexistent AddRecs from other loops. These need to be filtered
4789 // immediately, otherwise heuristics could choose them over others leading
4790 // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
4791 // avoids the need to recompute this information across formulae using the
4792 // same bad AddRec. Passing LoserRegs is also essential unless we remove
4793 // the corresponding bad register from the Regs set.
4794 Cost CostF(L, SE, TTI, AMK);
4795 Regs.clear();
4796 CostF.RateFormula(F, Regs, VisitedRegs, LU, HardwareLoopProfitable,
4797 &LoserRegs);
4798 if (CostF.isLoser()) {
4799 // During initial formula generation, undesirable formulae are generated
4800 // by uses within other loops that have some non-trivial address mode or
4801 // use the postinc form of the IV. LSR needs to provide these formulae
4802 // as the basis of rediscovering the desired formula that uses an AddRec
4803 // corresponding to the existing phi. Once all formulae have been
4804 // generated, these initial losers may be pruned.
4805 LLVM_DEBUG(dbgs() << " Filtering loser "; F.print(dbgs());
4806 dbgs() << "\n");
4807 }
4808 else {
4810 for (const SCEV *Reg : F.BaseRegs) {
4811 if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
4812 Key.push_back(Reg);
4813 }
4814 if (F.ScaledReg &&
4815 RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
4816 Key.push_back(F.ScaledReg);
4817 // Unstable sort by host order ok, because this is only used for
4818 // uniquifying.
4819 llvm::sort(Key);
4820
4821 std::pair<BestFormulaeTy::const_iterator, bool> P =
4822 BestFormulae.insert(std::make_pair(Key, FIdx));
4823 if (P.second)
4824 continue;
4825
4826 Formula &Best = LU.Formulae[P.first->second];
4827
4828 Cost CostBest(L, SE, TTI, AMK);
4829 Regs.clear();
4830 CostBest.RateFormula(Best, Regs, VisitedRegs, LU,
4831 HardwareLoopProfitable);
4832 if (CostF.isLess(CostBest))
4833 std::swap(F, Best);
4834 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
4835 dbgs() << "\n"
4836 " in favor of formula ";
4837 Best.print(dbgs()); dbgs() << '\n');
4838 }
4839#ifndef NDEBUG
4840 ChangedFormulae = true;
4841#endif
4842 LU.DeleteFormula(F);
4843 --FIdx;
4844 --NumForms;
4845 Any = true;
4846 }
4847
4848 // Now that we've filtered out some formulae, recompute the Regs set.
4849 if (Any)
4850 LU.RecomputeRegs(LUIdx, RegUses);
4851
4852 // Reset this to prepare for the next use.
4853 BestFormulae.clear();
4854 }
4855
4856 LLVM_DEBUG(if (ChangedFormulae) {
4857 dbgs() << "\n"
4858 "After filtering out undesirable candidates:\n";
4859 print_uses(dbgs());
4860 });
4861}
4862
4863/// Estimate the worst-case number of solutions the solver might have to
4864/// consider. It almost never considers this many solutions because it prune the
4865/// search space, but the pruning isn't always sufficient.
4866size_t LSRInstance::EstimateSearchSpaceComplexity() const {
4867 size_t Power = 1;
4868 for (const LSRUse &LU : Uses) {
4869 size_t FSize = LU.Formulae.size();
4870 if (FSize >= ComplexityLimit) {
4871 Power = ComplexityLimit;
4872 break;
4873 }
4874 Power *= FSize;
4875 if (Power >= ComplexityLimit)
4876 break;
4877 }
4878 return Power;
4879}
4880
4881/// When one formula uses a superset of the registers of another formula, it
4882/// won't help reduce register pressure (though it may not necessarily hurt
4883/// register pressure); remove it to simplify the system.
4884void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
4885 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4886 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4887
4888 LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
4889 "which use a superset of registers used by other "
4890 "formulae.\n");
4891
4892 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4893 LSRUse &LU = Uses[LUIdx];
4894 bool Any = false;
4895 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
4896 Formula &F = LU.Formulae[i];
4897 if (F.BaseOffset.isNonZero() && F.BaseOffset.isScalable())
4898 continue;
4899 // Look for a formula with a constant or GV in a register. If the use
4900 // also has a formula with that same value in an immediate field,
4901 // delete the one that uses a register.
4903 I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
4904 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
4905 Formula NewF = F;
4906 //FIXME: Formulas should store bitwidth to do wrapping properly.
4907 // See PR41034.
4908 NewF.BaseOffset =
4909 Immediate::getFixed(NewF.BaseOffset.getFixedValue() +
4910 (uint64_t)C->getValue()->getSExtValue());
4911 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4912 (I - F.BaseRegs.begin()));
4913 if (LU.HasFormulaWithSameRegs(NewF)) {
4914 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4915 dbgs() << '\n');
4916 LU.DeleteFormula(F);
4917 --i;
4918 --e;
4919 Any = true;
4920 break;
4921 }
4922 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
4923 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
4924 if (!F.BaseGV) {
4925 Formula NewF = F;
4926 NewF.BaseGV = GV;
4927 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4928 (I - F.BaseRegs.begin()));
4929 if (LU.HasFormulaWithSameRegs(NewF)) {
4930 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4931 dbgs() << '\n');
4932 LU.DeleteFormula(F);
4933 --i;
4934 --e;
4935 Any = true;
4936 break;
4937 }
4938 }
4939 }
4940 }
4941 }
4942 if (Any)
4943 LU.RecomputeRegs(LUIdx, RegUses);
4944 }
4945
4946 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4947 }
4948}
4949
4950/// When there are many registers for expressions like A, A+1, A+2, etc.,
4951/// allocate a single register for them.
4952void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
4953 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4954 return;
4955
4956 LLVM_DEBUG(
4957 dbgs() << "The search space is too complex.\n"
4958 "Narrowing the search space by assuming that uses separated "
4959 "by a constant offset will use the same registers.\n");
4960
4961 // This is especially useful for unrolled loops.
4962
4963 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4964 LSRUse &LU = Uses[LUIdx];
4965 for (const Formula &F : LU.Formulae) {
4966 if (F.BaseOffset.isZero() || (F.Scale != 0 && F.Scale != 1))
4967 continue;
4968
4969 LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
4970 if (!LUThatHas)
4971 continue;
4972
4973 if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
4974 LU.Kind, LU.AccessTy))
4975 continue;
4976
4977 LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');
4978
4979 LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
4980 LUThatHas->AllFixupsUnconditional &= LU.AllFixupsUnconditional;
4981
4982 // Transfer the fixups of LU to LUThatHas.
4983 for (LSRFixup &Fixup : LU.Fixups) {
4984 Fixup.Offset += F.BaseOffset;
4985 LUThatHas->pushFixup(Fixup);
4986 LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
4987 }
4988
4989 // Delete formulae from the new use which are no longer legal.
4990 bool Any = false;
4991 for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
4992 Formula &F = LUThatHas->Formulae[i];
4993 if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
4994 LUThatHas->Kind, LUThatHas->AccessTy, F)) {
4995 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
4996 LUThatHas->DeleteFormula(F);
4997 --i;
4998 --e;
4999 Any = true;
5000 }
5001 }
5002
5003 if (Any)
5004 LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
5005
5006 // Delete the old use.
5007 DeleteUse(LU, LUIdx);
5008 --LUIdx;
5009 --NumUses;
5010 break;
5011 }
5012 }
5013
5014 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5015}
5016
5017/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
5018/// we've done more filtering, as it may be able to find more formulae to
5019/// eliminate.
5020void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
5021 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5022 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5023
5024 LLVM_DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
5025 "undesirable dedicated registers.\n");
5026
5027 FilterOutUndesirableDedicatedRegisters();
5028
5029 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5030 }
5031}
5032
5033/// If a LSRUse has multiple formulae with the same ScaledReg and Scale.
5034/// Pick the best one and delete the others.
5035/// This narrowing heuristic is to keep as many formulae with different
5036/// Scale and ScaledReg pair as possible while narrowing the search space.
5037/// The benefit is that it is more likely to find out a better solution
5038/// from a formulae set with more Scale and ScaledReg variations than
5039/// a formulae set with the same Scale and ScaledReg. The picking winner
5040/// reg heuristic will often keep the formulae with the same Scale and
5041/// ScaledReg and filter others, and we want to avoid that if possible.
5042void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
5043 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5044 return;
5045
5046 LLVM_DEBUG(
5047 dbgs() << "The search space is too complex.\n"
5048 "Narrowing the search space by choosing the best Formula "
5049 "from the Formulae with the same Scale and ScaledReg.\n");
5050
5051 // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
5052 using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>;
5053
5054 BestFormulaeTy BestFormulae;
5055#ifndef NDEBUG
5056 bool ChangedFormulae = false;
5057#endif
5058 DenseSet<const SCEV *> VisitedRegs;
5059 SmallPtrSet<const SCEV *, 16> Regs;
5060
5061 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5062 LSRUse &LU = Uses[LUIdx];
5063 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
5064 dbgs() << '\n');
5065
5066 // Return true if Formula FA is better than Formula FB.
5067 auto IsBetterThan = [&](Formula &FA, Formula &FB) {
5068 // First we will try to choose the Formula with fewer new registers.
5069 // For a register used by current Formula, the more the register is
5070 // shared among LSRUses, the less we increase the register number
5071 // counter of the formula.
5072 size_t FARegNum = 0;
5073 for (const SCEV *Reg : FA.BaseRegs) {
5074 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5075 FARegNum += (NumUses - UsedByIndices.count() + 1);
5076 }
5077 size_t FBRegNum = 0;
5078 for (const SCEV *Reg : FB.BaseRegs) {
5079 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5080 FBRegNum += (NumUses - UsedByIndices.count() + 1);
5081 }
5082 if (FARegNum != FBRegNum)
5083 return FARegNum < FBRegNum;
5084
5085 // If the new register numbers are the same, choose the Formula with
5086 // less Cost.
5087 Cost CostFA(L, SE, TTI, AMK);
5088 Cost CostFB(L, SE, TTI, AMK);
5089 Regs.clear();
5090 CostFA.RateFormula(FA, Regs, VisitedRegs, LU, HardwareLoopProfitable);
5091 Regs.clear();
5092 CostFB.RateFormula(FB, Regs, VisitedRegs, LU, HardwareLoopProfitable);
5093 return CostFA.isLess(CostFB);
5094 };
5095
5096 bool Any = false;
5097 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5098 ++FIdx) {
5099 Formula &F = LU.Formulae[FIdx];
5100 if (!F.ScaledReg)
5101 continue;
5102 auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx});
5103 if (P.second)
5104 continue;
5105
5106 Formula &Best = LU.Formulae[P.first->second];
5107 if (IsBetterThan(F, Best))
5108 std::swap(F, Best);
5109 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5110 dbgs() << "\n"
5111 " in favor of formula ";
5112 Best.print(dbgs()); dbgs() << '\n');
5113#ifndef NDEBUG
5114 ChangedFormulae = true;
5115#endif
5116 LU.DeleteFormula(F);
5117 --FIdx;
5118 --NumForms;
5119 Any = true;
5120 }
5121 if (Any)
5122 LU.RecomputeRegs(LUIdx, RegUses);
5123
5124 // Reset this to prepare for the next use.
5125 BestFormulae.clear();
5126 }
5127
5128 LLVM_DEBUG(if (ChangedFormulae) {
5129 dbgs() << "\n"
5130 "After filtering out undesirable candidates:\n";
5131 print_uses(dbgs());
5132 });
5133}
5134
5135/// If we are over the complexity limit, filter out any post-inc prefering
5136/// variables to only post-inc values.
5137void LSRInstance::NarrowSearchSpaceByFilterPostInc() {
5138 if (AMK != TTI::AMK_PostIndexed)
5139 return;
5140 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5141 return;
5142
5143 LLVM_DEBUG(dbgs() << "The search space is too complex.\n"
5144 "Narrowing the search space by choosing the lowest "
5145 "register Formula for PostInc Uses.\n");
5146
5147 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5148 LSRUse &LU = Uses[LUIdx];
5149
5150 if (LU.Kind != LSRUse::Address)
5151 continue;
5152 if (!TTI.isIndexedLoadLegal(TTI.MIM_PostInc, LU.AccessTy.getType()) &&
5153 !TTI.isIndexedStoreLegal(TTI.MIM_PostInc, LU.AccessTy.getType()))
5154 continue;
5155
5156 size_t MinRegs = std::numeric_limits<size_t>::max();
5157 for (const Formula &F : LU.Formulae)
5158 MinRegs = std::min(F.getNumRegs(), MinRegs);
5159
5160 bool Any = false;
5161 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5162 ++FIdx) {
5163 Formula &F = LU.Formulae[FIdx];
5164 if (F.getNumRegs() > MinRegs) {
5165 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5166 dbgs() << "\n");
5167 LU.DeleteFormula(F);
5168 --FIdx;
5169 --NumForms;
5170 Any = true;
5171 }
5172 }
5173 if (Any)
5174 LU.RecomputeRegs(LUIdx, RegUses);
5175
5176 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5177 break;
5178 }
5179
5180 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5181}
5182
5183/// The function delete formulas with high registers number expectation.
5184/// Assuming we don't know the value of each formula (already delete
5185/// all inefficient), generate probability of not selecting for each
5186/// register.
5187/// For example,
5188/// Use1:
5189/// reg(a) + reg({0,+,1})
5190/// reg(a) + reg({-1,+,1}) + 1
5191/// reg({a,+,1})
5192/// Use2:
5193/// reg(b) + reg({0,+,1})
5194/// reg(b) + reg({-1,+,1}) + 1
5195/// reg({b,+,1})
5196/// Use3:
5197/// reg(c) + reg(b) + reg({0,+,1})
5198/// reg(c) + reg({b,+,1})
5199///
5200/// Probability of not selecting
5201/// Use1 Use2 Use3
5202/// reg(a) (1/3) * 1 * 1
5203/// reg(b) 1 * (1/3) * (1/2)
5204/// reg({0,+,1}) (2/3) * (2/3) * (1/2)
5205/// reg({-1,+,1}) (2/3) * (2/3) * 1
5206/// reg({a,+,1}) (2/3) * 1 * 1
5207/// reg({b,+,1}) 1 * (2/3) * (2/3)
5208/// reg(c) 1 * 1 * 0
5209///
5210/// Now count registers number mathematical expectation for each formula:
5211/// Note that for each use we exclude probability if not selecting for the use.
5212/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
5213/// probabilty 1/3 of not selecting for Use1).
5214/// Use1:
5215/// reg(a) + reg({0,+,1}) 1 + 1/3 -- to be deleted
5216/// reg(a) + reg({-1,+,1}) + 1 1 + 4/9 -- to be deleted
5217/// reg({a,+,1}) 1
5218/// Use2:
5219/// reg(b) + reg({0,+,1}) 1/2 + 1/3 -- to be deleted
5220/// reg(b) + reg({-1,+,1}) + 1 1/2 + 2/3 -- to be deleted
5221/// reg({b,+,1}) 2/3
5222/// Use3:
5223/// reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
5224/// reg(c) + reg({b,+,1}) 1 + 2/3
5225void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
5226 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5227 return;
5228 // Ok, we have too many of formulae on our hands to conveniently handle.
5229 // Use a rough heuristic to thin out the list.
5230
5231 // Set of Regs wich will be 100% used in final solution.
5232 // Used in each formula of a solution (in example above this is reg(c)).
5233 // We can skip them in calculations.
5234 SmallPtrSet<const SCEV *, 4> UniqRegs;
5235 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5236
5237 // Map each register to probability of not selecting
5238 DenseMap <const SCEV *, float> RegNumMap;
5239 for (const SCEV *Reg : RegUses) {
5240 if (UniqRegs.count(Reg))
5241 continue;
5242 float PNotSel = 1;
5243 for (const LSRUse &LU : Uses) {
5244 if (!LU.Regs.count(Reg))
5245 continue;
5246 float P = LU.getNotSelectedProbability(Reg);
5247 if (P != 0.0)
5248 PNotSel *= P;
5249 else
5250 UniqRegs.insert(Reg);
5251 }
5252 RegNumMap.insert(std::make_pair(Reg, PNotSel));
5253 }
5254
5255 LLVM_DEBUG(
5256 dbgs() << "Narrowing the search space by deleting costly formulas\n");
5257
5258 // Delete formulas where registers number expectation is high.
5259 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5260 LSRUse &LU = Uses[LUIdx];
5261 // If nothing to delete - continue.
5262 if (LU.Formulae.size() < 2)
5263 continue;
5264 // This is temporary solution to test performance. Float should be
5265 // replaced with round independent type (based on integers) to avoid
5266 // different results for different target builds.
5267 float FMinRegNum = LU.Formulae[0].getNumRegs();
5268 float FMinARegNum = LU.Formulae[0].getNumRegs();
5269 size_t MinIdx = 0;
5270 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5271 Formula &F = LU.Formulae[i];
5272 float FRegNum = 0;
5273 float FARegNum = 0;
5274 for (const SCEV *BaseReg : F.BaseRegs) {
5275 if (UniqRegs.count(BaseReg))
5276 continue;
5277 FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5278 if (isa<SCEVAddRecExpr>(BaseReg))
5279 FARegNum +=
5280 RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5281 }
5282 if (const SCEV *ScaledReg = F.ScaledReg) {
5283 if (!UniqRegs.count(ScaledReg)) {
5284 FRegNum +=
5285 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5286 if (isa<SCEVAddRecExpr>(ScaledReg))
5287 FARegNum +=
5288 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5289 }
5290 }
5291 if (FMinRegNum > FRegNum ||
5292 (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
5293 FMinRegNum = FRegNum;
5294 FMinARegNum = FARegNum;
5295 MinIdx = i;
5296 }
5297 }
5298 LLVM_DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs());
5299 dbgs() << " with min reg num " << FMinRegNum << '\n');
5300 if (MinIdx != 0)
5301 std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
5302 while (LU.Formulae.size() != 1) {
5303 LLVM_DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs());
5304 dbgs() << '\n');
5305 LU.Formulae.pop_back();
5306 }
5307 LU.RecomputeRegs(LUIdx, RegUses);
5308 assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
5309 Formula &F = LU.Formulae[0];
5310 LLVM_DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n');
5311 // When we choose the formula, the regs become unique.
5312 UniqRegs.insert_range(F.BaseRegs);
5313 if (F.ScaledReg)
5314 UniqRegs.insert(F.ScaledReg);
5315 }
5316 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5317}
5318
5319// Check if Best and Reg are SCEVs separated by a constant amount C, and if so
5320// would the addressing offset +C would be legal where the negative offset -C is
5321// not.
5323 ScalarEvolution &SE, const SCEV *Best,
5324 const SCEV *Reg,
5325 MemAccessTy AccessType) {
5326 if (Best->getType() != Reg->getType() ||
5328 cast<SCEVAddRecExpr>(Best)->getLoop() !=
5329 cast<SCEVAddRecExpr>(Reg)->getLoop()))
5330 return false;
5331 std::optional<APInt> Diff = SE.computeConstantDifference(Best, Reg);
5332 if (!Diff)
5333 return false;
5334
5335 return TTI.isLegalAddressingMode(
5336 AccessType.MemTy, /*BaseGV=*/nullptr,
5337 /*BaseOffset=*/Diff->getSExtValue(),
5338 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace) &&
5339 !TTI.isLegalAddressingMode(
5340 AccessType.MemTy, /*BaseGV=*/nullptr,
5341 /*BaseOffset=*/-Diff->getSExtValue(),
5342 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace);
5343}
5344
5345/// Pick a register which seems likely to be profitable, and then in any use
5346/// which has any reference to that register, delete all formulae which do not
5347/// reference that register.
5348void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
5349 // With all other options exhausted, loop until the system is simple
5350 // enough to handle.
5351 SmallPtrSet<const SCEV *, 4> Taken;
5352 while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5353 // Ok, we have too many of formulae on our hands to conveniently handle.
5354 // Use a rough heuristic to thin out the list.
5355 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5356
5357 // Pick the register which is used by the most LSRUses, which is likely
5358 // to be a good reuse register candidate.
5359 const SCEV *Best = nullptr;
5360 unsigned BestNum = 0;
5361 for (const SCEV *Reg : RegUses) {
5362 if (Taken.count(Reg))
5363 continue;
5364 if (!Best) {
5365 Best = Reg;
5366 BestNum = RegUses.getUsedByIndices(Reg).count();
5367 } else {
5368 unsigned Count = RegUses.getUsedByIndices(Reg).count();
5369 if (Count > BestNum) {
5370 Best = Reg;
5371 BestNum = Count;
5372 }
5373
5374 // If the scores are the same, but the Reg is simpler for the target
5375 // (for example {x,+,1} as opposed to {x+C,+,1}, where the target can
5376 // handle +C but not -C), opt for the simpler formula.
5377 if (Count == BestNum) {
5378 int LUIdx = RegUses.getUsedByIndices(Reg).find_first();
5379 if (LUIdx >= 0 && Uses[LUIdx].Kind == LSRUse::Address &&
5381 Uses[LUIdx].AccessTy)) {
5382 Best = Reg;
5383 BestNum = Count;
5384 }
5385 }
5386 }
5387 }
5388 assert(Best && "Failed to find best LSRUse candidate");
5389
5390 LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
5391 << " will yield profitable reuse.\n");
5392 Taken.insert(Best);
5393
5394 // In any use with formulae which references this register, delete formulae
5395 // which don't reference it.
5396 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5397 LSRUse &LU = Uses[LUIdx];
5398 if (!LU.Regs.count(Best)) continue;
5399
5400 bool Any = false;
5401 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5402 Formula &F = LU.Formulae[i];
5403 if (!F.referencesReg(Best)) {
5404 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
5405 LU.DeleteFormula(F);
5406 --e;
5407 --i;
5408 Any = true;
5409 assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
5410 continue;
5411 }
5412 }
5413
5414 if (Any)
5415 LU.RecomputeRegs(LUIdx, RegUses);
5416 }
5417
5418 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5419 }
5420}
5421
5422/// If there are an extraordinary number of formulae to choose from, use some
5423/// rough heuristics to prune down the number of formulae. This keeps the main
5424/// solver from taking an extraordinary amount of time in some worst-case
5425/// scenarios.
5426void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
5427 NarrowSearchSpaceByDetectingSupersets();
5428 NarrowSearchSpaceByCollapsingUnrolledCode();
5429 NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
5431 NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
5432 NarrowSearchSpaceByFilterPostInc();
5433 if (LSRExpNarrow)
5434 NarrowSearchSpaceByDeletingCostlyFormulas();
5435 else
5436 NarrowSearchSpaceByPickingWinnerRegs();
5437}
5438
5439/// This is the recursive solver.
5440void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
5441 Cost &SolutionCost,
5442 SmallVectorImpl<const Formula *> &Workspace,
5443 const Cost &CurCost,
5444 const SmallPtrSet<const SCEV *, 16> &CurRegs,
5445 DenseSet<const SCEV *> &VisitedRegs) const {
5446 // Some ideas:
5447 // - prune more:
5448 // - use more aggressive filtering
5449 // - sort the formula so that the most profitable solutions are found first
5450 // - sort the uses too
5451 // - search faster:
5452 // - don't compute a cost, and then compare. compare while computing a cost
5453 // and bail early.
5454 // - track register sets with SmallBitVector
5455
5456 const LSRUse &LU = Uses[Workspace.size()];
5457
5458 // If this use references any register that's already a part of the
5459 // in-progress solution, consider it a requirement that a formula must
5460 // reference that register in order to be considered. This prunes out
5461 // unprofitable searching.
5462 SmallSetVector<const SCEV *, 4> ReqRegs;
5463 for (const SCEV *S : CurRegs)
5464 if (LU.Regs.count(S))
5465 ReqRegs.insert(S);
5466
5467 SmallPtrSet<const SCEV *, 16> NewRegs;
5468 Cost NewCost(L, SE, TTI, AMK);
5469 for (const Formula &F : LU.Formulae) {
5470 // Ignore formulae which may not be ideal in terms of register reuse of
5471 // ReqRegs. The formula should use all required registers before
5472 // introducing new ones.
5473 // This can sometimes (notably when trying to favour postinc) lead to
5474 // sub-optimial decisions. There it is best left to the cost modelling to
5475 // get correct.
5476 if (!(AMK & TTI::AMK_PostIndexed) || LU.Kind != LSRUse::Address) {
5477 int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
5478 for (const SCEV *Reg : ReqRegs) {
5479 if ((F.ScaledReg && F.ScaledReg == Reg) ||
5480 is_contained(F.BaseRegs, Reg)) {
5481 --NumReqRegsToFind;
5482 if (NumReqRegsToFind == 0)
5483 break;
5484 }
5485 }
5486 if (NumReqRegsToFind != 0) {
5487 // If none of the formulae satisfied the required registers, then we could
5488 // clear ReqRegs and try again. Currently, we simply give up in this case.
5489 continue;
5490 }
5491 }
5492
5493 // Evaluate the cost of the current formula. If it's already worse than
5494 // the current best, prune the search at that point.
5495 NewCost = CurCost;
5496 NewRegs = CurRegs;
5497 NewCost.RateFormula(F, NewRegs, VisitedRegs, LU, HardwareLoopProfitable);
5498 if (NewCost.isLess(SolutionCost)) {
5499 Workspace.push_back(&F);
5500 if (Workspace.size() != Uses.size()) {
5501 SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
5502 NewRegs, VisitedRegs);
5503 if (F.getNumRegs() == 1 && Workspace.size() == 1)
5504 VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
5505 } else {
5506 LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
5507 dbgs() << ".\nRegs:\n";
5508 for (const SCEV *S : NewRegs) dbgs()
5509 << "- " << *S << "\n";
5510 dbgs() << '\n');
5511
5512 SolutionCost = NewCost;
5513 Solution = Workspace;
5514 }
5515 Workspace.pop_back();
5516 }
5517 }
5518}
5519
5520/// Choose one formula from each use. Return the results in the given Solution
5521/// vector.
5522void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
5524 Cost SolutionCost(L, SE, TTI, AMK);
5525 SolutionCost.Lose();
5526 Cost CurCost(L, SE, TTI, AMK);
5527 SmallPtrSet<const SCEV *, 16> CurRegs;
5528 DenseSet<const SCEV *> VisitedRegs;
5529 Workspace.reserve(Uses.size());
5530
5531 // SolveRecurse does all the work.
5532 SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
5533 CurRegs, VisitedRegs);
5534 if (Solution.empty()) {
5535 LLVM_DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
5536 return;
5537 }
5538
5539 // Ok, we've now made all our decisions.
5540 LLVM_DEBUG(dbgs() << "\n"
5541 "The chosen solution requires ";
5542 SolutionCost.print(dbgs()); dbgs() << ":\n";
5543 for (size_t i = 0, e = Uses.size(); i != e; ++i) {
5544 dbgs() << " ";
5545 Uses[i].print(dbgs());
5546 dbgs() << "\n"
5547 " ";
5548 Solution[i]->print(dbgs());
5549 dbgs() << '\n';
5550 });
5551
5552 assert(Solution.size() == Uses.size() && "Malformed solution!");
5553
5554 const bool EnableDropUnprofitableSolution = [&] {
5556 case cl::BOU_TRUE:
5557 return true;
5558 case cl::BOU_FALSE:
5559 return false;
5560 case cl::BOU_UNSET:
5562 }
5563 llvm_unreachable("Unhandled cl::boolOrDefault enum");
5564 }();
5565
5566 if (BaselineCost.isLess(SolutionCost)) {
5567 if (!EnableDropUnprofitableSolution)
5568 LLVM_DEBUG(
5569 dbgs() << "Baseline is more profitable than chosen solution, "
5570 "add option 'lsr-drop-solution' to drop LSR solution.\n");
5571 else {
5572 LLVM_DEBUG(dbgs() << "Baseline is more profitable than chosen "
5573 "solution, dropping LSR solution.\n";);
5574 Solution.clear();
5575 }
5576 }
5577}
5578
5579/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
5580/// we can go while still being dominated by the input positions. This helps
5581/// canonicalize the insert position, which encourages sharing.
5583LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
5584 const SmallVectorImpl<Instruction *> &Inputs)
5585 const {
5586 Instruction *Tentative = &*IP;
5587 while (true) {
5588 bool AllDominate = true;
5589 Instruction *BetterPos = nullptr;
5590 // Don't bother attempting to insert before a catchswitch, their basic block
5591 // cannot have other non-PHI instructions.
5592 if (isa<CatchSwitchInst>(Tentative))
5593 return IP;
5594
5595 for (Instruction *Inst : Inputs) {
5596 if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
5597 AllDominate = false;
5598 break;
5599 }
5600 // Attempt to find an insert position in the middle of the block,
5601 // instead of at the end, so that it can be used for other expansions.
5602 if (Tentative->getParent() == Inst->getParent() &&
5603 (!BetterPos || !DT.dominates(Inst, BetterPos)))
5604 BetterPos = &*std::next(BasicBlock::iterator(Inst));
5605 }
5606 if (!AllDominate)
5607 break;
5608 if (BetterPos)
5609 IP = BetterPos->getIterator();
5610 else
5611 IP = Tentative->getIterator();
5612
5613 const Loop *IPLoop = LI.getLoopFor(IP->getParent());
5614 unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
5615
5616 BasicBlock *IDom;
5617 for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
5618 if (!Rung) return IP;
5619 Rung = Rung->getIDom();
5620 if (!Rung) return IP;
5621 IDom = Rung->getBlock();
5622
5623 // Don't climb into a loop though.
5624 const Loop *IDomLoop = LI.getLoopFor(IDom);
5625 unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
5626 if (IDomDepth <= IPLoopDepth &&
5627 (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
5628 break;
5629 }
5630
5631 Tentative = IDom->getTerminator();
5632 }
5633
5634 return IP;
5635}
5636
5637/// Determine an input position which will be dominated by the operands and
5638/// which will dominate the result.
5639BasicBlock::iterator LSRInstance::AdjustInsertPositionForExpand(
5640 BasicBlock::iterator LowestIP, const LSRFixup &LF, const LSRUse &LU) const {
5641 // Collect some instructions which must be dominated by the
5642 // expanding replacement. These must be dominated by any operands that
5643 // will be required in the expansion.
5644 SmallVector<Instruction *, 4> Inputs;
5645 if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
5646 Inputs.push_back(I);
5647 if (LU.Kind == LSRUse::ICmpZero)
5648 if (Instruction *I =
5649 dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
5650 Inputs.push_back(I);
5651 if (LF.PostIncLoops.count(L)) {
5652 if (LF.isUseFullyOutsideLoop(L))
5653 Inputs.push_back(L->getLoopLatch()->getTerminator());
5654 else
5655 Inputs.push_back(IVIncInsertPos);
5656 }
5657 // The expansion must also be dominated by the increment positions of any
5658 // loops it for which it is using post-inc mode.
5659 for (const Loop *PIL : LF.PostIncLoops) {
5660 if (PIL == L) continue;
5661
5662 // Be dominated by the loop exit.
5663 SmallVector<BasicBlock *, 4> ExitingBlocks;
5664 PIL->getExitingBlocks(ExitingBlocks);
5665 if (!ExitingBlocks.empty()) {
5666 BasicBlock *BB = ExitingBlocks[0];
5667 for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
5668 BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
5669 Inputs.push_back(BB->getTerminator());
5670 }
5671 }
5672
5673 assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad() &&
5674 "Insertion point must be a normal instruction");
5675
5676 // Then, climb up the immediate dominator tree as far as we can go while
5677 // still being dominated by the input positions.
5678 BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
5679
5680 // Don't insert instructions before PHI nodes.
5681 while (isa<PHINode>(IP)) ++IP;
5682
5683 // Ignore landingpad instructions.
5684 while (IP->isEHPad()) ++IP;
5685
5686 // Set IP below instructions recently inserted by SCEVExpander. This keeps the
5687 // IP consistent across expansions and allows the previously inserted
5688 // instructions to be reused by subsequent expansion.
5689 while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP)
5690 ++IP;
5691
5692 return IP;
5693}
5694
5695/// Emit instructions for the leading candidate expression for this LSRUse (this
5696/// is called "expanding").
5697Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
5698 const Formula &F, BasicBlock::iterator IP,
5699 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5700 if (LU.RigidFormula)
5701 return LF.OperandValToReplace;
5702
5703 // Determine an input position which will be dominated by the operands and
5704 // which will dominate the result.
5705 IP = AdjustInsertPositionForExpand(IP, LF, LU);
5706 Rewriter.setInsertPoint(&*IP);
5707
5708 // Inform the Rewriter if we have a post-increment use, so that it can
5709 // perform an advantageous expansion.
5710 Rewriter.setPostInc(LF.PostIncLoops);
5711
5712 // This is the type that the user actually needs.
5713 Type *OpTy = LF.OperandValToReplace->getType();
5714 // This will be the type that we'll initially expand to.
5715 Type *Ty = F.getType();
5716 if (!Ty)
5717 // No type known; just expand directly to the ultimate type.
5718 Ty = OpTy;
5719 else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
5720 // Expand directly to the ultimate type if it's the right size.
5721 Ty = OpTy;
5722 // This is the type to do integer arithmetic in.
5723 Type *IntTy = SE.getEffectiveSCEVType(Ty);
5724
5725 // Build up a list of operands to add together to form the full base.
5727
5728 // Expand the BaseRegs portion.
5729 for (const SCEV *Reg : F.BaseRegs) {
5730 assert(!Reg->isZero() && "Zero allocated in a base register!");
5731
5732 // If we're expanding for a post-inc user, make the post-inc adjustment.
5733 Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE);
5734 Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
5735 }
5736
5737 // Expand the ScaledReg portion.
5738 Value *ICmpScaledV = nullptr;
5739 if (F.Scale != 0) {
5740 const SCEV *ScaledS = F.ScaledReg;
5741
5742 // If we're expanding for a post-inc user, make the post-inc adjustment.
5743 PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
5744 ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE);
5745
5746 if (LU.Kind == LSRUse::ICmpZero) {
5747 // Expand ScaleReg as if it was part of the base regs.
5748 if (F.Scale == 1)
5749 Ops.push_back(
5750 SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)));
5751 else {
5752 // An interesting way of "folding" with an icmp is to use a negated
5753 // scale, which we'll implement by inserting it into the other operand
5754 // of the icmp.
5755 assert(F.Scale == -1 &&
5756 "The only scale supported by ICmpZero uses is -1!");
5757 ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr);
5758 }
5759 } else {
5760 // Otherwise just expand the scaled register and an explicit scale,
5761 // which is expected to be matched as part of the address.
5762
5763 // Flush the operand list to suppress SCEVExpander hoisting address modes.
5764 // Unless the addressing mode will not be folded.
5765 if (!Ops.empty() && LU.Kind == LSRUse::Address &&
5766 isAMCompletelyFolded(TTI, LU, F)) {
5767 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), nullptr);
5768 Ops.clear();
5769 Ops.push_back(SE.getUnknown(FullV));
5770 }
5771 ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr));
5772 if (F.Scale != 1)
5773 ScaledS =
5774 SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
5775 Ops.push_back(ScaledS);
5776 }
5777 }
5778
5779 // Expand the GV portion.
5780 if (F.BaseGV) {
5781 // Flush the operand list to suppress SCEVExpander hoisting.
5782 if (!Ops.empty()) {
5783 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), IntTy);
5784 Ops.clear();
5785 Ops.push_back(SE.getUnknown(FullV));
5786 }
5787 Ops.push_back(SE.getUnknown(F.BaseGV));
5788 }
5789
5790 // Flush the operand list to suppress SCEVExpander hoisting of both folded and
5791 // unfolded offsets. LSR assumes they both live next to their uses.
5792 if (!Ops.empty()) {
5793 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
5794 Ops.clear();
5795 Ops.push_back(SE.getUnknown(FullV));
5796 }
5797
5798 // FIXME: Are we sure we won't get a mismatch here? Is there a way to bail
5799 // out at this point, or should we generate a SCEV adding together mixed
5800 // offsets?
5801 assert(F.BaseOffset.isCompatibleImmediate(LF.Offset) &&
5802 "Expanding mismatched offsets\n");
5803 // Expand the immediate portion.
5804 Immediate Offset = F.BaseOffset.addUnsigned(LF.Offset);
5805 if (Offset.isNonZero()) {
5806 if (LU.Kind == LSRUse::ICmpZero) {
5807 // The other interesting way of "folding" with an ICmpZero is to use a
5808 // negated immediate.
5809 if (!ICmpScaledV) {
5810 // TODO: Avoid implicit trunc?
5811 // See https://github.com/llvm/llvm-project/issues/112510.
5812 ICmpScaledV = ConstantInt::getSigned(
5813 IntTy, -(uint64_t)Offset.getFixedValue(), /*ImplicitTrunc=*/true);
5814 } else {
5815 Ops.push_back(SE.getUnknown(ICmpScaledV));
5816 ICmpScaledV = ConstantInt::getSigned(IntTy, Offset.getFixedValue(),
5817 /*ImplicitTrunc=*/true);
5818 }
5819 } else {
5820 // Just add the immediate values. These again are expected to be matched
5821 // as part of the address.
5822 Ops.push_back(Offset.getUnknownSCEV(SE, IntTy));
5823 }
5824 }
5825
5826 // Expand the unfolded offset portion.
5827 Immediate UnfoldedOffset = F.UnfoldedOffset;
5828 if (UnfoldedOffset.isNonZero()) {
5829 // Just add the immediate values.
5830 Ops.push_back(UnfoldedOffset.getUnknownSCEV(SE, IntTy));
5831 }
5832
5833 // Emit instructions summing all the operands.
5834 const SCEV *FullS = Ops.empty() ?
5835 SE.getConstant(IntTy, 0) :
5836 SE.getAddExpr(Ops);
5837 Value *FullV = Rewriter.expandCodeFor(FullS, Ty);
5838
5839 // We're done expanding now, so reset the rewriter.
5840 Rewriter.clearPostInc();
5841
5842 // An ICmpZero Formula represents an ICmp which we're handling as a
5843 // comparison against zero. Now that we've expanded an expression for that
5844 // form, update the ICmp's other operand.
5845 if (LU.Kind == LSRUse::ICmpZero) {
5846 ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
5847 if (auto *OperandIsInstr = dyn_cast<Instruction>(CI->getOperand(1)))
5848 DeadInsts.emplace_back(OperandIsInstr);
5849 assert(!F.BaseGV && "ICmp does not support folding a global value and "
5850 "a scale at the same time!");
5851 if (F.Scale == -1) {
5852 if (ICmpScaledV->getType() != OpTy) {
5854 CastInst::getCastOpcode(ICmpScaledV, false, OpTy, false),
5855 ICmpScaledV, OpTy, "tmp", CI->getIterator());
5856 ICmpScaledV = Cast;
5857 }
5858 CI->setOperand(1, ICmpScaledV);
5859 } else {
5860 // A scale of 1 means that the scale has been expanded as part of the
5861 // base regs.
5862 assert((F.Scale == 0 || F.Scale == 1) &&
5863 "ICmp does not support folding a global value and "
5864 "a scale at the same time!");
5865 // TODO: Avoid implicit trunc?
5866 // See https://github.com/llvm/llvm-project/issues/112510.
5868 -(uint64_t)Offset.getFixedValue(),
5869 /*ImplicitTrunc=*/true);
5870 if (C->getType() != OpTy) {
5872 CastInst::getCastOpcode(C, false, OpTy, false), C, OpTy,
5873 CI->getDataLayout());
5874 assert(C && "Cast of ConstantInt should have folded");
5875 }
5876
5877 CI->setOperand(1, C);
5878 }
5879 }
5880
5881 return FullV;
5882}
5883
5884/// Helper for Rewrite. PHI nodes are special because the use of their operands
5885/// effectively happens in their predecessor blocks, so the expression may need
5886/// to be expanded in multiple places.
5887void LSRInstance::RewriteForPHI(PHINode *PN, const LSRUse &LU,
5888 const LSRFixup &LF, const Formula &F,
5889 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
5890 DenseMap<BasicBlock *, Value *> Inserted;
5891
5892 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
5893 if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
5894 bool needUpdateFixups = false;
5895 BasicBlock *BB = PN->getIncomingBlock(i);
5896
5897 // If this is a critical edge, split the edge so that we do not insert
5898 // the code on all predecessor/successor paths. We do this unless this
5899 // is the canonical backedge for this loop, which complicates post-inc
5900 // users.
5901 if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
5904 BasicBlock *Parent = PN->getParent();
5905 Loop *PNLoop = LI.getLoopFor(Parent);
5906 if (!PNLoop || Parent != PNLoop->getHeader()) {
5907 // Split the critical edge.
5908 BasicBlock *NewBB = nullptr;
5909 if (!Parent->isLandingPad()) {
5910 NewBB =
5911 SplitCriticalEdge(BB, Parent,
5912 CriticalEdgeSplittingOptions(&DT, &LI, MSSAU)
5913 .setMergeIdenticalEdges()
5914 .setKeepOneInputPHIs());
5915 } else {
5917 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
5918 SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DTU, &LI);
5919 NewBB = NewBBs[0];
5920 }
5921 // If NewBB==NULL, then SplitCriticalEdge refused to split because all
5922 // phi predecessors are identical. The simple thing to do is skip
5923 // splitting in this case rather than complicate the API.
5924 if (NewBB) {
5925 // If PN is outside of the loop and BB is in the loop, we want to
5926 // move the block to be immediately before the PHI block, not
5927 // immediately after BB.
5928 if (L->contains(BB) && !L->contains(PN))
5929 NewBB->moveBefore(PN->getParent());
5930
5931 // Splitting the edge can reduce the number of PHI entries we have.
5932 e = PN->getNumIncomingValues();
5933 BB = NewBB;
5934 i = PN->getBasicBlockIndex(BB);
5935
5936 needUpdateFixups = true;
5937 }
5938 }
5939 }
5940
5941 std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
5942 Inserted.try_emplace(BB);
5943 if (!Pair.second)
5944 PN->setIncomingValue(i, Pair.first->second);
5945 else {
5946 Value *FullV =
5947 Expand(LU, LF, F, BB->getTerminator()->getIterator(), DeadInsts);
5948
5949 // If this is reuse-by-noop-cast, insert the noop cast.
5950 Type *OpTy = LF.OperandValToReplace->getType();
5951 if (FullV->getType() != OpTy)
5952 FullV = CastInst::Create(
5953 CastInst::getCastOpcode(FullV, false, OpTy, false), FullV,
5954 LF.OperandValToReplace->getType(), "tmp",
5955 BB->getTerminator()->getIterator());
5956
5957 // If the incoming block for this value is not in the loop, it means the
5958 // current PHI is not in a loop exit, so we must create a LCSSA PHI for
5959 // the inserted value.
5960 if (auto *I = dyn_cast<Instruction>(FullV))
5961 if (L->contains(I) && !L->contains(BB))
5962 InsertedNonLCSSAInsts.insert(I);
5963
5964 PN->setIncomingValue(i, FullV);
5965 Pair.first->second = FullV;
5966 }
5967
5968 // If LSR splits critical edge and phi node has other pending
5969 // fixup operands, we need to update those pending fixups. Otherwise
5970 // formulae will not be implemented completely and some instructions
5971 // will not be eliminated.
5972 if (needUpdateFixups) {
5973 for (LSRUse &LU : Uses)
5974 for (LSRFixup &Fixup : LU.Fixups)
5975 // If fixup is supposed to rewrite some operand in the phi
5976 // that was just updated, it may be already moved to
5977 // another phi node. Such fixup requires update.
5978 if (Fixup.UserInst == PN) {
5979 // Check if the operand we try to replace still exists in the
5980 // original phi.
5981 bool foundInOriginalPHI = false;
5982 for (const auto &val : PN->incoming_values())
5983 if (val == Fixup.OperandValToReplace) {
5984 foundInOriginalPHI = true;
5985 break;
5986 }
5987
5988 // If fixup operand found in original PHI - nothing to do.
5989 if (foundInOriginalPHI)
5990 continue;
5991
5992 // Otherwise it might be moved to another PHI and requires update.
5993 // If fixup operand not found in any of the incoming blocks that
5994 // means we have already rewritten it - nothing to do.
5995 for (const auto &Block : PN->blocks())
5996 for (BasicBlock::iterator I = Block->begin(); isa<PHINode>(I);
5997 ++I) {
5998 PHINode *NewPN = cast<PHINode>(I);
5999 for (const auto &val : NewPN->incoming_values())
6000 if (val == Fixup.OperandValToReplace)
6001 Fixup.UserInst = NewPN;
6002 }
6003 }
6004 }
6005 }
6006}
6007
6008/// Emit instructions for the leading candidate expression for this LSRUse (this
6009/// is called "expanding"), and update the UserInst to reference the newly
6010/// expanded value.
6011void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
6012 const Formula &F,
6013 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
6014 // First, find an insertion point that dominates UserInst. For PHI nodes,
6015 // find the nearest block which dominates all the relevant uses.
6016 if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
6017 RewriteForPHI(PN, LU, LF, F, DeadInsts);
6018 } else {
6019 Value *FullV = Expand(LU, LF, F, LF.UserInst->getIterator(), DeadInsts);
6020
6021 // If this is reuse-by-noop-cast, insert the noop cast.
6022 Type *OpTy = LF.OperandValToReplace->getType();
6023 if (FullV->getType() != OpTy) {
6024 Instruction *Cast =
6025 CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
6026 FullV, OpTy, "tmp", LF.UserInst->getIterator());
6027 FullV = Cast;
6028 }
6029
6030 // Update the user. ICmpZero is handled specially here (for now) because
6031 // Expand may have updated one of the operands of the icmp already, and
6032 // its new value may happen to be equal to LF.OperandValToReplace, in
6033 // which case doing replaceUsesOfWith leads to replacing both operands
6034 // with the same value. TODO: Reorganize this.
6035 if (LU.Kind == LSRUse::ICmpZero)
6036 LF.UserInst->setOperand(0, FullV);
6037 else
6038 LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
6039 }
6040
6041 if (auto *OperandIsInstr = dyn_cast<Instruction>(LF.OperandValToReplace))
6042 DeadInsts.emplace_back(OperandIsInstr);
6043}
6044
6045// Determine where to insert the transformed IV increment instruction for this
6046// fixup. By default this is the default insert position, but if this is a
6047// postincrement opportunity then we try to insert it in the same block as the
6048// fixup user instruction, as this is needed for a postincrement instruction to
6049// be generated.
6051 const LSRFixup &Fixup, const LSRUse &LU,
6052 Instruction *IVIncInsertPos,
6053 DominatorTree &DT) {
6054 // Only address uses can be postincremented
6055 if (LU.Kind != LSRUse::Address)
6056 return IVIncInsertPos;
6057
6058 // Don't try to postincrement if it's not legal
6059 Instruction *I = Fixup.UserInst;
6060 Type *Ty = I->getType();
6061 if (!(isa<LoadInst>(I) && TTI.isIndexedLoadLegal(TTI.MIM_PostInc, Ty)) &&
6062 !(isa<StoreInst>(I) && TTI.isIndexedStoreLegal(TTI.MIM_PostInc, Ty)))
6063 return IVIncInsertPos;
6064
6065 // It's only legal to hoist to the user block if it dominates the default
6066 // insert position.
6067 BasicBlock *HoistBlock = I->getParent();
6068 BasicBlock *IVIncBlock = IVIncInsertPos->getParent();
6069 if (!DT.dominates(I, IVIncBlock))
6070 return IVIncInsertPos;
6071
6072 return HoistBlock->getTerminator();
6073}
6074
6075/// Rewrite all the fixup locations with new values, following the chosen
6076/// solution.
6077void LSRInstance::ImplementSolution(
6078 const SmallVectorImpl<const Formula *> &Solution) {
6079 // Keep track of instructions we may have made dead, so that
6080 // we can remove them after we are done working.
6082
6083 // Mark phi nodes that terminate chains so the expander tries to reuse them.
6084 for (const IVChain &Chain : IVChainVec) {
6085 if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst()))
6086 Rewriter.setChainedPhi(PN);
6087 }
6088
6089 // Expand the new value definitions and update the users.
6090 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
6091 for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) {
6092 Instruction *InsertPos =
6093 getFixupInsertPos(TTI, Fixup, Uses[LUIdx], IVIncInsertPos, DT);
6094 Rewriter.setIVIncInsertPos(L, InsertPos);
6095 Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], DeadInsts);
6096 Changed = true;
6097 }
6098
6099 auto InsertedInsts = InsertedNonLCSSAInsts.takeVector();
6100 formLCSSAForInstructions(InsertedInsts, DT, LI, &SE);
6101
6102 for (const IVChain &Chain : IVChainVec) {
6103 GenerateIVChain(Chain, DeadInsts);
6104 Changed = true;
6105 }
6106
6107 for (const WeakVH &IV : Rewriter.getInsertedIVs())
6108 if (IV && dyn_cast<Instruction>(&*IV)->getParent())
6109 ScalarEvolutionIVs.push_back(IV);
6110
6111 // Clean up after ourselves. This must be done before deleting any
6112 // instructions.
6113 Rewriter.clear();
6114
6116 &TLI, MSSAU);
6117
6118 // In our cost analysis above, we assume that each addrec consumes exactly
6119 // one register, and arrange to have increments inserted just before the
6120 // latch to maximimize the chance this is true. However, if we reused
6121 // existing IVs, we now need to move the increments to match our
6122 // expectations. Otherwise, our cost modeling results in us having a
6123 // chosen a non-optimal result for the actual schedule. (And yes, this
6124 // scheduling decision does impact later codegen.)
6125 for (PHINode &PN : L->getHeader()->phis()) {
6126 BinaryOperator *BO = nullptr;
6127 Value *Start = nullptr, *Step = nullptr;
6128 if (!matchSimpleRecurrence(&PN, BO, Start, Step))
6129 continue;
6130
6131 switch (BO->getOpcode()) {
6132 case Instruction::Sub:
6133 if (BO->getOperand(0) != &PN)
6134 // sub is non-commutative - match handling elsewhere in LSR
6135 continue;
6136 break;
6137 case Instruction::Add:
6138 break;
6139 default:
6140 continue;
6141 };
6142
6143 if (!isa<Constant>(Step))
6144 // If not a constant step, might increase register pressure
6145 // (We assume constants have been canonicalized to RHS)
6146 continue;
6147
6148 if (BO->getParent() == IVIncInsertPos->getParent())
6149 // Only bother moving across blocks. Isel can handle block local case.
6150 continue;
6151
6152 // Can we legally schedule inc at the desired point?
6153 if (!llvm::all_of(BO->uses(),
6154 [&](Use &U) {return DT.dominates(IVIncInsertPos, U);}))
6155 continue;
6156 BO->moveBefore(IVIncInsertPos->getIterator());
6157 Changed = true;
6158 }
6159
6160
6161}
6162
6163LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
6164 DominatorTree &DT, LoopInfo &LI,
6165 const TargetTransformInfo &TTI, AssumptionCache &AC,
6166 TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU)
6167 : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), TLI(TLI), TTI(TTI), L(L),
6168 MSSAU(MSSAU), AMK(PreferredAddresingMode.getNumOccurrences() > 0
6170 : TTI.getPreferredAddressingMode(L, &SE)),
6171 Rewriter(SE, "lsr", false), BaselineCost(L, SE, TTI, AMK) {
6172 // If LoopSimplify form is not available, stay out of trouble.
6173 if (!L->isLoopSimplifyForm())
6174 return;
6175
6176 // If there's no interesting work to be done, bail early.
6177 if (IU.empty()) return;
6178
6179 // If there's too much analysis to be done, bail early. We won't be able to
6180 // model the problem anyway.
6181 unsigned NumUsers = 0;
6182 for (const IVStrideUse &U : IU) {
6183 if (++NumUsers > MaxIVUsers) {
6184 (void)U;
6185 LLVM_DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U
6186 << "\n");
6187 return;
6188 }
6189 // Bail out if we have a PHI on an EHPad that gets a value from a
6190 // CatchSwitchInst. Because the CatchSwitchInst cannot be split, there is
6191 // no good place to stick any instructions.
6192 if (auto *PN = dyn_cast<PHINode>(U.getUser())) {
6193 auto FirstNonPHI = PN->getParent()->getFirstNonPHIIt();
6194 if (isa<FuncletPadInst>(FirstNonPHI) ||
6195 isa<CatchSwitchInst>(FirstNonPHI))
6196 for (BasicBlock *PredBB : PN->blocks())
6197 if (isa<CatchSwitchInst>(PredBB->getFirstNonPHIIt()))
6198 return;
6199 }
6200 }
6201
6202 LLVM_DEBUG(dbgs() << "\nLSR on loop ";
6203 L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
6204 dbgs() << ":\n");
6205
6206 // Check if we expect this loop to use a hardware loop instruction, which will
6207 // be used when calculating the costs of formulas.
6208 HardwareLoopInfo HWLoopInfo(L);
6209 HardwareLoopProfitable =
6210 TTI.isHardwareLoopProfitable(L, SE, AC, &TLI, HWLoopInfo);
6211
6212 // Configure SCEVExpander already now, so the correct mode is used for
6213 // isSafeToExpand() checks.
6214#if LLVM_ENABLE_ABI_BREAKING_CHECKS
6215 Rewriter.setDebugType(DEBUG_TYPE);
6216#endif
6217 Rewriter.disableCanonicalMode();
6218 Rewriter.enableLSRMode();
6219
6220 // First, perform some low-level loop optimizations.
6221 OptimizeShadowIV();
6222 OptimizeLoopTermCond();
6223
6224 // If loop preparation eliminates all interesting IV users, bail.
6225 if (IU.empty()) return;
6226
6227 // Skip nested loops until we can model them better with formulae.
6228 if (!L->isInnermost()) {
6229 LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
6230 return;
6231 }
6232
6233 // Start collecting data and preparing for the solver.
6234 // If number of registers is not the major cost, we cannot benefit from the
6235 // current profitable chain optimization which is based on number of
6236 // registers.
6237 // FIXME: add profitable chain optimization for other kinds major cost, for
6238 // example number of instructions.
6239 if (TTI.isNumRegsMajorCostOfLSR() || StressIVChain)
6240 CollectChains();
6241 CollectInterestingTypesAndFactors();
6242 CollectFixupsAndInitialFormulae();
6243 CollectLoopInvariantFixupsAndFormulae();
6244
6245 if (Uses.empty())
6246 return;
6247
6248 LLVM_DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
6249 print_uses(dbgs()));
6250 LLVM_DEBUG(dbgs() << "The baseline solution requires ";
6251 BaselineCost.print(dbgs()); dbgs() << "\n");
6252
6253 // Now use the reuse data to generate a bunch of interesting ways
6254 // to formulate the values needed for the uses.
6255 GenerateAllReuseFormulae();
6256
6257 FilterOutUndesirableDedicatedRegisters();
6258 NarrowSearchSpaceUsingHeuristics();
6259
6261 Solve(Solution);
6262
6263 // Release memory that is no longer needed.
6264 Factors.clear();
6265 Types.clear();
6266 RegUses.clear();
6267
6268 if (Solution.empty())
6269 return;
6270
6271#ifndef NDEBUG
6272 // Formulae should be legal.
6273 for (const LSRUse &LU : Uses) {
6274 for (const Formula &F : LU.Formulae)
6275 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
6276 F) && "Illegal formula generated!");
6277 };
6278#endif
6279
6280 // Now that we've decided what we want, make it so.
6281 ImplementSolution(Solution);
6282}
6283
6284#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
6285void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
6286 if (Factors.empty() && Types.empty()) return;
6287
6288 OS << "LSR has identified the following interesting factors and types: ";
6289 bool First = true;
6290
6291 for (int64_t Factor : Factors) {
6292 if (!First) OS << ", ";
6293 First = false;
6294 OS << '*' << Factor;
6295 }
6296
6297 for (Type *Ty : Types) {
6298 if (!First) OS << ", ";
6299 First = false;
6300 OS << '(' << *Ty << ')';
6301 }
6302 OS << '\n';
6303}
6304
6305void LSRInstance::print_fixups(raw_ostream &OS) const {
6306 OS << "LSR is examining the following fixup sites:\n";
6307 for (const LSRUse &LU : Uses)
6308 for (const LSRFixup &LF : LU.Fixups) {
6309 dbgs() << " ";
6310 LF.print(OS);
6311 OS << '\n';
6312 }
6313}
6314
6315void LSRInstance::print_uses(raw_ostream &OS) const {
6316 OS << "LSR is examining the following uses:\n";
6317 for (const LSRUse &LU : Uses) {
6318 dbgs() << " ";
6319 LU.print(OS);
6320 OS << '\n';
6321 for (const Formula &F : LU.Formulae) {
6322 OS << " ";
6323 F.print(OS);
6324 OS << '\n';
6325 }
6326 }
6327}
6328
6329void LSRInstance::print(raw_ostream &OS) const {
6330 print_factors_and_types(OS);
6331 print_fixups(OS);
6332 print_uses(OS);
6333}
6334
6335LLVM_DUMP_METHOD void LSRInstance::dump() const {
6336 print(errs()); errs() << '\n';
6337}
6338#endif
6339
6340namespace {
6341
6342class LoopStrengthReduce : public LoopPass {
6343public:
6344 static char ID; // Pass ID, replacement for typeid
6345
6346 LoopStrengthReduce();
6347
6348private:
6349 bool runOnLoop(Loop *L, LPPassManager &LPM) override;
6350 void getAnalysisUsage(AnalysisUsage &AU) const override;
6351};
6352
6353} // end anonymous namespace
6354
6355LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
6357}
6358
6359void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
6360 // We split critical edges, so we change the CFG. However, we do update
6361 // many analyses if they are around.
6363
6364 AU.addRequired<LoopInfoWrapperPass>();
6365 AU.addPreserved<LoopInfoWrapperPass>();
6367 AU.addRequired<DominatorTreeWrapperPass>();
6368 AU.addPreserved<DominatorTreeWrapperPass>();
6369 AU.addRequired<ScalarEvolutionWrapperPass>();
6370 AU.addPreserved<ScalarEvolutionWrapperPass>();
6371 AU.addRequired<AssumptionCacheTracker>();
6372 AU.addRequired<TargetLibraryInfoWrapperPass>();
6373 // Requiring LoopSimplify a second time here prevents IVUsers from running
6374 // twice, since LoopSimplify was invalidated by running ScalarEvolution.
6376 AU.addRequired<IVUsersWrapperPass>();
6377 AU.addPreserved<IVUsersWrapperPass>();
6378 AU.addRequired<TargetTransformInfoWrapperPass>();
6379 AU.addPreserved<MemorySSAWrapperPass>();
6380}
6381
6382namespace {
6383
6384/// Enables more convenient iteration over a DWARF expression vector.
6386ToDwarfOpIter(SmallVectorImpl<uint64_t> &Expr) {
6387 llvm::DIExpression::expr_op_iterator Begin =
6388 llvm::DIExpression::expr_op_iterator(Expr.begin());
6389 llvm::DIExpression::expr_op_iterator End =
6390 llvm::DIExpression::expr_op_iterator(Expr.end());
6391 return {Begin, End};
6392}
6393
6394struct SCEVDbgValueBuilder {
6395 SCEVDbgValueBuilder() = default;
6396 SCEVDbgValueBuilder(const SCEVDbgValueBuilder &Base) { clone(Base); }
6397
6398 void clone(const SCEVDbgValueBuilder &Base) {
6399 LocationOps = Base.LocationOps;
6400 Expr = Base.Expr;
6401 }
6402
6403 void clear() {
6404 LocationOps.clear();
6405 Expr.clear();
6406 }
6407
6408 /// The DIExpression as we translate the SCEV.
6410 /// The location ops of the DIExpression.
6411 SmallVector<Value *, 2> LocationOps;
6412
6413 void pushOperator(uint64_t Op) { Expr.push_back(Op); }
6414 void pushUInt(uint64_t Operand) { Expr.push_back(Operand); }
6415
6416 /// Add a DW_OP_LLVM_arg to the expression, followed by the index of the value
6417 /// in the set of values referenced by the expression.
6418 void pushLocation(llvm::Value *V) {
6420 auto *It = llvm::find(LocationOps, V);
6421 unsigned ArgIndex = 0;
6422 if (It != LocationOps.end()) {
6423 ArgIndex = std::distance(LocationOps.begin(), It);
6424 } else {
6425 ArgIndex = LocationOps.size();
6426 LocationOps.push_back(V);
6427 }
6428 Expr.push_back(ArgIndex);
6429 }
6430
6431 void pushValue(const SCEVUnknown *U) {
6432 llvm::Value *V = cast<SCEVUnknown>(U)->getValue();
6433 pushLocation(V);
6434 }
6435
6436 bool pushConst(const SCEVConstant *C) {
6437 if (C->getAPInt().getSignificantBits() > 64)
6438 return false;
6439 Expr.push_back(llvm::dwarf::DW_OP_consts);
6440 Expr.push_back(C->getAPInt().getSExtValue());
6441 return true;
6442 }
6443
6444 // Iterating the expression as DWARF ops is convenient when updating
6445 // DWARF_OP_LLVM_args.
6447 return ToDwarfOpIter(Expr);
6448 }
6449
6450 /// Several SCEV types are sequences of the same arithmetic operator applied
6451 /// to constants and values that may be extended or truncated.
6452 bool pushArithmeticExpr(const llvm::SCEVCommutativeExpr *CommExpr,
6453 uint64_t DwarfOp) {
6454 assert((isa<llvm::SCEVAddExpr>(CommExpr) || isa<SCEVMulExpr>(CommExpr)) &&
6455 "Expected arithmetic SCEV type");
6456 bool Success = true;
6457 unsigned EmitOperator = 0;
6458 for (const auto &Op : CommExpr->operands()) {
6459 Success &= pushSCEV(Op);
6460
6461 if (EmitOperator >= 1)
6462 pushOperator(DwarfOp);
6463 ++EmitOperator;
6464 }
6465 return Success;
6466 }
6467
6468 // TODO: Identify and omit noop casts.
6469 bool pushCast(const llvm::SCEVCastExpr *C, bool IsSigned) {
6470 const llvm::SCEV *Inner = C->getOperand(0);
6471 const llvm::Type *Type = C->getType();
6472 uint64_t ToWidth = Type->getIntegerBitWidth();
6473 bool Success = pushSCEV(Inner);
6474 uint64_t CastOps[] = {dwarf::DW_OP_LLVM_convert, ToWidth,
6475 IsSigned ? llvm::dwarf::DW_ATE_signed
6476 : llvm::dwarf::DW_ATE_unsigned};
6477 for (const auto &Op : CastOps)
6478 pushOperator(Op);
6479 return Success;
6480 }
6481
6482 // TODO: MinMax - although these haven't been encountered in the test suite.
6483 bool pushSCEV(const llvm::SCEV *S) {
6484 bool Success = true;
6485 if (const SCEVConstant *StartInt = dyn_cast<SCEVConstant>(S)) {
6486 Success &= pushConst(StartInt);
6487
6488 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
6489 if (!U->getValue())
6490 return false;
6491 pushLocation(U->getValue());
6492
6493 } else if (const SCEVMulExpr *MulRec = dyn_cast<SCEVMulExpr>(S)) {
6494 Success &= pushArithmeticExpr(MulRec, llvm::dwarf::DW_OP_mul);
6495
6496 } else if (const SCEVUDivExpr *UDiv = dyn_cast<SCEVUDivExpr>(S)) {
6497 Success &= pushSCEV(UDiv->getLHS());
6498 Success &= pushSCEV(UDiv->getRHS());
6499 pushOperator(llvm::dwarf::DW_OP_div);
6500
6501 } else if (const SCEVCastExpr *Cast = dyn_cast<SCEVCastExpr>(S)) {
6502 // Assert if a new and unknown SCEVCastEXpr type is encountered.
6505 "Unexpected cast type in SCEV.");
6506 Success &= pushCast(Cast, (isa<SCEVSignExtendExpr>(Cast)));
6507
6508 } else if (const SCEVAddExpr *AddExpr = dyn_cast<SCEVAddExpr>(S)) {
6509 Success &= pushArithmeticExpr(AddExpr, llvm::dwarf::DW_OP_plus);
6510
6511 } else if (isa<SCEVAddRecExpr>(S)) {
6512 // Nested SCEVAddRecExpr are generated by nested loops and are currently
6513 // unsupported.
6514 return false;
6515
6516 } else {
6517 return false;
6518 }
6519 return Success;
6520 }
6521
6522 /// Return true if the combination of arithmetic operator and underlying
6523 /// SCEV constant value is an identity function.
6524 bool isIdentityFunction(uint64_t Op, const SCEV *S) {
6525 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
6526 if (C->getAPInt().getSignificantBits() > 64)
6527 return false;
6528 int64_t I = C->getAPInt().getSExtValue();
6529 switch (Op) {
6530 case llvm::dwarf::DW_OP_plus:
6531 case llvm::dwarf::DW_OP_minus:
6532 return I == 0;
6533 case llvm::dwarf::DW_OP_mul:
6534 case llvm::dwarf::DW_OP_div:
6535 return I == 1;
6536 }
6537 }
6538 return false;
6539 }
6540
6541 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6542 /// builder's expression stack. The stack should already contain an
6543 /// expression for the iteration count, so that it can be multiplied by
6544 /// the stride and added to the start.
6545 /// Components of the expression are omitted if they are an identity function.
6546 /// Chain (non-affine) SCEVs are not supported.
6547 bool SCEVToValueExpr(const llvm::SCEVAddRecExpr &SAR, ScalarEvolution &SE) {
6548 assert(SAR.isAffine() && "Expected affine SCEV");
6549 const SCEV *Start = SAR.getStart();
6550 const SCEV *Stride = SAR.getStepRecurrence(SE);
6551
6552 // Skip pushing arithmetic noops.
6553 if (!isIdentityFunction(llvm::dwarf::DW_OP_mul, Stride)) {
6554 if (!pushSCEV(Stride))
6555 return false;
6556 pushOperator(llvm::dwarf::DW_OP_mul);
6557 }
6558 if (!isIdentityFunction(llvm::dwarf::DW_OP_plus, Start)) {
6559 if (!pushSCEV(Start))
6560 return false;
6561 pushOperator(llvm::dwarf::DW_OP_plus);
6562 }
6563 return true;
6564 }
6565
6566 /// Create an expression that is an offset from a value (usually the IV).
6567 void createOffsetExpr(int64_t Offset, Value *OffsetValue) {
6568 pushLocation(OffsetValue);
6570 LLVM_DEBUG(
6571 dbgs() << "scev-salvage: Generated IV offset expression. Offset: "
6572 << std::to_string(Offset) << "\n");
6573 }
6574
6575 /// Combine a translation of the SCEV and the IV to create an expression that
6576 /// recovers a location's value.
6577 /// returns true if an expression was created.
6578 bool createIterCountExpr(const SCEV *S,
6579 const SCEVDbgValueBuilder &IterationCount,
6580 ScalarEvolution &SE) {
6581 // SCEVs for SSA values are most frquently of the form
6582 // {start,+,stride}, but sometimes they are ({start,+,stride} + %a + ..).
6583 // This is because %a is a PHI node that is not the IV. However, these
6584 // SCEVs have not been observed to result in debuginfo-lossy optimisations,
6585 // so its not expected this point will be reached.
6586 if (!isa<SCEVAddRecExpr>(S))
6587 return false;
6588
6589 LLVM_DEBUG(dbgs() << "scev-salvage: Location to salvage SCEV: " << *S
6590 << '\n');
6591
6592 const auto *Rec = cast<SCEVAddRecExpr>(S);
6593 if (!Rec->isAffine())
6594 return false;
6595
6597 return false;
6598
6599 // Initialise a new builder with the iteration count expression. In
6600 // combination with the value's SCEV this enables recovery.
6601 clone(IterationCount);
6602 if (!SCEVToValueExpr(*Rec, SE))
6603 return false;
6604
6605 return true;
6606 }
6607
6608 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6609 /// builder's expression stack. The stack should already contain an
6610 /// expression for the iteration count, so that it can be multiplied by
6611 /// the stride and added to the start.
6612 /// Components of the expression are omitted if they are an identity function.
6613 bool SCEVToIterCountExpr(const llvm::SCEVAddRecExpr &SAR,
6614 ScalarEvolution &SE) {
6615 assert(SAR.isAffine() && "Expected affine SCEV");
6616 const SCEV *Start = SAR.getStart();
6617 const SCEV *Stride = SAR.getStepRecurrence(SE);
6618
6619 // Skip pushing arithmetic noops.
6620 if (!isIdentityFunction(llvm::dwarf::DW_OP_minus, Start)) {
6621 if (!pushSCEV(Start))
6622 return false;
6623 pushOperator(llvm::dwarf::DW_OP_minus);
6624 }
6625 if (!isIdentityFunction(llvm::dwarf::DW_OP_div, Stride)) {
6626 if (!pushSCEV(Stride))
6627 return false;
6628 pushOperator(llvm::dwarf::DW_OP_div);
6629 }
6630 return true;
6631 }
6632
6633 // Append the current expression and locations to a location list and an
6634 // expression list. Modify the DW_OP_LLVM_arg indexes to account for
6635 // the locations already present in the destination list.
6636 void appendToVectors(SmallVectorImpl<uint64_t> &DestExpr,
6637 SmallVectorImpl<Value *> &DestLocations) {
6638 assert(!DestLocations.empty() &&
6639 "Expected the locations vector to contain the IV");
6640 // The DWARF_OP_LLVM_arg arguments of the expression being appended must be
6641 // modified to account for the locations already in the destination vector.
6642 // All builders contain the IV as the first location op.
6643 assert(!LocationOps.empty() &&
6644 "Expected the location ops to contain the IV.");
6645 // DestIndexMap[n] contains the index in DestLocations for the nth
6646 // location in this SCEVDbgValueBuilder.
6647 SmallVector<uint64_t, 2> DestIndexMap;
6648 for (const auto &Op : LocationOps) {
6649 auto It = find(DestLocations, Op);
6650 if (It != DestLocations.end()) {
6651 // Location already exists in DestLocations, reuse existing ArgIndex.
6652 DestIndexMap.push_back(std::distance(DestLocations.begin(), It));
6653 continue;
6654 }
6655 // Location is not in DestLocations, add it.
6656 DestIndexMap.push_back(DestLocations.size());
6657 DestLocations.push_back(Op);
6658 }
6659
6660 for (const auto &Op : expr_ops()) {
6661 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6662 Op.appendToVector(DestExpr);
6663 continue;
6664 }
6665
6667 // `DW_OP_LLVM_arg n` represents the nth LocationOp in this SCEV,
6668 // DestIndexMap[n] contains its new index in DestLocations.
6669 uint64_t NewIndex = DestIndexMap[Op.getArg(0)];
6670 DestExpr.push_back(NewIndex);
6671 }
6672 }
6673};
6674
6675/// Holds all the required data to salvage a dbg.value using the pre-LSR SCEVs
6676/// and DIExpression.
6677struct DVIRecoveryRec {
6678 DVIRecoveryRec(DbgVariableRecord *DVR)
6679 : DbgRef(DVR), Expr(DVR->getExpression()), HadLocationArgList(false) {}
6680
6681 DbgVariableRecord *DbgRef;
6682 DIExpression *Expr;
6683 bool HadLocationArgList;
6684 SmallVector<WeakVH, 2> LocationOps;
6687
6688 void clear() {
6689 for (auto &RE : RecoveryExprs)
6690 RE.reset();
6691 RecoveryExprs.clear();
6692 }
6693
6694 ~DVIRecoveryRec() { clear(); }
6695};
6696} // namespace
6697
6698/// Returns the total number of DW_OP_llvm_arg operands in the expression.
6699/// This helps in determining if a DIArglist is necessary or can be omitted from
6700/// the dbg.value.
6702 auto expr_ops = ToDwarfOpIter(Expr);
6703 unsigned Count = 0;
6704 for (auto Op : expr_ops)
6705 if (Op.getOp() == dwarf::DW_OP_LLVM_arg)
6706 Count++;
6707 return Count;
6708}
6709
6710/// Overwrites DVI with the location and Ops as the DIExpression. This will
6711/// create an invalid expression if Ops has any dwarf::DW_OP_llvm_arg operands,
6712/// because a DIArglist is not created for the first argument of the dbg.value.
6713template <typename T>
6714static void updateDVIWithLocation(T &DbgVal, Value *Location,
6716 assert(numLLVMArgOps(Ops) == 0 && "Expected expression that does not "
6717 "contain any DW_OP_llvm_arg operands.");
6718 DbgVal.setRawLocation(ValueAsMetadata::get(Location));
6719 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6720 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6721}
6722
6723/// Overwrite DVI with locations placed into a DIArglist.
6724template <typename T>
6725static void updateDVIWithLocations(T &DbgVal,
6726 SmallVectorImpl<Value *> &Locations,
6728 assert(numLLVMArgOps(Ops) != 0 &&
6729 "Expected expression that references DIArglist locations using "
6730 "DW_OP_llvm_arg operands.");
6732 for (Value *V : Locations)
6733 MetadataLocs.push_back(ValueAsMetadata::get(V));
6734 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6735 DbgVal.setRawLocation(llvm::DIArgList::get(DbgVal.getContext(), ValArrayRef));
6736 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6737}
6738
6739/// Write the new expression and new location ops for the dbg.value. If possible
6740/// reduce the szie of the dbg.value by omitting DIArglist. This
6741/// can be omitted if:
6742/// 1. There is only a single location, refenced by a single DW_OP_llvm_arg.
6743/// 2. The DW_OP_LLVM_arg is the first operand in the expression.
6744static void UpdateDbgValue(DVIRecoveryRec &DVIRec,
6745 SmallVectorImpl<Value *> &NewLocationOps,
6747 DbgVariableRecord *DbgVal = DVIRec.DbgRef;
6748 unsigned NumLLVMArgs = numLLVMArgOps(NewExpr);
6749 if (NumLLVMArgs == 0) {
6750 // Location assumed to be on the stack.
6751 updateDVIWithLocation(*DbgVal, NewLocationOps[0], NewExpr);
6752 } else if (NumLLVMArgs == 1 && NewExpr[0] == dwarf::DW_OP_LLVM_arg) {
6753 // There is only a single DW_OP_llvm_arg at the start of the expression,
6754 // so it can be omitted along with DIArglist.
6755 assert(NewExpr[1] == 0 &&
6756 "Lone LLVM_arg in a DIExpression should refer to location-op 0.");
6758 updateDVIWithLocation(*DbgVal, NewLocationOps[0], ShortenedOps);
6759 } else {
6760 // Multiple DW_OP_llvm_arg, so DIArgList is strictly necessary.
6761 updateDVIWithLocations(*DbgVal, NewLocationOps, NewExpr);
6762 }
6763
6764 // If the DIExpression was previously empty then add the stack terminator.
6765 // Non-empty expressions have only had elements inserted into them and so
6766 // the terminator should already be present e.g. stack_value or fragment.
6767 DIExpression *SalvageExpr = DbgVal->getExpression();
6768 if (!DVIRec.Expr->isComplex() && SalvageExpr->isComplex()) {
6769 SalvageExpr = DIExpression::append(SalvageExpr, {dwarf::DW_OP_stack_value});
6770 DbgVal->setExpression(SalvageExpr);
6771 }
6772}
6773
6774/// Cached location ops may be erased during LSR, in which case a poison is
6775/// required when restoring from the cache. The type of that location is no
6776/// longer available, so just use int8. The poison will be replaced by one or
6777/// more locations later when a SCEVDbgValueBuilder selects alternative
6778/// locations to use for the salvage.
6780 return (VH) ? VH : PoisonValue::get(llvm::Type::getInt8Ty(C));
6781}
6782
6783/// Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
6784static void restorePreTransformState(DVIRecoveryRec &DVIRec) {
6785 DbgVariableRecord *DbgVal = DVIRec.DbgRef;
6786 LLVM_DEBUG(dbgs() << "scev-salvage: restore dbg.value to pre-LSR state\n"
6787 << "scev-salvage: post-LSR: " << *DbgVal << '\n');
6788 assert(DVIRec.Expr && "Expected an expression");
6789 DbgVal->setExpression(DVIRec.Expr);
6790
6791 // Even a single location-op may be inside a DIArgList and referenced with
6792 // DW_OP_LLVM_arg, which is valid only with a DIArgList.
6793 if (!DVIRec.HadLocationArgList) {
6794 assert(DVIRec.LocationOps.size() == 1 &&
6795 "Unexpected number of location ops.");
6796 // LSR's unsuccessful salvage attempt may have added DIArgList, which in
6797 // this case was not present before, so force the location back to a
6798 // single uncontained Value.
6799 Value *CachedValue =
6800 getValueOrPoison(DVIRec.LocationOps[0], DbgVal->getContext());
6801 DbgVal->setRawLocation(ValueAsMetadata::get(CachedValue));
6802 } else {
6804 for (WeakVH VH : DVIRec.LocationOps) {
6805 Value *CachedValue = getValueOrPoison(VH, DbgVal->getContext());
6806 MetadataLocs.push_back(ValueAsMetadata::get(CachedValue));
6807 }
6808 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6809 DbgVal->setRawLocation(
6810 llvm::DIArgList::get(DbgVal->getContext(), ValArrayRef));
6811 }
6812 LLVM_DEBUG(dbgs() << "scev-salvage: pre-LSR: " << *DbgVal << '\n');
6813}
6814
6816 llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec,
6817 const SCEV *SCEVInductionVar,
6818 SCEVDbgValueBuilder IterCountExpr) {
6819
6820 if (!DVIRec.DbgRef->isKillLocation())
6821 return false;
6822
6823 // LSR may have caused several changes to the dbg.value in the failed salvage
6824 // attempt. So restore the DIExpression, the location ops and also the
6825 // location ops format, which is always DIArglist for multiple ops, but only
6826 // sometimes for a single op.
6828
6829 // LocationOpIndexMap[i] will store the post-LSR location index of
6830 // the non-optimised out location at pre-LSR index i.
6831 SmallVector<int64_t, 2> LocationOpIndexMap;
6832 LocationOpIndexMap.assign(DVIRec.LocationOps.size(), -1);
6833 SmallVector<Value *, 2> NewLocationOps;
6834 NewLocationOps.push_back(LSRInductionVar);
6835
6836 for (unsigned i = 0; i < DVIRec.LocationOps.size(); i++) {
6837 WeakVH VH = DVIRec.LocationOps[i];
6838 // Place the locations not optimised out in the list first, avoiding
6839 // inserts later. The map is used to update the DIExpression's
6840 // DW_OP_LLVM_arg arguments as the expression is updated.
6841 if (VH && !isa<UndefValue>(VH)) {
6842 NewLocationOps.push_back(VH);
6843 LocationOpIndexMap[i] = NewLocationOps.size() - 1;
6844 LLVM_DEBUG(dbgs() << "scev-salvage: Location index " << i
6845 << " now at index " << LocationOpIndexMap[i] << "\n");
6846 continue;
6847 }
6848
6849 // It's possible that a value referred to in the SCEV may have been
6850 // optimised out by LSR.
6851 if (SE.containsErasedValue(DVIRec.SCEVs[i]) ||
6852 SE.containsUndefs(DVIRec.SCEVs[i])) {
6853 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV for location at index: " << i
6854 << " refers to a location that is now undef or erased. "
6855 "Salvage abandoned.\n");
6856 return false;
6857 }
6858
6859 LLVM_DEBUG(dbgs() << "scev-salvage: salvaging location at index " << i
6860 << " with SCEV: " << *DVIRec.SCEVs[i] << "\n");
6861
6862 DVIRec.RecoveryExprs[i] = std::make_unique<SCEVDbgValueBuilder>();
6863 SCEVDbgValueBuilder *SalvageExpr = DVIRec.RecoveryExprs[i].get();
6864
6865 // Create an offset-based salvage expression if possible, as it requires
6866 // less DWARF ops than an iteration count-based expression.
6867 if (std::optional<APInt> Offset =
6868 SE.computeConstantDifference(DVIRec.SCEVs[i], SCEVInductionVar)) {
6869 if (Offset->getSignificantBits() <= 64)
6870 SalvageExpr->createOffsetExpr(Offset->getSExtValue(), LSRInductionVar);
6871 else
6872 return false;
6873 } else if (!SalvageExpr->createIterCountExpr(DVIRec.SCEVs[i], IterCountExpr,
6874 SE))
6875 return false;
6876 }
6877
6878 // Merge the DbgValueBuilder generated expressions and the original
6879 // DIExpression, place the result into an new vector.
6881 if (DVIRec.Expr->getNumElements() == 0) {
6882 assert(DVIRec.RecoveryExprs.size() == 1 &&
6883 "Expected only a single recovery expression for an empty "
6884 "DIExpression.");
6885 assert(DVIRec.RecoveryExprs[0] &&
6886 "Expected a SCEVDbgSalvageBuilder for location 0");
6887 SCEVDbgValueBuilder *B = DVIRec.RecoveryExprs[0].get();
6888 B->appendToVectors(NewExpr, NewLocationOps);
6889 }
6890 for (const auto &Op : DVIRec.Expr->expr_ops()) {
6891 // Most Ops needn't be updated.
6892 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6893 Op.appendToVector(NewExpr);
6894 continue;
6895 }
6896
6897 uint64_t LocationArgIndex = Op.getArg(0);
6898 SCEVDbgValueBuilder *DbgBuilder =
6899 DVIRec.RecoveryExprs[LocationArgIndex].get();
6900 // The location doesn't have s SCEVDbgValueBuilder, so LSR did not
6901 // optimise it away. So just translate the argument to the updated
6902 // location index.
6903 if (!DbgBuilder) {
6904 NewExpr.push_back(dwarf::DW_OP_LLVM_arg);
6905 assert(LocationOpIndexMap[Op.getArg(0)] != -1 &&
6906 "Expected a positive index for the location-op position.");
6907 NewExpr.push_back(LocationOpIndexMap[Op.getArg(0)]);
6908 continue;
6909 }
6910 // The location has a recovery expression.
6911 DbgBuilder->appendToVectors(NewExpr, NewLocationOps);
6912 }
6913
6914 UpdateDbgValue(DVIRec, NewLocationOps, NewExpr);
6915 LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: " << *DVIRec.DbgRef << "\n");
6916 return true;
6917}
6918
6919/// Obtain an expression for the iteration count, then attempt to salvage the
6920/// dbg.value intrinsics.
6922 llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar,
6923 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &DVIToUpdate) {
6924 if (DVIToUpdate.empty())
6925 return;
6926
6927 const llvm::SCEV *SCEVInductionVar = SE.getSCEV(LSRInductionVar);
6928 assert(SCEVInductionVar &&
6929 "Anticipated a SCEV for the post-LSR induction variable");
6930
6931 if (const SCEVAddRecExpr *IVAddRec =
6932 dyn_cast<SCEVAddRecExpr>(SCEVInductionVar)) {
6933 if (!IVAddRec->isAffine())
6934 return;
6935
6936 // Prevent translation using excessive resources.
6937 if (IVAddRec->getExpressionSize() > MaxSCEVSalvageExpressionSize)
6938 return;
6939
6940 // The iteration count is required to recover location values.
6941 SCEVDbgValueBuilder IterCountExpr;
6942 IterCountExpr.pushLocation(LSRInductionVar);
6943 if (!IterCountExpr.SCEVToIterCountExpr(*IVAddRec, SE))
6944 return;
6945
6946 LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV: " << *SCEVInductionVar
6947 << '\n');
6948
6949 for (auto &DVIRec : DVIToUpdate) {
6950 SalvageDVI(L, SE, LSRInductionVar, *DVIRec, SCEVInductionVar,
6951 IterCountExpr);
6952 }
6953 }
6954}
6955
6956/// Identify and cache salvageable DVI locations and expressions along with the
6957/// corresponding SCEV(s). Also ensure that the DVI is not deleted between
6958/// cacheing and salvaging.
6960 Loop *L, ScalarEvolution &SE,
6961 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &SalvageableDVISCEVs) {
6962 for (const auto &B : L->getBlocks()) {
6963 for (auto &I : *B) {
6964 for (DbgVariableRecord &DbgVal : filterDbgVars(I.getDbgRecordRange())) {
6965 if (!DbgVal.isDbgValue() && !DbgVal.isDbgAssign())
6966 continue;
6967
6968 // Ensure that if any location op is undef that the dbg.vlue is not
6969 // cached.
6970 if (DbgVal.isKillLocation())
6971 continue;
6972
6973 // Check that the location op SCEVs are suitable for translation to
6974 // DIExpression.
6975 const auto &HasTranslatableLocationOps =
6976 [&](const DbgVariableRecord &DbgValToTranslate) -> bool {
6977 for (const auto LocOp : DbgValToTranslate.location_ops()) {
6978 if (!LocOp)
6979 return false;
6980
6981 if (!SE.isSCEVable(LocOp->getType()))
6982 return false;
6983
6984 const SCEV *S = SE.getSCEV(LocOp);
6985 if (SE.containsUndefs(S))
6986 return false;
6987 }
6988 return true;
6989 };
6990
6991 if (!HasTranslatableLocationOps(DbgVal))
6992 continue;
6993
6994 std::unique_ptr<DVIRecoveryRec> NewRec =
6995 std::make_unique<DVIRecoveryRec>(&DbgVal);
6996 // Each location Op may need a SCEVDbgValueBuilder in order to recover
6997 // it. Pre-allocating a vector will enable quick lookups of the builder
6998 // later during the salvage.
6999 NewRec->RecoveryExprs.resize(DbgVal.getNumVariableLocationOps());
7000 for (const auto LocOp : DbgVal.location_ops()) {
7001 NewRec->SCEVs.push_back(SE.getSCEV(LocOp));
7002 NewRec->LocationOps.push_back(LocOp);
7003 NewRec->HadLocationArgList = DbgVal.hasArgList();
7004 }
7005 SalvageableDVISCEVs.push_back(std::move(NewRec));
7006 }
7007 }
7008 }
7009}
7010
7011/// Ideally pick the PHI IV inserted by ScalarEvolutionExpander. As a fallback
7012/// any PHi from the loop header is usable, but may have less chance of
7013/// surviving subsequent transforms.
7015 const LSRInstance &LSR) {
7016
7017 auto IsSuitableIV = [&](PHINode *P) {
7018 if (!SE.isSCEVable(P->getType()))
7019 return false;
7020 if (const SCEVAddRecExpr *Rec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(P)))
7021 return Rec->isAffine() && !SE.containsUndefs(SE.getSCEV(P));
7022 return false;
7023 };
7024
7025 // For now, just pick the first IV that was generated and inserted by
7026 // ScalarEvolution. Ideally pick an IV that is unlikely to be optimised away
7027 // by subsequent transforms.
7028 for (const WeakVH &IV : LSR.getScalarEvolutionIVs()) {
7029 if (!IV)
7030 continue;
7031
7032 // There should only be PHI node IVs.
7033 PHINode *P = cast<PHINode>(&*IV);
7034
7035 if (IsSuitableIV(P))
7036 return P;
7037 }
7038
7039 for (PHINode &P : L.getHeader()->phis()) {
7040 if (IsSuitableIV(&P))
7041 return &P;
7042 }
7043 return nullptr;
7044}
7045
7047 DominatorTree &DT, LoopInfo &LI,
7048 const TargetTransformInfo &TTI,
7050 MemorySSA *MSSA) {
7051
7052 // Debug preservation - before we start removing anything identify which DVI
7053 // meet the salvageable criteria and store their DIExpression and SCEVs.
7054 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> SalvageableDVIRecords;
7055 DbgGatherSalvagableDVI(L, SE, SalvageableDVIRecords);
7056
7057 bool Changed = false;
7058 std::unique_ptr<MemorySSAUpdater> MSSAU;
7059 if (MSSA)
7060 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
7061
7062 // Run the main LSR transformation.
7063 const LSRInstance &Reducer =
7064 LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get());
7065 Changed |= Reducer.getChanged();
7066
7067 // Remove any extra phis created by processing inner loops.
7068 Changed |= DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7069 if (EnablePhiElim && L->isLoopSimplifyForm()) {
7071 SCEVExpander Rewriter(SE, "lsr", false);
7072#if LLVM_ENABLE_ABI_BREAKING_CHECKS
7073 Rewriter.setDebugType(DEBUG_TYPE);
7074#endif
7075 unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI);
7076 Rewriter.clear();
7077 if (numFolded) {
7078 Changed = true;
7080 MSSAU.get());
7081 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7082 }
7083 }
7084 // LSR may at times remove all uses of an induction variable from a loop.
7085 // The only remaining use is the PHI in the exit block.
7086 // When this is the case, if the exit value of the IV can be calculated using
7087 // SCEV, we can replace the exit block PHI with the final value of the IV and
7088 // skip the updates in each loop iteration.
7089 if (L->isRecursivelyLCSSAForm(DT, LI) && L->getExitBlock()) {
7091 SCEVExpander Rewriter(SE, "lsr", true);
7092 int Rewrites = rewriteLoopExitValues(L, &LI, &TLI, &SE, &TTI, Rewriter, &DT,
7093 UnusedIndVarInLoop, DeadInsts);
7094 Rewriter.clear();
7095 if (Rewrites) {
7096 Changed = true;
7098 MSSAU.get());
7099 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7100 }
7101 }
7102
7103 if (SalvageableDVIRecords.empty())
7104 return Changed;
7105
7106 // Obtain relevant IVs and attempt to rewrite the salvageable DVIs with
7107 // expressions composed using the derived iteration count.
7108 // TODO: Allow for multiple IV references for nested AddRecSCEVs
7109 for (const auto &L : LI) {
7110 if (llvm::PHINode *IV = GetInductionVariable(*L, SE, Reducer))
7111 DbgRewriteSalvageableDVIs(L, SE, IV, SalvageableDVIRecords);
7112 else {
7113 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV salvaging not possible. An IV "
7114 "could not be identified.\n");
7115 }
7116 }
7117
7118 for (auto &Rec : SalvageableDVIRecords)
7119 Rec->clear();
7120 SalvageableDVIRecords.clear();
7121 return Changed;
7122}
7123
7124bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
7125 if (skipLoop(L))
7126 return false;
7127
7128 auto &IU = getAnalysis<IVUsersWrapperPass>().getIU();
7129 auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
7130 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
7131 auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
7132 const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
7133 *L->getHeader()->getParent());
7134 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
7135 *L->getHeader()->getParent());
7136 auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
7137 *L->getHeader()->getParent());
7138 auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
7139 MemorySSA *MSSA = nullptr;
7140 if (MSSAAnalysis)
7141 MSSA = &MSSAAnalysis->getMSSA();
7142 return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, TLI, MSSA);
7143}
7144
7147 LPMUpdater &) {
7148 if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,
7149 AR.DT, AR.LI, AR.TTI, AR.AC, AR.TLI, AR.MSSA))
7150 return PreservedAnalyses::all();
7151
7152 auto PA = getLoopPassPreservedAnalyses();
7153 if (AR.MSSA)
7154 PA.preserve<MemorySSAAnalysis>();
7155 return PA;
7156}
7157
7158char LoopStrengthReduce::ID = 0;
7159
7160INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
7161 "Loop Strength Reduction", false, false)
7167INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
7168INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
7169 "Loop Strength Reduction", false, false)
7170
7171Pass *llvm::createLoopStrengthReducePass() { return new LoopStrengthReduce(); }
#define Success
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
Function Alias Analysis false
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:638
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isCanonical(const MDString *S)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
This file contains constants used for implementing Dwarf debug support.
early cse Early CSE w MemorySSA
#define DEBUG_TYPE
Hexagon Hardware Loops
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
This header provides classes for managing per-loop analyses.
static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec, const SCEV *SCEVInductionVar, SCEVDbgValueBuilder IterCountExpr)
static cl::opt< bool > DropScaledForVScale("lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true), cl::desc("Avoid using scaled registers with vscale-relative addressing"))
static Value * getWideOperand(Value *Oper)
IVChain logic must consistently peek base TruncInst operands, so wrap it in a convenient helper.
static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE)
Return true if the given add can be sign-extended without changing its value.
static bool mayUsePostIncMode(const TargetTransformInfo &TTI, LSRUse &LU, const SCEV *S, const Loop *L, ScalarEvolution &SE)
Return true if the SCEV represents a value that may end up as a post-increment operation.
static void restorePreTransformState(DVIRecoveryRec &DVIRec)
Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE)
If S involves the addition of a constant integer value, return that integer value,...
static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L)
static User::op_iterator findIVOperand(User::op_iterator OI, User::op_iterator OE, Loop *L, ScalarEvolution &SE)
Helper for CollectChains that finds an IV operand (computed by an AddRec in this loop) within [OI,...
static cl::opt< TTI::AddressingModeKind > PreferredAddresingMode("lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None), cl::desc("A flag that overrides the target's preferred addressing mode."), cl::values(clEnumValN(TTI::AMK_None, "none", "Don't prefer any addressing mode"), clEnumValN(TTI::AMK_PreIndexed, "preindexed", "Prefer pre-indexed addressing mode"), clEnumValN(TTI::AMK_PostIndexed, "postindexed", "Prefer post-indexed addressing mode"), clEnumValN(TTI::AMK_All, "all", "Consider all addressing modes")))
static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset, Immediate MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg, int64_t Scale)
Test whether we know how to expand the current formula.
static void DbgGatherSalvagableDVI(Loop *L, ScalarEvolution &SE, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &SalvageableDVISCEVs)
Identify and cache salvageable DVI locations and expressions along with the corresponding SCEV(s).
static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE)
Return true if the given mul can be sign-extended without changing its value.
static const unsigned MaxSCEVSalvageExpressionSize
Limit the size of expression that SCEV-based salvaging will attempt to translate into a DIExpression.
static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if this AddRec is already a phi in its loop.
static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F, const Loop &L)
static cl::opt< bool > InsnsCost("lsr-insns-cost", cl::Hidden, cl::init(true), cl::desc("Add instruction count to a LSR cost model"))
static cl::opt< bool > StressIVChain("stress-ivchain", cl::Hidden, cl::init(false), cl::desc("Stress test LSR IV chains"))
static bool isAddressUse(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Returns true if the specified instruction is using the specified value as an address.
static GlobalValue * ExtractSymbol(const SCEV *&S, ScalarEvolution &SE)
If S involves the addition of a GlobalValue address, return that symbol, and mutate S to point to a n...
static void updateDVIWithLocation(T &DbgVal, Value *Location, SmallVectorImpl< uint64_t > &Ops)
Overwrites DVI with the location and Ops as the DIExpression.
static bool isLegalAddImmediate(const TargetTransformInfo &TTI, Immediate Offset)
static cl::opt< cl::boolOrDefault > AllowDropSolutionIfLessProfitable("lsr-drop-solution", cl::Hidden, cl::desc("Attempt to drop solution if it is less profitable"))
static cl::opt< bool > EnableVScaleImmediates("lsr-enable-vscale-immediates", cl::Hidden, cl::init(true), cl::desc("Enable analysis of vscale-relative immediates in LSR"))
static Instruction * getFixupInsertPos(const TargetTransformInfo &TTI, const LSRFixup &Fixup, const LSRUse &LU, Instruction *IVIncInsertPos, DominatorTree &DT)
static const SCEV * getExprBase(const SCEV *S)
Return an approximation of this SCEV expression's "base", or NULL for any constant.
static bool isAlwaysFoldable(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg)
static llvm::PHINode * GetInductionVariable(const Loop &L, ScalarEvolution &SE, const LSRInstance &LSR)
Ideally pick the PHI IV inserted by ScalarEvolutionExpander.
static bool IsSimplerBaseSCEVForTarget(const TargetTransformInfo &TTI, ScalarEvolution &SE, const SCEV *Best, const SCEV *Reg, MemAccessTy AccessType)
static const unsigned MaxIVUsers
MaxIVUsers is an arbitrary threshold that provides an early opportunity for bail out.
static bool isHighCostExpansion(const SCEV *S, SmallPtrSetImpl< const SCEV * > &Processed, ScalarEvolution &SE)
Check if expanding this expression is likely to incur significant cost.
static Value * getValueOrPoison(WeakVH &VH, LLVMContext &C)
Cached location ops may be erased during LSR, in which case a poison is required when restoring from ...
static MemAccessTy getAccessType(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Return the type of the memory being accessed.
static unsigned numLLVMArgOps(SmallVectorImpl< uint64_t > &Expr)
Returns the total number of DW_OP_llvm_arg operands in the expression.
static void DbgRewriteSalvageableDVIs(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &DVIToUpdate)
Obtain an expression for the iteration count, then attempt to salvage the dbg.value intrinsics.
static cl::opt< bool > EnablePhiElim("enable-lsr-phielim", cl::Hidden, cl::init(true), cl::desc("Enable LSR phi elimination"))
static void UpdateDbgValue(DVIRecoveryRec &DVIRec, SmallVectorImpl< Value * > &NewLocationOps, SmallVectorImpl< uint64_t > &NewExpr)
Write the new expression and new location ops for the dbg.value.
static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if the given addrec can be sign-extended without changing its value.
static void DoInitialMatch(const SCEV *S, Loop *L, SmallVectorImpl< const SCEV * > &Good, SmallVectorImpl< const SCEV * > &Bad, ScalarEvolution &SE)
Recursion helper for initialMatch.
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F)
Check if the addressing mode defined by F is completely folded in LU at isel time.
static cl::opt< bool > LSRExpNarrow("lsr-exp-narrow", cl::Hidden, cl::init(false), cl::desc("Narrow LSR complex solution using" " expectation of registers number"))
static cl::opt< bool > FilterSameScaledReg("lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true), cl::desc("Narrow LSR search space by filtering non-optimal formulae" " with the same ScaledReg and Scale"))
static void updateDVIWithLocations(T &DbgVal, SmallVectorImpl< Value * > &Locations, SmallVectorImpl< uint64_t > &Ops)
Overwrite DVI with locations placed into a DIArglist.
static cl::opt< unsigned > ComplexityLimit("lsr-complexity-limit", cl::Hidden, cl::init(std::numeric_limits< uint16_t >::max()), cl::desc("LSR search space complexity limit"))
static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC, TargetLibraryInfo &TLI, MemorySSA *MSSA)
static bool isProfitableChain(IVChain &Chain, SmallPtrSetImpl< Instruction * > &Users, ScalarEvolution &SE, const TargetTransformInfo &TTI)
Return true if the number of registers needed for the chain is estimated to be less than the number r...
static const SCEV * CollectSubexprs(const SCEV *S, const SCEVConstant *C, SmallVectorImpl< const SCEV * > &Ops, const Loop *L, ScalarEvolution &SE, unsigned Depth=0)
Split S into subexpressions which can be pulled out into separate registers.
static const SCEV * getExactSDiv(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool IgnoreSignificantBits=false)
Return an expression for LHS /s RHS, if it can be determined and if the remainder is known to be zero...
static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, Value *Operand, const TargetTransformInfo &TTI)
Return true if the IVInc can be folded into an addressing mode.
static const SCEV * getAnyExtendConsideringPostIncUses(ArrayRef< PostIncLoopSet > Loops, const SCEV *Expr, Type *ToTy, ScalarEvolution &SE)
Extend/Truncate Expr to ToTy considering post-inc uses in Loops.
static unsigned getSetupCost(const SCEV *Reg, unsigned Depth)
static cl::opt< unsigned > SetupCostDepthLimit("lsr-setupcost-depth-limit", cl::Hidden, cl::init(7), cl::desc("The limit on recursion depth for LSRs setup cost"))
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
Register Reg
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
#define T
uint64_t IntrinsicInst * II
#define P(N)
PowerPC TLS Dynamic Call Fixup
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the PointerIntPair class.
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
SI optimize exec mask operations pre RA
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
static const unsigned UnknownAddressSpace
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
Virtual Register Rewriter
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1541
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:330
LLVM_ABI APInt sdiv(const APInt &RHS) const
Signed division function for APInt.
Definition APInt.cpp:1644
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition APInt.h:1532
LLVM_ABI APInt srem(const APInt &RHS) const
Function for signed remainder operation.
Definition APInt.cpp:1736
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1563
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
LLVM_ABI AnalysisUsage & addRequiredID(const void *ID)
Definition Pass.cpp:284
AnalysisUsage & addPreservedID(const void *ID)
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:528
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:386
LLVM_ABI bool isLandingPad() const
Return true if this basic block is a landing pad.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
BinaryOps getOpcode() const
Definition InstrTypes.h:374
static LLVM_ABI BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
bool isUnconditional() const
Value * getCondition() const
static LLVM_ABI Instruction::CastOps getCastOpcode(const Value *Val, bool SrcIsSigned, Type *Ty, bool DstIsSigned)
Returns the opcode necessary to cast Val into Ty using usual casting rules.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_NE
not equal
Definition InstrTypes.h:698
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
static LLVM_ABI bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI DIArgList * get(LLVMContext &Context, ArrayRef< ValueAsMetadata * > Args)
DWARF expression.
iterator_range< expr_op_iterator > expr_ops() const
static LLVM_ABI DIExpression * append(const DIExpression *Expr, ArrayRef< uint64_t > Ops)
Append the opcodes Ops to DIExpr.
unsigned getNumElements() const
static LLVM_ABI void appendOffset(SmallVectorImpl< uint64_t > &Ops, int64_t Offset)
Append Ops with operations to apply the Offset.
LLVM_ABI bool isComplex() const
Return whether the location is computed on the expression stack, meaning it cannot be a simple regist...
LLVM_ABI LLVMContext & getContext()
Record of a variable value-assignment, aka a non instruction representation of the dbg....
LLVM_ABI bool isKillLocation() const
void setRawLocation(Metadata *NewLocation)
Use of this should generally be avoided; instead, replaceVariableLocationOp and addVariableLocationOp...
void setExpression(DIExpression *NewExpr)
DIExpression * getExpression() const
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
NodeT * getBlock() const
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:321
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
LLVM_ABI Instruction * findNearestCommonDominator(Instruction *I1, Instruction *I2) const
Find the nearest instruction I that dominates both I1 and I2, in the sense that a result produced bef...
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
PointerType * getType() const
Global values are always pointers.
IVStrideUse - Keep track of one use of a strided induction variable.
Definition IVUsers.h:35
void transformToPostInc(const Loop *L)
transformToPostInc - Transform the expression to post-inc form for the given loop.
Definition IVUsers.cpp:365
Value * getOperandValToReplace() const
getOperandValToReplace - Return the Value of the operand in the user instruction that this IVStrideUs...
Definition IVUsers.h:54
void setUser(Instruction *NewUser)
setUser - Assign a new user instruction for this use.
Definition IVUsers.h:48
Analysis pass that exposes the IVUsers for a loop.
Definition IVUsers.h:184
ilist< IVStrideUse >::const_iterator const_iterator
Definition IVUsers.h:142
iterator end()
Definition IVUsers.h:144
iterator begin()
Definition IVUsers.h:143
bool empty() const
Definition IVUsers.h:147
LLVM_ABI void print(raw_ostream &OS) const
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isLifetimeStartOrEnd() const LLVM_READONLY
Return true if the instruction is a llvm.lifetime.start or llvm.lifetime.end marker.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isEHPad() const
Return true if the instruction is a variety of EH-block.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI Type * getAccessType() const LLVM_READONLY
Return the type this instruction accesses in memory, if any.
const char * getOpcodeName() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
An instruction for reading from memory.
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
The legacy pass manager's analysis pass to compute loop information.
Definition LoopInfo.h:596
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1569
An analysis that produces MemorySSA for a function.
Definition MemorySSA.h:936
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition MemorySSA.h:702
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
iterator_range< const_block_iterator > blocks() const
op_range incoming_values()
void setIncomingValue(unsigned i, Value *V)
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
static unsigned getIncomingValueNumForOperand(unsigned i)
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
This node represents an addition of some number of SCEVs.
This node represents a polynomial recurrence on the trip count of the specified loop.
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents a constant integer value.
ConstantInt * getValue() const
const APInt & getAPInt() const
This class uses information about analyze scalars to rewrite expressions in canonical form.
This node represents multiplication of some number of SCEVs.
ArrayRef< const SCEV * > operands() const
This means that we are dealing with an entirely unknown SCEV value, and only represent it as its LLVM...
This class represents an analyzed expression in the program.
LLVM_ABI ArrayRef< const SCEV * > operands() const
Return operands of this SCEV expression.
unsigned short getExpressionSize() const
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
SCEVTypes getSCEVType() const
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
LLVM_ABI uint64_t getTypeSizeInBits(Type *Ty) const
Return the size in bits of the specified type, for which isSCEVable must return true.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getNoopOrSignExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI const SCEV * getAddRecExpr(const SCEV *Start, const SCEV *Step, const Loop *L, SCEV::NoWrapFlags Flags)
Get an add recurrence expression for the specified loop.
LLVM_ABI bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
LLVM_ABI Type * getEffectiveSCEVType(Type *Ty) const
Return a type with the same bitwidth as the given type and which represents how SCEV will treat the g...
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getAnyExtendExpr(const SCEV *Op, Type *Ty)
getAnyExtendExpr - Return a SCEV for the given operand extended with unspecified bits out to the give...
LLVM_ABI bool containsUndefs(const SCEV *S) const
Return true if the SCEV expression contains an undef value.
LLVM_ABI const SCEV * getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
LLVM_ABI const SCEV * getVScale(Type *Ty)
LLVM_ABI bool hasComputableLoopEvolution(const SCEV *S, const Loop *L)
Return true if the given SCEV changes value in a known way in the specified loop.
LLVM_ABI const SCEV * getPointerBase(const SCEV *V)
Transitively follow the chain of pointer-type operands until reaching a SCEV that does not have a sin...
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUnknown(Value *V)
LLVM_ABI std::optional< APInt > computeConstantDifference(const SCEV *LHS, const SCEV *RHS)
Compute LHS - RHS and returns the result as an APInt if it is a constant, and std::nullopt if it isn'...
LLVM_ABI bool properlyDominates(const SCEV *S, const BasicBlock *BB)
Return true if elements that makes up the given SCEV properly dominate the specified basic block.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVM_ABI bool containsErasedValue(const SCEV *S) const
Return true if the SCEV expression contains a Value that has been optimised out and is now a nullptr.
LLVMContext & getContext() const
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
iterator end()
Get an iterator to the end of the SetVector.
Definition SetVector.h:112
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition SetVector.h:106
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
iterator_range< const_set_bits_iterator > set_bits() const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
size_type size() const
Returns the number of bits in this bitvector.
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
iterator erase(const_iterator CI)
typename SuperClass::const_iterator const_iterator
typename SuperClass::iterator iterator
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
static StackOffset get(int64_t Fixed, int64_t Scalable)
Definition TypeSize.h:41
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI bool shouldDropLSRSolutionIfLessProfitable() const
Return true if LSR should drop a found solution if it's calculated to be less profitable than the bas...
LLVM_ABI bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const
Return true if LSR cost of C1 is lower than C2.
LLVM_ABI bool isIndexedStoreLegal(enum MemIndexedMode Mode, Type *Ty) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace=0, Instruction *I=nullptr, int64_t ScalableOffset=0) const
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
LLVM_ABI bool isIndexedLoadLegal(enum MemIndexedMode Mode, Type *Ty) const
LLVM_ABI bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
LLVM_ABI bool isLegalAddImmediate(int64_t Imm) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
LLVM_ABI bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *LibInfo) const
Return true if the target can save a compare for loop count, for example hardware loop saves a compar...
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI bool canMacroFuseCmp() const
Return true if the target can fuse a compare and branch.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
@ AMK_All
Consider all addressing modes.
@ AMK_PreIndexed
Prefer pre-indexed addressing mode.
@ AMK_None
Don't prefer any addressing mode.
LLVM_ABI bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
This class represents a truncation of integer types.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI int getFPMantissaWidth() const
Return the width of the mantissa of this type.
Definition Type.cpp:235
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
Use * op_iterator
Definition User.h:280
op_range operands()
Definition User.h:293
op_iterator op_begin()
Definition User.h:285
void setOperand(unsigned i, Value *Val)
Definition User.h:238
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:25
Value * getOperand(unsigned i) const
Definition User.h:233
op_iterator op_end()
Definition User.h:287
static LLVM_ABI ValueAsMetadata * get(Value *V)
Definition Metadata.cpp:503
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI void printAsOperand(raw_ostream &O, bool PrintType=true, const Module *M=nullptr) const
Print the name of this Value out to the specified raw_ostream.
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1106
iterator_range< use_iterator > uses()
Definition Value.h:380
A nullable Value handle that is nullable.
int getNumOccurrences() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
class_match< const SCEVVScale > m_SCEVVScale()
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
class_match< const SCEVConstant > m_SCEVConstant()
SCEVAffineAddRec_match< Op0_t, Op1_t, class_match< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
bind_ty< const SCEVMulExpr > m_scev_Mul(const SCEVMulExpr *&V)
bool match(const SCEV *S, const Pattern &P)
class_match< const Loop > m_Loop()
cst_pred_ty< is_specific_cst > m_scev_SpecificInt(uint64_t V)
Match an SCEV constant with a plain unsigned integer.
class_match< const SCEV > m_SCEV()
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
@ DW_OP_LLVM_arg
Only used in LLVM metadata.
Definition Dwarf.h:149
@ DW_OP_LLVM_convert
Only used in LLVM metadata.
Definition Dwarf.h:145
constexpr double e
Sequence
A sequence of states that a pointer may go through in which an objc_retain and objc_release are actua...
Definition PtrState.h:41
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
iterator end() const
Definition BasicBlock.h:89
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
Definition SFrame.h:77
unsigned KindType
For isa, dyn_cast, etc operations on TelemetryInfo.
Definition Telemetry.h:83
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1763
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
InstructionCost Cost
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition Utils.cpp:1731
bool operator!=(uint64_t V1, const APInt &V2)
Definition APInt.h:2114
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2184
LLVM_ABI char & LoopSimplifyID
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
DomTreeNodeBase< BasicBlock > DomTreeNode
Definition Dominators.h:94
AnalysisManager< Loop, LoopStandardAnalysisResults & > LoopAnalysisManager
The loop analysis manager.
LLVM_ABI bool matchSimpleRecurrence(const PHINode *P, BinaryOperator *&BO, Value *&Start, Value *&Step)
Attempt to match a simple first order recurrence cycle of the form: iv = phi Ty [Start,...
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
LLVM_ABI bool DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr)
Examine each PHI in the given block and delete it if it is dead.
LLVM_ABI void initializeLoopStrengthReducePass(PassRegistry &)
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
LLVM_ABI const SCEV * denormalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE)
Denormalize S to be post-increment for all loops present in Loops.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
LLVM_ABI Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void SplitLandingPadPredecessors(BasicBlock *OrigBB, ArrayRef< BasicBlock * > Preds, const char *Suffix, const char *Suffix2, SmallVectorImpl< BasicBlock * > &NewBBs, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, bool PreserveLCSSA=false)
This method transforms the landing pad, OrigBB, by introducing two new basic blocks into the function...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
LLVM_ABI const SCEV * normalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE, bool CheckInvertible=true)
Normalize S to be post-increment for all loops present in Loops.
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Other
Any other memory.
Definition ModRef.h:68
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
@ Add
Sum of integers.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2002
DWARFExpression::Operation Op
LLVM_ABI Pass * createLoopStrengthReducePass()
LLVM_ABI BasicBlock * SplitCriticalEdge(Instruction *TI, unsigned SuccNum, const CriticalEdgeSplittingOptions &Options=CriticalEdgeSplittingOptions(), const Twine &BBName="")
If this edge is a critical edge, insert a new node to split the critical edge.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructionsPermissive(SmallVectorImpl< WeakTrackingVH > &DeadInsts, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
Same functionality as RecursivelyDeleteTriviallyDeadInstructions, but allow instructions that are not...
Definition Local.cpp:548
constexpr unsigned BitWidth
LLVM_ABI bool formLCSSAForInstructions(SmallVectorImpl< Instruction * > &Worklist, const DominatorTree &DT, const LoopInfo &LI, ScalarEvolution *SE, SmallVectorImpl< PHINode * > *PHIsToRemove=nullptr, SmallVectorImpl< PHINode * > *InsertedPHIs=nullptr)
Ensures LCSSA form for every instruction from the Worklist in the scope of innermost containing loop.
Definition LCSSA.cpp:308
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
SmallPtrSet< const Loop *, 2 > PostIncLoopSet
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
LLVM_ABI int rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI, ScalarEvolution *SE, const TargetTransformInfo *TTI, SCEVExpander &Rewriter, DominatorTree *DT, ReplaceExitVal ReplaceExitValue, SmallVector< WeakTrackingVH, 16 > &DeadInsts)
If the final value of any expressions that are recurrent in the loop can be computed,...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
@ UnusedIndVarInLoop
Definition LoopUtils.h:552
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
bool SCEVExprContains(const SCEV *Root, PredTy Pred)
Return true if any node in Root satisfies the predicate Pred.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
Attributes of a target dependent hardware loop.
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Information about a load/store intrinsic defined by the target.
Value * PtrVal
This is the pointer that the intrinsic is loading from or storing to.