LLVM 23.0.0git
LoopStrengthReduce.cpp
Go to the documentation of this file.
1//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This transformation analyzes and transforms the induction variables (and
10// computations derived from them) into forms suitable for efficient execution
11// on the target.
12//
13// This pass performs a strength reduction on array references inside loops that
14// have as one or more of their components the loop induction variable, it
15// rewrites expressions to take advantage of scaled-index addressing modes
16// available on the target, and it performs a variety of other optimizations
17// related to loop induction variables.
18//
19// Terminology note: this code has a lot of handling for "post-increment" or
20// "post-inc" users. This is not talking about post-increment addressing modes;
21// it is instead talking about code like this:
22//
23// %i = phi [ 0, %entry ], [ %i.next, %latch ]
24// ...
25// %i.next = add %i, 1
26// %c = icmp eq %i.next, %n
27//
28// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
29// it's useful to think about these as the same register, with some uses using
30// the value of the register before the add and some using it after. In this
31// example, the icmp is a post-increment user, since it uses %i.next, which is
32// the value of the induction variable after the increment. The other common
33// case of post-increment users is users outside the loop.
34//
35// TODO: More sophistication in the way Formulae are generated and filtered.
36//
37// TODO: Handle multiple loops at a time.
38//
39// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
40// of a GlobalValue?
41//
42// TODO: When truncation is free, truncate ICmp users' operands to make it a
43// smaller encoding (on x86 at least).
44//
45// TODO: When a negated register is used by an add (such as in a list of
46// multiple base registers, or as the increment expression in an addrec),
47// we may not actually need both reg and (-1 * reg) in registers; the
48// negation can be implemented by using a sub instead of an add. The
49// lack of support for taking this into consideration when making
50// register pressure decisions is partly worked around by the "Special"
51// use kind.
52//
53//===----------------------------------------------------------------------===//
54
56#include "llvm/ADT/APInt.h"
57#include "llvm/ADT/DenseMap.h"
58#include "llvm/ADT/DenseSet.h"
60#include "llvm/ADT/STLExtras.h"
61#include "llvm/ADT/SetVector.h"
64#include "llvm/ADT/SmallSet.h"
66#include "llvm/ADT/Statistic.h"
84#include "llvm/IR/BasicBlock.h"
85#include "llvm/IR/Constant.h"
86#include "llvm/IR/Constants.h"
89#include "llvm/IR/Dominators.h"
90#include "llvm/IR/GlobalValue.h"
91#include "llvm/IR/IRBuilder.h"
92#include "llvm/IR/InstrTypes.h"
93#include "llvm/IR/Instruction.h"
96#include "llvm/IR/Module.h"
97#include "llvm/IR/Operator.h"
98#include "llvm/IR/Type.h"
99#include "llvm/IR/Use.h"
100#include "llvm/IR/User.h"
101#include "llvm/IR/Value.h"
102#include "llvm/IR/ValueHandle.h"
104#include "llvm/Pass.h"
105#include "llvm/Support/Casting.h"
108#include "llvm/Support/Debug.h"
118#include <algorithm>
119#include <cassert>
120#include <cstddef>
121#include <cstdint>
122#include <iterator>
123#include <limits>
124#include <map>
125#include <numeric>
126#include <optional>
127#include <utility>
128
129using namespace llvm;
130using namespace SCEVPatternMatch;
131
132#define DEBUG_TYPE "loop-reduce"
133
134/// MaxIVUsers is an arbitrary threshold that provides an early opportunity for
135/// bail out. This threshold is far beyond the number of users that LSR can
136/// conceivably solve, so it should not affect generated code, but catches the
137/// worst cases before LSR burns too much compile time and stack space.
138static const unsigned MaxIVUsers = 200;
139
140/// Limit the size of expression that SCEV-based salvaging will attempt to
141/// translate into a DIExpression.
142/// Choose a maximum size such that debuginfo is not excessively increased and
143/// the salvaging is not too expensive for the compiler.
144static const unsigned MaxSCEVSalvageExpressionSize = 64;
145
146// Cleanup congruent phis after LSR phi expansion.
148 "enable-lsr-phielim", cl::Hidden, cl::init(true),
149 cl::desc("Enable LSR phi elimination"));
150
151// The flag adds instruction count to solutions cost comparison.
153 "lsr-insns-cost", cl::Hidden, cl::init(true),
154 cl::desc("Add instruction count to a LSR cost model"));
155
156// Flag to choose how to narrow complex lsr solution
158 "lsr-exp-narrow", cl::Hidden, cl::init(false),
159 cl::desc("Narrow LSR complex solution using"
160 " expectation of registers number"));
161
162// Flag to narrow search space by filtering non-optimal formulae with
163// the same ScaledReg and Scale.
165 "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true),
166 cl::desc("Narrow LSR search space by filtering non-optimal formulae"
167 " with the same ScaledReg and Scale"));
168
170 "lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None),
171 cl::desc("A flag that overrides the target's preferred addressing mode."),
173 clEnumValN(TTI::AMK_None, "none", "Don't prefer any addressing mode"),
174 clEnumValN(TTI::AMK_PreIndexed, "preindexed",
175 "Prefer pre-indexed addressing mode"),
176 clEnumValN(TTI::AMK_PostIndexed, "postindexed",
177 "Prefer post-indexed addressing mode"),
178 clEnumValN(TTI::AMK_All, "all", "Consider all addressing modes")));
179
181 "lsr-complexity-limit", cl::Hidden,
182 cl::init(std::numeric_limits<uint16_t>::max()),
183 cl::desc("LSR search space complexity limit"));
184
186 "lsr-setupcost-depth-limit", cl::Hidden, cl::init(7),
187 cl::desc("The limit on recursion depth for LSRs setup cost"));
188
190 "lsr-drop-solution", cl::Hidden,
191 cl::desc("Attempt to drop solution if it is less profitable"));
192
194 "lsr-enable-vscale-immediates", cl::Hidden, cl::init(true),
195 cl::desc("Enable analysis of vscale-relative immediates in LSR"));
196
198 "lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true),
199 cl::desc("Avoid using scaled registers with vscale-relative addressing"));
200
201#ifndef NDEBUG
202// Stress test IV chain generation.
204 "stress-ivchain", cl::Hidden, cl::init(false),
205 cl::desc("Stress test LSR IV chains"));
206#else
207static bool StressIVChain = false;
208#endif
209
210namespace {
211
212struct MemAccessTy {
213 /// Used in situations where the accessed memory type is unknown.
214 static const unsigned UnknownAddressSpace =
215 std::numeric_limits<unsigned>::max();
216
217 Type *MemTy = nullptr;
218 unsigned AddrSpace = UnknownAddressSpace;
219
220 MemAccessTy() = default;
221 MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {}
222
223 bool operator==(MemAccessTy Other) const {
224 return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
225 }
226
227 bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
228
229 static MemAccessTy getUnknown(LLVMContext &Ctx,
230 unsigned AS = UnknownAddressSpace) {
231 return MemAccessTy(Type::getVoidTy(Ctx), AS);
232 }
233
234 Type *getType() { return MemTy; }
235};
236
237/// This class holds data which is used to order reuse candidates.
238class RegSortData {
239public:
240 /// This represents the set of LSRUse indices which reference
241 /// a particular register.
242 SmallBitVector UsedByIndices;
243
244 void print(raw_ostream &OS) const;
245 void dump() const;
246};
247
248// An offset from an address that is either scalable or fixed. Used for
249// per-target optimizations of addressing modes.
250class Immediate : public details::FixedOrScalableQuantity<Immediate, int64_t> {
251 constexpr Immediate(ScalarTy MinVal, bool Scalable)
252 : FixedOrScalableQuantity(MinVal, Scalable) {}
253
254 constexpr Immediate(const FixedOrScalableQuantity<Immediate, int64_t> &V)
255 : FixedOrScalableQuantity(V) {}
256
257public:
258 constexpr Immediate() = delete;
259
260 static constexpr Immediate getFixed(ScalarTy MinVal) {
261 return {MinVal, false};
262 }
263 static constexpr Immediate getScalable(ScalarTy MinVal) {
264 return {MinVal, true};
265 }
266 static constexpr Immediate get(ScalarTy MinVal, bool Scalable) {
267 return {MinVal, Scalable};
268 }
269 static constexpr Immediate getZero() { return {0, false}; }
270 static constexpr Immediate getFixedMin() {
271 return {std::numeric_limits<int64_t>::min(), false};
272 }
273 static constexpr Immediate getFixedMax() {
274 return {std::numeric_limits<int64_t>::max(), false};
275 }
276 static constexpr Immediate getScalableMin() {
277 return {std::numeric_limits<int64_t>::min(), true};
278 }
279 static constexpr Immediate getScalableMax() {
280 return {std::numeric_limits<int64_t>::max(), true};
281 }
282
283 constexpr bool isLessThanZero() const { return Quantity < 0; }
284
285 constexpr bool isGreaterThanZero() const { return Quantity > 0; }
286
287 constexpr bool isCompatibleImmediate(const Immediate &Imm) const {
288 return isZero() || Imm.isZero() || Imm.Scalable == Scalable;
289 }
290
291 constexpr bool isMin() const {
292 return Quantity == std::numeric_limits<ScalarTy>::min();
293 }
294
295 constexpr bool isMax() const {
296 return Quantity == std::numeric_limits<ScalarTy>::max();
297 }
298
299 // Arithmetic 'operators' that cast to unsigned types first.
300 constexpr Immediate addUnsigned(const Immediate &RHS) const {
301 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
302 ScalarTy Value = (uint64_t)Quantity + RHS.getKnownMinValue();
303 return {Value, Scalable || RHS.isScalable()};
304 }
305
306 constexpr Immediate subUnsigned(const Immediate &RHS) const {
307 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
308 ScalarTy Value = (uint64_t)Quantity - RHS.getKnownMinValue();
309 return {Value, Scalable || RHS.isScalable()};
310 }
311
312 // Scale the quantity by a constant without caring about runtime scalability.
313 constexpr Immediate mulUnsigned(const ScalarTy RHS) const {
314 ScalarTy Value = (uint64_t)Quantity * RHS;
315 return {Value, Scalable};
316 }
317
318 // Helpers for generating SCEVs with vscale terms where needed.
319 const SCEV *getSCEV(ScalarEvolution &SE, Type *Ty) const {
320 const SCEV *S = SE.getConstant(Ty, Quantity);
321 if (Scalable)
322 S = SE.getMulExpr(S, SE.getVScale(S->getType()));
323 return S;
324 }
325
326 const SCEV *getNegativeSCEV(ScalarEvolution &SE, Type *Ty) const {
327 const SCEV *NegS = SE.getConstant(Ty, -(uint64_t)Quantity);
328 if (Scalable)
329 NegS = SE.getMulExpr(NegS, SE.getVScale(NegS->getType()));
330 return NegS;
331 }
332
333 const SCEV *getUnknownSCEV(ScalarEvolution &SE, Type *Ty) const {
334 // TODO: Avoid implicit trunc?
335 // See https://github.com/llvm/llvm-project/issues/112510.
336 const SCEV *SU = SE.getUnknown(
337 ConstantInt::getSigned(Ty, Quantity, /*ImplicitTrunc=*/true));
338 if (Scalable)
339 SU = SE.getMulExpr(SU, SE.getVScale(SU->getType()));
340 return SU;
341 }
342};
343
344// This is needed for the Compare type of std::map when Immediate is used
345// as a key. We don't need it to be fully correct against any value of vscale,
346// just to make sure that vscale-related terms in the map are considered against
347// each other rather than being mixed up and potentially missing opportunities.
348struct KeyOrderTargetImmediate {
349 bool operator()(const Immediate &LHS, const Immediate &RHS) const {
350 if (LHS.isScalable() && !RHS.isScalable())
351 return false;
352 if (!LHS.isScalable() && RHS.isScalable())
353 return true;
354 return LHS.getKnownMinValue() < RHS.getKnownMinValue();
355 }
356};
357
358// This would be nicer if we could be generic instead of directly using size_t,
359// but there doesn't seem to be a type trait for is_orderable or
360// is_lessthan_comparable or similar.
361struct KeyOrderSizeTAndImmediate {
362 bool operator()(const std::pair<size_t, Immediate> &LHS,
363 const std::pair<size_t, Immediate> &RHS) const {
364 size_t LSize = LHS.first;
365 size_t RSize = RHS.first;
366 if (LSize != RSize)
367 return LSize < RSize;
368 return KeyOrderTargetImmediate()(LHS.second, RHS.second);
369 }
370};
371} // end anonymous namespace
372
373#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
374void RegSortData::print(raw_ostream &OS) const {
375 OS << "[NumUses=" << UsedByIndices.count() << ']';
376}
377
378LLVM_DUMP_METHOD void RegSortData::dump() const {
379 print(errs()); errs() << '\n';
380}
381#endif
382
383namespace {
384
385/// Map register candidates to information about how they are used.
386class RegUseTracker {
387 using RegUsesTy = DenseMap<const SCEV *, RegSortData>;
388
389 RegUsesTy RegUsesMap;
391
392public:
393 void countRegister(const SCEV *Reg, size_t LUIdx);
394 void dropRegister(const SCEV *Reg, size_t LUIdx);
395 void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
396
397 bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
398
399 const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
400
401 void clear();
402
405
406 iterator begin() { return RegSequence.begin(); }
407 iterator end() { return RegSequence.end(); }
408 const_iterator begin() const { return RegSequence.begin(); }
409 const_iterator end() const { return RegSequence.end(); }
410};
411
412} // end anonymous namespace
413
414void
415RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
416 std::pair<RegUsesTy::iterator, bool> Pair = RegUsesMap.try_emplace(Reg);
417 RegSortData &RSD = Pair.first->second;
418 if (Pair.second)
419 RegSequence.push_back(Reg);
420 RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
421 RSD.UsedByIndices.set(LUIdx);
422}
423
424void
425RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
426 RegUsesTy::iterator It = RegUsesMap.find(Reg);
427 assert(It != RegUsesMap.end());
428 RegSortData &RSD = It->second;
429 assert(RSD.UsedByIndices.size() > LUIdx);
430 RSD.UsedByIndices.reset(LUIdx);
431}
432
433void
434RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
435 assert(LUIdx <= LastLUIdx);
436
437 // Update RegUses. The data structure is not optimized for this purpose;
438 // we must iterate through it and update each of the bit vectors.
439 for (auto &Pair : RegUsesMap) {
440 SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
441 if (LUIdx < UsedByIndices.size())
442 UsedByIndices[LUIdx] =
443 LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : false;
444 UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
445 }
446}
447
448bool
449RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
450 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
451 if (I == RegUsesMap.end())
452 return false;
453 const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
454 int i = UsedByIndices.find_first();
455 if (i == -1) return false;
456 if ((size_t)i != LUIdx) return true;
457 return UsedByIndices.find_next(i) != -1;
458}
459
460const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
461 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
462 assert(I != RegUsesMap.end() && "Unknown register!");
463 return I->second.UsedByIndices;
464}
465
466void RegUseTracker::clear() {
467 RegUsesMap.clear();
468 RegSequence.clear();
469}
470
471namespace {
472
473/// This class holds information that describes a formula for computing
474/// satisfying a use. It may include broken-out immediates and scaled registers.
475struct Formula {
476 /// Global base address used for complex addressing.
477 GlobalValue *BaseGV = nullptr;
478
479 /// Base offset for complex addressing.
480 Immediate BaseOffset = Immediate::getZero();
481
482 /// Whether any complex addressing has a base register.
483 bool HasBaseReg = false;
484
485 /// The scale of any complex addressing.
486 int64_t Scale = 0;
487
488 /// The list of "base" registers for this use. When this is non-empty. The
489 /// canonical representation of a formula is
490 /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
491 /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
492 /// 3. The reg containing recurrent expr related with currect loop in the
493 /// formula should be put in the ScaledReg.
494 /// #1 enforces that the scaled register is always used when at least two
495 /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
496 /// #2 enforces that 1 * reg is reg.
497 /// #3 ensures invariant regs with respect to current loop can be combined
498 /// together in LSR codegen.
499 /// This invariant can be temporarily broken while building a formula.
500 /// However, every formula inserted into the LSRInstance must be in canonical
501 /// form.
503
504 /// The 'scaled' register for this use. This should be non-null when Scale is
505 /// not zero.
506 const SCEV *ScaledReg = nullptr;
507
508 /// An additional constant offset which added near the use. This requires a
509 /// temporary register, but the offset itself can live in an add immediate
510 /// field rather than a register.
511 Immediate UnfoldedOffset = Immediate::getZero();
512
513 Formula() = default;
514
515 void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
516
517 bool isCanonical(const Loop &L) const;
518
519 void canonicalize(const Loop &L);
520
521 bool unscale();
522
523 bool hasZeroEnd() const;
524
525 bool countsDownToZero() const;
526
527 size_t getNumRegs() const;
528 Type *getType() const;
529
530 void deleteBaseReg(const SCEV *&S);
531
532 bool referencesReg(const SCEV *S) const;
533 bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
534 const RegUseTracker &RegUses) const;
535
536 void print(raw_ostream &OS) const;
537 void dump() const;
538};
539
540} // end anonymous namespace
541
542/// Recursion helper for initialMatch.
543static void DoInitialMatch(const SCEV *S, Loop *L,
546 ScalarEvolution &SE) {
547 // Collect expressions which properly dominate the loop header.
548 if (SE.properlyDominates(S, L->getHeader())) {
549 Good.push_back(S);
550 return;
551 }
552
553 // Look at add operands.
554 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
555 for (const SCEV *S : Add->operands())
556 DoInitialMatch(S, L, Good, Bad, SE);
557 return;
558 }
559
560 // Look at addrec operands.
561 const SCEV *Start, *Step;
562 const Loop *ARLoop;
563 if (match(S,
564 m_scev_AffineAddRec(m_SCEV(Start), m_SCEV(Step), m_Loop(ARLoop))) &&
565 !Start->isZero()) {
566 DoInitialMatch(Start, L, Good, Bad, SE);
567 DoInitialMatch(SE.getAddRecExpr(SE.getConstant(S->getType(), 0), Step,
568 // FIXME: AR->getNoWrapFlags()
569 ARLoop, SCEV::FlagAnyWrap),
570 L, Good, Bad, SE);
571 return;
572 }
573
574 // Handle a multiplication by -1 (negation) if it didn't fold.
575 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
576 if (Mul->getOperand(0)->isAllOnesValue()) {
578 const SCEV *NewMul = SE.getMulExpr(Ops);
579
582 DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
583 const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
584 SE.getEffectiveSCEVType(NewMul->getType())));
585 for (const SCEV *S : MyGood)
586 Good.push_back(SE.getMulExpr(NegOne, S));
587 for (const SCEV *S : MyBad)
588 Bad.push_back(SE.getMulExpr(NegOne, S));
589 return;
590 }
591
592 // Ok, we can't do anything interesting. Just stuff the whole thing into a
593 // register and hope for the best.
594 Bad.push_back(S);
595}
596
597/// Incorporate loop-variant parts of S into this Formula, attempting to keep
598/// all loop-invariant and loop-computable values in a single base register.
599void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
602 DoInitialMatch(S, L, Good, Bad, SE);
603 if (!Good.empty()) {
604 const SCEV *Sum = SE.getAddExpr(Good);
605 if (!Sum->isZero())
606 BaseRegs.push_back(Sum);
607 HasBaseReg = true;
608 }
609 if (!Bad.empty()) {
610 const SCEV *Sum = SE.getAddExpr(Bad);
611 if (!Sum->isZero())
612 BaseRegs.push_back(Sum);
613 HasBaseReg = true;
614 }
615 canonicalize(*L);
616}
617
618static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L) {
619 return SCEVExprContains(S, [&L](const SCEV *S) {
620 return isa<SCEVAddRecExpr>(S) && (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
621 });
622}
623
624/// Check whether or not this formula satisfies the canonical
625/// representation.
626/// \see Formula::BaseRegs.
627bool Formula::isCanonical(const Loop &L) const {
628 assert((Scale == 0 || ScaledReg) &&
629 "ScaledReg must be non-null if Scale is non-zero");
630
631 if (!ScaledReg)
632 return BaseRegs.size() <= 1;
633
634 if (Scale != 1)
635 return true;
636
637 if (Scale == 1 && BaseRegs.empty())
638 return false;
639
640 if (containsAddRecDependentOnLoop(ScaledReg, L))
641 return true;
642
643 // If ScaledReg is not a recurrent expr, or it is but its loop is not current
644 // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
645 // loop, we want to swap the reg in BaseRegs with ScaledReg.
646 return none_of(BaseRegs, [&L](const SCEV *S) {
648 });
649}
650
651/// Helper method to morph a formula into its canonical representation.
652/// \see Formula::BaseRegs.
653/// Every formula having more than one base register, must use the ScaledReg
654/// field. Otherwise, we would have to do special cases everywhere in LSR
655/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
656/// On the other hand, 1*reg should be canonicalized into reg.
657void Formula::canonicalize(const Loop &L) {
658 if (isCanonical(L))
659 return;
660
661 if (BaseRegs.empty()) {
662 // No base reg? Use scale reg with scale = 1 as such.
663 assert(ScaledReg && "Expected 1*reg => reg");
664 assert(Scale == 1 && "Expected 1*reg => reg");
665 BaseRegs.push_back(ScaledReg);
666 Scale = 0;
667 ScaledReg = nullptr;
668 return;
669 }
670
671 // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
672 if (!ScaledReg) {
673 ScaledReg = BaseRegs.pop_back_val();
674 Scale = 1;
675 }
676
677 // If ScaledReg is an invariant with respect to L, find the reg from
678 // BaseRegs containing the recurrent expr related with Loop L. Swap the
679 // reg with ScaledReg.
680 if (!containsAddRecDependentOnLoop(ScaledReg, L)) {
681 auto I = find_if(BaseRegs, [&L](const SCEV *S) {
683 });
684 if (I != BaseRegs.end())
685 std::swap(ScaledReg, *I);
686 }
687 assert(isCanonical(L) && "Failed to canonicalize?");
688}
689
690/// Get rid of the scale in the formula.
691/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
692/// \return true if it was possible to get rid of the scale, false otherwise.
693/// \note After this operation the formula may not be in the canonical form.
694bool Formula::unscale() {
695 if (Scale != 1)
696 return false;
697 Scale = 0;
698 BaseRegs.push_back(ScaledReg);
699 ScaledReg = nullptr;
700 return true;
701}
702
703bool Formula::hasZeroEnd() const {
704 if (UnfoldedOffset || BaseOffset)
705 return false;
706 if (BaseRegs.size() != 1 || ScaledReg)
707 return false;
708 return true;
709}
710
711bool Formula::countsDownToZero() const {
712 if (!hasZeroEnd())
713 return false;
714 assert(BaseRegs.size() == 1 && "hasZeroEnd should mean one BaseReg");
715 const APInt *StepInt;
716 if (!match(BaseRegs[0], m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StepInt))))
717 return false;
718 return StepInt->isNegative();
719}
720
721/// Return the total number of register operands used by this formula. This does
722/// not include register uses implied by non-constant addrec strides.
723size_t Formula::getNumRegs() const {
724 return !!ScaledReg + BaseRegs.size();
725}
726
727/// Return the type of this formula, if it has one, or null otherwise. This type
728/// is meaningless except for the bit size.
729Type *Formula::getType() const {
730 return !BaseRegs.empty() ? BaseRegs.front()->getType() :
731 ScaledReg ? ScaledReg->getType() :
732 BaseGV ? BaseGV->getType() :
733 nullptr;
734}
735
736/// Delete the given base reg from the BaseRegs list.
737void Formula::deleteBaseReg(const SCEV *&S) {
738 if (&S != &BaseRegs.back())
739 std::swap(S, BaseRegs.back());
740 BaseRegs.pop_back();
741}
742
743/// Test if this formula references the given register.
744bool Formula::referencesReg(const SCEV *S) const {
745 return S == ScaledReg || is_contained(BaseRegs, S);
746}
747
748/// Test whether this formula uses registers which are used by uses other than
749/// the use with the given index.
750bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
751 const RegUseTracker &RegUses) const {
752 if (ScaledReg)
753 if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
754 return true;
755 for (const SCEV *BaseReg : BaseRegs)
756 if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
757 return true;
758 return false;
759}
760
761#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
762void Formula::print(raw_ostream &OS) const {
763 ListSeparator Plus(" + ");
764 if (BaseGV) {
765 OS << Plus;
766 BaseGV->printAsOperand(OS, /*PrintType=*/false);
767 }
768 if (BaseOffset.isNonZero())
769 OS << Plus << BaseOffset;
770
771 for (const SCEV *BaseReg : BaseRegs)
772 OS << Plus << "reg(" << *BaseReg << ')';
773
774 if (HasBaseReg && BaseRegs.empty())
775 OS << Plus << "**error: HasBaseReg**";
776 else if (!HasBaseReg && !BaseRegs.empty())
777 OS << Plus << "**error: !HasBaseReg**";
778
779 if (Scale != 0) {
780 OS << Plus << Scale << "*reg(";
781 if (ScaledReg)
782 OS << *ScaledReg;
783 else
784 OS << "<unknown>";
785 OS << ')';
786 }
787 if (UnfoldedOffset.isNonZero())
788 OS << Plus << "imm(" << UnfoldedOffset << ')';
789}
790
791LLVM_DUMP_METHOD void Formula::dump() const {
792 print(errs()); errs() << '\n';
793}
794#endif
795
796/// Return true if the given addrec can be sign-extended without changing its
797/// value.
799 Type *WideTy =
801 return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
802}
803
804/// Return true if the given add can be sign-extended without changing its
805/// value.
806static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
807 Type *WideTy =
808 IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
809 return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
810}
811
812/// Return true if the given mul can be sign-extended without changing its
813/// value.
814static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
815 Type *WideTy =
817 SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
818 return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
819}
820
821/// Return an expression for LHS /s RHS, if it can be determined and if the
822/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
823/// is true, expressions like (X * Y) /s Y are simplified to X, ignoring that
824/// the multiplication may overflow, which is useful when the result will be
825/// used in a context where the most significant bits are ignored.
826static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
827 ScalarEvolution &SE,
828 bool IgnoreSignificantBits = false) {
829 // Handle the trivial case, which works for any SCEV type.
830 if (LHS == RHS)
831 return SE.getConstant(LHS->getType(), 1);
832
833 // Handle a few RHS special cases.
835 if (RC) {
836 const APInt &RA = RC->getAPInt();
837 // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
838 // some folding.
839 if (RA.isAllOnes()) {
840 if (LHS->getType()->isPointerTy())
841 return nullptr;
842 return SE.getMulExpr(LHS, RC);
843 }
844 // Handle x /s 1 as x.
845 if (RA == 1)
846 return LHS;
847 }
848
849 // Check for a division of a constant by a constant.
851 if (!RC)
852 return nullptr;
853 const APInt &LA = C->getAPInt();
854 const APInt &RA = RC->getAPInt();
855 if (LA.srem(RA) != 0)
856 return nullptr;
857 return SE.getConstant(LA.sdiv(RA));
858 }
859
860 // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
862 if ((IgnoreSignificantBits || isAddRecSExtable(AR, SE)) && AR->isAffine()) {
863 const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
864 IgnoreSignificantBits);
865 if (!Step) return nullptr;
866 const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
867 IgnoreSignificantBits);
868 if (!Start) return nullptr;
869 // FlagNW is independent of the start value, step direction, and is
870 // preserved with smaller magnitude steps.
871 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
872 return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
873 }
874 return nullptr;
875 }
876
877 // Distribute the sdiv over add operands, if the add doesn't overflow.
879 if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
881 for (const SCEV *S : Add->operands()) {
882 const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
883 if (!Op) return nullptr;
884 Ops.push_back(Op);
885 }
886 return SE.getAddExpr(Ops);
887 }
888 return nullptr;
889 }
890
891 // Check for a multiply operand that we can pull RHS out of.
893 if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
894 // Handle special case C1*X*Y /s C2*X*Y.
895 if (const SCEVMulExpr *MulRHS = dyn_cast<SCEVMulExpr>(RHS)) {
896 if (IgnoreSignificantBits || isMulSExtable(MulRHS, SE)) {
897 const SCEVConstant *LC = dyn_cast<SCEVConstant>(Mul->getOperand(0));
898 const SCEVConstant *RC =
899 dyn_cast<SCEVConstant>(MulRHS->getOperand(0));
900 if (LC && RC) {
902 SmallVector<const SCEV *, 4> ROps(drop_begin(MulRHS->operands()));
903 if (LOps == ROps)
904 return getExactSDiv(LC, RC, SE, IgnoreSignificantBits);
905 }
906 }
907 }
908
910 bool Found = false;
911 for (const SCEV *S : Mul->operands()) {
912 if (!Found)
913 if (const SCEV *Q = getExactSDiv(S, RHS, SE,
914 IgnoreSignificantBits)) {
915 S = Q;
916 Found = true;
917 }
918 Ops.push_back(S);
919 }
920 return Found ? SE.getMulExpr(Ops) : nullptr;
921 }
922 return nullptr;
923 }
924
925 // Otherwise we don't know.
926 return nullptr;
927}
928
929/// If S involves the addition of a constant integer value, return that integer
930/// value, and mutate S to point to a new SCEV with that value excluded.
931static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
932 const APInt *C;
933 if (match(S, m_scev_APInt(C))) {
934 if (C->getSignificantBits() <= 64) {
935 S = SE.getConstant(S->getType(), 0);
936 return Immediate::getFixed(C->getSExtValue());
937 }
938 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
939 SmallVector<const SCEV *, 8> NewOps(Add->operands());
940 Immediate Result = ExtractImmediate(NewOps.front(), SE);
941 if (Result.isNonZero())
942 S = SE.getAddExpr(NewOps);
943 return Result;
944 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
945 SmallVector<const SCEV *, 8> NewOps(AR->operands());
946 Immediate Result = ExtractImmediate(NewOps.front(), SE);
947 if (Result.isNonZero())
948 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
949 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
951 return Result;
952 } else if (EnableVScaleImmediates &&
954 S = SE.getConstant(S->getType(), 0);
955 return Immediate::getScalable(C->getSExtValue());
956 }
957 return Immediate::getZero();
958}
959
960/// If S involves the addition of a GlobalValue address, return that symbol, and
961/// mutate S to point to a new SCEV with that value excluded.
963 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
964 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
965 S = SE.getConstant(GV->getType(), 0);
966 return GV;
967 }
968 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
969 SmallVector<const SCEV *, 8> NewOps(Add->operands());
970 GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
971 if (Result)
972 S = SE.getAddExpr(NewOps);
973 return Result;
974 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
975 SmallVector<const SCEV *, 8> NewOps(AR->operands());
976 GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
977 if (Result)
978 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
979 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
981 return Result;
982 }
983 return nullptr;
984}
985
986/// Returns true if the specified instruction is using the specified value as an
987/// address.
989 Instruction *Inst, Value *OperandVal) {
990 bool isAddress = isa<LoadInst>(Inst);
991 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
992 if (SI->getPointerOperand() == OperandVal)
993 isAddress = true;
994 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
995 // Addressing modes can also be folded into prefetches and a variety
996 // of intrinsics.
997 switch (II->getIntrinsicID()) {
998 case Intrinsic::memset:
999 case Intrinsic::prefetch:
1000 case Intrinsic::masked_load:
1001 if (II->getArgOperand(0) == OperandVal)
1002 isAddress = true;
1003 break;
1004 case Intrinsic::masked_store:
1005 if (II->getArgOperand(1) == OperandVal)
1006 isAddress = true;
1007 break;
1008 case Intrinsic::memmove:
1009 case Intrinsic::memcpy:
1010 if (II->getArgOperand(0) == OperandVal ||
1011 II->getArgOperand(1) == OperandVal)
1012 isAddress = true;
1013 break;
1014 default: {
1015 MemIntrinsicInfo IntrInfo;
1016 if (TTI.getTgtMemIntrinsic(II, IntrInfo)) {
1017 if (IntrInfo.PtrVal == OperandVal)
1018 isAddress = true;
1019 }
1020 }
1021 }
1022 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1023 if (RMW->getPointerOperand() == OperandVal)
1024 isAddress = true;
1025 } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1026 if (CmpX->getPointerOperand() == OperandVal)
1027 isAddress = true;
1028 }
1029 return isAddress;
1030}
1031
1032/// Return the type of the memory being accessed.
1033static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
1034 Instruction *Inst, Value *OperandVal) {
1035 MemAccessTy AccessTy = MemAccessTy::getUnknown(Inst->getContext());
1036
1037 // First get the type of memory being accessed.
1038 if (Type *Ty = Inst->getAccessType())
1039 AccessTy.MemTy = Ty;
1040
1041 // Then get the pointer address space.
1042 if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
1043 AccessTy.AddrSpace = SI->getPointerAddressSpace();
1044 } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
1045 AccessTy.AddrSpace = LI->getPointerAddressSpace();
1046 } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1047 AccessTy.AddrSpace = RMW->getPointerAddressSpace();
1048 } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1049 AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
1050 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
1051 switch (II->getIntrinsicID()) {
1052 case Intrinsic::prefetch:
1053 case Intrinsic::memset:
1054 AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
1055 AccessTy.MemTy = OperandVal->getType();
1056 break;
1057 case Intrinsic::memmove:
1058 case Intrinsic::memcpy:
1059 AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace();
1060 AccessTy.MemTy = OperandVal->getType();
1061 break;
1062 case Intrinsic::masked_load:
1063 AccessTy.AddrSpace =
1064 II->getArgOperand(0)->getType()->getPointerAddressSpace();
1065 break;
1066 case Intrinsic::masked_store:
1067 AccessTy.AddrSpace =
1068 II->getArgOperand(1)->getType()->getPointerAddressSpace();
1069 break;
1070 default: {
1071 MemIntrinsicInfo IntrInfo;
1072 if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) {
1073 AccessTy.AddrSpace
1074 = IntrInfo.PtrVal->getType()->getPointerAddressSpace();
1075 }
1076
1077 break;
1078 }
1079 }
1080 }
1081
1082 return AccessTy;
1083}
1084
1085/// Return true if this AddRec is already a phi in its loop.
1086static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
1087 for (PHINode &PN : AR->getLoop()->getHeader()->phis()) {
1088 if (SE.isSCEVable(PN.getType()) &&
1089 (SE.getEffectiveSCEVType(PN.getType()) ==
1090 SE.getEffectiveSCEVType(AR->getType())) &&
1091 SE.getSCEV(&PN) == AR)
1092 return true;
1093 }
1094 return false;
1095}
1096
1097/// Check if expanding this expression is likely to incur significant cost. This
1098/// is tricky because SCEV doesn't track which expressions are actually computed
1099/// by the current IR.
1100///
1101/// We currently allow expansion of IV increments that involve adds,
1102/// multiplication by constants, and AddRecs from existing phis.
1103///
1104/// TODO: Allow UDivExpr if we can find an existing IV increment that is an
1105/// obvious multiple of the UDivExpr.
1106static bool isHighCostExpansion(const SCEV *S,
1108 ScalarEvolution &SE) {
1109 // Zero/One operand expressions
1110 switch (S->getSCEVType()) {
1111 case scUnknown:
1112 case scConstant:
1113 case scVScale:
1114 return false;
1115 case scTruncate:
1116 return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
1117 Processed, SE);
1118 case scZeroExtend:
1119 return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
1120 Processed, SE);
1121 case scSignExtend:
1122 return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
1123 Processed, SE);
1124 default:
1125 break;
1126 }
1127
1128 if (!Processed.insert(S).second)
1129 return false;
1130
1131 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
1132 for (const SCEV *S : Add->operands()) {
1133 if (isHighCostExpansion(S, Processed, SE))
1134 return true;
1135 }
1136 return false;
1137 }
1138
1139 const SCEV *Op0, *Op1;
1140 if (match(S, m_scev_Mul(m_SCEV(Op0), m_SCEV(Op1)))) {
1141 // Multiplication by a constant is ok
1142 if (isa<SCEVConstant>(Op0))
1143 return isHighCostExpansion(Op1, Processed, SE);
1144
1145 // If we have the value of one operand, check if an existing
1146 // multiplication already generates this expression.
1147 if (const auto *U = dyn_cast<SCEVUnknown>(Op1)) {
1148 Value *UVal = U->getValue();
1149 for (User *UR : UVal->users()) {
1150 // If U is a constant, it may be used by a ConstantExpr.
1152 if (UI && UI->getOpcode() == Instruction::Mul &&
1153 SE.isSCEVable(UI->getType())) {
1154 return SE.getSCEV(UI) == S;
1155 }
1156 }
1157 }
1158 }
1159
1160 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
1161 if (isExistingPhi(AR, SE))
1162 return false;
1163 }
1164
1165 // Fow now, consider any other type of expression (div/mul/min/max) high cost.
1166 return true;
1167}
1168
1169namespace {
1170
1171class LSRUse;
1172
1173} // end anonymous namespace
1174
1175/// Check if the addressing mode defined by \p F is completely
1176/// folded in \p LU at isel time.
1177/// This includes address-mode folding and special icmp tricks.
1178/// This function returns true if \p LU can accommodate what \p F
1179/// defines and up to 1 base + 1 scaled + offset.
1180/// In other words, if \p F has several base registers, this function may
1181/// still return true. Therefore, users still need to account for
1182/// additional base registers and/or unfolded offsets to derive an
1183/// accurate cost model.
1184static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1185 const LSRUse &LU, const Formula &F);
1186
1187// Get the cost of the scaling factor used in F for LU.
1188static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI,
1189 const LSRUse &LU, const Formula &F,
1190 const Loop &L);
1191
1192namespace {
1193
1194/// This class is used to measure and compare candidate formulae.
1195class Cost {
1196 const Loop *L = nullptr;
1197 ScalarEvolution *SE = nullptr;
1198 const TargetTransformInfo *TTI = nullptr;
1199 TargetTransformInfo::LSRCost C;
1201
1202public:
1203 Cost() = delete;
1204 Cost(const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
1206 L(L), SE(&SE), TTI(&TTI), AMK(AMK) {
1207 C.Insns = 0;
1208 C.NumRegs = 0;
1209 C.AddRecCost = 0;
1210 C.NumIVMuls = 0;
1211 C.NumBaseAdds = 0;
1212 C.ImmCost = 0;
1213 C.SetupCost = 0;
1214 C.ScaleCost = 0;
1215 }
1216
1217 bool isLess(const Cost &Other) const;
1218
1219 void Lose();
1220
1221#ifndef NDEBUG
1222 // Once any of the metrics loses, they must all remain losers.
1223 bool isValid() {
1224 return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
1225 | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
1226 || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
1227 & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
1228 }
1229#endif
1230
1231 bool isLoser() {
1232 assert(isValid() && "invalid cost");
1233 return C.NumRegs == ~0u;
1234 }
1235
1236 void RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
1237 const DenseSet<const SCEV *> &VisitedRegs, const LSRUse &LU,
1238 bool HardwareLoopProfitable,
1239 SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
1240
1241 void print(raw_ostream &OS) const;
1242 void dump() const;
1243
1244private:
1245 void RateRegister(const Formula &F, const SCEV *Reg,
1246 SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
1247 bool HardwareLoopProfitable);
1248 void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1249 SmallPtrSetImpl<const SCEV *> &Regs,
1250 const LSRUse &LU, bool HardwareLoopProfitable,
1251 SmallPtrSetImpl<const SCEV *> *LoserRegs);
1252};
1253
1254/// An operand value in an instruction which is to be replaced with some
1255/// equivalent, possibly strength-reduced, replacement.
1256struct LSRFixup {
1257 /// The instruction which will be updated.
1258 Instruction *UserInst = nullptr;
1259
1260 /// The operand of the instruction which will be replaced. The operand may be
1261 /// used more than once; every instance will be replaced.
1262 Value *OperandValToReplace = nullptr;
1263
1264 /// If this user is to use the post-incremented value of an induction
1265 /// variable, this set is non-empty and holds the loops associated with the
1266 /// induction variable.
1267 PostIncLoopSet PostIncLoops;
1268
1269 /// A constant offset to be added to the LSRUse expression. This allows
1270 /// multiple fixups to share the same LSRUse with different offsets, for
1271 /// example in an unrolled loop.
1272 Immediate Offset = Immediate::getZero();
1273
1274 LSRFixup() = default;
1275
1276 bool isUseFullyOutsideLoop(const Loop *L) const;
1277
1278 void print(raw_ostream &OS) const;
1279 void dump() const;
1280};
1281
1282/// This class holds the state that LSR keeps for each use in IVUsers, as well
1283/// as uses invented by LSR itself. It includes information about what kinds of
1284/// things can be folded into the user, information about the user itself, and
1285/// information about how the use may be satisfied. TODO: Represent multiple
1286/// users of the same expression in common?
1287class LSRUse {
1288 DenseSet<SmallVector<const SCEV *, 4>> Uniquifier;
1289
1290public:
1291 /// An enum for a kind of use, indicating what types of scaled and immediate
1292 /// operands it might support.
1293 enum KindType {
1294 Basic, ///< A normal use, with no folding.
1295 Special, ///< A special case of basic, allowing -1 scales.
1296 Address, ///< An address use; folding according to TargetLowering
1297 ICmpZero ///< An equality icmp with both operands folded into one.
1298 // TODO: Add a generic icmp too?
1299 };
1300
1301 using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>;
1302
1303 KindType Kind;
1304 MemAccessTy AccessTy;
1305
1306 /// The list of operands which are to be replaced.
1308
1309 /// Keep track of the min and max offsets of the fixups.
1310 Immediate MinOffset = Immediate::getFixedMax();
1311 Immediate MaxOffset = Immediate::getFixedMin();
1312
1313 /// This records whether all of the fixups using this LSRUse are outside of
1314 /// the loop, in which case some special-case heuristics may be used.
1315 bool AllFixupsOutsideLoop = true;
1316
1317 /// This records whether all of the fixups using this LSRUse are unconditional
1318 /// within the loop, meaning they will be executed on every path to the loop
1319 /// latch. This includes fixups before early exits.
1320 bool AllFixupsUnconditional = true;
1321
1322 /// RigidFormula is set to true to guarantee that this use will be associated
1323 /// with a single formula--the one that initially matched. Some SCEV
1324 /// expressions cannot be expanded. This allows LSR to consider the registers
1325 /// used by those expressions without the need to expand them later after
1326 /// changing the formula.
1327 bool RigidFormula = false;
1328
1329 /// This records the widest use type for any fixup using this
1330 /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
1331 /// fixup widths to be equivalent, because the narrower one may be relying on
1332 /// the implicit truncation to truncate away bogus bits.
1333 Type *WidestFixupType = nullptr;
1334
1335 /// A list of ways to build a value that can satisfy this user. After the
1336 /// list is populated, one of these is selected heuristically and used to
1337 /// formulate a replacement for OperandValToReplace in UserInst.
1338 SmallVector<Formula, 12> Formulae;
1339
1340 /// The set of register candidates used by all formulae in this LSRUse.
1341 SmallPtrSet<const SCEV *, 4> Regs;
1342
1343 LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {}
1344
1345 LSRFixup &getNewFixup() {
1346 Fixups.push_back(LSRFixup());
1347 return Fixups.back();
1348 }
1349
1350 void pushFixup(LSRFixup &f) {
1351 Fixups.push_back(f);
1352 if (Immediate::isKnownGT(f.Offset, MaxOffset))
1353 MaxOffset = f.Offset;
1354 if (Immediate::isKnownLT(f.Offset, MinOffset))
1355 MinOffset = f.Offset;
1356 }
1357
1358 bool HasFormulaWithSameRegs(const Formula &F) const;
1359 float getNotSelectedProbability(const SCEV *Reg) const;
1360 bool InsertFormula(const Formula &F, const Loop &L);
1361 void DeleteFormula(Formula &F);
1362 void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
1363
1364 void print(raw_ostream &OS) const;
1365 void dump() const;
1366};
1367
1368} // end anonymous namespace
1369
1370static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1371 LSRUse::KindType Kind, MemAccessTy AccessTy,
1372 GlobalValue *BaseGV, Immediate BaseOffset,
1373 bool HasBaseReg, int64_t Scale,
1374 Instruction *Fixup = nullptr);
1375
1376static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) {
1378 return 1;
1379 if (Depth == 0)
1380 return 0;
1381 if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))
1382 return getSetupCost(S->getStart(), Depth - 1);
1383 if (auto S = dyn_cast<SCEVIntegralCastExpr>(Reg))
1384 return getSetupCost(S->getOperand(), Depth - 1);
1385 if (auto S = dyn_cast<SCEVNAryExpr>(Reg))
1386 return std::accumulate(S->operands().begin(), S->operands().end(), 0,
1387 [&](unsigned i, const SCEV *Reg) {
1388 return i + getSetupCost(Reg, Depth - 1);
1389 });
1390 if (auto S = dyn_cast<SCEVUDivExpr>(Reg))
1391 return getSetupCost(S->getLHS(), Depth - 1) +
1392 getSetupCost(S->getRHS(), Depth - 1);
1393 return 0;
1394}
1395
1396/// Tally up interesting quantities from the given register.
1397void Cost::RateRegister(const Formula &F, const SCEV *Reg,
1398 SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
1399 bool HardwareLoopProfitable) {
1400 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
1401 // If this is an addrec for another loop, it should be an invariant
1402 // with respect to L since L is the innermost loop (at least
1403 // for now LSR only handles innermost loops).
1404 if (AR->getLoop() != L) {
1405 // If the AddRec exists, consider it's register free and leave it alone.
1406 if (isExistingPhi(AR, *SE) && !(AMK & TTI::AMK_PostIndexed))
1407 return;
1408
1409 // It is bad to allow LSR for current loop to add induction variables
1410 // for its sibling loops.
1411 if (!AR->getLoop()->contains(L)) {
1412 Lose();
1413 return;
1414 }
1415
1416 // Otherwise, it will be an invariant with respect to Loop L.
1417 ++C.NumRegs;
1418 return;
1419 }
1420
1421 unsigned LoopCost = 1;
1422 if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||
1423 TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {
1424 const SCEV *Start;
1425 const APInt *Step;
1426 if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_scev_APInt(Step)))) {
1427 // If the step size matches the base offset, we could use pre-indexed
1428 // addressing.
1429 bool CanPreIndex = (AMK & TTI::AMK_PreIndexed) &&
1430 F.BaseOffset.isFixed() &&
1431 *Step == F.BaseOffset.getFixedValue();
1432 bool CanPostIndex = (AMK & TTI::AMK_PostIndexed) &&
1433 !isa<SCEVConstant>(Start) &&
1434 SE->isLoopInvariant(Start, L);
1435 // We can only pre or post index when the load/store is unconditional.
1436 if ((CanPreIndex || CanPostIndex) && LU.AllFixupsUnconditional)
1437 LoopCost = 0;
1438 }
1439 }
1440
1441 // If the loop counts down to zero and we'll be using a hardware loop then
1442 // the addrec will be combined into the hardware loop instruction.
1443 if (LU.Kind == LSRUse::ICmpZero && F.countsDownToZero() &&
1444 HardwareLoopProfitable)
1445 LoopCost = 0;
1446 C.AddRecCost += LoopCost;
1447
1448 // Add the step value register, if it needs one.
1449 // TODO: The non-affine case isn't precisely modeled here.
1450 if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
1451 if (!Regs.count(AR->getOperand(1))) {
1452 RateRegister(F, AR->getOperand(1), Regs, LU, HardwareLoopProfitable);
1453 if (isLoser())
1454 return;
1455 }
1456 }
1457 }
1458 ++C.NumRegs;
1459
1460 // Rough heuristic; favor registers which don't require extra setup
1461 // instructions in the preheader.
1462 C.SetupCost += getSetupCost(Reg, SetupCostDepthLimit);
1463 // Ensure we don't, even with the recusion limit, produce invalid costs.
1464 C.SetupCost = std::min<unsigned>(C.SetupCost, 1 << 16);
1465
1466 C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
1468}
1469
1470/// Record this register in the set. If we haven't seen it before, rate
1471/// it. Optional LoserRegs provides a way to declare any formula that refers to
1472/// one of those regs an instant loser.
1473void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1474 SmallPtrSetImpl<const SCEV *> &Regs,
1475 const LSRUse &LU, bool HardwareLoopProfitable,
1476 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1477 if (LoserRegs && LoserRegs->count(Reg)) {
1478 Lose();
1479 return;
1480 }
1481 if (Regs.insert(Reg).second) {
1482 RateRegister(F, Reg, Regs, LU, HardwareLoopProfitable);
1483 if (LoserRegs && isLoser())
1484 LoserRegs->insert(Reg);
1485 }
1486}
1487
1488void Cost::RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
1489 const DenseSet<const SCEV *> &VisitedRegs,
1490 const LSRUse &LU, bool HardwareLoopProfitable,
1491 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1492 if (isLoser())
1493 return;
1494 assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
1495 // Tally up the registers.
1496 unsigned PrevAddRecCost = C.AddRecCost;
1497 unsigned PrevNumRegs = C.NumRegs;
1498 unsigned PrevNumBaseAdds = C.NumBaseAdds;
1499 if (const SCEV *ScaledReg = F.ScaledReg) {
1500 if (VisitedRegs.count(ScaledReg)) {
1501 Lose();
1502 return;
1503 }
1504 RatePrimaryRegister(F, ScaledReg, Regs, LU, HardwareLoopProfitable,
1505 LoserRegs);
1506 if (isLoser())
1507 return;
1508 }
1509 for (const SCEV *BaseReg : F.BaseRegs) {
1510 if (VisitedRegs.count(BaseReg)) {
1511 Lose();
1512 return;
1513 }
1514 RatePrimaryRegister(F, BaseReg, Regs, LU, HardwareLoopProfitable,
1515 LoserRegs);
1516 if (isLoser())
1517 return;
1518 }
1519
1520 // Determine how many (unfolded) adds we'll need inside the loop.
1521 size_t NumBaseParts = F.getNumRegs();
1522 if (NumBaseParts > 1)
1523 // Do not count the base and a possible second register if the target
1524 // allows to fold 2 registers.
1525 C.NumBaseAdds +=
1526 NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F)));
1527 C.NumBaseAdds += (F.UnfoldedOffset.isNonZero());
1528
1529 // Accumulate non-free scaling amounts.
1530 C.ScaleCost += getScalingFactorCost(*TTI, LU, F, *L).getValue();
1531
1532 // Tally up the non-zero immediates.
1533 for (const LSRFixup &Fixup : LU.Fixups) {
1534 if (Fixup.Offset.isCompatibleImmediate(F.BaseOffset)) {
1535 Immediate Offset = Fixup.Offset.addUnsigned(F.BaseOffset);
1536 if (F.BaseGV)
1537 C.ImmCost += 64; // Handle symbolic values conservatively.
1538 // TODO: This should probably be the pointer size.
1539 else if (Offset.isNonZero())
1540 C.ImmCost +=
1541 APInt(64, Offset.getKnownMinValue(), true).getSignificantBits();
1542
1543 // Check with target if this offset with this instruction is
1544 // specifically not supported.
1545 if (LU.Kind == LSRUse::Address && Offset.isNonZero() &&
1546 !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1547 Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
1548 C.NumBaseAdds++;
1549 } else {
1550 // Incompatible immediate type, increase cost to avoid using
1551 C.ImmCost += 2048;
1552 }
1553 }
1554
1555 // If we don't count instruction cost exit here.
1556 if (!InsnsCost) {
1557 assert(isValid() && "invalid cost");
1558 return;
1559 }
1560
1561 // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
1562 // additional instruction (at least fill).
1563 // TODO: Need distinguish register class?
1564 unsigned TTIRegNum = TTI->getNumberOfRegisters(
1565 TTI->getRegisterClassForType(false, F.getType())) - 1;
1566 if (C.NumRegs > TTIRegNum) {
1567 // Cost already exceeded TTIRegNum, then only newly added register can add
1568 // new instructions.
1569 if (PrevNumRegs > TTIRegNum)
1570 C.Insns += (C.NumRegs - PrevNumRegs);
1571 else
1572 C.Insns += (C.NumRegs - TTIRegNum);
1573 }
1574
1575 // If ICmpZero formula ends with not 0, it could not be replaced by
1576 // just add or sub. We'll need to compare final result of AddRec.
1577 // That means we'll need an additional instruction. But if the target can
1578 // macro-fuse a compare with a branch, don't count this extra instruction.
1579 // For -10 + {0, +, 1}:
1580 // i = i + 1;
1581 // cmp i, 10
1582 //
1583 // For {-10, +, 1}:
1584 // i = i + 1;
1585 if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() &&
1586 !TTI->canMacroFuseCmp())
1587 C.Insns++;
1588 // Each new AddRec adds 1 instruction to calculation.
1589 C.Insns += (C.AddRecCost - PrevAddRecCost);
1590
1591 // BaseAdds adds instructions for unfolded registers.
1592 if (LU.Kind != LSRUse::ICmpZero)
1593 C.Insns += C.NumBaseAdds - PrevNumBaseAdds;
1594 assert(isValid() && "invalid cost");
1595}
1596
1597/// Set this cost to a losing value.
1598void Cost::Lose() {
1599 C.Insns = std::numeric_limits<unsigned>::max();
1600 C.NumRegs = std::numeric_limits<unsigned>::max();
1601 C.AddRecCost = std::numeric_limits<unsigned>::max();
1602 C.NumIVMuls = std::numeric_limits<unsigned>::max();
1603 C.NumBaseAdds = std::numeric_limits<unsigned>::max();
1604 C.ImmCost = std::numeric_limits<unsigned>::max();
1605 C.SetupCost = std::numeric_limits<unsigned>::max();
1606 C.ScaleCost = std::numeric_limits<unsigned>::max();
1607}
1608
1609/// Choose the lower cost.
1610bool Cost::isLess(const Cost &Other) const {
1611 if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
1612 C.Insns != Other.C.Insns)
1613 return C.Insns < Other.C.Insns;
1614 return TTI->isLSRCostLess(C, Other.C);
1615}
1616
1617#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1618void Cost::print(raw_ostream &OS) const {
1619 if (InsnsCost)
1620 OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
1621 OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s");
1622 if (C.AddRecCost != 0)
1623 OS << ", with addrec cost " << C.AddRecCost;
1624 if (C.NumIVMuls != 0)
1625 OS << ", plus " << C.NumIVMuls << " IV mul"
1626 << (C.NumIVMuls == 1 ? "" : "s");
1627 if (C.NumBaseAdds != 0)
1628 OS << ", plus " << C.NumBaseAdds << " base add"
1629 << (C.NumBaseAdds == 1 ? "" : "s");
1630 if (C.ScaleCost != 0)
1631 OS << ", plus " << C.ScaleCost << " scale cost";
1632 if (C.ImmCost != 0)
1633 OS << ", plus " << C.ImmCost << " imm cost";
1634 if (C.SetupCost != 0)
1635 OS << ", plus " << C.SetupCost << " setup cost";
1636}
1637
1638LLVM_DUMP_METHOD void Cost::dump() const {
1639 print(errs()); errs() << '\n';
1640}
1641#endif
1642
1643/// Test whether this fixup always uses its value outside of the given loop.
1644bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
1645 // PHI nodes use their value in their incoming blocks.
1646 if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
1647 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
1648 if (PN->getIncomingValue(i) == OperandValToReplace &&
1649 L->contains(PN->getIncomingBlock(i)))
1650 return false;
1651 return true;
1652 }
1653
1654 return !L->contains(UserInst);
1655}
1656
1657#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1658void LSRFixup::print(raw_ostream &OS) const {
1659 OS << "UserInst=";
1660 // Store is common and interesting enough to be worth special-casing.
1661 if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
1662 OS << "store ";
1663 Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
1664 } else if (UserInst->getType()->isVoidTy())
1665 OS << UserInst->getOpcodeName();
1666 else
1667 UserInst->printAsOperand(OS, /*PrintType=*/false);
1668
1669 OS << ", OperandValToReplace=";
1670 OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
1671
1672 for (const Loop *PIL : PostIncLoops) {
1673 OS << ", PostIncLoop=";
1674 PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
1675 }
1676
1677 if (Offset.isNonZero())
1678 OS << ", Offset=" << Offset;
1679}
1680
1681LLVM_DUMP_METHOD void LSRFixup::dump() const {
1682 print(errs()); errs() << '\n';
1683}
1684#endif
1685
1686/// Test whether this use as a formula which has the same registers as the given
1687/// formula.
1688bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
1690 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1691 // Unstable sort by host order ok, because this is only used for uniquifying.
1692 llvm::sort(Key);
1693 return Uniquifier.count(Key);
1694}
1695
1696/// The function returns a probability of selecting formula without Reg.
1697float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
1698 unsigned FNum = 0;
1699 for (const Formula &F : Formulae)
1700 if (F.referencesReg(Reg))
1701 FNum++;
1702 return ((float)(Formulae.size() - FNum)) / Formulae.size();
1703}
1704
1705/// If the given formula has not yet been inserted, add it to the list, and
1706/// return true. Return false otherwise. The formula must be in canonical form.
1707bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
1708 assert(F.isCanonical(L) && "Invalid canonical representation");
1709
1710 if (!Formulae.empty() && RigidFormula)
1711 return false;
1712
1714 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1715 // Unstable sort by host order ok, because this is only used for uniquifying.
1716 llvm::sort(Key);
1717
1718 if (!Uniquifier.insert(Key).second)
1719 return false;
1720
1721 // Using a register to hold the value of 0 is not profitable.
1722 assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
1723 "Zero allocated in a scaled register!");
1724#ifndef NDEBUG
1725 for (const SCEV *BaseReg : F.BaseRegs)
1726 assert(!BaseReg->isZero() && "Zero allocated in a base register!");
1727#endif
1728
1729 // Add the formula to the list.
1730 Formulae.push_back(F);
1731
1732 // Record registers now being used by this use.
1733 Regs.insert_range(F.BaseRegs);
1734 if (F.ScaledReg)
1735 Regs.insert(F.ScaledReg);
1736
1737 return true;
1738}
1739
1740/// Remove the given formula from this use's list.
1741void LSRUse::DeleteFormula(Formula &F) {
1742 if (&F != &Formulae.back())
1743 std::swap(F, Formulae.back());
1744 Formulae.pop_back();
1745}
1746
1747/// Recompute the Regs field, and update RegUses.
1748void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
1749 // Now that we've filtered out some formulae, recompute the Regs set.
1750 SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
1751 Regs.clear();
1752 for (const Formula &F : Formulae) {
1753 if (F.ScaledReg) Regs.insert(F.ScaledReg);
1754 Regs.insert_range(F.BaseRegs);
1755 }
1756
1757 // Update the RegTracker.
1758 for (const SCEV *S : OldRegs)
1759 if (!Regs.count(S))
1760 RegUses.dropRegister(S, LUIdx);
1761}
1762
1763#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1764void LSRUse::print(raw_ostream &OS) const {
1765 OS << "LSR Use: Kind=";
1766 switch (Kind) {
1767 case Basic: OS << "Basic"; break;
1768 case Special: OS << "Special"; break;
1769 case ICmpZero: OS << "ICmpZero"; break;
1770 case Address:
1771 OS << "Address of ";
1772 if (AccessTy.MemTy->isPointerTy())
1773 OS << "pointer"; // the full pointer type could be really verbose
1774 else {
1775 OS << *AccessTy.MemTy;
1776 }
1777
1778 OS << " in addrspace(" << AccessTy.AddrSpace << ')';
1779 }
1780
1781 OS << ", Offsets={";
1782 bool NeedComma = false;
1783 for (const LSRFixup &Fixup : Fixups) {
1784 if (NeedComma) OS << ',';
1785 OS << Fixup.Offset;
1786 NeedComma = true;
1787 }
1788 OS << '}';
1789
1790 if (AllFixupsOutsideLoop)
1791 OS << ", all-fixups-outside-loop";
1792
1793 if (AllFixupsUnconditional)
1794 OS << ", all-fixups-unconditional";
1795
1796 if (WidestFixupType)
1797 OS << ", widest fixup type: " << *WidestFixupType;
1798}
1799
1800LLVM_DUMP_METHOD void LSRUse::dump() const {
1801 print(errs()); errs() << '\n';
1802}
1803#endif
1804
1806 LSRUse::KindType Kind, MemAccessTy AccessTy,
1807 GlobalValue *BaseGV, Immediate BaseOffset,
1808 bool HasBaseReg, int64_t Scale,
1809 Instruction *Fixup /* = nullptr */) {
1810 switch (Kind) {
1811 case LSRUse::Address: {
1812 int64_t FixedOffset =
1813 BaseOffset.isScalable() ? 0 : BaseOffset.getFixedValue();
1814 int64_t ScalableOffset =
1815 BaseOffset.isScalable() ? BaseOffset.getKnownMinValue() : 0;
1816 return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, FixedOffset,
1817 HasBaseReg, Scale, AccessTy.AddrSpace,
1818 Fixup, ScalableOffset);
1819 }
1820 case LSRUse::ICmpZero:
1821 // There's not even a target hook for querying whether it would be legal to
1822 // fold a GV into an ICmp.
1823 if (BaseGV)
1824 return false;
1825
1826 // ICmp only has two operands; don't allow more than two non-trivial parts.
1827 if (Scale != 0 && HasBaseReg && BaseOffset.isNonZero())
1828 return false;
1829
1830 // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
1831 // putting the scaled register in the other operand of the icmp.
1832 if (Scale != 0 && Scale != -1)
1833 return false;
1834
1835 // If we have low-level target information, ask the target if it can fold an
1836 // integer immediate on an icmp.
1837 if (BaseOffset.isNonZero()) {
1838 // We don't have an interface to query whether the target supports
1839 // icmpzero against scalable quantities yet.
1840 if (BaseOffset.isScalable())
1841 return false;
1842
1843 // We have one of:
1844 // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
1845 // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
1846 // Offs is the ICmp immediate.
1847 if (Scale == 0)
1848 // The cast does the right thing with
1849 // std::numeric_limits<int64_t>::min().
1850 BaseOffset = BaseOffset.getFixed(-(uint64_t)BaseOffset.getFixedValue());
1851 return TTI.isLegalICmpImmediate(BaseOffset.getFixedValue());
1852 }
1853
1854 // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
1855 return true;
1856
1857 case LSRUse::Basic:
1858 // Only handle single-register values.
1859 return !BaseGV && Scale == 0 && BaseOffset.isZero();
1860
1861 case LSRUse::Special:
1862 // Special case Basic to handle -1 scales.
1863 return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset.isZero();
1864 }
1865
1866 llvm_unreachable("Invalid LSRUse Kind!");
1867}
1868
1870 Immediate MinOffset, Immediate MaxOffset,
1871 LSRUse::KindType Kind, MemAccessTy AccessTy,
1872 GlobalValue *BaseGV, Immediate BaseOffset,
1873 bool HasBaseReg, int64_t Scale) {
1874 if (BaseOffset.isNonZero() &&
1875 (BaseOffset.isScalable() != MinOffset.isScalable() ||
1876 BaseOffset.isScalable() != MaxOffset.isScalable()))
1877 return false;
1878 // Check for overflow.
1879 int64_t Base = BaseOffset.getKnownMinValue();
1880 int64_t Min = MinOffset.getKnownMinValue();
1881 int64_t Max = MaxOffset.getKnownMinValue();
1882 if (((int64_t)((uint64_t)Base + Min) > Base) != (Min > 0))
1883 return false;
1884 MinOffset = Immediate::get((uint64_t)Base + Min, MinOffset.isScalable());
1885 if (((int64_t)((uint64_t)Base + Max) > Base) != (Max > 0))
1886 return false;
1887 MaxOffset = Immediate::get((uint64_t)Base + Max, MaxOffset.isScalable());
1888
1889 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
1890 HasBaseReg, Scale) &&
1891 isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
1892 HasBaseReg, Scale);
1893}
1894
1896 Immediate MinOffset, Immediate MaxOffset,
1897 LSRUse::KindType Kind, MemAccessTy AccessTy,
1898 const Formula &F, const Loop &L) {
1899 // For the purpose of isAMCompletelyFolded either having a canonical formula
1900 // or a scale not equal to zero is correct.
1901 // Problems may arise from non canonical formulae having a scale == 0.
1902 // Strictly speaking it would best to just rely on canonical formulae.
1903 // However, when we generate the scaled formulae, we first check that the
1904 // scaling factor is profitable before computing the actual ScaledReg for
1905 // compile time sake.
1906 assert((F.isCanonical(L) || F.Scale != 0));
1907 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1908 F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
1909}
1910
1911/// Test whether we know how to expand the current formula.
1912static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1913 Immediate MaxOffset, LSRUse::KindType Kind,
1914 MemAccessTy AccessTy, GlobalValue *BaseGV,
1915 Immediate BaseOffset, bool HasBaseReg, int64_t Scale) {
1916 // We know how to expand completely foldable formulae.
1917 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1918 BaseOffset, HasBaseReg, Scale) ||
1919 // Or formulae that use a base register produced by a sum of base
1920 // registers.
1921 (Scale == 1 &&
1922 isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1923 BaseGV, BaseOffset, true, 0));
1924}
1925
1926static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1927 Immediate MaxOffset, LSRUse::KindType Kind,
1928 MemAccessTy AccessTy, const Formula &F) {
1929 return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
1930 F.BaseOffset, F.HasBaseReg, F.Scale);
1931}
1932
1934 Immediate Offset) {
1935 if (Offset.isScalable())
1936 return TTI.isLegalAddScalableImmediate(Offset.getKnownMinValue());
1937
1938 return TTI.isLegalAddImmediate(Offset.getFixedValue());
1939}
1940
1942 const LSRUse &LU, const Formula &F) {
1943 // Target may want to look at the user instructions.
1944 if (LU.Kind == LSRUse::Address && TTI.LSRWithInstrQueries()) {
1945 for (const LSRFixup &Fixup : LU.Fixups)
1946 if (!isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1947 (F.BaseOffset + Fixup.Offset), F.HasBaseReg,
1948 F.Scale, Fixup.UserInst))
1949 return false;
1950 return true;
1951 }
1952
1953 return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1954 LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
1955 F.Scale);
1956}
1957
1959 const LSRUse &LU, const Formula &F,
1960 const Loop &L) {
1961 if (!F.Scale)
1962 return 0;
1963
1964 // If the use is not completely folded in that instruction, we will have to
1965 // pay an extra cost only for scale != 1.
1966 if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1967 LU.AccessTy, F, L))
1968 return F.Scale != 1;
1969
1970 switch (LU.Kind) {
1971 case LSRUse::Address: {
1972 // Check the scaling factor cost with both the min and max offsets.
1973 int64_t ScalableMin = 0, ScalableMax = 0, FixedMin = 0, FixedMax = 0;
1974 if (F.BaseOffset.isScalable()) {
1975 ScalableMin = (F.BaseOffset + LU.MinOffset).getKnownMinValue();
1976 ScalableMax = (F.BaseOffset + LU.MaxOffset).getKnownMinValue();
1977 } else {
1978 FixedMin = (F.BaseOffset + LU.MinOffset).getFixedValue();
1979 FixedMax = (F.BaseOffset + LU.MaxOffset).getFixedValue();
1980 }
1981 InstructionCost ScaleCostMinOffset = TTI.getScalingFactorCost(
1982 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMin, ScalableMin),
1983 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
1984 InstructionCost ScaleCostMaxOffset = TTI.getScalingFactorCost(
1985 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMax, ScalableMax),
1986 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
1987
1988 assert(ScaleCostMinOffset.isValid() && ScaleCostMaxOffset.isValid() &&
1989 "Legal addressing mode has an illegal cost!");
1990 return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
1991 }
1992 case LSRUse::ICmpZero:
1993 case LSRUse::Basic:
1994 case LSRUse::Special:
1995 // The use is completely folded, i.e., everything is folded into the
1996 // instruction.
1997 return 0;
1998 }
1999
2000 llvm_unreachable("Invalid LSRUse Kind!");
2001}
2002
2004 LSRUse::KindType Kind, MemAccessTy AccessTy,
2005 GlobalValue *BaseGV, Immediate BaseOffset,
2006 bool HasBaseReg) {
2007 // Fast-path: zero is always foldable.
2008 if (BaseOffset.isZero() && !BaseGV)
2009 return true;
2010
2011 // Conservatively, create an address with an immediate and a
2012 // base and a scale.
2013 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2014
2015 // Canonicalize a scale of 1 to a base register if the formula doesn't
2016 // already have a base register.
2017 if (!HasBaseReg && Scale == 1) {
2018 Scale = 0;
2019 HasBaseReg = true;
2020 }
2021
2022 // FIXME: Try with + without a scale? Maybe based on TTI?
2023 // I think basereg + scaledreg + immediateoffset isn't a good 'conservative'
2024 // default for many architectures, not just AArch64 SVE. More investigation
2025 // needed later to determine if this should be used more widely than just
2026 // on scalable types.
2027 if (HasBaseReg && BaseOffset.isNonZero() && Kind != LSRUse::ICmpZero &&
2028 AccessTy.MemTy && AccessTy.MemTy->isScalableTy() && DropScaledForVScale)
2029 Scale = 0;
2030
2031 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
2032 HasBaseReg, Scale);
2033}
2034
2036 ScalarEvolution &SE, Immediate MinOffset,
2037 Immediate MaxOffset, LSRUse::KindType Kind,
2038 MemAccessTy AccessTy, const SCEV *S,
2039 bool HasBaseReg) {
2040 // Fast-path: zero is always foldable.
2041 if (S->isZero()) return true;
2042
2043 // Conservatively, create an address with an immediate and a
2044 // base and a scale.
2045 Immediate BaseOffset = ExtractImmediate(S, SE);
2046 GlobalValue *BaseGV = ExtractSymbol(S, SE);
2047
2048 // If there's anything else involved, it's not foldable.
2049 if (!S->isZero()) return false;
2050
2051 // Fast-path: zero is always foldable.
2052 if (BaseOffset.isZero() && !BaseGV)
2053 return true;
2054
2055 if (BaseOffset.isScalable())
2056 return false;
2057
2058 // Conservatively, create an address with an immediate and a
2059 // base and a scale.
2060 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2061
2062 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
2063 BaseOffset, HasBaseReg, Scale);
2064}
2065
2066namespace {
2067
2068/// An individual increment in a Chain of IV increments. Relate an IV user to
2069/// an expression that computes the IV it uses from the IV used by the previous
2070/// link in the Chain.
2071///
2072/// For the head of a chain, IncExpr holds the absolute SCEV expression for the
2073/// original IVOperand. The head of the chain's IVOperand is only valid during
2074/// chain collection, before LSR replaces IV users. During chain generation,
2075/// IncExpr can be used to find the new IVOperand that computes the same
2076/// expression.
2077struct IVInc {
2078 Instruction *UserInst;
2079 Value* IVOperand;
2080 const SCEV *IncExpr;
2081
2082 IVInc(Instruction *U, Value *O, const SCEV *E)
2083 : UserInst(U), IVOperand(O), IncExpr(E) {}
2084};
2085
2086// The list of IV increments in program order. We typically add the head of a
2087// chain without finding subsequent links.
2088struct IVChain {
2090 const SCEV *ExprBase = nullptr;
2091
2092 IVChain() = default;
2093 IVChain(const IVInc &Head, const SCEV *Base)
2094 : Incs(1, Head), ExprBase(Base) {}
2095
2096 using const_iterator = SmallVectorImpl<IVInc>::const_iterator;
2097
2098 // Return the first increment in the chain.
2099 const_iterator begin() const {
2100 assert(!Incs.empty());
2101 return std::next(Incs.begin());
2102 }
2103 const_iterator end() const {
2104 return Incs.end();
2105 }
2106
2107 // Returns true if this chain contains any increments.
2108 bool hasIncs() const { return Incs.size() >= 2; }
2109
2110 // Add an IVInc to the end of this chain.
2111 void add(const IVInc &X) { Incs.push_back(X); }
2112
2113 // Returns the last UserInst in the chain.
2114 Instruction *tailUserInst() const { return Incs.back().UserInst; }
2115
2116 // Returns true if IncExpr can be profitably added to this chain.
2117 bool isProfitableIncrement(const SCEV *OperExpr,
2118 const SCEV *IncExpr,
2119 ScalarEvolution&);
2120};
2121
2122/// Helper for CollectChains to track multiple IV increment uses. Distinguish
2123/// between FarUsers that definitely cross IV increments and NearUsers that may
2124/// be used between IV increments.
2125struct ChainUsers {
2126 SmallPtrSet<Instruction*, 4> FarUsers;
2127 SmallPtrSet<Instruction*, 4> NearUsers;
2128};
2129
2130/// This class holds state for the main loop strength reduction logic.
2131class LSRInstance {
2132 IVUsers &IU;
2133 ScalarEvolution &SE;
2134 DominatorTree &DT;
2135 LoopInfo &LI;
2136 AssumptionCache &AC;
2137 TargetLibraryInfo &TLI;
2138 const TargetTransformInfo &TTI;
2139 Loop *const L;
2140 MemorySSAUpdater *MSSAU;
2142 mutable SCEVExpander Rewriter;
2143 bool Changed = false;
2144 bool HardwareLoopProfitable = false;
2145
2146 /// This is the insert position that the current loop's induction variable
2147 /// increment should be placed. In simple loops, this is the latch block's
2148 /// terminator. But in more complicated cases, this is a position which will
2149 /// dominate all the in-loop post-increment users.
2150 Instruction *IVIncInsertPos = nullptr;
2151
2152 /// Interesting factors between use strides.
2153 ///
2154 /// We explicitly use a SetVector which contains a SmallSet, instead of the
2155 /// default, a SmallDenseSet, because we need to use the full range of
2156 /// int64_ts, and there's currently no good way of doing that with
2157 /// SmallDenseSet.
2158 SetVector<int64_t, SmallVector<int64_t, 8>, SmallSet<int64_t, 8>> Factors;
2159
2160 /// The cost of the current SCEV, the best solution by LSR will be dropped if
2161 /// the solution is not profitable.
2162 Cost BaselineCost;
2163
2164 /// Interesting use types, to facilitate truncation reuse.
2165 SmallSetVector<Type *, 4> Types;
2166
2167 /// The list of interesting uses.
2169
2170 /// Track which uses use which register candidates.
2171 RegUseTracker RegUses;
2172
2173 // Limit the number of chains to avoid quadratic behavior. We don't expect to
2174 // have more than a few IV increment chains in a loop. Missing a Chain falls
2175 // back to normal LSR behavior for those uses.
2176 static const unsigned MaxChains = 8;
2177
2178 /// IV users can form a chain of IV increments.
2180
2181 /// IV users that belong to profitable IVChains.
2182 SmallPtrSet<Use*, MaxChains> IVIncSet;
2183
2184 /// Induction variables that were generated and inserted by the SCEV Expander.
2185 SmallVector<llvm::WeakVH, 2> ScalarEvolutionIVs;
2186
2187 // Inserting instructions in the loop and using them as PHI's input could
2188 // break LCSSA in case if PHI's parent block is not a loop exit (i.e. the
2189 // corresponding incoming block is not loop exiting). So collect all such
2190 // instructions to form LCSSA for them later.
2191 SmallSetVector<Instruction *, 4> InsertedNonLCSSAInsts;
2192
2193 void OptimizeShadowIV();
2194 bool FindIVUserForCond(Instruction *Cond, IVStrideUse *&CondUse);
2195 Instruction *OptimizeMax(ICmpInst *Cond, IVStrideUse *&CondUse);
2196 void OptimizeLoopTermCond();
2197
2198 void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
2199 SmallVectorImpl<ChainUsers> &ChainUsersVec);
2200 void FinalizeChain(IVChain &Chain);
2201 void CollectChains();
2202 void GenerateIVChain(const IVChain &Chain,
2203 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2204
2205 void CollectInterestingTypesAndFactors();
2206 void CollectFixupsAndInitialFormulae();
2207
2208 // Support for sharing of LSRUses between LSRFixups.
2209 using UseMapTy = DenseMap<LSRUse::SCEVUseKindPair, size_t>;
2210 UseMapTy UseMap;
2211
2212 bool reconcileNewOffset(LSRUse &LU, Immediate NewOffset, bool HasBaseReg,
2213 LSRUse::KindType Kind, MemAccessTy AccessTy);
2214
2215 std::pair<size_t, Immediate> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
2216 MemAccessTy AccessTy);
2217
2218 void DeleteUse(LSRUse &LU, size_t LUIdx);
2219
2220 LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
2221
2222 void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2223 void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2224 void CountRegisters(const Formula &F, size_t LUIdx);
2225 bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
2226 bool IsFixupExecutedEachIncrement(const LSRFixup &LF) const;
2227
2228 void CollectLoopInvariantFixupsAndFormulae();
2229
2230 void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
2231 unsigned Depth = 0);
2232
2233 void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
2234 const Formula &Base, unsigned Depth,
2235 size_t Idx, bool IsScaledReg = false);
2236 void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
2237 void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2238 const Formula &Base, size_t Idx,
2239 bool IsScaledReg = false);
2240 void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2241 void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2242 const Formula &Base,
2243 const SmallVectorImpl<Immediate> &Worklist,
2244 size_t Idx, bool IsScaledReg = false);
2245 void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2246 void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2247 void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2248 void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
2249 void GenerateCrossUseConstantOffsets();
2250 void GenerateAllReuseFormulae();
2251
2252 void FilterOutUndesirableDedicatedRegisters();
2253
2254 size_t EstimateSearchSpaceComplexity() const;
2255 void NarrowSearchSpaceByDetectingSupersets();
2256 void NarrowSearchSpaceByCollapsingUnrolledCode();
2257 void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
2258 void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
2259 void NarrowSearchSpaceByFilterPostInc();
2260 void NarrowSearchSpaceByDeletingCostlyFormulas();
2261 void NarrowSearchSpaceByPickingWinnerRegs();
2262 void NarrowSearchSpaceUsingHeuristics();
2263
2264 void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
2265 Cost &SolutionCost,
2266 SmallVectorImpl<const Formula *> &Workspace,
2267 const Cost &CurCost,
2268 const SmallPtrSet<const SCEV *, 16> &CurRegs,
2269 DenseSet<const SCEV *> &VisitedRegs) const;
2270 void Solve(SmallVectorImpl<const Formula *> &Solution) const;
2271
2273 HoistInsertPosition(BasicBlock::iterator IP,
2274 const SmallVectorImpl<Instruction *> &Inputs) const;
2275 BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP,
2276 const LSRFixup &LF,
2277 const LSRUse &LU) const;
2278
2279 Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2281 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2282 void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
2283 const Formula &F,
2284 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2285 void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2286 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2287 void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
2288
2289public:
2290 LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
2291 LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC,
2292 TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU);
2293
2294 bool getChanged() const { return Changed; }
2295 const SmallVectorImpl<WeakVH> &getScalarEvolutionIVs() const {
2296 return ScalarEvolutionIVs;
2297 }
2298
2299 void print_factors_and_types(raw_ostream &OS) const;
2300 void print_fixups(raw_ostream &OS) const;
2301 void print_uses(raw_ostream &OS) const;
2302 void print(raw_ostream &OS) const;
2303 void dump() const;
2304};
2305
2306} // end anonymous namespace
2307
2308/// If IV is used in a int-to-float cast inside the loop then try to eliminate
2309/// the cast operation.
2310void LSRInstance::OptimizeShadowIV() {
2311 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2312 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2313 return;
2314
2315 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
2316 UI != E; /* empty */) {
2317 IVUsers::const_iterator CandidateUI = UI;
2318 ++UI;
2319 Instruction *ShadowUse = CandidateUI->getUser();
2320 Type *DestTy = nullptr;
2321 bool IsSigned = false;
2322
2323 /* If shadow use is a int->float cast then insert a second IV
2324 to eliminate this cast.
2325
2326 for (unsigned i = 0; i < n; ++i)
2327 foo((double)i);
2328
2329 is transformed into
2330
2331 double d = 0.0;
2332 for (unsigned i = 0; i < n; ++i, ++d)
2333 foo(d);
2334 */
2335 if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
2336 IsSigned = false;
2337 DestTy = UCast->getDestTy();
2338 }
2339 else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
2340 IsSigned = true;
2341 DestTy = SCast->getDestTy();
2342 }
2343 if (!DestTy) continue;
2344
2345 // If target does not support DestTy natively then do not apply
2346 // this transformation.
2347 if (!TTI.isTypeLegal(DestTy)) continue;
2348
2349 PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
2350 if (!PH) continue;
2351 if (PH->getNumIncomingValues() != 2) continue;
2352
2353 // If the calculation in integers overflows, the result in FP type will
2354 // differ. So we only can do this transformation if we are guaranteed to not
2355 // deal with overflowing values
2356 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PH));
2357 if (!AR) continue;
2358 if (IsSigned && !AR->hasNoSignedWrap()) continue;
2359 if (!IsSigned && !AR->hasNoUnsignedWrap()) continue;
2360
2361 Type *SrcTy = PH->getType();
2362 int Mantissa = DestTy->getFPMantissaWidth();
2363 if (Mantissa == -1) continue;
2364 if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
2365 continue;
2366
2367 unsigned Entry, Latch;
2368 if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
2369 Entry = 0;
2370 Latch = 1;
2371 } else {
2372 Entry = 1;
2373 Latch = 0;
2374 }
2375
2376 ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
2377 if (!Init) continue;
2378 Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
2379 (double)Init->getSExtValue() :
2380 (double)Init->getZExtValue());
2381
2382 BinaryOperator *Incr =
2384 if (!Incr) continue;
2385 if (Incr->getOpcode() != Instruction::Add
2386 && Incr->getOpcode() != Instruction::Sub)
2387 continue;
2388
2389 /* Initialize new IV, double d = 0.0 in above example. */
2390 ConstantInt *C = nullptr;
2391 if (Incr->getOperand(0) == PH)
2393 else if (Incr->getOperand(1) == PH)
2395 else
2396 continue;
2397
2398 if (!C) continue;
2399
2400 // Ignore negative constants, as the code below doesn't handle them
2401 // correctly. TODO: Remove this restriction.
2402 if (!C->getValue().isStrictlyPositive())
2403 continue;
2404
2405 /* Add new PHINode. */
2406 PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH->getIterator());
2407 NewPH->setDebugLoc(PH->getDebugLoc());
2408
2409 /* create new increment. '++d' in above example. */
2410 Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
2411 BinaryOperator *NewIncr = BinaryOperator::Create(
2412 Incr->getOpcode() == Instruction::Add ? Instruction::FAdd
2413 : Instruction::FSub,
2414 NewPH, CFP, "IV.S.next.", Incr->getIterator());
2415 NewIncr->setDebugLoc(Incr->getDebugLoc());
2416
2417 NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
2418 NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
2419
2420 /* Remove cast operation */
2421 ShadowUse->replaceAllUsesWith(NewPH);
2422 ShadowUse->eraseFromParent();
2423 Changed = true;
2424 break;
2425 }
2426}
2427
2428/// If Cond has an operand that is an expression of an IV, set the IV user and
2429/// stride information and return true, otherwise return false.
2430bool LSRInstance::FindIVUserForCond(Instruction *Cond, IVStrideUse *&CondUse) {
2431 for (IVStrideUse &U : IU)
2432 if (U.getUser() == Cond) {
2433 // NOTE: we could handle setcc instructions with multiple uses here, but
2434 // InstCombine does it as well for simple uses, it's not clear that it
2435 // occurs enough in real life to handle.
2436 CondUse = &U;
2437 return true;
2438 }
2439 return false;
2440}
2441
2442/// Rewrite the loop's terminating condition if it uses a max computation.
2443///
2444/// This is a narrow solution to a specific, but acute, problem. For loops
2445/// like this:
2446///
2447/// i = 0;
2448/// do {
2449/// p[i] = 0.0;
2450/// } while (++i < n);
2451///
2452/// the trip count isn't just 'n', because 'n' might not be positive. And
2453/// unfortunately this can come up even for loops where the user didn't use
2454/// a C do-while loop. For example, seemingly well-behaved top-test loops
2455/// will commonly be lowered like this:
2456///
2457/// if (n > 0) {
2458/// i = 0;
2459/// do {
2460/// p[i] = 0.0;
2461/// } while (++i < n);
2462/// }
2463///
2464/// and then it's possible for subsequent optimization to obscure the if
2465/// test in such a way that indvars can't find it.
2466///
2467/// When indvars can't find the if test in loops like this, it creates a
2468/// max expression, which allows it to give the loop a canonical
2469/// induction variable:
2470///
2471/// i = 0;
2472/// max = n < 1 ? 1 : n;
2473/// do {
2474/// p[i] = 0.0;
2475/// } while (++i != max);
2476///
2477/// Canonical induction variables are necessary because the loop passes
2478/// are designed around them. The most obvious example of this is the
2479/// LoopInfo analysis, which doesn't remember trip count values. It
2480/// expects to be able to rediscover the trip count each time it is
2481/// needed, and it does this using a simple analysis that only succeeds if
2482/// the loop has a canonical induction variable.
2483///
2484/// However, when it comes time to generate code, the maximum operation
2485/// can be quite costly, especially if it's inside of an outer loop.
2486///
2487/// This function solves this problem by detecting this type of loop and
2488/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
2489/// the instructions for the maximum computation.
2490Instruction *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse *&CondUse) {
2491 // Check that the loop matches the pattern we're looking for.
2492 if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
2493 Cond->getPredicate() != CmpInst::ICMP_NE)
2494 return Cond;
2495
2496 SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
2497 if (!Sel || !Sel->hasOneUse()) return Cond;
2498
2499 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2500 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2501 return Cond;
2502 const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
2503
2504 // Add one to the backedge-taken count to get the trip count.
2505 const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
2506 if (IterationCount != SE.getSCEV(Sel)) return Cond;
2507
2508 // Check for a max calculation that matches the pattern. There's no check
2509 // for ICMP_ULE here because the comparison would be with zero, which
2510 // isn't interesting.
2511 CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
2512 const SCEVNAryExpr *Max = nullptr;
2513 if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
2514 Pred = ICmpInst::ICMP_SLE;
2515 Max = S;
2516 } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
2517 Pred = ICmpInst::ICMP_SLT;
2518 Max = S;
2519 } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
2520 Pred = ICmpInst::ICMP_ULT;
2521 Max = U;
2522 } else {
2523 // No match; bail.
2524 return Cond;
2525 }
2526
2527 // To handle a max with more than two operands, this optimization would
2528 // require additional checking and setup.
2529 if (Max->getNumOperands() != 2)
2530 return Cond;
2531
2532 const SCEV *MaxLHS = Max->getOperand(0);
2533 const SCEV *MaxRHS = Max->getOperand(1);
2534
2535 // ScalarEvolution canonicalizes constants to the left. For < and >, look
2536 // for a comparison with 1. For <= and >=, a comparison with zero.
2537 if (!MaxLHS ||
2538 (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
2539 return Cond;
2540
2541 // Check the relevant induction variable for conformance to
2542 // the pattern.
2543 const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
2544 if (!match(IV,
2546 return Cond;
2547
2548 assert(cast<SCEVAddRecExpr>(IV)->getLoop() == L &&
2549 "Loop condition operand is an addrec in a different loop!");
2550
2551 // Check the right operand of the select, and remember it, as it will
2552 // be used in the new comparison instruction.
2553 Value *NewRHS = nullptr;
2554 if (ICmpInst::isTrueWhenEqual(Pred)) {
2555 // Look for n+1, and grab n.
2556 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
2557 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2558 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2559 NewRHS = BO->getOperand(0);
2560 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
2561 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2562 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2563 NewRHS = BO->getOperand(0);
2564 if (!NewRHS)
2565 return Cond;
2566 } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
2567 NewRHS = Sel->getOperand(1);
2568 else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
2569 NewRHS = Sel->getOperand(2);
2570 else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
2571 NewRHS = SU->getValue();
2572 else
2573 // Max doesn't match expected pattern.
2574 return Cond;
2575
2576 // Determine the new comparison opcode. It may be signed or unsigned,
2577 // and the original comparison may be either equality or inequality.
2578 if (Cond->getPredicate() == CmpInst::ICMP_EQ)
2579 Pred = CmpInst::getInversePredicate(Pred);
2580
2581 // Ok, everything looks ok to change the condition into an SLT or SGE and
2582 // delete the max calculation.
2583 ICmpInst *NewCond = new ICmpInst(Cond->getIterator(), Pred,
2584 Cond->getOperand(0), NewRHS, "scmp");
2585
2586 // Delete the max calculation instructions.
2587 NewCond->setDebugLoc(Cond->getDebugLoc());
2588 Cond->replaceAllUsesWith(NewCond);
2589 CondUse->setUser(NewCond);
2591 Cond->eraseFromParent();
2592 Sel->eraseFromParent();
2593 if (Cmp->use_empty()) {
2594 salvageDebugInfo(*Cmp);
2595 Cmp->eraseFromParent();
2596 }
2597 return NewCond;
2598}
2599
2600/// Change loop terminating condition to use the postinc iv when possible.
2601void
2602LSRInstance::OptimizeLoopTermCond() {
2603 SmallPtrSet<Instruction *, 4> PostIncs;
2604
2605 // We need a different set of heuristics for rotated and non-rotated loops.
2606 // If a loop is rotated then the latch is also the backedge, so inserting
2607 // post-inc expressions just before the latch is ideal. To reduce live ranges
2608 // it also makes sense to rewrite terminating conditions to use post-inc
2609 // expressions.
2610 //
2611 // If the loop is not rotated then the latch is not a backedge; the latch
2612 // check is done in the loop head. Adding post-inc expressions before the
2613 // latch will cause overlapping live-ranges of pre-inc and post-inc expressions
2614 // in the loop body. In this case we do *not* want to use post-inc expressions
2615 // in the latch check, and we want to insert post-inc expressions before
2616 // the backedge.
2617 BasicBlock *LatchBlock = L->getLoopLatch();
2618 SmallVector<BasicBlock*, 8> ExitingBlocks;
2619 L->getExitingBlocks(ExitingBlocks);
2620 if (!llvm::is_contained(ExitingBlocks, LatchBlock)) {
2621 // The backedge doesn't exit the loop; treat this as a head-tested loop.
2622 IVIncInsertPos = LatchBlock->getTerminator();
2623 return;
2624 }
2625
2626 // Otherwise treat this as a rotated loop.
2627 for (BasicBlock *ExitingBlock : ExitingBlocks) {
2628 // Get the terminating condition for the loop if possible. If we
2629 // can, we want to change it to use a post-incremented version of its
2630 // induction variable, to allow coalescing the live ranges for the IV into
2631 // one register value.
2632
2633 BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
2634 if (!TermBr || TermBr->isUnconditional())
2635 continue;
2636
2638 // If the argument to TermBr is an extractelement, then the source of that
2639 // instruction is what's generated the condition.
2641 if (Extract)
2642 Cond = dyn_cast<Instruction>(Extract->getVectorOperand());
2643 // FIXME: We could do more here, like handling logical operations where one
2644 // side is a cmp that uses an induction variable.
2645 if (!Cond)
2646 continue;
2647
2648 // Search IVUsesByStride to find Cond's IVUse if there is one.
2649 IVStrideUse *CondUse = nullptr;
2650 if (!FindIVUserForCond(Cond, CondUse))
2651 continue;
2652
2653 // If the trip count is computed in terms of a max (due to ScalarEvolution
2654 // being unable to find a sufficient guard, for example), change the loop
2655 // comparison to use SLT or ULT instead of NE.
2656 // One consequence of doing this now is that it disrupts the count-down
2657 // optimization. That's not always a bad thing though, because in such
2658 // cases it may still be worthwhile to avoid a max.
2659 if (auto *Cmp = dyn_cast<ICmpInst>(Cond))
2660 Cond = OptimizeMax(Cmp, CondUse);
2661
2662 // If this exiting block dominates the latch block, it may also use
2663 // the post-inc value if it won't be shared with other uses.
2664 // Check for dominance.
2665 if (!DT.dominates(ExitingBlock, LatchBlock))
2666 continue;
2667
2668 // Conservatively avoid trying to use the post-inc value in non-latch
2669 // exits if there may be pre-inc users in intervening blocks.
2670 if (LatchBlock != ExitingBlock)
2671 for (const IVStrideUse &UI : IU)
2672 // Test if the use is reachable from the exiting block. This dominator
2673 // query is a conservative approximation of reachability.
2674 if (&UI != CondUse &&
2675 !DT.properlyDominates(UI.getUser()->getParent(), ExitingBlock)) {
2676 // Conservatively assume there may be reuse if the quotient of their
2677 // strides could be a legal scale.
2678 const SCEV *A = IU.getStride(*CondUse, L);
2679 const SCEV *B = IU.getStride(UI, L);
2680 if (!A || !B) continue;
2681 if (SE.getTypeSizeInBits(A->getType()) !=
2682 SE.getTypeSizeInBits(B->getType())) {
2683 if (SE.getTypeSizeInBits(A->getType()) >
2684 SE.getTypeSizeInBits(B->getType()))
2685 B = SE.getSignExtendExpr(B, A->getType());
2686 else
2687 A = SE.getSignExtendExpr(A, B->getType());
2688 }
2689 if (const SCEVConstant *D =
2691 const ConstantInt *C = D->getValue();
2692 // Stride of one or negative one can have reuse with non-addresses.
2693 if (C->isOne() || C->isMinusOne())
2694 goto decline_post_inc;
2695 // Avoid weird situations.
2696 if (C->getValue().getSignificantBits() >= 64 ||
2697 C->getValue().isMinSignedValue())
2698 goto decline_post_inc;
2699 // Check for possible scaled-address reuse.
2700 if (isAddressUse(TTI, UI.getUser(), UI.getOperandValToReplace())) {
2701 MemAccessTy AccessTy =
2702 getAccessType(TTI, UI.getUser(), UI.getOperandValToReplace());
2703 int64_t Scale = C->getSExtValue();
2704 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2705 /*BaseOffset=*/0,
2706 /*HasBaseReg=*/true, Scale,
2707 AccessTy.AddrSpace))
2708 goto decline_post_inc;
2709 Scale = -Scale;
2710 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2711 /*BaseOffset=*/0,
2712 /*HasBaseReg=*/true, Scale,
2713 AccessTy.AddrSpace))
2714 goto decline_post_inc;
2715 }
2716 }
2717 }
2718
2719 LLVM_DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: "
2720 << *Cond << '\n');
2721
2722 // It's possible for the setcc instruction to be anywhere in the loop, and
2723 // possible for it to have multiple users. If it is not immediately before
2724 // the exiting block branch, move it.
2725 if (isa_and_nonnull<CmpInst>(Cond) && Cond->getNextNode() != TermBr &&
2726 !Extract) {
2727 if (Cond->hasOneUse()) {
2728 Cond->moveBefore(TermBr->getIterator());
2729 } else {
2730 // Clone the terminating condition and insert into the loopend.
2731 Instruction *OldCond = Cond;
2732 Cond = Cond->clone();
2733 Cond->setName(L->getHeader()->getName() + ".termcond");
2734 Cond->insertInto(ExitingBlock, TermBr->getIterator());
2735
2736 // Clone the IVUse, as the old use still exists!
2737 CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
2738 TermBr->replaceUsesOfWith(OldCond, Cond);
2739 }
2740 }
2741
2742 // If we get to here, we know that we can transform the setcc instruction to
2743 // use the post-incremented version of the IV, allowing us to coalesce the
2744 // live ranges for the IV correctly.
2745 CondUse->transformToPostInc(L);
2746 Changed = true;
2747
2748 PostIncs.insert(Cond);
2749 decline_post_inc:;
2750 }
2751
2752 // Determine an insertion point for the loop induction variable increment. It
2753 // must dominate all the post-inc comparisons we just set up, and it must
2754 // dominate the loop latch edge.
2755 IVIncInsertPos = L->getLoopLatch()->getTerminator();
2756 for (Instruction *Inst : PostIncs)
2757 IVIncInsertPos = DT.findNearestCommonDominator(IVIncInsertPos, Inst);
2758}
2759
2760/// Determine if the given use can accommodate a fixup at the given offset and
2761/// other details. If so, update the use and return true.
2762bool LSRInstance::reconcileNewOffset(LSRUse &LU, Immediate NewOffset,
2763 bool HasBaseReg, LSRUse::KindType Kind,
2764 MemAccessTy AccessTy) {
2765 Immediate NewMinOffset = LU.MinOffset;
2766 Immediate NewMaxOffset = LU.MaxOffset;
2767 MemAccessTy NewAccessTy = AccessTy;
2768
2769 // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
2770 // something conservative, however this can pessimize in the case that one of
2771 // the uses will have all its uses outside the loop, for example.
2772 if (LU.Kind != Kind)
2773 return false;
2774
2775 // Check for a mismatched access type, and fall back conservatively as needed.
2776 // TODO: Be less conservative when the type is similar and can use the same
2777 // addressing modes.
2778 if (Kind == LSRUse::Address) {
2779 if (AccessTy.MemTy != LU.AccessTy.MemTy) {
2780 NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext(),
2781 AccessTy.AddrSpace);
2782 }
2783 }
2784
2785 // Conservatively assume HasBaseReg is true for now.
2786 if (Immediate::isKnownLT(NewOffset, LU.MinOffset)) {
2787 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2788 LU.MaxOffset - NewOffset, HasBaseReg))
2789 return false;
2790 NewMinOffset = NewOffset;
2791 } else if (Immediate::isKnownGT(NewOffset, LU.MaxOffset)) {
2792 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2793 NewOffset - LU.MinOffset, HasBaseReg))
2794 return false;
2795 NewMaxOffset = NewOffset;
2796 }
2797
2798 // FIXME: We should be able to handle some level of scalable offset support
2799 // for 'void', but in order to get basic support up and running this is
2800 // being left out.
2801 if (NewAccessTy.MemTy && NewAccessTy.MemTy->isVoidTy() &&
2802 (NewMinOffset.isScalable() || NewMaxOffset.isScalable()))
2803 return false;
2804
2805 // Update the use.
2806 LU.MinOffset = NewMinOffset;
2807 LU.MaxOffset = NewMaxOffset;
2808 LU.AccessTy = NewAccessTy;
2809 return true;
2810}
2811
2812/// Return an LSRUse index and an offset value for a fixup which needs the given
2813/// expression, with the given kind and optional access type. Either reuse an
2814/// existing use or create a new one, as needed.
2815std::pair<size_t, Immediate> LSRInstance::getUse(const SCEV *&Expr,
2816 LSRUse::KindType Kind,
2817 MemAccessTy AccessTy) {
2818 const SCEV *Copy = Expr;
2819 Immediate Offset = ExtractImmediate(Expr, SE);
2820
2821 // Basic uses can't accept any offset, for example.
2822 if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
2823 Offset, /*HasBaseReg=*/ true)) {
2824 Expr = Copy;
2825 Offset = Immediate::getFixed(0);
2826 }
2827
2828 std::pair<UseMapTy::iterator, bool> P =
2829 UseMap.try_emplace(LSRUse::SCEVUseKindPair(Expr, Kind));
2830 if (!P.second) {
2831 // A use already existed with this base.
2832 size_t LUIdx = P.first->second;
2833 LSRUse &LU = Uses[LUIdx];
2834 if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
2835 // Reuse this use.
2836 return std::make_pair(LUIdx, Offset);
2837 }
2838
2839 // Create a new use.
2840 size_t LUIdx = Uses.size();
2841 P.first->second = LUIdx;
2842 Uses.push_back(LSRUse(Kind, AccessTy));
2843 LSRUse &LU = Uses[LUIdx];
2844
2845 LU.MinOffset = Offset;
2846 LU.MaxOffset = Offset;
2847 return std::make_pair(LUIdx, Offset);
2848}
2849
2850/// Delete the given use from the Uses list.
2851void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
2852 if (&LU != &Uses.back())
2853 std::swap(LU, Uses.back());
2854 Uses.pop_back();
2855
2856 // Update RegUses.
2857 RegUses.swapAndDropUse(LUIdx, Uses.size());
2858}
2859
2860/// Look for a use distinct from OrigLU which is has a formula that has the same
2861/// registers as the given formula.
2862LSRUse *
2863LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
2864 const LSRUse &OrigLU) {
2865 // Search all uses for the formula. This could be more clever.
2866 for (LSRUse &LU : Uses) {
2867 // Check whether this use is close enough to OrigLU, to see whether it's
2868 // worthwhile looking through its formulae.
2869 // Ignore ICmpZero uses because they may contain formulae generated by
2870 // GenerateICmpZeroScales, in which case adding fixup offsets may
2871 // be invalid.
2872 if (&LU != &OrigLU &&
2873 LU.Kind != LSRUse::ICmpZero &&
2874 LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
2875 LU.WidestFixupType == OrigLU.WidestFixupType &&
2876 LU.HasFormulaWithSameRegs(OrigF)) {
2877 // Scan through this use's formulae.
2878 for (const Formula &F : LU.Formulae) {
2879 // Check to see if this formula has the same registers and symbols
2880 // as OrigF.
2881 if (F.BaseRegs == OrigF.BaseRegs &&
2882 F.ScaledReg == OrigF.ScaledReg &&
2883 F.BaseGV == OrigF.BaseGV &&
2884 F.Scale == OrigF.Scale &&
2885 F.UnfoldedOffset == OrigF.UnfoldedOffset) {
2886 if (F.BaseOffset.isZero())
2887 return &LU;
2888 // This is the formula where all the registers and symbols matched;
2889 // there aren't going to be any others. Since we declined it, we
2890 // can skip the rest of the formulae and proceed to the next LSRUse.
2891 break;
2892 }
2893 }
2894 }
2895 }
2896
2897 // Nothing looked good.
2898 return nullptr;
2899}
2900
2901void LSRInstance::CollectInterestingTypesAndFactors() {
2902 SmallSetVector<const SCEV *, 4> Strides;
2903
2904 // Collect interesting types and strides.
2906 for (const IVStrideUse &U : IU) {
2907 const SCEV *Expr = IU.getExpr(U);
2908 if (!Expr)
2909 continue;
2910
2911 // Collect interesting types.
2912 Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
2913
2914 // Add strides for mentioned loops.
2915 Worklist.push_back(Expr);
2916 do {
2917 const SCEV *S = Worklist.pop_back_val();
2918 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
2919 if (AR->getLoop() == L)
2920 Strides.insert(AR->getStepRecurrence(SE));
2921 Worklist.push_back(AR->getStart());
2922 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
2923 append_range(Worklist, Add->operands());
2924 }
2925 } while (!Worklist.empty());
2926 }
2927
2928 // Compute interesting factors from the set of interesting strides.
2929 for (SmallSetVector<const SCEV *, 4>::const_iterator
2930 I = Strides.begin(), E = Strides.end(); I != E; ++I)
2931 for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
2932 std::next(I); NewStrideIter != E; ++NewStrideIter) {
2933 const SCEV *OldStride = *I;
2934 const SCEV *NewStride = *NewStrideIter;
2935
2936 if (SE.getTypeSizeInBits(OldStride->getType()) !=
2937 SE.getTypeSizeInBits(NewStride->getType())) {
2938 if (SE.getTypeSizeInBits(OldStride->getType()) >
2939 SE.getTypeSizeInBits(NewStride->getType()))
2940 NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
2941 else
2942 OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
2943 }
2944 if (const SCEVConstant *Factor =
2945 dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
2946 SE, true))) {
2947 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2948 Factors.insert(Factor->getAPInt().getSExtValue());
2949 } else if (const SCEVConstant *Factor =
2951 NewStride,
2952 SE, true))) {
2953 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2954 Factors.insert(Factor->getAPInt().getSExtValue());
2955 }
2956 }
2957
2958 // If all uses use the same type, don't bother looking for truncation-based
2959 // reuse.
2960 if (Types.size() == 1)
2961 Types.clear();
2962
2963 LLVM_DEBUG(print_factors_and_types(dbgs()));
2964}
2965
2966/// Helper for CollectChains that finds an IV operand (computed by an AddRec in
2967/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
2968/// IVStrideUses, we could partially skip this.
2969static User::op_iterator
2971 Loop *L, ScalarEvolution &SE) {
2972 for(; OI != OE; ++OI) {
2973 if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
2974 if (!SE.isSCEVable(Oper->getType()))
2975 continue;
2976
2977 if (const SCEVAddRecExpr *AR =
2979 if (AR->getLoop() == L)
2980 break;
2981 }
2982 }
2983 }
2984 return OI;
2985}
2986
2987/// IVChain logic must consistently peek base TruncInst operands, so wrap it in
2988/// a convenient helper.
2990 if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
2991 return Trunc->getOperand(0);
2992 return Oper;
2993}
2994
2995/// Return an approximation of this SCEV expression's "base", or NULL for any
2996/// constant. Returning the expression itself is conservative. Returning a
2997/// deeper subexpression is more precise and valid as long as it isn't less
2998/// complex than another subexpression. For expressions involving multiple
2999/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
3000/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
3001/// IVInc==b-a.
3002///
3003/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
3004/// SCEVUnknown, we simply return the rightmost SCEV operand.
3005static const SCEV *getExprBase(const SCEV *S) {
3006 switch (S->getSCEVType()) {
3007 default: // including scUnknown.
3008 return S;
3009 case scConstant:
3010 case scVScale:
3011 return nullptr;
3012 case scTruncate:
3013 return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
3014 case scZeroExtend:
3015 return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
3016 case scSignExtend:
3017 return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
3018 case scAddExpr: {
3019 // Skip over scaled operands (scMulExpr) to follow add operands as long as
3020 // there's nothing more complex.
3021 // FIXME: not sure if we want to recognize negation.
3022 const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
3023 for (const SCEV *SubExpr : reverse(Add->operands())) {
3024 if (SubExpr->getSCEVType() == scAddExpr)
3025 return getExprBase(SubExpr);
3026
3027 if (SubExpr->getSCEVType() != scMulExpr)
3028 return SubExpr;
3029 }
3030 return S; // all operands are scaled, be conservative.
3031 }
3032 case scAddRecExpr:
3033 return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
3034 }
3035 llvm_unreachable("Unknown SCEV kind!");
3036}
3037
3038/// Return true if the chain increment is profitable to expand into a loop
3039/// invariant value, which may require its own register. A profitable chain
3040/// increment will be an offset relative to the same base. We allow such offsets
3041/// to potentially be used as chain increment as long as it's not obviously
3042/// expensive to expand using real instructions.
3043bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
3044 const SCEV *IncExpr,
3045 ScalarEvolution &SE) {
3046 // Aggressively form chains when -stress-ivchain.
3047 if (StressIVChain)
3048 return true;
3049
3050 // Do not replace a constant offset from IV head with a nonconstant IV
3051 // increment.
3052 if (!isa<SCEVConstant>(IncExpr)) {
3053 const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
3054 if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
3055 return false;
3056 }
3057
3058 SmallPtrSet<const SCEV*, 8> Processed;
3059 return !isHighCostExpansion(IncExpr, Processed, SE);
3060}
3061
3062/// Return true if the number of registers needed for the chain is estimated to
3063/// be less than the number required for the individual IV users. First prohibit
3064/// any IV users that keep the IV live across increments (the Users set should
3065/// be empty). Next count the number and type of increments in the chain.
3066///
3067/// Chaining IVs can lead to considerable code bloat if ISEL doesn't
3068/// effectively use postinc addressing modes. Only consider it profitable it the
3069/// increments can be computed in fewer registers when chained.
3070///
3071/// TODO: Consider IVInc free if it's already used in another chains.
3072static bool isProfitableChain(IVChain &Chain,
3074 ScalarEvolution &SE,
3075 const TargetTransformInfo &TTI) {
3076 if (StressIVChain)
3077 return true;
3078
3079 if (!Chain.hasIncs())
3080 return false;
3081
3082 if (!Users.empty()) {
3083 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
3084 for (Instruction *Inst
3085 : Users) { dbgs() << " " << *Inst << "\n"; });
3086 return false;
3087 }
3088 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3089
3090 // The chain itself may require a register, so initialize cost to 1.
3091 int cost = 1;
3092
3093 // A complete chain likely eliminates the need for keeping the original IV in
3094 // a register. LSR does not currently know how to form a complete chain unless
3095 // the header phi already exists.
3096 if (isa<PHINode>(Chain.tailUserInst())
3097 && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
3098 --cost;
3099 }
3100 const SCEV *LastIncExpr = nullptr;
3101 unsigned NumConstIncrements = 0;
3102 unsigned NumVarIncrements = 0;
3103 unsigned NumReusedIncrements = 0;
3104
3105 if (TTI.isProfitableLSRChainElement(Chain.Incs[0].UserInst))
3106 return true;
3107
3108 for (const IVInc &Inc : Chain) {
3109 if (TTI.isProfitableLSRChainElement(Inc.UserInst))
3110 return true;
3111 if (Inc.IncExpr->isZero())
3112 continue;
3113
3114 // Incrementing by zero or some constant is neutral. We assume constants can
3115 // be folded into an addressing mode or an add's immediate operand.
3116 if (isa<SCEVConstant>(Inc.IncExpr)) {
3117 ++NumConstIncrements;
3118 continue;
3119 }
3120
3121 if (Inc.IncExpr == LastIncExpr)
3122 ++NumReusedIncrements;
3123 else
3124 ++NumVarIncrements;
3125
3126 LastIncExpr = Inc.IncExpr;
3127 }
3128 // An IV chain with a single increment is handled by LSR's postinc
3129 // uses. However, a chain with multiple increments requires keeping the IV's
3130 // value live longer than it needs to be if chained.
3131 if (NumConstIncrements > 1)
3132 --cost;
3133
3134 // Materializing increment expressions in the preheader that didn't exist in
3135 // the original code may cost a register. For example, sign-extended array
3136 // indices can produce ridiculous increments like this:
3137 // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
3138 cost += NumVarIncrements;
3139
3140 // Reusing variable increments likely saves a register to hold the multiple of
3141 // the stride.
3142 cost -= NumReusedIncrements;
3143
3144 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
3145 << "\n");
3146
3147 return cost < 0;
3148}
3149
3150/// Add this IV user to an existing chain or make it the head of a new chain.
3151void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
3152 SmallVectorImpl<ChainUsers> &ChainUsersVec) {
3153 // When IVs are used as types of varying widths, they are generally converted
3154 // to a wider type with some uses remaining narrow under a (free) trunc.
3155 Value *const NextIV = getWideOperand(IVOper);
3156 const SCEV *const OperExpr = SE.getSCEV(NextIV);
3157 const SCEV *const OperExprBase = getExprBase(OperExpr);
3158
3159 // Visit all existing chains. Check if its IVOper can be computed as a
3160 // profitable loop invariant increment from the last link in the Chain.
3161 unsigned ChainIdx = 0, NChains = IVChainVec.size();
3162 const SCEV *LastIncExpr = nullptr;
3163 for (; ChainIdx < NChains; ++ChainIdx) {
3164 IVChain &Chain = IVChainVec[ChainIdx];
3165
3166 // Prune the solution space aggressively by checking that both IV operands
3167 // are expressions that operate on the same unscaled SCEVUnknown. This
3168 // "base" will be canceled by the subsequent getMinusSCEV call. Checking
3169 // first avoids creating extra SCEV expressions.
3170 if (!StressIVChain && Chain.ExprBase != OperExprBase)
3171 continue;
3172
3173 Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
3174 if (PrevIV->getType() != NextIV->getType())
3175 continue;
3176
3177 // A phi node terminates a chain.
3178 if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
3179 continue;
3180
3181 // The increment must be loop-invariant so it can be kept in a register.
3182 const SCEV *PrevExpr = SE.getSCEV(PrevIV);
3183 const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
3184 if (isa<SCEVCouldNotCompute>(IncExpr) || !SE.isLoopInvariant(IncExpr, L))
3185 continue;
3186
3187 if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
3188 LastIncExpr = IncExpr;
3189 break;
3190 }
3191 }
3192 // If we haven't found a chain, create a new one, unless we hit the max. Don't
3193 // bother for phi nodes, because they must be last in the chain.
3194 if (ChainIdx == NChains) {
3195 if (isa<PHINode>(UserInst))
3196 return;
3197 if (NChains >= MaxChains && !StressIVChain) {
3198 LLVM_DEBUG(dbgs() << "IV Chain Limit\n");
3199 return;
3200 }
3201 LastIncExpr = OperExpr;
3202 // IVUsers may have skipped over sign/zero extensions. We don't currently
3203 // attempt to form chains involving extensions unless they can be hoisted
3204 // into this loop's AddRec.
3205 if (!isa<SCEVAddRecExpr>(LastIncExpr))
3206 return;
3207 ++NChains;
3208 IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
3209 OperExprBase));
3210 ChainUsersVec.resize(NChains);
3211 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
3212 << ") IV=" << *LastIncExpr << "\n");
3213 } else {
3214 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst
3215 << ") IV+" << *LastIncExpr << "\n");
3216 // Add this IV user to the end of the chain.
3217 IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
3218 }
3219 IVChain &Chain = IVChainVec[ChainIdx];
3220
3221 SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
3222 // This chain's NearUsers become FarUsers.
3223 if (!LastIncExpr->isZero()) {
3224 ChainUsersVec[ChainIdx].FarUsers.insert_range(NearUsers);
3225 NearUsers.clear();
3226 }
3227
3228 // All other uses of IVOperand become near uses of the chain.
3229 // We currently ignore intermediate values within SCEV expressions, assuming
3230 // they will eventually be used be the current chain, or can be computed
3231 // from one of the chain increments. To be more precise we could
3232 // transitively follow its user and only add leaf IV users to the set.
3233 for (User *U : IVOper->users()) {
3234 Instruction *OtherUse = dyn_cast<Instruction>(U);
3235 if (!OtherUse)
3236 continue;
3237 // Uses in the chain will no longer be uses if the chain is formed.
3238 // Include the head of the chain in this iteration (not Chain.begin()).
3239 IVChain::const_iterator IncIter = Chain.Incs.begin();
3240 IVChain::const_iterator IncEnd = Chain.Incs.end();
3241 for( ; IncIter != IncEnd; ++IncIter) {
3242 if (IncIter->UserInst == OtherUse)
3243 break;
3244 }
3245 if (IncIter != IncEnd)
3246 continue;
3247
3248 if (SE.isSCEVable(OtherUse->getType())
3249 && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
3250 && IU.isIVUserOrOperand(OtherUse)) {
3251 continue;
3252 }
3253 NearUsers.insert(OtherUse);
3254 }
3255
3256 // Since this user is part of the chain, it's no longer considered a use
3257 // of the chain.
3258 ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
3259}
3260
3261/// Populate the vector of Chains.
3262///
3263/// This decreases ILP at the architecture level. Targets with ample registers,
3264/// multiple memory ports, and no register renaming probably don't want
3265/// this. However, such targets should probably disable LSR altogether.
3266///
3267/// The job of LSR is to make a reasonable choice of induction variables across
3268/// the loop. Subsequent passes can easily "unchain" computation exposing more
3269/// ILP *within the loop* if the target wants it.
3270///
3271/// Finding the best IV chain is potentially a scheduling problem. Since LSR
3272/// will not reorder memory operations, it will recognize this as a chain, but
3273/// will generate redundant IV increments. Ideally this would be corrected later
3274/// by a smart scheduler:
3275/// = A[i]
3276/// = A[i+x]
3277/// A[i] =
3278/// A[i+x] =
3279///
3280/// TODO: Walk the entire domtree within this loop, not just the path to the
3281/// loop latch. This will discover chains on side paths, but requires
3282/// maintaining multiple copies of the Chains state.
3283void LSRInstance::CollectChains() {
3284 LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n");
3285 SmallVector<ChainUsers, 8> ChainUsersVec;
3286
3287 SmallVector<BasicBlock *,8> LatchPath;
3288 BasicBlock *LoopHeader = L->getHeader();
3289 for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
3290 Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
3291 LatchPath.push_back(Rung->getBlock());
3292 }
3293 LatchPath.push_back(LoopHeader);
3294
3295 // Walk the instruction stream from the loop header to the loop latch.
3296 for (BasicBlock *BB : reverse(LatchPath)) {
3297 for (Instruction &I : *BB) {
3298 // Skip instructions that weren't seen by IVUsers analysis.
3299 if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&I))
3300 continue;
3301
3302 // Ignore users that are part of a SCEV expression. This way we only
3303 // consider leaf IV Users. This effectively rediscovers a portion of
3304 // IVUsers analysis but in program order this time.
3305 if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I)))
3306 continue;
3307
3308 // Remove this instruction from any NearUsers set it may be in.
3309 for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
3310 ChainIdx < NChains; ++ChainIdx) {
3311 ChainUsersVec[ChainIdx].NearUsers.erase(&I);
3312 }
3313 // Search for operands that can be chained.
3314 SmallPtrSet<Instruction*, 4> UniqueOperands;
3315 User::op_iterator IVOpEnd = I.op_end();
3316 User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE);
3317 while (IVOpIter != IVOpEnd) {
3318 Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
3319 if (UniqueOperands.insert(IVOpInst).second)
3320 ChainInstruction(&I, IVOpInst, ChainUsersVec);
3321 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3322 }
3323 } // Continue walking down the instructions.
3324 } // Continue walking down the domtree.
3325 // Visit phi backedges to determine if the chain can generate the IV postinc.
3326 for (PHINode &PN : L->getHeader()->phis()) {
3327 if (!SE.isSCEVable(PN.getType()))
3328 continue;
3329
3330 Instruction *IncV =
3331 dyn_cast<Instruction>(PN.getIncomingValueForBlock(L->getLoopLatch()));
3332 if (IncV)
3333 ChainInstruction(&PN, IncV, ChainUsersVec);
3334 }
3335 // Remove any unprofitable chains.
3336 unsigned ChainIdx = 0;
3337 for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
3338 UsersIdx < NChains; ++UsersIdx) {
3339 if (!isProfitableChain(IVChainVec[UsersIdx],
3340 ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
3341 continue;
3342 // Preserve the chain at UsesIdx.
3343 if (ChainIdx != UsersIdx)
3344 IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
3345 FinalizeChain(IVChainVec[ChainIdx]);
3346 ++ChainIdx;
3347 }
3348 IVChainVec.resize(ChainIdx);
3349}
3350
3351void LSRInstance::FinalizeChain(IVChain &Chain) {
3352 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3353 LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
3354
3355 for (const IVInc &Inc : Chain) {
3356 LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");
3357 auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
3358 assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
3359 IVIncSet.insert(UseI);
3360 }
3361}
3362
3363/// Return true if the IVInc can be folded into an addressing mode.
3364static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
3365 Value *Operand, const TargetTransformInfo &TTI) {
3366 const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
3367 Immediate IncOffset = Immediate::getZero();
3368 if (IncConst) {
3369 if (IncConst && IncConst->getAPInt().getSignificantBits() > 64)
3370 return false;
3371 IncOffset = Immediate::getFixed(IncConst->getValue()->getSExtValue());
3372 } else {
3373 // Look for mul(vscale, constant), to detect a scalable offset.
3374 const APInt *C;
3375 if (!match(IncExpr, m_scev_Mul(m_scev_APInt(C), m_SCEVVScale())) ||
3376 C->getSignificantBits() > 64)
3377 return false;
3378 IncOffset = Immediate::getScalable(C->getSExtValue());
3379 }
3380
3381 if (!isAddressUse(TTI, UserInst, Operand))
3382 return false;
3383
3384 MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
3385 if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
3386 IncOffset, /*HasBaseReg=*/false))
3387 return false;
3388
3389 return true;
3390}
3391
3392/// Generate an add or subtract for each IVInc in a chain to materialize the IV
3393/// user's operand from the previous IV user's operand.
3394void LSRInstance::GenerateIVChain(const IVChain &Chain,
3395 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
3396 // Find the new IVOperand for the head of the chain. It may have been replaced
3397 // by LSR.
3398 const IVInc &Head = Chain.Incs[0];
3399 User::op_iterator IVOpEnd = Head.UserInst->op_end();
3400 // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
3401 User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
3402 IVOpEnd, L, SE);
3403 Value *IVSrc = nullptr;
3404 while (IVOpIter != IVOpEnd) {
3405 IVSrc = getWideOperand(*IVOpIter);
3406
3407 // If this operand computes the expression that the chain needs, we may use
3408 // it. (Check this after setting IVSrc which is used below.)
3409 //
3410 // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
3411 // narrow for the chain, so we can no longer use it. We do allow using a
3412 // wider phi, assuming the LSR checked for free truncation. In that case we
3413 // should already have a truncate on this operand such that
3414 // getSCEV(IVSrc) == IncExpr.
3415 if (SE.getSCEV(*IVOpIter) == Head.IncExpr
3416 || SE.getSCEV(IVSrc) == Head.IncExpr) {
3417 break;
3418 }
3419 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3420 }
3421 if (IVOpIter == IVOpEnd) {
3422 // Gracefully give up on this chain.
3423 LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
3424 return;
3425 }
3426 assert(IVSrc && "Failed to find IV chain source");
3427
3428 LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
3429 Type *IVTy = IVSrc->getType();
3430 Type *IntTy = SE.getEffectiveSCEVType(IVTy);
3431 const SCEV *LeftOverExpr = nullptr;
3432 const SCEV *Accum = SE.getZero(IntTy);
3434 Bases.emplace_back(Accum, IVSrc);
3435
3436 for (const IVInc &Inc : Chain) {
3437 Instruction *InsertPt = Inc.UserInst;
3438 if (isa<PHINode>(InsertPt))
3439 InsertPt = L->getLoopLatch()->getTerminator();
3440
3441 // IVOper will replace the current IV User's operand. IVSrc is the IV
3442 // value currently held in a register.
3443 Value *IVOper = IVSrc;
3444 if (!Inc.IncExpr->isZero()) {
3445 // IncExpr was the result of subtraction of two narrow values, so must
3446 // be signed.
3447 const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
3448 Accum = SE.getAddExpr(Accum, IncExpr);
3449 LeftOverExpr = LeftOverExpr ?
3450 SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
3451 }
3452
3453 // Look through each base to see if any can produce a nice addressing mode.
3454 bool FoundBase = false;
3455 for (auto [MapScev, MapIVOper] : reverse(Bases)) {
3456 const SCEV *Remainder = SE.getMinusSCEV(Accum, MapScev);
3457 if (canFoldIVIncExpr(Remainder, Inc.UserInst, Inc.IVOperand, TTI)) {
3458 if (!Remainder->isZero()) {
3459 Rewriter.clearPostInc();
3460 Value *IncV = Rewriter.expandCodeFor(Remainder, IntTy, InsertPt);
3461 const SCEV *IVOperExpr =
3462 SE.getAddExpr(SE.getUnknown(MapIVOper), SE.getUnknown(IncV));
3463 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3464 } else {
3465 IVOper = MapIVOper;
3466 }
3467
3468 FoundBase = true;
3469 break;
3470 }
3471 }
3472 if (!FoundBase && LeftOverExpr && !LeftOverExpr->isZero()) {
3473 // Expand the IV increment.
3474 Rewriter.clearPostInc();
3475 Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
3476 const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
3477 SE.getUnknown(IncV));
3478 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3479
3480 // If an IV increment can't be folded, use it as the next IV value.
3481 if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
3482 assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
3483 Bases.emplace_back(Accum, IVOper);
3484 IVSrc = IVOper;
3485 LeftOverExpr = nullptr;
3486 }
3487 }
3488 Type *OperTy = Inc.IVOperand->getType();
3489 if (IVTy != OperTy) {
3490 assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
3491 "cannot extend a chained IV");
3492 IRBuilder<> Builder(InsertPt);
3493 IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
3494 }
3495 Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
3496 if (auto *OperandIsInstr = dyn_cast<Instruction>(Inc.IVOperand))
3497 DeadInsts.emplace_back(OperandIsInstr);
3498 }
3499 // If LSR created a new, wider phi, we may also replace its postinc. We only
3500 // do this if we also found a wide value for the head of the chain.
3501 if (isa<PHINode>(Chain.tailUserInst())) {
3502 for (PHINode &Phi : L->getHeader()->phis()) {
3503 if (Phi.getType() != IVSrc->getType())
3504 continue;
3506 Phi.getIncomingValueForBlock(L->getLoopLatch()));
3507 if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
3508 continue;
3509 Value *IVOper = IVSrc;
3510 Type *PostIncTy = PostIncV->getType();
3511 if (IVTy != PostIncTy) {
3512 assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
3513 IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
3514 Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
3515 IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
3516 }
3517 Phi.replaceUsesOfWith(PostIncV, IVOper);
3518 DeadInsts.emplace_back(PostIncV);
3519 }
3520 }
3521}
3522
3523void LSRInstance::CollectFixupsAndInitialFormulae() {
3524 BranchInst *ExitBranch = nullptr;
3525 bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &TLI);
3526
3527 // For calculating baseline cost
3528 SmallPtrSet<const SCEV *, 16> Regs;
3529 DenseSet<const SCEV *> VisitedRegs;
3530 DenseSet<size_t> VisitedLSRUse;
3531
3532 for (const IVStrideUse &U : IU) {
3533 Instruction *UserInst = U.getUser();
3534 // Skip IV users that are part of profitable IV Chains.
3535 User::op_iterator UseI =
3536 find(UserInst->operands(), U.getOperandValToReplace());
3537 assert(UseI != UserInst->op_end() && "cannot find IV operand");
3538 if (IVIncSet.count(UseI)) {
3539 LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
3540 continue;
3541 }
3542
3543 LSRUse::KindType Kind = LSRUse::Basic;
3544 MemAccessTy AccessTy;
3545 if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
3546 Kind = LSRUse::Address;
3547 AccessTy = getAccessType(TTI, UserInst, U.getOperandValToReplace());
3548 }
3549
3550 const SCEV *S = IU.getExpr(U);
3551 if (!S)
3552 continue;
3553 PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
3554
3555 // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
3556 // (N - i == 0), and this allows (N - i) to be the expression that we work
3557 // with rather than just N or i, so we can consider the register
3558 // requirements for both N and i at the same time. Limiting this code to
3559 // equality icmps is not a problem because all interesting loops use
3560 // equality icmps, thanks to IndVarSimplify.
3561 if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) {
3562 // If CI can be saved in some target, like replaced inside hardware loop
3563 // in PowerPC, no need to generate initial formulae for it.
3564 if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition()))
3565 continue;
3566 if (CI->isEquality()) {
3567 // Swap the operands if needed to put the OperandValToReplace on the
3568 // left, for consistency.
3569 Value *NV = CI->getOperand(1);
3570 if (NV == U.getOperandValToReplace()) {
3571 CI->setOperand(1, CI->getOperand(0));
3572 CI->setOperand(0, NV);
3573 NV = CI->getOperand(1);
3574 Changed = true;
3575 }
3576
3577 // x == y --> x - y == 0
3578 const SCEV *N = SE.getSCEV(NV);
3579 if (SE.isLoopInvariant(N, L) && Rewriter.isSafeToExpand(N) &&
3580 (!NV->getType()->isPointerTy() ||
3581 SE.getPointerBase(N) == SE.getPointerBase(S))) {
3582 // S is normalized, so normalize N before folding it into S
3583 // to keep the result normalized.
3584 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3585 if (!N)
3586 continue;
3587 Kind = LSRUse::ICmpZero;
3588 S = SE.getMinusSCEV(N, S);
3589 } else if (L->isLoopInvariant(NV) &&
3590 (!isa<Instruction>(NV) ||
3591 DT.dominates(cast<Instruction>(NV), L->getHeader())) &&
3592 !NV->getType()->isPointerTy()) {
3593 // If we can't generally expand the expression (e.g. it contains
3594 // a divide), but it is already at a loop invariant point before the
3595 // loop, wrap it in an unknown (to prevent the expander from trying
3596 // to re-expand in a potentially unsafe way.) The restriction to
3597 // integer types is required because the unknown hides the base, and
3598 // SCEV can't compute the difference of two unknown pointers.
3599 N = SE.getUnknown(NV);
3600 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3601 if (!N)
3602 continue;
3603 Kind = LSRUse::ICmpZero;
3604 S = SE.getMinusSCEV(N, S);
3606 }
3607
3608 // -1 and the negations of all interesting strides (except the negation
3609 // of -1) are now also interesting.
3610 for (size_t i = 0, e = Factors.size(); i != e; ++i)
3611 if (Factors[i] != -1)
3612 Factors.insert(-(uint64_t)Factors[i]);
3613 Factors.insert(-1);
3614 }
3615 }
3616
3617 // Get or create an LSRUse.
3618 std::pair<size_t, Immediate> P = getUse(S, Kind, AccessTy);
3619 size_t LUIdx = P.first;
3620 Immediate Offset = P.second;
3621 LSRUse &LU = Uses[LUIdx];
3622
3623 // Record the fixup.
3624 LSRFixup &LF = LU.getNewFixup();
3625 LF.UserInst = UserInst;
3626 LF.OperandValToReplace = U.getOperandValToReplace();
3627 LF.PostIncLoops = TmpPostIncLoops;
3628 LF.Offset = Offset;
3629 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3630 LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF);
3631
3632 // Create SCEV as Formula for calculating baseline cost
3633 if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) {
3634 Formula F;
3635 F.initialMatch(S, L, SE);
3636 BaselineCost.RateFormula(F, Regs, VisitedRegs, LU,
3637 HardwareLoopProfitable);
3638 VisitedLSRUse.insert(LUIdx);
3639 }
3640
3641 if (!LU.WidestFixupType ||
3642 SE.getTypeSizeInBits(LU.WidestFixupType) <
3643 SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3644 LU.WidestFixupType = LF.OperandValToReplace->getType();
3645
3646 // If this is the first use of this LSRUse, give it a formula.
3647 if (LU.Formulae.empty()) {
3648 InsertInitialFormula(S, LU, LUIdx);
3649 CountRegisters(LU.Formulae.back(), LUIdx);
3650 }
3651 }
3652
3653 LLVM_DEBUG(print_fixups(dbgs()));
3654}
3655
3656/// Insert a formula for the given expression into the given use, separating out
3657/// loop-variant portions from loop-invariant and loop-computable portions.
3658void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU,
3659 size_t LUIdx) {
3660 // Mark uses whose expressions cannot be expanded.
3661 if (!Rewriter.isSafeToExpand(S))
3662 LU.RigidFormula = true;
3663
3664 Formula F;
3665 F.initialMatch(S, L, SE);
3666 bool Inserted = InsertFormula(LU, LUIdx, F);
3667 assert(Inserted && "Initial formula already exists!"); (void)Inserted;
3668}
3669
3670/// Insert a simple single-register formula for the given expression into the
3671/// given use.
3672void
3673LSRInstance::InsertSupplementalFormula(const SCEV *S,
3674 LSRUse &LU, size_t LUIdx) {
3675 Formula F;
3676 F.BaseRegs.push_back(S);
3677 F.HasBaseReg = true;
3678 bool Inserted = InsertFormula(LU, LUIdx, F);
3679 assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
3680}
3681
3682/// Note which registers are used by the given formula, updating RegUses.
3683void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
3684 if (F.ScaledReg)
3685 RegUses.countRegister(F.ScaledReg, LUIdx);
3686 for (const SCEV *BaseReg : F.BaseRegs)
3687 RegUses.countRegister(BaseReg, LUIdx);
3688}
3689
3690/// If the given formula has not yet been inserted, add it to the list, and
3691/// return true. Return false otherwise.
3692bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
3693 // Do not insert formula that we will not be able to expand.
3694 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
3695 "Formula is illegal");
3696
3697 if (!LU.InsertFormula(F, *L))
3698 return false;
3699
3700 CountRegisters(F, LUIdx);
3701 return true;
3702}
3703
3704/// Test whether this fixup will be executed each time the corresponding IV
3705/// increment instruction is executed.
3706bool LSRInstance::IsFixupExecutedEachIncrement(const LSRFixup &LF) const {
3707 // If the fixup block dominates the IV increment block then there is no path
3708 // through the loop to the increment that doesn't pass through the fixup.
3709 return DT.dominates(LF.UserInst->getParent(), IVIncInsertPos->getParent());
3710}
3711
3712/// Check for other uses of loop-invariant values which we're tracking. These
3713/// other uses will pin these values in registers, making them less profitable
3714/// for elimination.
3715/// TODO: This currently misses non-constant addrec step registers.
3716/// TODO: Should this give more weight to users inside the loop?
3717void
3718LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
3719 SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
3720 SmallPtrSet<const SCEV *, 32> Visited;
3721
3722 // Don't collect outside uses if we are favoring postinc - the instructions in
3723 // the loop are more important than the ones outside of it.
3724 if (AMK == TTI::AMK_PostIndexed)
3725 return;
3726
3727 while (!Worklist.empty()) {
3728 const SCEV *S = Worklist.pop_back_val();
3729
3730 // Don't process the same SCEV twice
3731 if (!Visited.insert(S).second)
3732 continue;
3733
3734 if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
3735 append_range(Worklist, N->operands());
3736 else if (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(S))
3737 Worklist.push_back(C->getOperand());
3738 else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
3739 Worklist.push_back(D->getLHS());
3740 Worklist.push_back(D->getRHS());
3741 } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
3742 const Value *V = US->getValue();
3743 if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
3744 // Look for instructions defined outside the loop.
3745 if (L->contains(Inst)) continue;
3746 } else if (isa<Constant>(V))
3747 // Constants can be re-materialized.
3748 continue;
3749 for (const Use &U : V->uses()) {
3750 const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
3751 // Ignore non-instructions.
3752 if (!UserInst)
3753 continue;
3754 // Don't bother if the instruction is an EHPad.
3755 if (UserInst->isEHPad())
3756 continue;
3757 // Ignore instructions in other functions (as can happen with
3758 // Constants).
3759 if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
3760 continue;
3761 // Ignore instructions not dominated by the loop.
3762 const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
3763 UserInst->getParent() :
3764 cast<PHINode>(UserInst)->getIncomingBlock(
3766 if (!DT.dominates(L->getHeader(), UseBB))
3767 continue;
3768 // Don't bother if the instruction is in a BB which ends in an EHPad.
3769 if (UseBB->getTerminator()->isEHPad())
3770 continue;
3771
3772 // Ignore cases in which the currently-examined value could come from
3773 // a basic block terminated with an EHPad. This checks all incoming
3774 // blocks of the phi node since it is possible that the same incoming
3775 // value comes from multiple basic blocks, only some of which may end
3776 // in an EHPad. If any of them do, a subsequent rewrite attempt by this
3777 // pass would try to insert instructions into an EHPad, hitting an
3778 // assertion.
3779 if (isa<PHINode>(UserInst)) {
3780 const auto *PhiNode = cast<PHINode>(UserInst);
3781 bool HasIncompatibleEHPTerminatedBlock = false;
3782 llvm::Value *ExpectedValue = U;
3783 for (unsigned int I = 0; I < PhiNode->getNumIncomingValues(); I++) {
3784 if (PhiNode->getIncomingValue(I) == ExpectedValue) {
3785 if (PhiNode->getIncomingBlock(I)->getTerminator()->isEHPad()) {
3786 HasIncompatibleEHPTerminatedBlock = true;
3787 break;
3788 }
3789 }
3790 }
3791 if (HasIncompatibleEHPTerminatedBlock) {
3792 continue;
3793 }
3794 }
3795
3796 // Don't bother rewriting PHIs in catchswitch blocks.
3797 if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
3798 continue;
3799 // Ignore uses which are part of other SCEV expressions, to avoid
3800 // analyzing them multiple times.
3801 if (SE.isSCEVable(UserInst->getType())) {
3802 const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
3803 // If the user is a no-op, look through to its uses.
3804 if (!isa<SCEVUnknown>(UserS))
3805 continue;
3806 if (UserS == US) {
3807 Worklist.push_back(
3808 SE.getUnknown(const_cast<Instruction *>(UserInst)));
3809 continue;
3810 }
3811 }
3812 // Ignore icmp instructions which are already being analyzed.
3813 if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
3814 unsigned OtherIdx = !U.getOperandNo();
3815 Value *OtherOp = ICI->getOperand(OtherIdx);
3816 if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
3817 continue;
3818 }
3819
3820 // Do not consider uses inside lifetime intrinsics. These are not
3821 // actually materialized.
3822 if (UserInst->isLifetimeStartOrEnd())
3823 continue;
3824
3825 std::pair<size_t, Immediate> P =
3826 getUse(S, LSRUse::Basic, MemAccessTy());
3827 size_t LUIdx = P.first;
3828 Immediate Offset = P.second;
3829 LSRUse &LU = Uses[LUIdx];
3830 LSRFixup &LF = LU.getNewFixup();
3831 LF.UserInst = const_cast<Instruction *>(UserInst);
3832 LF.OperandValToReplace = U;
3833 LF.Offset = Offset;
3834 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3835 LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF);
3836 if (!LU.WidestFixupType ||
3837 SE.getTypeSizeInBits(LU.WidestFixupType) <
3838 SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3839 LU.WidestFixupType = LF.OperandValToReplace->getType();
3840 InsertSupplementalFormula(US, LU, LUIdx);
3841 CountRegisters(LU.Formulae.back(), Uses.size() - 1);
3842 break;
3843 }
3844 }
3845 }
3846}
3847
3848/// Split S into subexpressions which can be pulled out into separate
3849/// registers. If C is non-null, multiply each subexpression by C.
3850///
3851/// Return remainder expression after factoring the subexpressions captured by
3852/// Ops. If Ops is complete, return NULL.
3853static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
3855 const Loop *L,
3856 ScalarEvolution &SE,
3857 unsigned Depth = 0) {
3858 // Arbitrarily cap recursion to protect compile time.
3859 if (Depth >= 3)
3860 return S;
3861
3862 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
3863 // Break out add operands.
3864 for (const SCEV *S : Add->operands()) {
3865 const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
3866 if (Remainder)
3867 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3868 }
3869 return nullptr;
3870 }
3871 const SCEV *Start, *Step;
3872 const SCEVConstant *Op0;
3873 const SCEV *Op1;
3874 if (match(S, m_scev_AffineAddRec(m_SCEV(Start), m_SCEV(Step)))) {
3875 // Split a non-zero base out of an addrec.
3876 if (Start->isZero())
3877 return S;
3878
3879 const SCEV *Remainder = CollectSubexprs(Start, C, Ops, L, SE, Depth + 1);
3880 // Split the non-zero AddRec unless it is part of a nested recurrence that
3881 // does not pertain to this loop.
3882 if (Remainder && (cast<SCEVAddRecExpr>(S)->getLoop() == L ||
3883 !isa<SCEVAddRecExpr>(Remainder))) {
3884 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3885 Remainder = nullptr;
3886 }
3887 if (Remainder != Start) {
3888 if (!Remainder)
3889 Remainder = SE.getConstant(S->getType(), 0);
3890 return SE.getAddRecExpr(Remainder, Step,
3891 cast<SCEVAddRecExpr>(S)->getLoop(),
3892 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
3894 }
3895 } else if (match(S, m_scev_Mul(m_SCEVConstant(Op0), m_SCEV(Op1)))) {
3896 // Break (C * (a + b + c)) into C*a + C*b + C*c.
3897 C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
3898 const SCEV *Remainder = CollectSubexprs(Op1, C, Ops, L, SE, Depth + 1);
3899 if (Remainder)
3900 Ops.push_back(SE.getMulExpr(C, Remainder));
3901 return nullptr;
3902 }
3903 return S;
3904}
3905
3906/// Return true if the SCEV represents a value that may end up as a
3907/// post-increment operation.
3909 LSRUse &LU, const SCEV *S, const Loop *L,
3910 ScalarEvolution &SE) {
3911 if (LU.Kind != LSRUse::Address ||
3912 !LU.AccessTy.getType()->isIntOrIntVectorTy())
3913 return false;
3914 const SCEV *Start;
3915 if (!match(S, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant())))
3916 return false;
3917 // Check if a post-indexed load/store can be used.
3918 if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, S->getType()) ||
3919 TTI.isIndexedStoreLegal(TTI.MIM_PostInc, S->getType())) {
3920 if (!isa<SCEVConstant>(Start) && SE.isLoopInvariant(Start, L))
3921 return true;
3922 }
3923 return false;
3924}
3925
3926/// Helper function for LSRInstance::GenerateReassociations.
3927void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
3928 const Formula &Base,
3929 unsigned Depth, size_t Idx,
3930 bool IsScaledReg) {
3931 const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3932 // Don't generate reassociations for the base register of a value that
3933 // may generate a post-increment operator. The reason is that the
3934 // reassociations cause extra base+register formula to be created,
3935 // and possibly chosen, but the post-increment is more efficient.
3936 if (AMK == TTI::AMK_PostIndexed && mayUsePostIncMode(TTI, LU, BaseReg, L, SE))
3937 return;
3939 const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
3940 if (Remainder)
3941 AddOps.push_back(Remainder);
3942
3943 if (AddOps.size() == 1)
3944 return;
3945
3947 JE = AddOps.end();
3948 J != JE; ++J) {
3949 // Loop-variant "unknown" values are uninteresting; we won't be able to
3950 // do anything meaningful with them.
3951 if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
3952 continue;
3953
3954 // Don't pull a constant into a register if the constant could be folded
3955 // into an immediate field.
3956 if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3957 LU.AccessTy, *J, Base.getNumRegs() > 1))
3958 continue;
3959
3960 // Collect all operands except *J.
3961 SmallVector<const SCEV *, 8> InnerAddOps(std::as_const(AddOps).begin(), J);
3962 InnerAddOps.append(std::next(J), std::as_const(AddOps).end());
3963
3964 // Don't leave just a constant behind in a register if the constant could
3965 // be folded into an immediate field.
3966 if (InnerAddOps.size() == 1 &&
3967 isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3968 LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
3969 continue;
3970
3971 const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
3972 if (InnerSum->isZero())
3973 continue;
3974 Formula F = Base;
3975
3976 if (F.UnfoldedOffset.isNonZero() && F.UnfoldedOffset.isScalable())
3977 continue;
3978
3979 // Add the remaining pieces of the add back into the new formula.
3980 const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
3981 if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
3982 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
3983 InnerSumSC->getValue()->getZExtValue())) {
3984 F.UnfoldedOffset =
3985 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
3986 InnerSumSC->getValue()->getZExtValue());
3987 if (IsScaledReg) {
3988 F.ScaledReg = nullptr;
3989 F.Scale = 0;
3990 } else
3991 F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
3992 } else if (IsScaledReg)
3993 F.ScaledReg = InnerSum;
3994 else
3995 F.BaseRegs[Idx] = InnerSum;
3996
3997 // Add J as its own register, or an unfolded immediate.
3998 const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
3999 if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
4000 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
4001 SC->getValue()->getZExtValue()))
4002 F.UnfoldedOffset =
4003 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
4004 SC->getValue()->getZExtValue());
4005 else
4006 F.BaseRegs.push_back(*J);
4007 // We may have changed the number of register in base regs, adjust the
4008 // formula accordingly.
4009 F.canonicalize(*L);
4010
4011 if (InsertFormula(LU, LUIdx, F))
4012 // If that formula hadn't been seen before, recurse to find more like
4013 // it.
4014 // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2)
4015 // Because just Depth is not enough to bound compile time.
4016 // This means that every time AddOps.size() is greater 16^x we will add
4017 // x to Depth.
4018 GenerateReassociations(LU, LUIdx, LU.Formulae.back(),
4019 Depth + 1 + (Log2_32(AddOps.size()) >> 2));
4020 }
4021}
4022
4023/// Split out subexpressions from adds and the bases of addrecs.
4024void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
4025 Formula Base, unsigned Depth) {
4026 assert(Base.isCanonical(*L) && "Input must be in the canonical form");
4027 // Arbitrarily cap recursion to protect compile time.
4028 if (Depth >= 3)
4029 return;
4030
4031 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4032 GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
4033
4034 if (Base.Scale == 1)
4035 GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
4036 /* Idx */ -1, /* IsScaledReg */ true);
4037}
4038
4039/// Generate a formula consisting of all of the loop-dominating registers added
4040/// into a single register.
4041void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
4042 Formula Base) {
4043 // This method is only interesting on a plurality of registers.
4044 if (Base.BaseRegs.size() + (Base.Scale == 1) +
4045 (Base.UnfoldedOffset.isNonZero()) <=
4046 1)
4047 return;
4048
4049 // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
4050 // processing the formula.
4051 Base.unscale();
4053 Formula NewBase = Base;
4054 NewBase.BaseRegs.clear();
4055 Type *CombinedIntegerType = nullptr;
4056 for (const SCEV *BaseReg : Base.BaseRegs) {
4057 if (SE.properlyDominates(BaseReg, L->getHeader()) &&
4058 !SE.hasComputableLoopEvolution(BaseReg, L)) {
4059 if (!CombinedIntegerType)
4060 CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType());
4061 Ops.push_back(BaseReg);
4062 }
4063 else
4064 NewBase.BaseRegs.push_back(BaseReg);
4065 }
4066
4067 // If no register is relevant, we're done.
4068 if (Ops.size() == 0)
4069 return;
4070
4071 // Utility function for generating the required variants of the combined
4072 // registers.
4073 auto GenerateFormula = [&](const SCEV *Sum) {
4074 Formula F = NewBase;
4075
4076 // TODO: If Sum is zero, it probably means ScalarEvolution missed an
4077 // opportunity to fold something. For now, just ignore such cases
4078 // rather than proceed with zero in a register.
4079 if (Sum->isZero())
4080 return;
4081
4082 F.BaseRegs.push_back(Sum);
4083 F.canonicalize(*L);
4084 (void)InsertFormula(LU, LUIdx, F);
4085 };
4086
4087 // If we collected at least two registers, generate a formula combining them.
4088 if (Ops.size() > 1) {
4089 SmallVector<const SCEV *, 4> OpsCopy(Ops); // Don't let SE modify Ops.
4090 GenerateFormula(SE.getAddExpr(OpsCopy));
4091 }
4092
4093 // If we have an unfolded offset, generate a formula combining it with the
4094 // registers collected.
4095 if (NewBase.UnfoldedOffset.isNonZero() && NewBase.UnfoldedOffset.isFixed()) {
4096 assert(CombinedIntegerType && "Missing a type for the unfolded offset");
4097 Ops.push_back(SE.getConstant(CombinedIntegerType,
4098 NewBase.UnfoldedOffset.getFixedValue(), true));
4099 NewBase.UnfoldedOffset = Immediate::getFixed(0);
4100 GenerateFormula(SE.getAddExpr(Ops));
4101 }
4102}
4103
4104/// Helper function for LSRInstance::GenerateSymbolicOffsets.
4105void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
4106 const Formula &Base, size_t Idx,
4107 bool IsScaledReg) {
4108 const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4109 GlobalValue *GV = ExtractSymbol(G, SE);
4110 if (G->isZero() || !GV)
4111 return;
4112 Formula F = Base;
4113 F.BaseGV = GV;
4114 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4115 return;
4116 if (IsScaledReg)
4117 F.ScaledReg = G;
4118 else
4119 F.BaseRegs[Idx] = G;
4120 (void)InsertFormula(LU, LUIdx, F);
4121}
4122
4123/// Generate reuse formulae using symbolic offsets.
4124void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
4125 Formula Base) {
4126 // We can't add a symbolic offset if the address already contains one.
4127 if (Base.BaseGV) return;
4128
4129 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4130 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
4131 if (Base.Scale == 1)
4132 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
4133 /* IsScaledReg */ true);
4134}
4135
4136/// Helper function for LSRInstance::GenerateConstantOffsets.
4137void LSRInstance::GenerateConstantOffsetsImpl(
4138 LSRUse &LU, unsigned LUIdx, const Formula &Base,
4139 const SmallVectorImpl<Immediate> &Worklist, size_t Idx, bool IsScaledReg) {
4140
4141 auto GenerateOffset = [&](const SCEV *G, Immediate Offset) {
4142 Formula F = Base;
4143 if (!Base.BaseOffset.isCompatibleImmediate(Offset))
4144 return;
4145 F.BaseOffset = Base.BaseOffset.subUnsigned(Offset);
4146
4147 if (isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) {
4148 // Add the offset to the base register.
4149 const SCEV *NewOffset = Offset.getSCEV(SE, G->getType());
4150 const SCEV *NewG = SE.getAddExpr(NewOffset, G);
4151 // If it cancelled out, drop the base register, otherwise update it.
4152 if (NewG->isZero()) {
4153 if (IsScaledReg) {
4154 F.Scale = 0;
4155 F.ScaledReg = nullptr;
4156 } else
4157 F.deleteBaseReg(F.BaseRegs[Idx]);
4158 F.canonicalize(*L);
4159 } else if (IsScaledReg)
4160 F.ScaledReg = NewG;
4161 else
4162 F.BaseRegs[Idx] = NewG;
4163
4164 (void)InsertFormula(LU, LUIdx, F);
4165 }
4166 };
4167
4168 const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4169
4170 // With constant offsets and constant steps, we can generate pre-inc
4171 // accesses by having the offset equal the step. So, for access #0 with a
4172 // step of 8, we generate a G - 8 base which would require the first access
4173 // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer
4174 // for itself and hopefully becomes the base for other accesses. This means
4175 // means that a single pre-indexed access can be generated to become the new
4176 // base pointer for each iteration of the loop, resulting in no extra add/sub
4177 // instructions for pointer updating.
4178 if ((AMK & TTI::AMK_PreIndexed) && LU.Kind == LSRUse::Address) {
4179 const APInt *StepInt;
4180 if (match(G, m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StepInt)))) {
4181 int64_t Step = StepInt->isNegative() ? StepInt->getSExtValue()
4182 : StepInt->getZExtValue();
4183
4184 for (Immediate Offset : Worklist) {
4185 if (Offset.isFixed()) {
4186 Offset = Immediate::getFixed(Offset.getFixedValue() - Step);
4187 GenerateOffset(G, Offset);
4188 }
4189 }
4190 }
4191 }
4192 for (Immediate Offset : Worklist)
4193 GenerateOffset(G, Offset);
4194
4195 Immediate Imm = ExtractImmediate(G, SE);
4196 if (G->isZero() || Imm.isZero() ||
4197 !Base.BaseOffset.isCompatibleImmediate(Imm))
4198 return;
4199 Formula F = Base;
4200 F.BaseOffset = F.BaseOffset.addUnsigned(Imm);
4201 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4202 return;
4203 if (IsScaledReg) {
4204 F.ScaledReg = G;
4205 } else {
4206 F.BaseRegs[Idx] = G;
4207 // We may generate non canonical Formula if G is a recurrent expr reg
4208 // related with current loop while F.ScaledReg is not.
4209 F.canonicalize(*L);
4210 }
4211 (void)InsertFormula(LU, LUIdx, F);
4212}
4213
4214/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
4215void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
4216 Formula Base) {
4217 // TODO: For now, just add the min and max offset, because it usually isn't
4218 // worthwhile looking at everything inbetween.
4220 Worklist.push_back(LU.MinOffset);
4221 if (LU.MaxOffset != LU.MinOffset)
4222 Worklist.push_back(LU.MaxOffset);
4223
4224 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4225 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
4226 if (Base.Scale == 1)
4227 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
4228 /* IsScaledReg */ true);
4229}
4230
4231/// For ICmpZero, check to see if we can scale up the comparison. For example, x
4232/// == y -> x*c == y*c.
4233void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
4234 Formula Base) {
4235 if (LU.Kind != LSRUse::ICmpZero) return;
4236
4237 // Determine the integer type for the base formula.
4238 Type *IntTy = Base.getType();
4239 if (!IntTy) return;
4240 if (SE.getTypeSizeInBits(IntTy) > 64) return;
4241
4242 // Don't do this if there is more than one offset.
4243 if (LU.MinOffset != LU.MaxOffset) return;
4244
4245 // Check if transformation is valid. It is illegal to multiply pointer.
4246 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4247 return;
4248 for (const SCEV *BaseReg : Base.BaseRegs)
4249 if (BaseReg->getType()->isPointerTy())
4250 return;
4251 assert(!Base.BaseGV && "ICmpZero use is not legal!");
4252
4253 // Check each interesting stride.
4254 for (int64_t Factor : Factors) {
4255 // Check that Factor can be represented by IntTy
4256 if (!ConstantInt::isValueValidForType(IntTy, Factor))
4257 continue;
4258 // Check that the multiplication doesn't overflow.
4259 if (Base.BaseOffset.isMin() && Factor == -1)
4260 continue;
4261 // Not supporting scalable immediates.
4262 if (Base.BaseOffset.isNonZero() && Base.BaseOffset.isScalable())
4263 continue;
4264 Immediate NewBaseOffset = Base.BaseOffset.mulUnsigned(Factor);
4265 assert(Factor != 0 && "Zero factor not expected!");
4266 if (NewBaseOffset.getFixedValue() / Factor !=
4267 Base.BaseOffset.getFixedValue())
4268 continue;
4269 // If the offset will be truncated at this use, check that it is in bounds.
4270 if (!IntTy->isPointerTy() &&
4271 !ConstantInt::isValueValidForType(IntTy, NewBaseOffset.getFixedValue()))
4272 continue;
4273
4274 // Check that multiplying with the use offset doesn't overflow.
4275 Immediate Offset = LU.MinOffset;
4276 if (Offset.isMin() && Factor == -1)
4277 continue;
4278 Offset = Offset.mulUnsigned(Factor);
4279 if (Offset.getFixedValue() / Factor != LU.MinOffset.getFixedValue())
4280 continue;
4281 // If the offset will be truncated at this use, check that it is in bounds.
4282 if (!IntTy->isPointerTy() &&
4283 !ConstantInt::isValueValidForType(IntTy, Offset.getFixedValue()))
4284 continue;
4285
4286 Formula F = Base;
4287 F.BaseOffset = NewBaseOffset;
4288
4289 // Check that this scale is legal.
4290 if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
4291 continue;
4292
4293 // Compensate for the use having MinOffset built into it.
4294 F.BaseOffset = F.BaseOffset.addUnsigned(Offset).subUnsigned(LU.MinOffset);
4295
4296 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4297
4298 // Check that multiplying with each base register doesn't overflow.
4299 for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
4300 F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
4301 if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
4302 goto next;
4303 }
4304
4305 // Check that multiplying with the scaled register doesn't overflow.
4306 if (F.ScaledReg) {
4307 F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
4308 if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
4309 continue;
4310 }
4311
4312 // Check that multiplying with the unfolded offset doesn't overflow.
4313 if (F.UnfoldedOffset.isNonZero()) {
4314 if (F.UnfoldedOffset.isMin() && Factor == -1)
4315 continue;
4316 F.UnfoldedOffset = F.UnfoldedOffset.mulUnsigned(Factor);
4317 if (F.UnfoldedOffset.getFixedValue() / Factor !=
4318 Base.UnfoldedOffset.getFixedValue())
4319 continue;
4320 // If the offset will be truncated, check that it is in bounds.
4322 IntTy, F.UnfoldedOffset.getFixedValue()))
4323 continue;
4324 }
4325
4326 // If we make it here and it's legal, add it.
4327 (void)InsertFormula(LU, LUIdx, F);
4328 next:;
4329 }
4330}
4331
4332/// Generate stride factor reuse formulae by making use of scaled-offset address
4333/// modes, for example.
4334void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
4335 // Determine the integer type for the base formula.
4336 Type *IntTy = Base.getType();
4337 if (!IntTy) return;
4338
4339 // If this Formula already has a scaled register, we can't add another one.
4340 // Try to unscale the formula to generate a better scale.
4341 if (Base.Scale != 0 && !Base.unscale())
4342 return;
4343
4344 assert(Base.Scale == 0 && "unscale did not did its job!");
4345
4346 // Check each interesting stride.
4347 for (int64_t Factor : Factors) {
4348 Base.Scale = Factor;
4349 Base.HasBaseReg = Base.BaseRegs.size() > 1;
4350 // Check whether this scale is going to be legal.
4351 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4352 Base)) {
4353 // As a special-case, handle special out-of-loop Basic users specially.
4354 // TODO: Reconsider this special case.
4355 if (LU.Kind == LSRUse::Basic &&
4356 isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
4357 LU.AccessTy, Base) &&
4358 LU.AllFixupsOutsideLoop)
4359 LU.Kind = LSRUse::Special;
4360 else
4361 continue;
4362 }
4363 // For an ICmpZero, negating a solitary base register won't lead to
4364 // new solutions.
4365 if (LU.Kind == LSRUse::ICmpZero && !Base.HasBaseReg &&
4366 Base.BaseOffset.isZero() && !Base.BaseGV)
4367 continue;
4368 // For each addrec base reg, if its loop is current loop, apply the scale.
4369 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
4370 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
4371 if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
4372 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4373 if (FactorS->isZero())
4374 continue;
4375 // Divide out the factor, ignoring high bits, since we'll be
4376 // scaling the value back up in the end.
4377 if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true))
4378 if (!Quotient->isZero()) {
4379 // TODO: This could be optimized to avoid all the copying.
4380 Formula F = Base;
4381 F.ScaledReg = Quotient;
4382 F.deleteBaseReg(F.BaseRegs[i]);
4383 // The canonical representation of 1*reg is reg, which is already in
4384 // Base. In that case, do not try to insert the formula, it will be
4385 // rejected anyway.
4386 if (F.Scale == 1 && (F.BaseRegs.empty() ||
4387 (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
4388 continue;
4389 // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
4390 // non canonical Formula with ScaledReg's loop not being L.
4391 if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
4392 F.canonicalize(*L);
4393 (void)InsertFormula(LU, LUIdx, F);
4394 }
4395 }
4396 }
4397 }
4398}
4399
4400/// Extend/Truncate \p Expr to \p ToTy considering post-inc uses in \p Loops.
4401/// For all PostIncLoopSets in \p Loops, first de-normalize \p Expr, then
4402/// perform the extension/truncate and normalize again, as the normalized form
4403/// can result in folds that are not valid in the post-inc use contexts. The
4404/// expressions for all PostIncLoopSets must match, otherwise return nullptr.
4405static const SCEV *
4407 const SCEV *Expr, Type *ToTy,
4408 ScalarEvolution &SE) {
4409 const SCEV *Result = nullptr;
4410 for (auto &L : Loops) {
4411 auto *DenormExpr = denormalizeForPostIncUse(Expr, L, SE);
4412 const SCEV *NewDenormExpr = SE.getAnyExtendExpr(DenormExpr, ToTy);
4413 const SCEV *New = normalizeForPostIncUse(NewDenormExpr, L, SE);
4414 if (!New || (Result && New != Result))
4415 return nullptr;
4416 Result = New;
4417 }
4418
4419 assert(Result && "failed to create expression");
4420 return Result;
4421}
4422
4423/// Generate reuse formulae from different IV types.
4424void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
4425 // Don't bother truncating symbolic values.
4426 if (Base.BaseGV) return;
4427
4428 // Determine the integer type for the base formula.
4429 Type *DstTy = Base.getType();
4430 if (!DstTy) return;
4431 if (DstTy->isPointerTy())
4432 return;
4433
4434 // It is invalid to extend a pointer type so exit early if ScaledReg or
4435 // any of the BaseRegs are pointers.
4436 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4437 return;
4438 if (any_of(Base.BaseRegs,
4439 [](const SCEV *S) { return S->getType()->isPointerTy(); }))
4440 return;
4441
4443 for (auto &LF : LU.Fixups)
4444 Loops.push_back(LF.PostIncLoops);
4445
4446 for (Type *SrcTy : Types) {
4447 if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
4448 Formula F = Base;
4449
4450 // Sometimes SCEV is able to prove zero during ext transform. It may
4451 // happen if SCEV did not do all possible transforms while creating the
4452 // initial node (maybe due to depth limitations), but it can do them while
4453 // taking ext.
4454 if (F.ScaledReg) {
4455 const SCEV *NewScaledReg =
4456 getAnyExtendConsideringPostIncUses(Loops, F.ScaledReg, SrcTy, SE);
4457 if (!NewScaledReg || NewScaledReg->isZero())
4458 continue;
4459 F.ScaledReg = NewScaledReg;
4460 }
4461 bool HasZeroBaseReg = false;
4462 for (const SCEV *&BaseReg : F.BaseRegs) {
4463 const SCEV *NewBaseReg =
4464 getAnyExtendConsideringPostIncUses(Loops, BaseReg, SrcTy, SE);
4465 if (!NewBaseReg || NewBaseReg->isZero()) {
4466 HasZeroBaseReg = true;
4467 break;
4468 }
4469 BaseReg = NewBaseReg;
4470 }
4471 if (HasZeroBaseReg)
4472 continue;
4473
4474 // TODO: This assumes we've done basic processing on all uses and
4475 // have an idea what the register usage is.
4476 if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
4477 continue;
4478
4479 F.canonicalize(*L);
4480 (void)InsertFormula(LU, LUIdx, F);
4481 }
4482 }
4483}
4484
4485namespace {
4486
4487/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
4488/// modifications so that the search phase doesn't have to worry about the data
4489/// structures moving underneath it.
4490struct WorkItem {
4491 size_t LUIdx;
4492 Immediate Imm;
4493 const SCEV *OrigReg;
4494
4495 WorkItem(size_t LI, Immediate I, const SCEV *R)
4496 : LUIdx(LI), Imm(I), OrigReg(R) {}
4497
4498 void print(raw_ostream &OS) const;
4499 void dump() const;
4500};
4501
4502} // end anonymous namespace
4503
4504#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4505void WorkItem::print(raw_ostream &OS) const {
4506 OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
4507 << " , add offset " << Imm;
4508}
4509
4510LLVM_DUMP_METHOD void WorkItem::dump() const {
4511 print(errs()); errs() << '\n';
4512}
4513#endif
4514
4515/// Look for registers which are a constant distance apart and try to form reuse
4516/// opportunities between them.
4517void LSRInstance::GenerateCrossUseConstantOffsets() {
4518 // Group the registers by their value without any added constant offset.
4519 using ImmMapTy = std::map<Immediate, const SCEV *, KeyOrderTargetImmediate>;
4520
4521 DenseMap<const SCEV *, ImmMapTy> Map;
4522 DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
4524 for (const SCEV *Use : RegUses) {
4525 const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify.
4526 Immediate Imm = ExtractImmediate(Reg, SE);
4527 auto Pair = Map.try_emplace(Reg);
4528 if (Pair.second)
4529 Sequence.push_back(Reg);
4530 Pair.first->second.insert(std::make_pair(Imm, Use));
4531 UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
4532 }
4533
4534 // Now examine each set of registers with the same base value. Build up
4535 // a list of work to do and do the work in a separate step so that we're
4536 // not adding formulae and register counts while we're searching.
4537 SmallVector<WorkItem, 32> WorkItems;
4538 SmallSet<std::pair<size_t, Immediate>, 32, KeyOrderSizeTAndImmediate>
4539 UniqueItems;
4540 for (const SCEV *Reg : Sequence) {
4541 const ImmMapTy &Imms = Map.find(Reg)->second;
4542
4543 // It's not worthwhile looking for reuse if there's only one offset.
4544 if (Imms.size() == 1)
4545 continue;
4546
4547 LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
4548 for (const auto &Entry
4549 : Imms) dbgs()
4550 << ' ' << Entry.first;
4551 dbgs() << '\n');
4552
4553 // Examine each offset.
4554 for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
4555 J != JE; ++J) {
4556 const SCEV *OrigReg = J->second;
4557
4558 Immediate JImm = J->first;
4559 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
4560
4561 if (!isa<SCEVConstant>(OrigReg) &&
4562 UsedByIndicesMap[Reg].count() == 1) {
4563 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4564 << '\n');
4565 continue;
4566 }
4567
4568 // Conservatively examine offsets between this orig reg a few selected
4569 // other orig regs.
4570 Immediate First = Imms.begin()->first;
4571 Immediate Last = std::prev(Imms.end())->first;
4572 if (!First.isCompatibleImmediate(Last)) {
4573 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4574 << "\n");
4575 continue;
4576 }
4577 // Only scalable if both terms are scalable, or if one is scalable and
4578 // the other is 0.
4579 bool Scalable = First.isScalable() || Last.isScalable();
4580 int64_t FI = First.getKnownMinValue();
4581 int64_t LI = Last.getKnownMinValue();
4582 // Compute (First + Last) / 2 without overflow using the fact that
4583 // First + Last = 2 * (First + Last) + (First ^ Last).
4584 int64_t Avg = (FI & LI) + ((FI ^ LI) >> 1);
4585 // If the result is negative and FI is odd and LI even (or vice versa),
4586 // we rounded towards -inf. Add 1 in that case, to round towards 0.
4587 Avg = Avg + ((FI ^ LI) & ((uint64_t)Avg >> 63));
4588 ImmMapTy::const_iterator OtherImms[] = {
4589 Imms.begin(), std::prev(Imms.end()),
4590 Imms.lower_bound(Immediate::get(Avg, Scalable))};
4591 for (const auto &M : OtherImms) {
4592 if (M == J || M == JE) continue;
4593 if (!JImm.isCompatibleImmediate(M->first))
4594 continue;
4595
4596 // Compute the difference between the two.
4597 Immediate Imm = JImm.subUnsigned(M->first);
4598 for (unsigned LUIdx : UsedByIndices.set_bits())
4599 // Make a memo of this use, offset, and register tuple.
4600 if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
4601 WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
4602 }
4603 }
4604 }
4605
4606 Map.clear();
4607 Sequence.clear();
4608 UsedByIndicesMap.clear();
4609 UniqueItems.clear();
4610
4611 // Now iterate through the worklist and add new formulae.
4612 for (const WorkItem &WI : WorkItems) {
4613 size_t LUIdx = WI.LUIdx;
4614 LSRUse &LU = Uses[LUIdx];
4615 Immediate Imm = WI.Imm;
4616 const SCEV *OrigReg = WI.OrigReg;
4617
4618 Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
4619 const SCEV *NegImmS = Imm.getNegativeSCEV(SE, IntTy);
4620 unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
4621
4622 // TODO: Use a more targeted data structure.
4623 for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
4624 Formula F = LU.Formulae[L];
4625 // FIXME: The code for the scaled and unscaled registers looks
4626 // very similar but slightly different. Investigate if they
4627 // could be merged. That way, we would not have to unscale the
4628 // Formula.
4629 F.unscale();
4630 // Use the immediate in the scaled register.
4631 if (F.ScaledReg == OrigReg) {
4632 if (!F.BaseOffset.isCompatibleImmediate(Imm))
4633 continue;
4634 Immediate Offset = F.BaseOffset.addUnsigned(Imm.mulUnsigned(F.Scale));
4635 // Don't create 50 + reg(-50).
4636 const SCEV *S = Offset.getNegativeSCEV(SE, IntTy);
4637 if (F.referencesReg(S))
4638 continue;
4639 Formula NewF = F;
4640 NewF.BaseOffset = Offset;
4641 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4642 NewF))
4643 continue;
4644 NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
4645
4646 // If the new scale is a constant in a register, and adding the constant
4647 // value to the immediate would produce a value closer to zero than the
4648 // immediate itself, then the formula isn't worthwhile.
4649 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) {
4650 // FIXME: Do we need to do something for scalable immediates here?
4651 // A scalable SCEV won't be constant, but we might still have
4652 // something in the offset? Bail out for now to be safe.
4653 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4654 continue;
4655 if (C->getValue()->isNegative() !=
4656 (NewF.BaseOffset.isLessThanZero()) &&
4657 (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
4658 .ule(std::abs(NewF.BaseOffset.getFixedValue())))
4659 continue;
4660 }
4661
4662 // OK, looks good.
4663 NewF.canonicalize(*this->L);
4664 (void)InsertFormula(LU, LUIdx, NewF);
4665 } else {
4666 // Use the immediate in a base register.
4667 for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
4668 const SCEV *BaseReg = F.BaseRegs[N];
4669 if (BaseReg != OrigReg)
4670 continue;
4671 Formula NewF = F;
4672 if (!NewF.BaseOffset.isCompatibleImmediate(Imm) ||
4673 !NewF.UnfoldedOffset.isCompatibleImmediate(Imm) ||
4674 !NewF.BaseOffset.isCompatibleImmediate(NewF.UnfoldedOffset))
4675 continue;
4676 NewF.BaseOffset = NewF.BaseOffset.addUnsigned(Imm);
4677 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
4678 LU.Kind, LU.AccessTy, NewF)) {
4679 if (AMK == TTI::AMK_PostIndexed &&
4680 mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
4681 continue;
4682 Immediate NewUnfoldedOffset = NewF.UnfoldedOffset.addUnsigned(Imm);
4683 if (!isLegalAddImmediate(TTI, NewUnfoldedOffset))
4684 continue;
4685 NewF = F;
4686 NewF.UnfoldedOffset = NewUnfoldedOffset;
4687 }
4688 NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
4689
4690 // If the new formula has a constant in a register, and adding the
4691 // constant value to the immediate would produce a value closer to
4692 // zero than the immediate itself, then the formula isn't worthwhile.
4693 for (const SCEV *NewReg : NewF.BaseRegs)
4694 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg)) {
4695 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4696 goto skip_formula;
4697 if ((C->getAPInt() + NewF.BaseOffset.getFixedValue())
4698 .abs()
4699 .slt(std::abs(NewF.BaseOffset.getFixedValue())) &&
4700 (C->getAPInt() + NewF.BaseOffset.getFixedValue())
4701 .countr_zero() >=
4703 NewF.BaseOffset.getFixedValue()))
4704 goto skip_formula;
4705 }
4706
4707 // Ok, looks good.
4708 NewF.canonicalize(*this->L);
4709 (void)InsertFormula(LU, LUIdx, NewF);
4710 break;
4711 skip_formula:;
4712 }
4713 }
4714 }
4715 }
4716}
4717
4718/// Generate formulae for each use.
4719void
4720LSRInstance::GenerateAllReuseFormulae() {
4721 // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
4722 // queries are more precise.
4723 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4724 LSRUse &LU = Uses[LUIdx];
4725 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4726 GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
4727 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4728 GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
4729 }
4730 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4731 LSRUse &LU = Uses[LUIdx];
4732 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4733 GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
4734 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4735 GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
4736 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4737 GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
4738 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4739 GenerateScales(LU, LUIdx, LU.Formulae[i]);
4740 }
4741 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4742 LSRUse &LU = Uses[LUIdx];
4743 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4744 GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
4745 }
4746
4747 GenerateCrossUseConstantOffsets();
4748
4749 LLVM_DEBUG(dbgs() << "\n"
4750 "After generating reuse formulae:\n";
4751 print_uses(dbgs()));
4752}
4753
4754/// If there are multiple formulae with the same set of registers used
4755/// by other uses, pick the best one and delete the others.
4756void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
4757 DenseSet<const SCEV *> VisitedRegs;
4758 SmallPtrSet<const SCEV *, 16> Regs;
4759 SmallPtrSet<const SCEV *, 16> LoserRegs;
4760#ifndef NDEBUG
4761 bool ChangedFormulae = false;
4762#endif
4763
4764 // Collect the best formula for each unique set of shared registers. This
4765 // is reset for each use.
4766 using BestFormulaeTy = DenseMap<SmallVector<const SCEV *, 4>, size_t>;
4767
4768 BestFormulaeTy BestFormulae;
4769
4770 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4771 LSRUse &LU = Uses[LUIdx];
4772 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
4773 dbgs() << '\n');
4774
4775 bool Any = false;
4776 for (size_t FIdx = 0, NumForms = LU.Formulae.size();
4777 FIdx != NumForms; ++FIdx) {
4778 Formula &F = LU.Formulae[FIdx];
4779
4780 // Some formulas are instant losers. For example, they may depend on
4781 // nonexistent AddRecs from other loops. These need to be filtered
4782 // immediately, otherwise heuristics could choose them over others leading
4783 // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
4784 // avoids the need to recompute this information across formulae using the
4785 // same bad AddRec. Passing LoserRegs is also essential unless we remove
4786 // the corresponding bad register from the Regs set.
4787 Cost CostF(L, SE, TTI, AMK);
4788 Regs.clear();
4789 CostF.RateFormula(F, Regs, VisitedRegs, LU, HardwareLoopProfitable,
4790 &LoserRegs);
4791 if (CostF.isLoser()) {
4792 // During initial formula generation, undesirable formulae are generated
4793 // by uses within other loops that have some non-trivial address mode or
4794 // use the postinc form of the IV. LSR needs to provide these formulae
4795 // as the basis of rediscovering the desired formula that uses an AddRec
4796 // corresponding to the existing phi. Once all formulae have been
4797 // generated, these initial losers may be pruned.
4798 LLVM_DEBUG(dbgs() << " Filtering loser "; F.print(dbgs());
4799 dbgs() << "\n");
4800 }
4801 else {
4803 for (const SCEV *Reg : F.BaseRegs) {
4804 if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
4805 Key.push_back(Reg);
4806 }
4807 if (F.ScaledReg &&
4808 RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
4809 Key.push_back(F.ScaledReg);
4810 // Unstable sort by host order ok, because this is only used for
4811 // uniquifying.
4812 llvm::sort(Key);
4813
4814 std::pair<BestFormulaeTy::const_iterator, bool> P =
4815 BestFormulae.insert(std::make_pair(Key, FIdx));
4816 if (P.second)
4817 continue;
4818
4819 Formula &Best = LU.Formulae[P.first->second];
4820
4821 Cost CostBest(L, SE, TTI, AMK);
4822 Regs.clear();
4823 CostBest.RateFormula(Best, Regs, VisitedRegs, LU,
4824 HardwareLoopProfitable);
4825 if (CostF.isLess(CostBest))
4826 std::swap(F, Best);
4827 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
4828 dbgs() << "\n"
4829 " in favor of formula ";
4830 Best.print(dbgs()); dbgs() << '\n');
4831 }
4832#ifndef NDEBUG
4833 ChangedFormulae = true;
4834#endif
4835 LU.DeleteFormula(F);
4836 --FIdx;
4837 --NumForms;
4838 Any = true;
4839 }
4840
4841 // Now that we've filtered out some formulae, recompute the Regs set.
4842 if (Any)
4843 LU.RecomputeRegs(LUIdx, RegUses);
4844
4845 // Reset this to prepare for the next use.
4846 BestFormulae.clear();
4847 }
4848
4849 LLVM_DEBUG(if (ChangedFormulae) {
4850 dbgs() << "\n"
4851 "After filtering out undesirable candidates:\n";
4852 print_uses(dbgs());
4853 });
4854}
4855
4856/// Estimate the worst-case number of solutions the solver might have to
4857/// consider. It almost never considers this many solutions because it prune the
4858/// search space, but the pruning isn't always sufficient.
4859size_t LSRInstance::EstimateSearchSpaceComplexity() const {
4860 size_t Power = 1;
4861 for (const LSRUse &LU : Uses) {
4862 size_t FSize = LU.Formulae.size();
4863 if (FSize >= ComplexityLimit) {
4864 Power = ComplexityLimit;
4865 break;
4866 }
4867 Power *= FSize;
4868 if (Power >= ComplexityLimit)
4869 break;
4870 }
4871 return Power;
4872}
4873
4874/// When one formula uses a superset of the registers of another formula, it
4875/// won't help reduce register pressure (though it may not necessarily hurt
4876/// register pressure); remove it to simplify the system.
4877void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
4878 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4879 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4880
4881 LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
4882 "which use a superset of registers used by other "
4883 "formulae.\n");
4884
4885 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4886 LSRUse &LU = Uses[LUIdx];
4887 bool Any = false;
4888 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
4889 Formula &F = LU.Formulae[i];
4890 if (F.BaseOffset.isNonZero() && F.BaseOffset.isScalable())
4891 continue;
4892 // Look for a formula with a constant or GV in a register. If the use
4893 // also has a formula with that same value in an immediate field,
4894 // delete the one that uses a register.
4896 I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
4897 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
4898 Formula NewF = F;
4899 //FIXME: Formulas should store bitwidth to do wrapping properly.
4900 // See PR41034.
4901 NewF.BaseOffset =
4902 Immediate::getFixed(NewF.BaseOffset.getFixedValue() +
4903 (uint64_t)C->getValue()->getSExtValue());
4904 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4905 (I - F.BaseRegs.begin()));
4906 if (LU.HasFormulaWithSameRegs(NewF)) {
4907 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4908 dbgs() << '\n');
4909 LU.DeleteFormula(F);
4910 --i;
4911 --e;
4912 Any = true;
4913 break;
4914 }
4915 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
4916 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
4917 if (!F.BaseGV) {
4918 Formula NewF = F;
4919 NewF.BaseGV = GV;
4920 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4921 (I - F.BaseRegs.begin()));
4922 if (LU.HasFormulaWithSameRegs(NewF)) {
4923 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4924 dbgs() << '\n');
4925 LU.DeleteFormula(F);
4926 --i;
4927 --e;
4928 Any = true;
4929 break;
4930 }
4931 }
4932 }
4933 }
4934 }
4935 if (Any)
4936 LU.RecomputeRegs(LUIdx, RegUses);
4937 }
4938
4939 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4940 }
4941}
4942
4943/// When there are many registers for expressions like A, A+1, A+2, etc.,
4944/// allocate a single register for them.
4945void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
4946 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4947 return;
4948
4949 LLVM_DEBUG(
4950 dbgs() << "The search space is too complex.\n"
4951 "Narrowing the search space by assuming that uses separated "
4952 "by a constant offset will use the same registers.\n");
4953
4954 // This is especially useful for unrolled loops.
4955
4956 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4957 LSRUse &LU = Uses[LUIdx];
4958 for (const Formula &F : LU.Formulae) {
4959 if (F.BaseOffset.isZero() || (F.Scale != 0 && F.Scale != 1))
4960 continue;
4961
4962 LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
4963 if (!LUThatHas)
4964 continue;
4965
4966 if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
4967 LU.Kind, LU.AccessTy))
4968 continue;
4969
4970 LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');
4971
4972 LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
4973 LUThatHas->AllFixupsUnconditional &= LU.AllFixupsUnconditional;
4974
4975 // Transfer the fixups of LU to LUThatHas.
4976 for (LSRFixup &Fixup : LU.Fixups) {
4977 Fixup.Offset += F.BaseOffset;
4978 LUThatHas->pushFixup(Fixup);
4979 LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
4980 }
4981
4982 // Delete formulae from the new use which are no longer legal.
4983 bool Any = false;
4984 for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
4985 Formula &F = LUThatHas->Formulae[i];
4986 if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
4987 LUThatHas->Kind, LUThatHas->AccessTy, F)) {
4988 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
4989 LUThatHas->DeleteFormula(F);
4990 --i;
4991 --e;
4992 Any = true;
4993 }
4994 }
4995
4996 if (Any)
4997 LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
4998
4999 // Delete the old use.
5000 DeleteUse(LU, LUIdx);
5001 --LUIdx;
5002 --NumUses;
5003 break;
5004 }
5005 }
5006
5007 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5008}
5009
5010/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
5011/// we've done more filtering, as it may be able to find more formulae to
5012/// eliminate.
5013void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
5014 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5015 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5016
5017 LLVM_DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
5018 "undesirable dedicated registers.\n");
5019
5020 FilterOutUndesirableDedicatedRegisters();
5021
5022 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5023 }
5024}
5025
5026/// If a LSRUse has multiple formulae with the same ScaledReg and Scale.
5027/// Pick the best one and delete the others.
5028/// This narrowing heuristic is to keep as many formulae with different
5029/// Scale and ScaledReg pair as possible while narrowing the search space.
5030/// The benefit is that it is more likely to find out a better solution
5031/// from a formulae set with more Scale and ScaledReg variations than
5032/// a formulae set with the same Scale and ScaledReg. The picking winner
5033/// reg heuristic will often keep the formulae with the same Scale and
5034/// ScaledReg and filter others, and we want to avoid that if possible.
5035void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
5036 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5037 return;
5038
5039 LLVM_DEBUG(
5040 dbgs() << "The search space is too complex.\n"
5041 "Narrowing the search space by choosing the best Formula "
5042 "from the Formulae with the same Scale and ScaledReg.\n");
5043
5044 // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
5045 using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>;
5046
5047 BestFormulaeTy BestFormulae;
5048#ifndef NDEBUG
5049 bool ChangedFormulae = false;
5050#endif
5051 DenseSet<const SCEV *> VisitedRegs;
5052 SmallPtrSet<const SCEV *, 16> Regs;
5053
5054 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5055 LSRUse &LU = Uses[LUIdx];
5056 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
5057 dbgs() << '\n');
5058
5059 // Return true if Formula FA is better than Formula FB.
5060 auto IsBetterThan = [&](Formula &FA, Formula &FB) {
5061 // First we will try to choose the Formula with fewer new registers.
5062 // For a register used by current Formula, the more the register is
5063 // shared among LSRUses, the less we increase the register number
5064 // counter of the formula.
5065 size_t FARegNum = 0;
5066 for (const SCEV *Reg : FA.BaseRegs) {
5067 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5068 FARegNum += (NumUses - UsedByIndices.count() + 1);
5069 }
5070 size_t FBRegNum = 0;
5071 for (const SCEV *Reg : FB.BaseRegs) {
5072 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5073 FBRegNum += (NumUses - UsedByIndices.count() + 1);
5074 }
5075 if (FARegNum != FBRegNum)
5076 return FARegNum < FBRegNum;
5077
5078 // If the new register numbers are the same, choose the Formula with
5079 // less Cost.
5080 Cost CostFA(L, SE, TTI, AMK);
5081 Cost CostFB(L, SE, TTI, AMK);
5082 Regs.clear();
5083 CostFA.RateFormula(FA, Regs, VisitedRegs, LU, HardwareLoopProfitable);
5084 Regs.clear();
5085 CostFB.RateFormula(FB, Regs, VisitedRegs, LU, HardwareLoopProfitable);
5086 return CostFA.isLess(CostFB);
5087 };
5088
5089 bool Any = false;
5090 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5091 ++FIdx) {
5092 Formula &F = LU.Formulae[FIdx];
5093 if (!F.ScaledReg)
5094 continue;
5095 auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx});
5096 if (P.second)
5097 continue;
5098
5099 Formula &Best = LU.Formulae[P.first->second];
5100 if (IsBetterThan(F, Best))
5101 std::swap(F, Best);
5102 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5103 dbgs() << "\n"
5104 " in favor of formula ";
5105 Best.print(dbgs()); dbgs() << '\n');
5106#ifndef NDEBUG
5107 ChangedFormulae = true;
5108#endif
5109 LU.DeleteFormula(F);
5110 --FIdx;
5111 --NumForms;
5112 Any = true;
5113 }
5114 if (Any)
5115 LU.RecomputeRegs(LUIdx, RegUses);
5116
5117 // Reset this to prepare for the next use.
5118 BestFormulae.clear();
5119 }
5120
5121 LLVM_DEBUG(if (ChangedFormulae) {
5122 dbgs() << "\n"
5123 "After filtering out undesirable candidates:\n";
5124 print_uses(dbgs());
5125 });
5126}
5127
5128/// If we are over the complexity limit, filter out any post-inc prefering
5129/// variables to only post-inc values.
5130void LSRInstance::NarrowSearchSpaceByFilterPostInc() {
5131 if (AMK != TTI::AMK_PostIndexed)
5132 return;
5133 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5134 return;
5135
5136 LLVM_DEBUG(dbgs() << "The search space is too complex.\n"
5137 "Narrowing the search space by choosing the lowest "
5138 "register Formula for PostInc Uses.\n");
5139
5140 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5141 LSRUse &LU = Uses[LUIdx];
5142
5143 if (LU.Kind != LSRUse::Address)
5144 continue;
5145 if (!TTI.isIndexedLoadLegal(TTI.MIM_PostInc, LU.AccessTy.getType()) &&
5146 !TTI.isIndexedStoreLegal(TTI.MIM_PostInc, LU.AccessTy.getType()))
5147 continue;
5148
5149 size_t MinRegs = std::numeric_limits<size_t>::max();
5150 for (const Formula &F : LU.Formulae)
5151 MinRegs = std::min(F.getNumRegs(), MinRegs);
5152
5153 bool Any = false;
5154 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5155 ++FIdx) {
5156 Formula &F = LU.Formulae[FIdx];
5157 if (F.getNumRegs() > MinRegs) {
5158 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5159 dbgs() << "\n");
5160 LU.DeleteFormula(F);
5161 --FIdx;
5162 --NumForms;
5163 Any = true;
5164 }
5165 }
5166 if (Any)
5167 LU.RecomputeRegs(LUIdx, RegUses);
5168
5169 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5170 break;
5171 }
5172
5173 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5174}
5175
5176/// The function delete formulas with high registers number expectation.
5177/// Assuming we don't know the value of each formula (already delete
5178/// all inefficient), generate probability of not selecting for each
5179/// register.
5180/// For example,
5181/// Use1:
5182/// reg(a) + reg({0,+,1})
5183/// reg(a) + reg({-1,+,1}) + 1
5184/// reg({a,+,1})
5185/// Use2:
5186/// reg(b) + reg({0,+,1})
5187/// reg(b) + reg({-1,+,1}) + 1
5188/// reg({b,+,1})
5189/// Use3:
5190/// reg(c) + reg(b) + reg({0,+,1})
5191/// reg(c) + reg({b,+,1})
5192///
5193/// Probability of not selecting
5194/// Use1 Use2 Use3
5195/// reg(a) (1/3) * 1 * 1
5196/// reg(b) 1 * (1/3) * (1/2)
5197/// reg({0,+,1}) (2/3) * (2/3) * (1/2)
5198/// reg({-1,+,1}) (2/3) * (2/3) * 1
5199/// reg({a,+,1}) (2/3) * 1 * 1
5200/// reg({b,+,1}) 1 * (2/3) * (2/3)
5201/// reg(c) 1 * 1 * 0
5202///
5203/// Now count registers number mathematical expectation for each formula:
5204/// Note that for each use we exclude probability if not selecting for the use.
5205/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
5206/// probabilty 1/3 of not selecting for Use1).
5207/// Use1:
5208/// reg(a) + reg({0,+,1}) 1 + 1/3 -- to be deleted
5209/// reg(a) + reg({-1,+,1}) + 1 1 + 4/9 -- to be deleted
5210/// reg({a,+,1}) 1
5211/// Use2:
5212/// reg(b) + reg({0,+,1}) 1/2 + 1/3 -- to be deleted
5213/// reg(b) + reg({-1,+,1}) + 1 1/2 + 2/3 -- to be deleted
5214/// reg({b,+,1}) 2/3
5215/// Use3:
5216/// reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
5217/// reg(c) + reg({b,+,1}) 1 + 2/3
5218void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
5219 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5220 return;
5221 // Ok, we have too many of formulae on our hands to conveniently handle.
5222 // Use a rough heuristic to thin out the list.
5223
5224 // Set of Regs wich will be 100% used in final solution.
5225 // Used in each formula of a solution (in example above this is reg(c)).
5226 // We can skip them in calculations.
5227 SmallPtrSet<const SCEV *, 4> UniqRegs;
5228 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5229
5230 // Map each register to probability of not selecting
5231 DenseMap <const SCEV *, float> RegNumMap;
5232 for (const SCEV *Reg : RegUses) {
5233 if (UniqRegs.count(Reg))
5234 continue;
5235 float PNotSel = 1;
5236 for (const LSRUse &LU : Uses) {
5237 if (!LU.Regs.count(Reg))
5238 continue;
5239 float P = LU.getNotSelectedProbability(Reg);
5240 if (P != 0.0)
5241 PNotSel *= P;
5242 else
5243 UniqRegs.insert(Reg);
5244 }
5245 RegNumMap.insert(std::make_pair(Reg, PNotSel));
5246 }
5247
5248 LLVM_DEBUG(
5249 dbgs() << "Narrowing the search space by deleting costly formulas\n");
5250
5251 // Delete formulas where registers number expectation is high.
5252 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5253 LSRUse &LU = Uses[LUIdx];
5254 // If nothing to delete - continue.
5255 if (LU.Formulae.size() < 2)
5256 continue;
5257 // This is temporary solution to test performance. Float should be
5258 // replaced with round independent type (based on integers) to avoid
5259 // different results for different target builds.
5260 float FMinRegNum = LU.Formulae[0].getNumRegs();
5261 float FMinARegNum = LU.Formulae[0].getNumRegs();
5262 size_t MinIdx = 0;
5263 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5264 Formula &F = LU.Formulae[i];
5265 float FRegNum = 0;
5266 float FARegNum = 0;
5267 for (const SCEV *BaseReg : F.BaseRegs) {
5268 if (UniqRegs.count(BaseReg))
5269 continue;
5270 FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5271 if (isa<SCEVAddRecExpr>(BaseReg))
5272 FARegNum +=
5273 RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5274 }
5275 if (const SCEV *ScaledReg = F.ScaledReg) {
5276 if (!UniqRegs.count(ScaledReg)) {
5277 FRegNum +=
5278 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5279 if (isa<SCEVAddRecExpr>(ScaledReg))
5280 FARegNum +=
5281 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5282 }
5283 }
5284 if (FMinRegNum > FRegNum ||
5285 (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
5286 FMinRegNum = FRegNum;
5287 FMinARegNum = FARegNum;
5288 MinIdx = i;
5289 }
5290 }
5291 LLVM_DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs());
5292 dbgs() << " with min reg num " << FMinRegNum << '\n');
5293 if (MinIdx != 0)
5294 std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
5295 while (LU.Formulae.size() != 1) {
5296 LLVM_DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs());
5297 dbgs() << '\n');
5298 LU.Formulae.pop_back();
5299 }
5300 LU.RecomputeRegs(LUIdx, RegUses);
5301 assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
5302 Formula &F = LU.Formulae[0];
5303 LLVM_DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n');
5304 // When we choose the formula, the regs become unique.
5305 UniqRegs.insert_range(F.BaseRegs);
5306 if (F.ScaledReg)
5307 UniqRegs.insert(F.ScaledReg);
5308 }
5309 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5310}
5311
5312// Check if Best and Reg are SCEVs separated by a constant amount C, and if so
5313// would the addressing offset +C would be legal where the negative offset -C is
5314// not.
5316 ScalarEvolution &SE, const SCEV *Best,
5317 const SCEV *Reg,
5318 MemAccessTy AccessType) {
5319 if (Best->getType() != Reg->getType() ||
5321 cast<SCEVAddRecExpr>(Best)->getLoop() !=
5322 cast<SCEVAddRecExpr>(Reg)->getLoop()))
5323 return false;
5324 std::optional<APInt> Diff = SE.computeConstantDifference(Best, Reg);
5325 if (!Diff)
5326 return false;
5327
5328 return TTI.isLegalAddressingMode(
5329 AccessType.MemTy, /*BaseGV=*/nullptr,
5330 /*BaseOffset=*/Diff->getSExtValue(),
5331 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace) &&
5332 !TTI.isLegalAddressingMode(
5333 AccessType.MemTy, /*BaseGV=*/nullptr,
5334 /*BaseOffset=*/-Diff->getSExtValue(),
5335 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace);
5336}
5337
5338/// Pick a register which seems likely to be profitable, and then in any use
5339/// which has any reference to that register, delete all formulae which do not
5340/// reference that register.
5341void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
5342 // With all other options exhausted, loop until the system is simple
5343 // enough to handle.
5344 SmallPtrSet<const SCEV *, 4> Taken;
5345 while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5346 // Ok, we have too many of formulae on our hands to conveniently handle.
5347 // Use a rough heuristic to thin out the list.
5348 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5349
5350 // Pick the register which is used by the most LSRUses, which is likely
5351 // to be a good reuse register candidate.
5352 const SCEV *Best = nullptr;
5353 unsigned BestNum = 0;
5354 for (const SCEV *Reg : RegUses) {
5355 if (Taken.count(Reg))
5356 continue;
5357 if (!Best) {
5358 Best = Reg;
5359 BestNum = RegUses.getUsedByIndices(Reg).count();
5360 } else {
5361 unsigned Count = RegUses.getUsedByIndices(Reg).count();
5362 if (Count > BestNum) {
5363 Best = Reg;
5364 BestNum = Count;
5365 }
5366
5367 // If the scores are the same, but the Reg is simpler for the target
5368 // (for example {x,+,1} as opposed to {x+C,+,1}, where the target can
5369 // handle +C but not -C), opt for the simpler formula.
5370 if (Count == BestNum) {
5371 int LUIdx = RegUses.getUsedByIndices(Reg).find_first();
5372 if (LUIdx >= 0 && Uses[LUIdx].Kind == LSRUse::Address &&
5374 Uses[LUIdx].AccessTy)) {
5375 Best = Reg;
5376 BestNum = Count;
5377 }
5378 }
5379 }
5380 }
5381 assert(Best && "Failed to find best LSRUse candidate");
5382
5383 LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
5384 << " will yield profitable reuse.\n");
5385 Taken.insert(Best);
5386
5387 // In any use with formulae which references this register, delete formulae
5388 // which don't reference it.
5389 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5390 LSRUse &LU = Uses[LUIdx];
5391 if (!LU.Regs.count(Best)) continue;
5392
5393 bool Any = false;
5394 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5395 Formula &F = LU.Formulae[i];
5396 if (!F.referencesReg(Best)) {
5397 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
5398 LU.DeleteFormula(F);
5399 --e;
5400 --i;
5401 Any = true;
5402 assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
5403 continue;
5404 }
5405 }
5406
5407 if (Any)
5408 LU.RecomputeRegs(LUIdx, RegUses);
5409 }
5410
5411 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5412 }
5413}
5414
5415/// If there are an extraordinary number of formulae to choose from, use some
5416/// rough heuristics to prune down the number of formulae. This keeps the main
5417/// solver from taking an extraordinary amount of time in some worst-case
5418/// scenarios.
5419void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
5420 NarrowSearchSpaceByDetectingSupersets();
5421 NarrowSearchSpaceByCollapsingUnrolledCode();
5422 NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
5424 NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
5425 NarrowSearchSpaceByFilterPostInc();
5426 if (LSRExpNarrow)
5427 NarrowSearchSpaceByDeletingCostlyFormulas();
5428 else
5429 NarrowSearchSpaceByPickingWinnerRegs();
5430}
5431
5432/// This is the recursive solver.
5433void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
5434 Cost &SolutionCost,
5435 SmallVectorImpl<const Formula *> &Workspace,
5436 const Cost &CurCost,
5437 const SmallPtrSet<const SCEV *, 16> &CurRegs,
5438 DenseSet<const SCEV *> &VisitedRegs) const {
5439 // Some ideas:
5440 // - prune more:
5441 // - use more aggressive filtering
5442 // - sort the formula so that the most profitable solutions are found first
5443 // - sort the uses too
5444 // - search faster:
5445 // - don't compute a cost, and then compare. compare while computing a cost
5446 // and bail early.
5447 // - track register sets with SmallBitVector
5448
5449 const LSRUse &LU = Uses[Workspace.size()];
5450
5451 // If this use references any register that's already a part of the
5452 // in-progress solution, consider it a requirement that a formula must
5453 // reference that register in order to be considered. This prunes out
5454 // unprofitable searching.
5455 SmallSetVector<const SCEV *, 4> ReqRegs;
5456 for (const SCEV *S : CurRegs)
5457 if (LU.Regs.count(S))
5458 ReqRegs.insert(S);
5459
5460 SmallPtrSet<const SCEV *, 16> NewRegs;
5461 Cost NewCost(L, SE, TTI, AMK);
5462 for (const Formula &F : LU.Formulae) {
5463 // Ignore formulae which may not be ideal in terms of register reuse of
5464 // ReqRegs. The formula should use all required registers before
5465 // introducing new ones.
5466 // This can sometimes (notably when trying to favour postinc) lead to
5467 // sub-optimial decisions. There it is best left to the cost modelling to
5468 // get correct.
5469 if (!(AMK & TTI::AMK_PostIndexed) || LU.Kind != LSRUse::Address) {
5470 int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
5471 for (const SCEV *Reg : ReqRegs) {
5472 if ((F.ScaledReg && F.ScaledReg == Reg) ||
5473 is_contained(F.BaseRegs, Reg)) {
5474 --NumReqRegsToFind;
5475 if (NumReqRegsToFind == 0)
5476 break;
5477 }
5478 }
5479 if (NumReqRegsToFind != 0) {
5480 // If none of the formulae satisfied the required registers, then we could
5481 // clear ReqRegs and try again. Currently, we simply give up in this case.
5482 continue;
5483 }
5484 }
5485
5486 // Evaluate the cost of the current formula. If it's already worse than
5487 // the current best, prune the search at that point.
5488 NewCost = CurCost;
5489 NewRegs = CurRegs;
5490 NewCost.RateFormula(F, NewRegs, VisitedRegs, LU, HardwareLoopProfitable);
5491 if (NewCost.isLess(SolutionCost)) {
5492 Workspace.push_back(&F);
5493 if (Workspace.size() != Uses.size()) {
5494 SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
5495 NewRegs, VisitedRegs);
5496 if (F.getNumRegs() == 1 && Workspace.size() == 1)
5497 VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
5498 } else {
5499 LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
5500 dbgs() << ".\nRegs:\n";
5501 for (const SCEV *S : NewRegs) dbgs()
5502 << "- " << *S << "\n";
5503 dbgs() << '\n');
5504
5505 SolutionCost = NewCost;
5506 Solution = Workspace;
5507 }
5508 Workspace.pop_back();
5509 }
5510 }
5511}
5512
5513/// Choose one formula from each use. Return the results in the given Solution
5514/// vector.
5515void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
5517 Cost SolutionCost(L, SE, TTI, AMK);
5518 SolutionCost.Lose();
5519 Cost CurCost(L, SE, TTI, AMK);
5520 SmallPtrSet<const SCEV *, 16> CurRegs;
5521 DenseSet<const SCEV *> VisitedRegs;
5522 Workspace.reserve(Uses.size());
5523
5524 // SolveRecurse does all the work.
5525 SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
5526 CurRegs, VisitedRegs);
5527 if (Solution.empty()) {
5528 LLVM_DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
5529 return;
5530 }
5531
5532 // Ok, we've now made all our decisions.
5533 LLVM_DEBUG(dbgs() << "\n"
5534 "The chosen solution requires ";
5535 SolutionCost.print(dbgs()); dbgs() << ":\n";
5536 for (size_t i = 0, e = Uses.size(); i != e; ++i) {
5537 dbgs() << " ";
5538 Uses[i].print(dbgs());
5539 dbgs() << "\n"
5540 " ";
5541 Solution[i]->print(dbgs());
5542 dbgs() << '\n';
5543 });
5544
5545 assert(Solution.size() == Uses.size() && "Malformed solution!");
5546
5547 const bool EnableDropUnprofitableSolution = [&] {
5549 case cl::BOU_TRUE:
5550 return true;
5551 case cl::BOU_FALSE:
5552 return false;
5553 case cl::BOU_UNSET:
5555 }
5556 llvm_unreachable("Unhandled cl::boolOrDefault enum");
5557 }();
5558
5559 if (BaselineCost.isLess(SolutionCost)) {
5560 if (!EnableDropUnprofitableSolution)
5561 LLVM_DEBUG(
5562 dbgs() << "Baseline is more profitable than chosen solution, "
5563 "add option 'lsr-drop-solution' to drop LSR solution.\n");
5564 else {
5565 LLVM_DEBUG(dbgs() << "Baseline is more profitable than chosen "
5566 "solution, dropping LSR solution.\n";);
5567 Solution.clear();
5568 }
5569 }
5570}
5571
5572/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
5573/// we can go while still being dominated by the input positions. This helps
5574/// canonicalize the insert position, which encourages sharing.
5576LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
5577 const SmallVectorImpl<Instruction *> &Inputs)
5578 const {
5579 Instruction *Tentative = &*IP;
5580 while (true) {
5581 bool AllDominate = true;
5582 Instruction *BetterPos = nullptr;
5583 // Don't bother attempting to insert before a catchswitch, their basic block
5584 // cannot have other non-PHI instructions.
5585 if (isa<CatchSwitchInst>(Tentative))
5586 return IP;
5587
5588 for (Instruction *Inst : Inputs) {
5589 if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
5590 AllDominate = false;
5591 break;
5592 }
5593 // Attempt to find an insert position in the middle of the block,
5594 // instead of at the end, so that it can be used for other expansions.
5595 if (Tentative->getParent() == Inst->getParent() &&
5596 (!BetterPos || !DT.dominates(Inst, BetterPos)))
5597 BetterPos = &*std::next(BasicBlock::iterator(Inst));
5598 }
5599 if (!AllDominate)
5600 break;
5601 if (BetterPos)
5602 IP = BetterPos->getIterator();
5603 else
5604 IP = Tentative->getIterator();
5605
5606 const Loop *IPLoop = LI.getLoopFor(IP->getParent());
5607 unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
5608
5609 BasicBlock *IDom;
5610 for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
5611 if (!Rung) return IP;
5612 Rung = Rung->getIDom();
5613 if (!Rung) return IP;
5614 IDom = Rung->getBlock();
5615
5616 // Don't climb into a loop though.
5617 const Loop *IDomLoop = LI.getLoopFor(IDom);
5618 unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
5619 if (IDomDepth <= IPLoopDepth &&
5620 (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
5621 break;
5622 }
5623
5624 Tentative = IDom->getTerminator();
5625 }
5626
5627 return IP;
5628}
5629
5630/// Determine an input position which will be dominated by the operands and
5631/// which will dominate the result.
5632BasicBlock::iterator LSRInstance::AdjustInsertPositionForExpand(
5633 BasicBlock::iterator LowestIP, const LSRFixup &LF, const LSRUse &LU) const {
5634 // Collect some instructions which must be dominated by the
5635 // expanding replacement. These must be dominated by any operands that
5636 // will be required in the expansion.
5637 SmallVector<Instruction *, 4> Inputs;
5638 if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
5639 Inputs.push_back(I);
5640 if (LU.Kind == LSRUse::ICmpZero)
5641 if (Instruction *I =
5642 dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
5643 Inputs.push_back(I);
5644 if (LF.PostIncLoops.count(L)) {
5645 if (LF.isUseFullyOutsideLoop(L))
5646 Inputs.push_back(L->getLoopLatch()->getTerminator());
5647 else
5648 Inputs.push_back(IVIncInsertPos);
5649 }
5650 // The expansion must also be dominated by the increment positions of any
5651 // loops it for which it is using post-inc mode.
5652 for (const Loop *PIL : LF.PostIncLoops) {
5653 if (PIL == L) continue;
5654
5655 // Be dominated by the loop exit.
5656 SmallVector<BasicBlock *, 4> ExitingBlocks;
5657 PIL->getExitingBlocks(ExitingBlocks);
5658 if (!ExitingBlocks.empty()) {
5659 BasicBlock *BB = ExitingBlocks[0];
5660 for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
5661 BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
5662 Inputs.push_back(BB->getTerminator());
5663 }
5664 }
5665
5666 assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad() &&
5667 "Insertion point must be a normal instruction");
5668
5669 // Then, climb up the immediate dominator tree as far as we can go while
5670 // still being dominated by the input positions.
5671 BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
5672
5673 // Don't insert instructions before PHI nodes.
5674 while (isa<PHINode>(IP)) ++IP;
5675
5676 // Ignore landingpad instructions.
5677 while (IP->isEHPad()) ++IP;
5678
5679 // Set IP below instructions recently inserted by SCEVExpander. This keeps the
5680 // IP consistent across expansions and allows the previously inserted
5681 // instructions to be reused by subsequent expansion.
5682 while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP)
5683 ++IP;
5684
5685 return IP;
5686}
5687
5688/// Emit instructions for the leading candidate expression for this LSRUse (this
5689/// is called "expanding").
5690Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
5691 const Formula &F, BasicBlock::iterator IP,
5692 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5693 if (LU.RigidFormula)
5694 return LF.OperandValToReplace;
5695
5696 // Determine an input position which will be dominated by the operands and
5697 // which will dominate the result.
5698 IP = AdjustInsertPositionForExpand(IP, LF, LU);
5699 Rewriter.setInsertPoint(&*IP);
5700
5701 // Inform the Rewriter if we have a post-increment use, so that it can
5702 // perform an advantageous expansion.
5703 Rewriter.setPostInc(LF.PostIncLoops);
5704
5705 // This is the type that the user actually needs.
5706 Type *OpTy = LF.OperandValToReplace->getType();
5707 // This will be the type that we'll initially expand to.
5708 Type *Ty = F.getType();
5709 if (!Ty)
5710 // No type known; just expand directly to the ultimate type.
5711 Ty = OpTy;
5712 else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
5713 // Expand directly to the ultimate type if it's the right size.
5714 Ty = OpTy;
5715 // This is the type to do integer arithmetic in.
5716 Type *IntTy = SE.getEffectiveSCEVType(Ty);
5717
5718 // Build up a list of operands to add together to form the full base.
5720
5721 // Expand the BaseRegs portion.
5722 for (const SCEV *Reg : F.BaseRegs) {
5723 assert(!Reg->isZero() && "Zero allocated in a base register!");
5724
5725 // If we're expanding for a post-inc user, make the post-inc adjustment.
5726 Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE);
5727 Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
5728 }
5729
5730 // Expand the ScaledReg portion.
5731 Value *ICmpScaledV = nullptr;
5732 if (F.Scale != 0) {
5733 const SCEV *ScaledS = F.ScaledReg;
5734
5735 // If we're expanding for a post-inc user, make the post-inc adjustment.
5736 PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
5737 ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE);
5738
5739 if (LU.Kind == LSRUse::ICmpZero) {
5740 // Expand ScaleReg as if it was part of the base regs.
5741 if (F.Scale == 1)
5742 Ops.push_back(
5743 SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)));
5744 else {
5745 // An interesting way of "folding" with an icmp is to use a negated
5746 // scale, which we'll implement by inserting it into the other operand
5747 // of the icmp.
5748 assert(F.Scale == -1 &&
5749 "The only scale supported by ICmpZero uses is -1!");
5750 ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr);
5751 }
5752 } else {
5753 // Otherwise just expand the scaled register and an explicit scale,
5754 // which is expected to be matched as part of the address.
5755
5756 // Flush the operand list to suppress SCEVExpander hoisting address modes.
5757 // Unless the addressing mode will not be folded.
5758 if (!Ops.empty() && LU.Kind == LSRUse::Address &&
5759 isAMCompletelyFolded(TTI, LU, F)) {
5760 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), nullptr);
5761 Ops.clear();
5762 Ops.push_back(SE.getUnknown(FullV));
5763 }
5764 ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr));
5765 if (F.Scale != 1)
5766 ScaledS =
5767 SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
5768 Ops.push_back(ScaledS);
5769 }
5770 }
5771
5772 // Expand the GV portion.
5773 if (F.BaseGV) {
5774 // Flush the operand list to suppress SCEVExpander hoisting.
5775 if (!Ops.empty()) {
5776 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), IntTy);
5777 Ops.clear();
5778 Ops.push_back(SE.getUnknown(FullV));
5779 }
5780 Ops.push_back(SE.getUnknown(F.BaseGV));
5781 }
5782
5783 // Flush the operand list to suppress SCEVExpander hoisting of both folded and
5784 // unfolded offsets. LSR assumes they both live next to their uses.
5785 if (!Ops.empty()) {
5786 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
5787 Ops.clear();
5788 Ops.push_back(SE.getUnknown(FullV));
5789 }
5790
5791 // FIXME: Are we sure we won't get a mismatch here? Is there a way to bail
5792 // out at this point, or should we generate a SCEV adding together mixed
5793 // offsets?
5794 assert(F.BaseOffset.isCompatibleImmediate(LF.Offset) &&
5795 "Expanding mismatched offsets\n");
5796 // Expand the immediate portion.
5797 Immediate Offset = F.BaseOffset.addUnsigned(LF.Offset);
5798 if (Offset.isNonZero()) {
5799 if (LU.Kind == LSRUse::ICmpZero) {
5800 // The other interesting way of "folding" with an ICmpZero is to use a
5801 // negated immediate.
5802 if (!ICmpScaledV) {
5803 // TODO: Avoid implicit trunc?
5804 // See https://github.com/llvm/llvm-project/issues/112510.
5805 ICmpScaledV = ConstantInt::getSigned(
5806 IntTy, -(uint64_t)Offset.getFixedValue(), /*ImplicitTrunc=*/true);
5807 } else {
5808 Ops.push_back(SE.getUnknown(ICmpScaledV));
5809 ICmpScaledV = ConstantInt::getSigned(IntTy, Offset.getFixedValue(),
5810 /*ImplicitTrunc=*/true);
5811 }
5812 } else {
5813 // Just add the immediate values. These again are expected to be matched
5814 // as part of the address.
5815 Ops.push_back(Offset.getUnknownSCEV(SE, IntTy));
5816 }
5817 }
5818
5819 // Expand the unfolded offset portion.
5820 Immediate UnfoldedOffset = F.UnfoldedOffset;
5821 if (UnfoldedOffset.isNonZero()) {
5822 // Just add the immediate values.
5823 Ops.push_back(UnfoldedOffset.getUnknownSCEV(SE, IntTy));
5824 }
5825
5826 // Emit instructions summing all the operands.
5827 const SCEV *FullS = Ops.empty() ?
5828 SE.getConstant(IntTy, 0) :
5829 SE.getAddExpr(Ops);
5830 Value *FullV = Rewriter.expandCodeFor(FullS, Ty);
5831
5832 // We're done expanding now, so reset the rewriter.
5833 Rewriter.clearPostInc();
5834
5835 // An ICmpZero Formula represents an ICmp which we're handling as a
5836 // comparison against zero. Now that we've expanded an expression for that
5837 // form, update the ICmp's other operand.
5838 if (LU.Kind == LSRUse::ICmpZero) {
5839 ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
5840 if (auto *OperandIsInstr = dyn_cast<Instruction>(CI->getOperand(1)))
5841 DeadInsts.emplace_back(OperandIsInstr);
5842 assert(!F.BaseGV && "ICmp does not support folding a global value and "
5843 "a scale at the same time!");
5844 if (F.Scale == -1) {
5845 if (ICmpScaledV->getType() != OpTy) {
5847 CastInst::getCastOpcode(ICmpScaledV, false, OpTy, false),
5848 ICmpScaledV, OpTy, "tmp", CI->getIterator());
5849 ICmpScaledV = Cast;
5850 }
5851 CI->setOperand(1, ICmpScaledV);
5852 } else {
5853 // A scale of 1 means that the scale has been expanded as part of the
5854 // base regs.
5855 assert((F.Scale == 0 || F.Scale == 1) &&
5856 "ICmp does not support folding a global value and "
5857 "a scale at the same time!");
5858 // TODO: Avoid implicit trunc?
5859 // See https://github.com/llvm/llvm-project/issues/112510.
5861 -(uint64_t)Offset.getFixedValue(),
5862 /*ImplicitTrunc=*/true);
5863 if (C->getType() != OpTy) {
5865 CastInst::getCastOpcode(C, false, OpTy, false), C, OpTy,
5866 CI->getDataLayout());
5867 assert(C && "Cast of ConstantInt should have folded");
5868 }
5869
5870 CI->setOperand(1, C);
5871 }
5872 }
5873
5874 return FullV;
5875}
5876
5877/// Helper for Rewrite. PHI nodes are special because the use of their operands
5878/// effectively happens in their predecessor blocks, so the expression may need
5879/// to be expanded in multiple places.
5880void LSRInstance::RewriteForPHI(PHINode *PN, const LSRUse &LU,
5881 const LSRFixup &LF, const Formula &F,
5882 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
5883 DenseMap<BasicBlock *, Value *> Inserted;
5884
5885 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
5886 if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
5887 bool needUpdateFixups = false;
5888 BasicBlock *BB = PN->getIncomingBlock(i);
5889
5890 // If this is a critical edge, split the edge so that we do not insert
5891 // the code on all predecessor/successor paths. We do this unless this
5892 // is the canonical backedge for this loop, which complicates post-inc
5893 // users.
5894 if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
5897 BasicBlock *Parent = PN->getParent();
5898 Loop *PNLoop = LI.getLoopFor(Parent);
5899 if (!PNLoop || Parent != PNLoop->getHeader()) {
5900 // Split the critical edge.
5901 BasicBlock *NewBB = nullptr;
5902 if (!Parent->isLandingPad()) {
5903 NewBB =
5904 SplitCriticalEdge(BB, Parent,
5905 CriticalEdgeSplittingOptions(&DT, &LI, MSSAU)
5906 .setMergeIdenticalEdges()
5907 .setKeepOneInputPHIs());
5908 } else {
5910 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
5911 SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DTU, &LI);
5912 NewBB = NewBBs[0];
5913 }
5914 // If NewBB==NULL, then SplitCriticalEdge refused to split because all
5915 // phi predecessors are identical. The simple thing to do is skip
5916 // splitting in this case rather than complicate the API.
5917 if (NewBB) {
5918 // If PN is outside of the loop and BB is in the loop, we want to
5919 // move the block to be immediately before the PHI block, not
5920 // immediately after BB.
5921 if (L->contains(BB) && !L->contains(PN))
5922 NewBB->moveBefore(PN->getParent());
5923
5924 // Splitting the edge can reduce the number of PHI entries we have.
5925 e = PN->getNumIncomingValues();
5926 BB = NewBB;
5927 i = PN->getBasicBlockIndex(BB);
5928
5929 needUpdateFixups = true;
5930 }
5931 }
5932 }
5933
5934 std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
5935 Inserted.try_emplace(BB);
5936 if (!Pair.second)
5937 PN->setIncomingValue(i, Pair.first->second);
5938 else {
5939 Value *FullV =
5940 Expand(LU, LF, F, BB->getTerminator()->getIterator(), DeadInsts);
5941
5942 // If this is reuse-by-noop-cast, insert the noop cast.
5943 Type *OpTy = LF.OperandValToReplace->getType();
5944 if (FullV->getType() != OpTy)
5945 FullV = CastInst::Create(
5946 CastInst::getCastOpcode(FullV, false, OpTy, false), FullV,
5947 LF.OperandValToReplace->getType(), "tmp",
5948 BB->getTerminator()->getIterator());
5949
5950 // If the incoming block for this value is not in the loop, it means the
5951 // current PHI is not in a loop exit, so we must create a LCSSA PHI for
5952 // the inserted value.
5953 if (auto *I = dyn_cast<Instruction>(FullV))
5954 if (L->contains(I) && !L->contains(BB))
5955 InsertedNonLCSSAInsts.insert(I);
5956
5957 PN->setIncomingValue(i, FullV);
5958 Pair.first->second = FullV;
5959 }
5960
5961 // If LSR splits critical edge and phi node has other pending
5962 // fixup operands, we need to update those pending fixups. Otherwise
5963 // formulae will not be implemented completely and some instructions
5964 // will not be eliminated.
5965 if (needUpdateFixups) {
5966 for (LSRUse &LU : Uses)
5967 for (LSRFixup &Fixup : LU.Fixups)
5968 // If fixup is supposed to rewrite some operand in the phi
5969 // that was just updated, it may be already moved to
5970 // another phi node. Such fixup requires update.
5971 if (Fixup.UserInst == PN) {
5972 // Check if the operand we try to replace still exists in the
5973 // original phi.
5974 bool foundInOriginalPHI = false;
5975 for (const auto &val : PN->incoming_values())
5976 if (val == Fixup.OperandValToReplace) {
5977 foundInOriginalPHI = true;
5978 break;
5979 }
5980
5981 // If fixup operand found in original PHI - nothing to do.
5982 if (foundInOriginalPHI)
5983 continue;
5984
5985 // Otherwise it might be moved to another PHI and requires update.
5986 // If fixup operand not found in any of the incoming blocks that
5987 // means we have already rewritten it - nothing to do.
5988 for (const auto &Block : PN->blocks())
5989 for (BasicBlock::iterator I = Block->begin(); isa<PHINode>(I);
5990 ++I) {
5991 PHINode *NewPN = cast<PHINode>(I);
5992 for (const auto &val : NewPN->incoming_values())
5993 if (val == Fixup.OperandValToReplace)
5994 Fixup.UserInst = NewPN;
5995 }
5996 }
5997 }
5998 }
5999}
6000
6001/// Emit instructions for the leading candidate expression for this LSRUse (this
6002/// is called "expanding"), and update the UserInst to reference the newly
6003/// expanded value.
6004void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
6005 const Formula &F,
6006 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
6007 // First, find an insertion point that dominates UserInst. For PHI nodes,
6008 // find the nearest block which dominates all the relevant uses.
6009 if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
6010 RewriteForPHI(PN, LU, LF, F, DeadInsts);
6011 } else {
6012 Value *FullV = Expand(LU, LF, F, LF.UserInst->getIterator(), DeadInsts);
6013
6014 // If this is reuse-by-noop-cast, insert the noop cast.
6015 Type *OpTy = LF.OperandValToReplace->getType();
6016 if (FullV->getType() != OpTy) {
6017 Instruction *Cast =
6018 CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
6019 FullV, OpTy, "tmp", LF.UserInst->getIterator());
6020 FullV = Cast;
6021 }
6022
6023 // Update the user. ICmpZero is handled specially here (for now) because
6024 // Expand may have updated one of the operands of the icmp already, and
6025 // its new value may happen to be equal to LF.OperandValToReplace, in
6026 // which case doing replaceUsesOfWith leads to replacing both operands
6027 // with the same value. TODO: Reorganize this.
6028 if (LU.Kind == LSRUse::ICmpZero)
6029 LF.UserInst->setOperand(0, FullV);
6030 else
6031 LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
6032 }
6033
6034 if (auto *OperandIsInstr = dyn_cast<Instruction>(LF.OperandValToReplace))
6035 DeadInsts.emplace_back(OperandIsInstr);
6036}
6037
6038// Determine where to insert the transformed IV increment instruction for this
6039// fixup. By default this is the default insert position, but if this is a
6040// postincrement opportunity then we try to insert it in the same block as the
6041// fixup user instruction, as this is needed for a postincrement instruction to
6042// be generated.
6044 const LSRFixup &Fixup, const LSRUse &LU,
6045 Instruction *IVIncInsertPos,
6046 DominatorTree &DT) {
6047 // Only address uses can be postincremented
6048 if (LU.Kind != LSRUse::Address)
6049 return IVIncInsertPos;
6050
6051 // Don't try to postincrement if it's not legal
6052 Instruction *I = Fixup.UserInst;
6053 Type *Ty = I->getType();
6054 if (!(isa<LoadInst>(I) && TTI.isIndexedLoadLegal(TTI.MIM_PostInc, Ty)) &&
6055 !(isa<StoreInst>(I) && TTI.isIndexedStoreLegal(TTI.MIM_PostInc, Ty)))
6056 return IVIncInsertPos;
6057
6058 // It's only legal to hoist to the user block if it dominates the default
6059 // insert position.
6060 BasicBlock *HoistBlock = I->getParent();
6061 BasicBlock *IVIncBlock = IVIncInsertPos->getParent();
6062 if (!DT.dominates(I, IVIncBlock))
6063 return IVIncInsertPos;
6064
6065 return HoistBlock->getTerminator();
6066}
6067
6068/// Rewrite all the fixup locations with new values, following the chosen
6069/// solution.
6070void LSRInstance::ImplementSolution(
6071 const SmallVectorImpl<const Formula *> &Solution) {
6072 // Keep track of instructions we may have made dead, so that
6073 // we can remove them after we are done working.
6075
6076 // Mark phi nodes that terminate chains so the expander tries to reuse them.
6077 for (const IVChain &Chain : IVChainVec) {
6078 if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst()))
6079 Rewriter.setChainedPhi(PN);
6080 }
6081
6082 // Expand the new value definitions and update the users.
6083 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
6084 for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) {
6085 Instruction *InsertPos =
6086 getFixupInsertPos(TTI, Fixup, Uses[LUIdx], IVIncInsertPos, DT);
6087 Rewriter.setIVIncInsertPos(L, InsertPos);
6088 Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], DeadInsts);
6089 Changed = true;
6090 }
6091
6092 auto InsertedInsts = InsertedNonLCSSAInsts.takeVector();
6093 formLCSSAForInstructions(InsertedInsts, DT, LI, &SE);
6094
6095 for (const IVChain &Chain : IVChainVec) {
6096 GenerateIVChain(Chain, DeadInsts);
6097 Changed = true;
6098 }
6099
6100 for (const WeakVH &IV : Rewriter.getInsertedIVs())
6101 if (IV && dyn_cast<Instruction>(&*IV)->getParent())
6102 ScalarEvolutionIVs.push_back(IV);
6103
6104 // Clean up after ourselves. This must be done before deleting any
6105 // instructions.
6106 Rewriter.clear();
6107
6109 &TLI, MSSAU);
6110
6111 // In our cost analysis above, we assume that each addrec consumes exactly
6112 // one register, and arrange to have increments inserted just before the
6113 // latch to maximimize the chance this is true. However, if we reused
6114 // existing IVs, we now need to move the increments to match our
6115 // expectations. Otherwise, our cost modeling results in us having a
6116 // chosen a non-optimal result for the actual schedule. (And yes, this
6117 // scheduling decision does impact later codegen.)
6118 for (PHINode &PN : L->getHeader()->phis()) {
6119 BinaryOperator *BO = nullptr;
6120 Value *Start = nullptr, *Step = nullptr;
6121 if (!matchSimpleRecurrence(&PN, BO, Start, Step))
6122 continue;
6123
6124 switch (BO->getOpcode()) {
6125 case Instruction::Sub:
6126 if (BO->getOperand(0) != &PN)
6127 // sub is non-commutative - match handling elsewhere in LSR
6128 continue;
6129 break;
6130 case Instruction::Add:
6131 break;
6132 default:
6133 continue;
6134 };
6135
6136 if (!isa<Constant>(Step))
6137 // If not a constant step, might increase register pressure
6138 // (We assume constants have been canonicalized to RHS)
6139 continue;
6140
6141 if (BO->getParent() == IVIncInsertPos->getParent())
6142 // Only bother moving across blocks. Isel can handle block local case.
6143 continue;
6144
6145 // Can we legally schedule inc at the desired point?
6146 if (!llvm::all_of(BO->uses(),
6147 [&](Use &U) {return DT.dominates(IVIncInsertPos, U);}))
6148 continue;
6149 BO->moveBefore(IVIncInsertPos->getIterator());
6150 Changed = true;
6151 }
6152
6153
6154}
6155
6156LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
6157 DominatorTree &DT, LoopInfo &LI,
6158 const TargetTransformInfo &TTI, AssumptionCache &AC,
6159 TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU)
6160 : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), TLI(TLI), TTI(TTI), L(L),
6161 MSSAU(MSSAU), AMK(PreferredAddresingMode.getNumOccurrences() > 0
6163 : TTI.getPreferredAddressingMode(L, &SE)),
6164 Rewriter(SE, "lsr", false), BaselineCost(L, SE, TTI, AMK) {
6165 // If LoopSimplify form is not available, stay out of trouble.
6166 if (!L->isLoopSimplifyForm())
6167 return;
6168
6169 // If there's no interesting work to be done, bail early.
6170 if (IU.empty()) return;
6171
6172 // If there's too much analysis to be done, bail early. We won't be able to
6173 // model the problem anyway.
6174 unsigned NumUsers = 0;
6175 for (const IVStrideUse &U : IU) {
6176 if (++NumUsers > MaxIVUsers) {
6177 (void)U;
6178 LLVM_DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U
6179 << "\n");
6180 return;
6181 }
6182 // Bail out if we have a PHI on an EHPad that gets a value from a
6183 // CatchSwitchInst. Because the CatchSwitchInst cannot be split, there is
6184 // no good place to stick any instructions.
6185 if (auto *PN = dyn_cast<PHINode>(U.getUser())) {
6186 auto FirstNonPHI = PN->getParent()->getFirstNonPHIIt();
6187 if (isa<FuncletPadInst>(FirstNonPHI) ||
6188 isa<CatchSwitchInst>(FirstNonPHI))
6189 for (BasicBlock *PredBB : PN->blocks())
6190 if (isa<CatchSwitchInst>(PredBB->getFirstNonPHIIt()))
6191 return;
6192 }
6193 }
6194
6195 LLVM_DEBUG(dbgs() << "\nLSR on loop ";
6196 L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
6197 dbgs() << ":\n");
6198
6199 // Check if we expect this loop to use a hardware loop instruction, which will
6200 // be used when calculating the costs of formulas.
6201 HardwareLoopInfo HWLoopInfo(L);
6202 HardwareLoopProfitable =
6203 TTI.isHardwareLoopProfitable(L, SE, AC, &TLI, HWLoopInfo);
6204
6205 // Configure SCEVExpander already now, so the correct mode is used for
6206 // isSafeToExpand() checks.
6207#if LLVM_ENABLE_ABI_BREAKING_CHECKS
6208 Rewriter.setDebugType(DEBUG_TYPE);
6209#endif
6210 Rewriter.disableCanonicalMode();
6211 Rewriter.enableLSRMode();
6212
6213 // First, perform some low-level loop optimizations.
6214 OptimizeShadowIV();
6215 OptimizeLoopTermCond();
6216
6217 // If loop preparation eliminates all interesting IV users, bail.
6218 if (IU.empty()) return;
6219
6220 // Skip nested loops until we can model them better with formulae.
6221 if (!L->isInnermost()) {
6222 LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
6223 return;
6224 }
6225
6226 // Start collecting data and preparing for the solver.
6227 // If number of registers is not the major cost, we cannot benefit from the
6228 // current profitable chain optimization which is based on number of
6229 // registers.
6230 // FIXME: add profitable chain optimization for other kinds major cost, for
6231 // example number of instructions.
6232 if (TTI.isNumRegsMajorCostOfLSR() || StressIVChain)
6233 CollectChains();
6234 CollectInterestingTypesAndFactors();
6235 CollectFixupsAndInitialFormulae();
6236 CollectLoopInvariantFixupsAndFormulae();
6237
6238 if (Uses.empty())
6239 return;
6240
6241 LLVM_DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
6242 print_uses(dbgs()));
6243 LLVM_DEBUG(dbgs() << "The baseline solution requires ";
6244 BaselineCost.print(dbgs()); dbgs() << "\n");
6245
6246 // Now use the reuse data to generate a bunch of interesting ways
6247 // to formulate the values needed for the uses.
6248 GenerateAllReuseFormulae();
6249
6250 FilterOutUndesirableDedicatedRegisters();
6251 NarrowSearchSpaceUsingHeuristics();
6252
6254 Solve(Solution);
6255
6256 // Release memory that is no longer needed.
6257 Factors.clear();
6258 Types.clear();
6259 RegUses.clear();
6260
6261 if (Solution.empty())
6262 return;
6263
6264#ifndef NDEBUG
6265 // Formulae should be legal.
6266 for (const LSRUse &LU : Uses) {
6267 for (const Formula &F : LU.Formulae)
6268 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
6269 F) && "Illegal formula generated!");
6270 };
6271#endif
6272
6273 // Now that we've decided what we want, make it so.
6274 ImplementSolution(Solution);
6275}
6276
6277#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
6278void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
6279 if (Factors.empty() && Types.empty()) return;
6280
6281 OS << "LSR has identified the following interesting factors and types: ";
6282 ListSeparator LS;
6283
6284 for (int64_t Factor : Factors)
6285 OS << LS << '*' << Factor;
6286
6287 for (Type *Ty : Types)
6288 OS << LS << '(' << *Ty << ')';
6289 OS << '\n';
6290}
6291
6292void LSRInstance::print_fixups(raw_ostream &OS) const {
6293 OS << "LSR is examining the following fixup sites:\n";
6294 for (const LSRUse &LU : Uses)
6295 for (const LSRFixup &LF : LU.Fixups) {
6296 dbgs() << " ";
6297 LF.print(OS);
6298 OS << '\n';
6299 }
6300}
6301
6302void LSRInstance::print_uses(raw_ostream &OS) const {
6303 OS << "LSR is examining the following uses:\n";
6304 for (const LSRUse &LU : Uses) {
6305 dbgs() << " ";
6306 LU.print(OS);
6307 OS << '\n';
6308 for (const Formula &F : LU.Formulae) {
6309 OS << " ";
6310 F.print(OS);
6311 OS << '\n';
6312 }
6313 }
6314}
6315
6316void LSRInstance::print(raw_ostream &OS) const {
6317 print_factors_and_types(OS);
6318 print_fixups(OS);
6319 print_uses(OS);
6320}
6321
6322LLVM_DUMP_METHOD void LSRInstance::dump() const {
6323 print(errs()); errs() << '\n';
6324}
6325#endif
6326
6327namespace {
6328
6329class LoopStrengthReduce : public LoopPass {
6330public:
6331 static char ID; // Pass ID, replacement for typeid
6332
6333 LoopStrengthReduce();
6334
6335private:
6336 bool runOnLoop(Loop *L, LPPassManager &LPM) override;
6337 void getAnalysisUsage(AnalysisUsage &AU) const override;
6338};
6339
6340} // end anonymous namespace
6341
6342LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
6344}
6345
6346void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
6347 // We split critical edges, so we change the CFG. However, we do update
6348 // many analyses if they are around.
6350
6351 AU.addRequired<LoopInfoWrapperPass>();
6352 AU.addPreserved<LoopInfoWrapperPass>();
6354 AU.addRequired<DominatorTreeWrapperPass>();
6355 AU.addPreserved<DominatorTreeWrapperPass>();
6356 AU.addRequired<ScalarEvolutionWrapperPass>();
6357 AU.addPreserved<ScalarEvolutionWrapperPass>();
6358 AU.addRequired<AssumptionCacheTracker>();
6359 AU.addRequired<TargetLibraryInfoWrapperPass>();
6360 // Requiring LoopSimplify a second time here prevents IVUsers from running
6361 // twice, since LoopSimplify was invalidated by running ScalarEvolution.
6363 AU.addRequired<IVUsersWrapperPass>();
6364 AU.addPreserved<IVUsersWrapperPass>();
6365 AU.addRequired<TargetTransformInfoWrapperPass>();
6366 AU.addPreserved<MemorySSAWrapperPass>();
6367}
6368
6369namespace {
6370
6371/// Enables more convenient iteration over a DWARF expression vector.
6373ToDwarfOpIter(SmallVectorImpl<uint64_t> &Expr) {
6374 llvm::DIExpression::expr_op_iterator Begin =
6375 llvm::DIExpression::expr_op_iterator(Expr.begin());
6376 llvm::DIExpression::expr_op_iterator End =
6377 llvm::DIExpression::expr_op_iterator(Expr.end());
6378 return {Begin, End};
6379}
6380
6381struct SCEVDbgValueBuilder {
6382 SCEVDbgValueBuilder() = default;
6383 SCEVDbgValueBuilder(const SCEVDbgValueBuilder &Base) { clone(Base); }
6384
6385 void clone(const SCEVDbgValueBuilder &Base) {
6386 LocationOps = Base.LocationOps;
6387 Expr = Base.Expr;
6388 }
6389
6390 void clear() {
6391 LocationOps.clear();
6392 Expr.clear();
6393 }
6394
6395 /// The DIExpression as we translate the SCEV.
6397 /// The location ops of the DIExpression.
6398 SmallVector<Value *, 2> LocationOps;
6399
6400 void pushOperator(uint64_t Op) { Expr.push_back(Op); }
6401 void pushUInt(uint64_t Operand) { Expr.push_back(Operand); }
6402
6403 /// Add a DW_OP_LLVM_arg to the expression, followed by the index of the value
6404 /// in the set of values referenced by the expression.
6405 void pushLocation(llvm::Value *V) {
6407 auto *It = llvm::find(LocationOps, V);
6408 unsigned ArgIndex = 0;
6409 if (It != LocationOps.end()) {
6410 ArgIndex = std::distance(LocationOps.begin(), It);
6411 } else {
6412 ArgIndex = LocationOps.size();
6413 LocationOps.push_back(V);
6414 }
6415 Expr.push_back(ArgIndex);
6416 }
6417
6418 void pushValue(const SCEVUnknown *U) {
6419 llvm::Value *V = cast<SCEVUnknown>(U)->getValue();
6420 pushLocation(V);
6421 }
6422
6423 bool pushConst(const SCEVConstant *C) {
6424 if (C->getAPInt().getSignificantBits() > 64)
6425 return false;
6426 Expr.push_back(llvm::dwarf::DW_OP_consts);
6427 Expr.push_back(C->getAPInt().getSExtValue());
6428 return true;
6429 }
6430
6431 // Iterating the expression as DWARF ops is convenient when updating
6432 // DWARF_OP_LLVM_args.
6434 return ToDwarfOpIter(Expr);
6435 }
6436
6437 /// Several SCEV types are sequences of the same arithmetic operator applied
6438 /// to constants and values that may be extended or truncated.
6439 bool pushArithmeticExpr(const llvm::SCEVCommutativeExpr *CommExpr,
6440 uint64_t DwarfOp) {
6441 assert((isa<llvm::SCEVAddExpr>(CommExpr) || isa<SCEVMulExpr>(CommExpr)) &&
6442 "Expected arithmetic SCEV type");
6443 bool Success = true;
6444 unsigned EmitOperator = 0;
6445 for (const auto &Op : CommExpr->operands()) {
6446 Success &= pushSCEV(Op);
6447
6448 if (EmitOperator >= 1)
6449 pushOperator(DwarfOp);
6450 ++EmitOperator;
6451 }
6452 return Success;
6453 }
6454
6455 // TODO: Identify and omit noop casts.
6456 bool pushCast(const llvm::SCEVCastExpr *C, bool IsSigned) {
6457 const llvm::SCEV *Inner = C->getOperand(0);
6458 const llvm::Type *Type = C->getType();
6459 uint64_t ToWidth = Type->getIntegerBitWidth();
6460 bool Success = pushSCEV(Inner);
6461 uint64_t CastOps[] = {dwarf::DW_OP_LLVM_convert, ToWidth,
6462 IsSigned ? llvm::dwarf::DW_ATE_signed
6463 : llvm::dwarf::DW_ATE_unsigned};
6464 for (const auto &Op : CastOps)
6465 pushOperator(Op);
6466 return Success;
6467 }
6468
6469 // TODO: MinMax - although these haven't been encountered in the test suite.
6470 bool pushSCEV(const llvm::SCEV *S) {
6471 bool Success = true;
6472 if (const SCEVConstant *StartInt = dyn_cast<SCEVConstant>(S)) {
6473 Success &= pushConst(StartInt);
6474
6475 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
6476 if (!U->getValue())
6477 return false;
6478 pushLocation(U->getValue());
6479
6480 } else if (const SCEVMulExpr *MulRec = dyn_cast<SCEVMulExpr>(S)) {
6481 Success &= pushArithmeticExpr(MulRec, llvm::dwarf::DW_OP_mul);
6482
6483 } else if (const SCEVUDivExpr *UDiv = dyn_cast<SCEVUDivExpr>(S)) {
6484 Success &= pushSCEV(UDiv->getLHS());
6485 Success &= pushSCEV(UDiv->getRHS());
6486 pushOperator(llvm::dwarf::DW_OP_div);
6487
6488 } else if (const SCEVCastExpr *Cast = dyn_cast<SCEVCastExpr>(S)) {
6489 // Assert if a new and unknown SCEVCastEXpr type is encountered.
6492 "Unexpected cast type in SCEV.");
6493 Success &= pushCast(Cast, (isa<SCEVSignExtendExpr>(Cast)));
6494
6495 } else if (const SCEVAddExpr *AddExpr = dyn_cast<SCEVAddExpr>(S)) {
6496 Success &= pushArithmeticExpr(AddExpr, llvm::dwarf::DW_OP_plus);
6497
6498 } else if (isa<SCEVAddRecExpr>(S)) {
6499 // Nested SCEVAddRecExpr are generated by nested loops and are currently
6500 // unsupported.
6501 return false;
6502
6503 } else {
6504 return false;
6505 }
6506 return Success;
6507 }
6508
6509 /// Return true if the combination of arithmetic operator and underlying
6510 /// SCEV constant value is an identity function.
6511 bool isIdentityFunction(uint64_t Op, const SCEV *S) {
6512 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
6513 if (C->getAPInt().getSignificantBits() > 64)
6514 return false;
6515 int64_t I = C->getAPInt().getSExtValue();
6516 switch (Op) {
6517 case llvm::dwarf::DW_OP_plus:
6518 case llvm::dwarf::DW_OP_minus:
6519 return I == 0;
6520 case llvm::dwarf::DW_OP_mul:
6521 case llvm::dwarf::DW_OP_div:
6522 return I == 1;
6523 }
6524 }
6525 return false;
6526 }
6527
6528 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6529 /// builder's expression stack. The stack should already contain an
6530 /// expression for the iteration count, so that it can be multiplied by
6531 /// the stride and added to the start.
6532 /// Components of the expression are omitted if they are an identity function.
6533 /// Chain (non-affine) SCEVs are not supported.
6534 bool SCEVToValueExpr(const llvm::SCEVAddRecExpr &SAR, ScalarEvolution &SE) {
6535 assert(SAR.isAffine() && "Expected affine SCEV");
6536 const SCEV *Start = SAR.getStart();
6537 const SCEV *Stride = SAR.getStepRecurrence(SE);
6538
6539 // Skip pushing arithmetic noops.
6540 if (!isIdentityFunction(llvm::dwarf::DW_OP_mul, Stride)) {
6541 if (!pushSCEV(Stride))
6542 return false;
6543 pushOperator(llvm::dwarf::DW_OP_mul);
6544 }
6545 if (!isIdentityFunction(llvm::dwarf::DW_OP_plus, Start)) {
6546 if (!pushSCEV(Start))
6547 return false;
6548 pushOperator(llvm::dwarf::DW_OP_plus);
6549 }
6550 return true;
6551 }
6552
6553 /// Create an expression that is an offset from a value (usually the IV).
6554 void createOffsetExpr(int64_t Offset, Value *OffsetValue) {
6555 pushLocation(OffsetValue);
6557 LLVM_DEBUG(
6558 dbgs() << "scev-salvage: Generated IV offset expression. Offset: "
6559 << std::to_string(Offset) << "\n");
6560 }
6561
6562 /// Combine a translation of the SCEV and the IV to create an expression that
6563 /// recovers a location's value.
6564 /// returns true if an expression was created.
6565 bool createIterCountExpr(const SCEV *S,
6566 const SCEVDbgValueBuilder &IterationCount,
6567 ScalarEvolution &SE) {
6568 // SCEVs for SSA values are most frquently of the form
6569 // {start,+,stride}, but sometimes they are ({start,+,stride} + %a + ..).
6570 // This is because %a is a PHI node that is not the IV. However, these
6571 // SCEVs have not been observed to result in debuginfo-lossy optimisations,
6572 // so its not expected this point will be reached.
6573 if (!isa<SCEVAddRecExpr>(S))
6574 return false;
6575
6576 LLVM_DEBUG(dbgs() << "scev-salvage: Location to salvage SCEV: " << *S
6577 << '\n');
6578
6579 const auto *Rec = cast<SCEVAddRecExpr>(S);
6580 if (!Rec->isAffine())
6581 return false;
6582
6584 return false;
6585
6586 // Initialise a new builder with the iteration count expression. In
6587 // combination with the value's SCEV this enables recovery.
6588 clone(IterationCount);
6589 if (!SCEVToValueExpr(*Rec, SE))
6590 return false;
6591
6592 return true;
6593 }
6594
6595 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6596 /// builder's expression stack. The stack should already contain an
6597 /// expression for the iteration count, so that it can be multiplied by
6598 /// the stride and added to the start.
6599 /// Components of the expression are omitted if they are an identity function.
6600 bool SCEVToIterCountExpr(const llvm::SCEVAddRecExpr &SAR,
6601 ScalarEvolution &SE) {
6602 assert(SAR.isAffine() && "Expected affine SCEV");
6603 const SCEV *Start = SAR.getStart();
6604 const SCEV *Stride = SAR.getStepRecurrence(SE);
6605
6606 // Skip pushing arithmetic noops.
6607 if (!isIdentityFunction(llvm::dwarf::DW_OP_minus, Start)) {
6608 if (!pushSCEV(Start))
6609 return false;
6610 pushOperator(llvm::dwarf::DW_OP_minus);
6611 }
6612 if (!isIdentityFunction(llvm::dwarf::DW_OP_div, Stride)) {
6613 if (!pushSCEV(Stride))
6614 return false;
6615 pushOperator(llvm::dwarf::DW_OP_div);
6616 }
6617 return true;
6618 }
6619
6620 // Append the current expression and locations to a location list and an
6621 // expression list. Modify the DW_OP_LLVM_arg indexes to account for
6622 // the locations already present in the destination list.
6623 void appendToVectors(SmallVectorImpl<uint64_t> &DestExpr,
6624 SmallVectorImpl<Value *> &DestLocations) {
6625 assert(!DestLocations.empty() &&
6626 "Expected the locations vector to contain the IV");
6627 // The DWARF_OP_LLVM_arg arguments of the expression being appended must be
6628 // modified to account for the locations already in the destination vector.
6629 // All builders contain the IV as the first location op.
6630 assert(!LocationOps.empty() &&
6631 "Expected the location ops to contain the IV.");
6632 // DestIndexMap[n] contains the index in DestLocations for the nth
6633 // location in this SCEVDbgValueBuilder.
6634 SmallVector<uint64_t, 2> DestIndexMap;
6635 for (const auto &Op : LocationOps) {
6636 auto It = find(DestLocations, Op);
6637 if (It != DestLocations.end()) {
6638 // Location already exists in DestLocations, reuse existing ArgIndex.
6639 DestIndexMap.push_back(std::distance(DestLocations.begin(), It));
6640 continue;
6641 }
6642 // Location is not in DestLocations, add it.
6643 DestIndexMap.push_back(DestLocations.size());
6644 DestLocations.push_back(Op);
6645 }
6646
6647 for (const auto &Op : expr_ops()) {
6648 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6649 Op.appendToVector(DestExpr);
6650 continue;
6651 }
6652
6654 // `DW_OP_LLVM_arg n` represents the nth LocationOp in this SCEV,
6655 // DestIndexMap[n] contains its new index in DestLocations.
6656 uint64_t NewIndex = DestIndexMap[Op.getArg(0)];
6657 DestExpr.push_back(NewIndex);
6658 }
6659 }
6660};
6661
6662/// Holds all the required data to salvage a dbg.value using the pre-LSR SCEVs
6663/// and DIExpression.
6664struct DVIRecoveryRec {
6665 DVIRecoveryRec(DbgVariableRecord *DVR)
6666 : DbgRef(DVR), Expr(DVR->getExpression()), HadLocationArgList(false) {}
6667
6668 DbgVariableRecord *DbgRef;
6669 DIExpression *Expr;
6670 bool HadLocationArgList;
6671 SmallVector<WeakVH, 2> LocationOps;
6674
6675 void clear() {
6676 for (auto &RE : RecoveryExprs)
6677 RE.reset();
6678 RecoveryExprs.clear();
6679 }
6680
6681 ~DVIRecoveryRec() { clear(); }
6682};
6683} // namespace
6684
6685/// Returns the total number of DW_OP_llvm_arg operands in the expression.
6686/// This helps in determining if a DIArglist is necessary or can be omitted from
6687/// the dbg.value.
6689 auto expr_ops = ToDwarfOpIter(Expr);
6690 unsigned Count = 0;
6691 for (auto Op : expr_ops)
6692 if (Op.getOp() == dwarf::DW_OP_LLVM_arg)
6693 Count++;
6694 return Count;
6695}
6696
6697/// Overwrites DVI with the location and Ops as the DIExpression. This will
6698/// create an invalid expression if Ops has any dwarf::DW_OP_llvm_arg operands,
6699/// because a DIArglist is not created for the first argument of the dbg.value.
6700template <typename T>
6701static void updateDVIWithLocation(T &DbgVal, Value *Location,
6703 assert(numLLVMArgOps(Ops) == 0 && "Expected expression that does not "
6704 "contain any DW_OP_llvm_arg operands.");
6705 DbgVal.setRawLocation(ValueAsMetadata::get(Location));
6706 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6707 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6708}
6709
6710/// Overwrite DVI with locations placed into a DIArglist.
6711template <typename T>
6712static void updateDVIWithLocations(T &DbgVal,
6713 SmallVectorImpl<Value *> &Locations,
6715 assert(numLLVMArgOps(Ops) != 0 &&
6716 "Expected expression that references DIArglist locations using "
6717 "DW_OP_llvm_arg operands.");
6719 for (Value *V : Locations)
6720 MetadataLocs.push_back(ValueAsMetadata::get(V));
6721 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6722 DbgVal.setRawLocation(llvm::DIArgList::get(DbgVal.getContext(), ValArrayRef));
6723 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6724}
6725
6726/// Write the new expression and new location ops for the dbg.value. If possible
6727/// reduce the szie of the dbg.value by omitting DIArglist. This
6728/// can be omitted if:
6729/// 1. There is only a single location, refenced by a single DW_OP_llvm_arg.
6730/// 2. The DW_OP_LLVM_arg is the first operand in the expression.
6731static void UpdateDbgValue(DVIRecoveryRec &DVIRec,
6732 SmallVectorImpl<Value *> &NewLocationOps,
6734 DbgVariableRecord *DbgVal = DVIRec.DbgRef;
6735 unsigned NumLLVMArgs = numLLVMArgOps(NewExpr);
6736 if (NumLLVMArgs == 0) {
6737 // Location assumed to be on the stack.
6738 updateDVIWithLocation(*DbgVal, NewLocationOps[0], NewExpr);
6739 } else if (NumLLVMArgs == 1 && NewExpr[0] == dwarf::DW_OP_LLVM_arg) {
6740 // There is only a single DW_OP_llvm_arg at the start of the expression,
6741 // so it can be omitted along with DIArglist.
6742 assert(NewExpr[1] == 0 &&
6743 "Lone LLVM_arg in a DIExpression should refer to location-op 0.");
6745 updateDVIWithLocation(*DbgVal, NewLocationOps[0], ShortenedOps);
6746 } else {
6747 // Multiple DW_OP_llvm_arg, so DIArgList is strictly necessary.
6748 updateDVIWithLocations(*DbgVal, NewLocationOps, NewExpr);
6749 }
6750
6751 // If the DIExpression was previously empty then add the stack terminator.
6752 // Non-empty expressions have only had elements inserted into them and so
6753 // the terminator should already be present e.g. stack_value or fragment.
6754 DIExpression *SalvageExpr = DbgVal->getExpression();
6755 if (!DVIRec.Expr->isComplex() && SalvageExpr->isComplex()) {
6756 SalvageExpr = DIExpression::append(SalvageExpr, {dwarf::DW_OP_stack_value});
6757 DbgVal->setExpression(SalvageExpr);
6758 }
6759}
6760
6761/// Cached location ops may be erased during LSR, in which case a poison is
6762/// required when restoring from the cache. The type of that location is no
6763/// longer available, so just use int8. The poison will be replaced by one or
6764/// more locations later when a SCEVDbgValueBuilder selects alternative
6765/// locations to use for the salvage.
6767 return (VH) ? VH : PoisonValue::get(llvm::Type::getInt8Ty(C));
6768}
6769
6770/// Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
6771static void restorePreTransformState(DVIRecoveryRec &DVIRec) {
6772 DbgVariableRecord *DbgVal = DVIRec.DbgRef;
6773 LLVM_DEBUG(dbgs() << "scev-salvage: restore dbg.value to pre-LSR state\n"
6774 << "scev-salvage: post-LSR: " << *DbgVal << '\n');
6775 assert(DVIRec.Expr && "Expected an expression");
6776 DbgVal->setExpression(DVIRec.Expr);
6777
6778 // Even a single location-op may be inside a DIArgList and referenced with
6779 // DW_OP_LLVM_arg, which is valid only with a DIArgList.
6780 if (!DVIRec.HadLocationArgList) {
6781 assert(DVIRec.LocationOps.size() == 1 &&
6782 "Unexpected number of location ops.");
6783 // LSR's unsuccessful salvage attempt may have added DIArgList, which in
6784 // this case was not present before, so force the location back to a
6785 // single uncontained Value.
6786 Value *CachedValue =
6787 getValueOrPoison(DVIRec.LocationOps[0], DbgVal->getContext());
6788 DbgVal->setRawLocation(ValueAsMetadata::get(CachedValue));
6789 } else {
6791 for (WeakVH VH : DVIRec.LocationOps) {
6792 Value *CachedValue = getValueOrPoison(VH, DbgVal->getContext());
6793 MetadataLocs.push_back(ValueAsMetadata::get(CachedValue));
6794 }
6795 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6796 DbgVal->setRawLocation(
6797 llvm::DIArgList::get(DbgVal->getContext(), ValArrayRef));
6798 }
6799 LLVM_DEBUG(dbgs() << "scev-salvage: pre-LSR: " << *DbgVal << '\n');
6800}
6801
6803 llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec,
6804 const SCEV *SCEVInductionVar,
6805 SCEVDbgValueBuilder IterCountExpr) {
6806
6807 if (!DVIRec.DbgRef->isKillLocation())
6808 return false;
6809
6810 // LSR may have caused several changes to the dbg.value in the failed salvage
6811 // attempt. So restore the DIExpression, the location ops and also the
6812 // location ops format, which is always DIArglist for multiple ops, but only
6813 // sometimes for a single op.
6815
6816 // LocationOpIndexMap[i] will store the post-LSR location index of
6817 // the non-optimised out location at pre-LSR index i.
6818 SmallVector<int64_t, 2> LocationOpIndexMap;
6819 LocationOpIndexMap.assign(DVIRec.LocationOps.size(), -1);
6820 SmallVector<Value *, 2> NewLocationOps;
6821 NewLocationOps.push_back(LSRInductionVar);
6822
6823 for (unsigned i = 0; i < DVIRec.LocationOps.size(); i++) {
6824 WeakVH VH = DVIRec.LocationOps[i];
6825 // Place the locations not optimised out in the list first, avoiding
6826 // inserts later. The map is used to update the DIExpression's
6827 // DW_OP_LLVM_arg arguments as the expression is updated.
6828 if (VH && !isa<UndefValue>(VH)) {
6829 NewLocationOps.push_back(VH);
6830 LocationOpIndexMap[i] = NewLocationOps.size() - 1;
6831 LLVM_DEBUG(dbgs() << "scev-salvage: Location index " << i
6832 << " now at index " << LocationOpIndexMap[i] << "\n");
6833 continue;
6834 }
6835
6836 // It's possible that a value referred to in the SCEV may have been
6837 // optimised out by LSR.
6838 if (SE.containsErasedValue(DVIRec.SCEVs[i]) ||
6839 SE.containsUndefs(DVIRec.SCEVs[i])) {
6840 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV for location at index: " << i
6841 << " refers to a location that is now undef or erased. "
6842 "Salvage abandoned.\n");
6843 return false;
6844 }
6845
6846 LLVM_DEBUG(dbgs() << "scev-salvage: salvaging location at index " << i
6847 << " with SCEV: " << *DVIRec.SCEVs[i] << "\n");
6848
6849 DVIRec.RecoveryExprs[i] = std::make_unique<SCEVDbgValueBuilder>();
6850 SCEVDbgValueBuilder *SalvageExpr = DVIRec.RecoveryExprs[i].get();
6851
6852 // Create an offset-based salvage expression if possible, as it requires
6853 // less DWARF ops than an iteration count-based expression.
6854 if (std::optional<APInt> Offset =
6855 SE.computeConstantDifference(DVIRec.SCEVs[i], SCEVInductionVar)) {
6856 if (Offset->getSignificantBits() <= 64)
6857 SalvageExpr->createOffsetExpr(Offset->getSExtValue(), LSRInductionVar);
6858 else
6859 return false;
6860 } else if (!SalvageExpr->createIterCountExpr(DVIRec.SCEVs[i], IterCountExpr,
6861 SE))
6862 return false;
6863 }
6864
6865 // Merge the DbgValueBuilder generated expressions and the original
6866 // DIExpression, place the result into an new vector.
6868 if (DVIRec.Expr->getNumElements() == 0) {
6869 assert(DVIRec.RecoveryExprs.size() == 1 &&
6870 "Expected only a single recovery expression for an empty "
6871 "DIExpression.");
6872 assert(DVIRec.RecoveryExprs[0] &&
6873 "Expected a SCEVDbgSalvageBuilder for location 0");
6874 SCEVDbgValueBuilder *B = DVIRec.RecoveryExprs[0].get();
6875 B->appendToVectors(NewExpr, NewLocationOps);
6876 }
6877 for (const auto &Op : DVIRec.Expr->expr_ops()) {
6878 // Most Ops needn't be updated.
6879 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6880 Op.appendToVector(NewExpr);
6881 continue;
6882 }
6883
6884 uint64_t LocationArgIndex = Op.getArg(0);
6885 SCEVDbgValueBuilder *DbgBuilder =
6886 DVIRec.RecoveryExprs[LocationArgIndex].get();
6887 // The location doesn't have s SCEVDbgValueBuilder, so LSR did not
6888 // optimise it away. So just translate the argument to the updated
6889 // location index.
6890 if (!DbgBuilder) {
6891 NewExpr.push_back(dwarf::DW_OP_LLVM_arg);
6892 assert(LocationOpIndexMap[Op.getArg(0)] != -1 &&
6893 "Expected a positive index for the location-op position.");
6894 NewExpr.push_back(LocationOpIndexMap[Op.getArg(0)]);
6895 continue;
6896 }
6897 // The location has a recovery expression.
6898 DbgBuilder->appendToVectors(NewExpr, NewLocationOps);
6899 }
6900
6901 UpdateDbgValue(DVIRec, NewLocationOps, NewExpr);
6902 LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: " << *DVIRec.DbgRef << "\n");
6903 return true;
6904}
6905
6906/// Obtain an expression for the iteration count, then attempt to salvage the
6907/// dbg.value intrinsics.
6909 llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar,
6910 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &DVIToUpdate) {
6911 if (DVIToUpdate.empty())
6912 return;
6913
6914 const llvm::SCEV *SCEVInductionVar = SE.getSCEV(LSRInductionVar);
6915 assert(SCEVInductionVar &&
6916 "Anticipated a SCEV for the post-LSR induction variable");
6917
6918 if (const SCEVAddRecExpr *IVAddRec =
6919 dyn_cast<SCEVAddRecExpr>(SCEVInductionVar)) {
6920 if (!IVAddRec->isAffine())
6921 return;
6922
6923 // Prevent translation using excessive resources.
6924 if (IVAddRec->getExpressionSize() > MaxSCEVSalvageExpressionSize)
6925 return;
6926
6927 // The iteration count is required to recover location values.
6928 SCEVDbgValueBuilder IterCountExpr;
6929 IterCountExpr.pushLocation(LSRInductionVar);
6930 if (!IterCountExpr.SCEVToIterCountExpr(*IVAddRec, SE))
6931 return;
6932
6933 LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV: " << *SCEVInductionVar
6934 << '\n');
6935
6936 for (auto &DVIRec : DVIToUpdate) {
6937 SalvageDVI(L, SE, LSRInductionVar, *DVIRec, SCEVInductionVar,
6938 IterCountExpr);
6939 }
6940 }
6941}
6942
6943/// Identify and cache salvageable DVI locations and expressions along with the
6944/// corresponding SCEV(s). Also ensure that the DVI is not deleted between
6945/// cacheing and salvaging.
6947 Loop *L, ScalarEvolution &SE,
6948 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &SalvageableDVISCEVs) {
6949 for (const auto &B : L->getBlocks()) {
6950 for (auto &I : *B) {
6951 for (DbgVariableRecord &DbgVal : filterDbgVars(I.getDbgRecordRange())) {
6952 if (!DbgVal.isDbgValue() && !DbgVal.isDbgAssign())
6953 continue;
6954
6955 // Ensure that if any location op is undef that the dbg.vlue is not
6956 // cached.
6957 if (DbgVal.isKillLocation())
6958 continue;
6959
6960 // Check that the location op SCEVs are suitable for translation to
6961 // DIExpression.
6962 const auto &HasTranslatableLocationOps =
6963 [&](const DbgVariableRecord &DbgValToTranslate) -> bool {
6964 for (const auto LocOp : DbgValToTranslate.location_ops()) {
6965 if (!LocOp)
6966 return false;
6967
6968 if (!SE.isSCEVable(LocOp->getType()))
6969 return false;
6970
6971 const SCEV *S = SE.getSCEV(LocOp);
6972 if (SE.containsUndefs(S))
6973 return false;
6974 }
6975 return true;
6976 };
6977
6978 if (!HasTranslatableLocationOps(DbgVal))
6979 continue;
6980
6981 std::unique_ptr<DVIRecoveryRec> NewRec =
6982 std::make_unique<DVIRecoveryRec>(&DbgVal);
6983 // Each location Op may need a SCEVDbgValueBuilder in order to recover
6984 // it. Pre-allocating a vector will enable quick lookups of the builder
6985 // later during the salvage.
6986 NewRec->RecoveryExprs.resize(DbgVal.getNumVariableLocationOps());
6987 for (const auto LocOp : DbgVal.location_ops()) {
6988 NewRec->SCEVs.push_back(SE.getSCEV(LocOp));
6989 NewRec->LocationOps.push_back(LocOp);
6990 NewRec->HadLocationArgList = DbgVal.hasArgList();
6991 }
6992 SalvageableDVISCEVs.push_back(std::move(NewRec));
6993 }
6994 }
6995 }
6996}
6997
6998/// Ideally pick the PHI IV inserted by ScalarEvolutionExpander. As a fallback
6999/// any PHi from the loop header is usable, but may have less chance of
7000/// surviving subsequent transforms.
7002 const LSRInstance &LSR) {
7003
7004 auto IsSuitableIV = [&](PHINode *P) {
7005 if (!SE.isSCEVable(P->getType()))
7006 return false;
7007 if (const SCEVAddRecExpr *Rec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(P)))
7008 return Rec->isAffine() && !SE.containsUndefs(SE.getSCEV(P));
7009 return false;
7010 };
7011
7012 // For now, just pick the first IV that was generated and inserted by
7013 // ScalarEvolution. Ideally pick an IV that is unlikely to be optimised away
7014 // by subsequent transforms.
7015 for (const WeakVH &IV : LSR.getScalarEvolutionIVs()) {
7016 if (!IV)
7017 continue;
7018
7019 // There should only be PHI node IVs.
7020 PHINode *P = cast<PHINode>(&*IV);
7021
7022 if (IsSuitableIV(P))
7023 return P;
7024 }
7025
7026 for (PHINode &P : L.getHeader()->phis()) {
7027 if (IsSuitableIV(&P))
7028 return &P;
7029 }
7030 return nullptr;
7031}
7032
7034 DominatorTree &DT, LoopInfo &LI,
7035 const TargetTransformInfo &TTI,
7037 MemorySSA *MSSA) {
7038
7039 // Debug preservation - before we start removing anything identify which DVI
7040 // meet the salvageable criteria and store their DIExpression and SCEVs.
7041 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> SalvageableDVIRecords;
7042 DbgGatherSalvagableDVI(L, SE, SalvageableDVIRecords);
7043
7044 bool Changed = false;
7045 std::unique_ptr<MemorySSAUpdater> MSSAU;
7046 if (MSSA)
7047 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
7048
7049 // Run the main LSR transformation.
7050 const LSRInstance &Reducer =
7051 LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get());
7052 Changed |= Reducer.getChanged();
7053
7054 // Remove any extra phis created by processing inner loops.
7055 Changed |= DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7056 if (EnablePhiElim && L->isLoopSimplifyForm()) {
7058 SCEVExpander Rewriter(SE, "lsr", false);
7059#if LLVM_ENABLE_ABI_BREAKING_CHECKS
7060 Rewriter.setDebugType(DEBUG_TYPE);
7061#endif
7062 unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI);
7063 Rewriter.clear();
7064 if (numFolded) {
7065 Changed = true;
7067 MSSAU.get());
7068 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7069 }
7070 }
7071 // LSR may at times remove all uses of an induction variable from a loop.
7072 // The only remaining use is the PHI in the exit block.
7073 // When this is the case, if the exit value of the IV can be calculated using
7074 // SCEV, we can replace the exit block PHI with the final value of the IV and
7075 // skip the updates in each loop iteration.
7076 if (L->isRecursivelyLCSSAForm(DT, LI) && L->getExitBlock()) {
7078 SCEVExpander Rewriter(SE, "lsr", true);
7079 int Rewrites = rewriteLoopExitValues(L, &LI, &TLI, &SE, &TTI, Rewriter, &DT,
7080 UnusedIndVarInLoop, DeadInsts);
7081 Rewriter.clear();
7082 if (Rewrites) {
7083 Changed = true;
7085 MSSAU.get());
7086 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7087 }
7088 }
7089
7090 if (SalvageableDVIRecords.empty())
7091 return Changed;
7092
7093 // Obtain relevant IVs and attempt to rewrite the salvageable DVIs with
7094 // expressions composed using the derived iteration count.
7095 // TODO: Allow for multiple IV references for nested AddRecSCEVs
7096 for (const auto &L : LI) {
7097 if (llvm::PHINode *IV = GetInductionVariable(*L, SE, Reducer))
7098 DbgRewriteSalvageableDVIs(L, SE, IV, SalvageableDVIRecords);
7099 else {
7100 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV salvaging not possible. An IV "
7101 "could not be identified.\n");
7102 }
7103 }
7104
7105 for (auto &Rec : SalvageableDVIRecords)
7106 Rec->clear();
7107 SalvageableDVIRecords.clear();
7108 return Changed;
7109}
7110
7111bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
7112 if (skipLoop(L))
7113 return false;
7114
7115 auto &IU = getAnalysis<IVUsersWrapperPass>().getIU();
7116 auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
7117 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
7118 auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
7119 const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
7120 *L->getHeader()->getParent());
7121 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
7122 *L->getHeader()->getParent());
7123 auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
7124 *L->getHeader()->getParent());
7125 auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
7126 MemorySSA *MSSA = nullptr;
7127 if (MSSAAnalysis)
7128 MSSA = &MSSAAnalysis->getMSSA();
7129 return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, TLI, MSSA);
7130}
7131
7134 LPMUpdater &) {
7135 if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,
7136 AR.DT, AR.LI, AR.TTI, AR.AC, AR.TLI, AR.MSSA))
7137 return PreservedAnalyses::all();
7138
7139 auto PA = getLoopPassPreservedAnalyses();
7140 if (AR.MSSA)
7141 PA.preserve<MemorySSAAnalysis>();
7142 return PA;
7143}
7144
7145char LoopStrengthReduce::ID = 0;
7146
7147INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
7148 "Loop Strength Reduction", false, false)
7154INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
7155INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
7156 "Loop Strength Reduction", false, false)
7157
7158Pass *llvm::createLoopStrengthReducePass() { return new LoopStrengthReduce(); }
#define Success
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
Function Alias Analysis false
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:638
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isCanonical(const MDString *S)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
This file contains constants used for implementing Dwarf debug support.
early cse Early CSE w MemorySSA
#define DEBUG_TYPE
Hexagon Hardware Loops
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
This header provides classes for managing per-loop analyses.
static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec, const SCEV *SCEVInductionVar, SCEVDbgValueBuilder IterCountExpr)
static cl::opt< bool > DropScaledForVScale("lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true), cl::desc("Avoid using scaled registers with vscale-relative addressing"))
static Value * getWideOperand(Value *Oper)
IVChain logic must consistently peek base TruncInst operands, so wrap it in a convenient helper.
static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE)
Return true if the given add can be sign-extended without changing its value.
static bool mayUsePostIncMode(const TargetTransformInfo &TTI, LSRUse &LU, const SCEV *S, const Loop *L, ScalarEvolution &SE)
Return true if the SCEV represents a value that may end up as a post-increment operation.
static void restorePreTransformState(DVIRecoveryRec &DVIRec)
Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE)
If S involves the addition of a constant integer value, return that integer value,...
static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L)
static User::op_iterator findIVOperand(User::op_iterator OI, User::op_iterator OE, Loop *L, ScalarEvolution &SE)
Helper for CollectChains that finds an IV operand (computed by an AddRec in this loop) within [OI,...
static cl::opt< TTI::AddressingModeKind > PreferredAddresingMode("lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None), cl::desc("A flag that overrides the target's preferred addressing mode."), cl::values(clEnumValN(TTI::AMK_None, "none", "Don't prefer any addressing mode"), clEnumValN(TTI::AMK_PreIndexed, "preindexed", "Prefer pre-indexed addressing mode"), clEnumValN(TTI::AMK_PostIndexed, "postindexed", "Prefer post-indexed addressing mode"), clEnumValN(TTI::AMK_All, "all", "Consider all addressing modes")))
static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset, Immediate MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg, int64_t Scale)
Test whether we know how to expand the current formula.
static void DbgGatherSalvagableDVI(Loop *L, ScalarEvolution &SE, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &SalvageableDVISCEVs)
Identify and cache salvageable DVI locations and expressions along with the corresponding SCEV(s).
static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE)
Return true if the given mul can be sign-extended without changing its value.
static const unsigned MaxSCEVSalvageExpressionSize
Limit the size of expression that SCEV-based salvaging will attempt to translate into a DIExpression.
static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if this AddRec is already a phi in its loop.
static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F, const Loop &L)
static cl::opt< bool > InsnsCost("lsr-insns-cost", cl::Hidden, cl::init(true), cl::desc("Add instruction count to a LSR cost model"))
static cl::opt< bool > StressIVChain("stress-ivchain", cl::Hidden, cl::init(false), cl::desc("Stress test LSR IV chains"))
static bool isAddressUse(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Returns true if the specified instruction is using the specified value as an address.
static GlobalValue * ExtractSymbol(const SCEV *&S, ScalarEvolution &SE)
If S involves the addition of a GlobalValue address, return that symbol, and mutate S to point to a n...
static void updateDVIWithLocation(T &DbgVal, Value *Location, SmallVectorImpl< uint64_t > &Ops)
Overwrites DVI with the location and Ops as the DIExpression.
static bool isLegalAddImmediate(const TargetTransformInfo &TTI, Immediate Offset)
static cl::opt< cl::boolOrDefault > AllowDropSolutionIfLessProfitable("lsr-drop-solution", cl::Hidden, cl::desc("Attempt to drop solution if it is less profitable"))
static cl::opt< bool > EnableVScaleImmediates("lsr-enable-vscale-immediates", cl::Hidden, cl::init(true), cl::desc("Enable analysis of vscale-relative immediates in LSR"))
static Instruction * getFixupInsertPos(const TargetTransformInfo &TTI, const LSRFixup &Fixup, const LSRUse &LU, Instruction *IVIncInsertPos, DominatorTree &DT)
static const SCEV * getExprBase(const SCEV *S)
Return an approximation of this SCEV expression's "base", or NULL for any constant.
static bool isAlwaysFoldable(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg)
static llvm::PHINode * GetInductionVariable(const Loop &L, ScalarEvolution &SE, const LSRInstance &LSR)
Ideally pick the PHI IV inserted by ScalarEvolutionExpander.
static bool IsSimplerBaseSCEVForTarget(const TargetTransformInfo &TTI, ScalarEvolution &SE, const SCEV *Best, const SCEV *Reg, MemAccessTy AccessType)
static const unsigned MaxIVUsers
MaxIVUsers is an arbitrary threshold that provides an early opportunity for bail out.
static bool isHighCostExpansion(const SCEV *S, SmallPtrSetImpl< const SCEV * > &Processed, ScalarEvolution &SE)
Check if expanding this expression is likely to incur significant cost.
static Value * getValueOrPoison(WeakVH &VH, LLVMContext &C)
Cached location ops may be erased during LSR, in which case a poison is required when restoring from ...
static MemAccessTy getAccessType(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Return the type of the memory being accessed.
static unsigned numLLVMArgOps(SmallVectorImpl< uint64_t > &Expr)
Returns the total number of DW_OP_llvm_arg operands in the expression.
static void DbgRewriteSalvageableDVIs(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &DVIToUpdate)
Obtain an expression for the iteration count, then attempt to salvage the dbg.value intrinsics.
static cl::opt< bool > EnablePhiElim("enable-lsr-phielim", cl::Hidden, cl::init(true), cl::desc("Enable LSR phi elimination"))
static void UpdateDbgValue(DVIRecoveryRec &DVIRec, SmallVectorImpl< Value * > &NewLocationOps, SmallVectorImpl< uint64_t > &NewExpr)
Write the new expression and new location ops for the dbg.value.
static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if the given addrec can be sign-extended without changing its value.
static void DoInitialMatch(const SCEV *S, Loop *L, SmallVectorImpl< const SCEV * > &Good, SmallVectorImpl< const SCEV * > &Bad, ScalarEvolution &SE)
Recursion helper for initialMatch.
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F)
Check if the addressing mode defined by F is completely folded in LU at isel time.
static cl::opt< bool > LSRExpNarrow("lsr-exp-narrow", cl::Hidden, cl::init(false), cl::desc("Narrow LSR complex solution using" " expectation of registers number"))
static cl::opt< bool > FilterSameScaledReg("lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true), cl::desc("Narrow LSR search space by filtering non-optimal formulae" " with the same ScaledReg and Scale"))
static void updateDVIWithLocations(T &DbgVal, SmallVectorImpl< Value * > &Locations, SmallVectorImpl< uint64_t > &Ops)
Overwrite DVI with locations placed into a DIArglist.
static cl::opt< unsigned > ComplexityLimit("lsr-complexity-limit", cl::Hidden, cl::init(std::numeric_limits< uint16_t >::max()), cl::desc("LSR search space complexity limit"))
static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC, TargetLibraryInfo &TLI, MemorySSA *MSSA)
static bool isProfitableChain(IVChain &Chain, SmallPtrSetImpl< Instruction * > &Users, ScalarEvolution &SE, const TargetTransformInfo &TTI)
Return true if the number of registers needed for the chain is estimated to be less than the number r...
static const SCEV * CollectSubexprs(const SCEV *S, const SCEVConstant *C, SmallVectorImpl< const SCEV * > &Ops, const Loop *L, ScalarEvolution &SE, unsigned Depth=0)
Split S into subexpressions which can be pulled out into separate registers.
static const SCEV * getExactSDiv(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool IgnoreSignificantBits=false)
Return an expression for LHS /s RHS, if it can be determined and if the remainder is known to be zero...
static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, Value *Operand, const TargetTransformInfo &TTI)
Return true if the IVInc can be folded into an addressing mode.
static const SCEV * getAnyExtendConsideringPostIncUses(ArrayRef< PostIncLoopSet > Loops, const SCEV *Expr, Type *ToTy, ScalarEvolution &SE)
Extend/Truncate Expr to ToTy considering post-inc uses in Loops.
static unsigned getSetupCost(const SCEV *Reg, unsigned Depth)
static cl::opt< unsigned > SetupCostDepthLimit("lsr-setupcost-depth-limit", cl::Hidden, cl::init(7), cl::desc("The limit on recursion depth for LSRs setup cost"))
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
Register Reg
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
#define T
uint64_t IntrinsicInst * II
#define P(N)
PowerPC TLS Dynamic Call Fixup
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the PointerIntPair class.
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
SI optimize exec mask operations pre RA
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
static const unsigned UnknownAddressSpace
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
Virtual Register Rewriter
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1549
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:330
LLVM_ABI APInt sdiv(const APInt &RHS) const
Signed division function for APInt.
Definition APInt.cpp:1655
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition APInt.h:1540
LLVM_ABI APInt srem(const APInt &RHS) const
Function for signed remainder operation.
Definition APInt.cpp:1747
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1571
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
LLVM_ABI AnalysisUsage & addRequiredID(const void *ID)
Definition Pass.cpp:284
AnalysisUsage & addPreservedID(const void *ID)
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:539
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:397
LLVM_ABI bool isLandingPad() const
Return true if this basic block is a landing pad.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
BinaryOps getOpcode() const
Definition InstrTypes.h:374
static LLVM_ABI BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
bool isUnconditional() const
Value * getCondition() const
static LLVM_ABI Instruction::CastOps getCastOpcode(const Value *Val, bool SrcIsSigned, Type *Ty, bool DstIsSigned)
Returns the opcode necessary to cast Val into Ty using usual casting rules.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_NE
not equal
Definition InstrTypes.h:698
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
static LLVM_ABI bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI DIArgList * get(LLVMContext &Context, ArrayRef< ValueAsMetadata * > Args)
DWARF expression.
iterator_range< expr_op_iterator > expr_ops() const
static LLVM_ABI DIExpression * append(const DIExpression *Expr, ArrayRef< uint64_t > Ops)
Append the opcodes Ops to DIExpr.
unsigned getNumElements() const
static LLVM_ABI void appendOffset(SmallVectorImpl< uint64_t > &Ops, int64_t Offset)
Append Ops with operations to apply the Offset.
LLVM_ABI bool isComplex() const
Return whether the location is computed on the expression stack, meaning it cannot be a simple regist...
LLVM_ABI LLVMContext & getContext()
Record of a variable value-assignment, aka a non instruction representation of the dbg....
LLVM_ABI bool isKillLocation() const
void setRawLocation(Metadata *NewLocation)
Use of this should generally be avoided; instead, replaceVariableLocationOp and addVariableLocationOp...
void setExpression(DIExpression *NewExpr)
DIExpression * getExpression() const
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
NodeT * getBlock() const
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:321
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
LLVM_ABI Instruction * findNearestCommonDominator(Instruction *I1, Instruction *I2) const
Find the nearest instruction I that dominates both I1 and I2, in the sense that a result produced bef...
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
PointerType * getType() const
Global values are always pointers.
IVStrideUse - Keep track of one use of a strided induction variable.
Definition IVUsers.h:35
void transformToPostInc(const Loop *L)
transformToPostInc - Transform the expression to post-inc form for the given loop.
Definition IVUsers.cpp:365
Value * getOperandValToReplace() const
getOperandValToReplace - Return the Value of the operand in the user instruction that this IVStrideUs...
Definition IVUsers.h:54
void setUser(Instruction *NewUser)
setUser - Assign a new user instruction for this use.
Definition IVUsers.h:48
Analysis pass that exposes the IVUsers for a loop.
Definition IVUsers.h:184
ilist< IVStrideUse >::const_iterator const_iterator
Definition IVUsers.h:142
iterator end()
Definition IVUsers.h:144
iterator begin()
Definition IVUsers.h:143
bool empty() const
Definition IVUsers.h:147
LLVM_ABI void print(raw_ostream &OS) const
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isLifetimeStartOrEnd() const LLVM_READONLY
Return true if the instruction is a llvm.lifetime.start or llvm.lifetime.end marker.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isEHPad() const
Return true if the instruction is a variety of EH-block.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI Type * getAccessType() const LLVM_READONLY
Return the type this instruction accesses in memory, if any.
const char * getOpcodeName() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
An instruction for reading from memory.
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
The legacy pass manager's analysis pass to compute loop information.
Definition LoopInfo.h:596
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1569
An analysis that produces MemorySSA for a function.
Definition MemorySSA.h:936
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition MemorySSA.h:702
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
iterator_range< const_block_iterator > blocks() const
op_range incoming_values()
void setIncomingValue(unsigned i, Value *V)
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
static unsigned getIncomingValueNumForOperand(unsigned i)
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
This node represents an addition of some number of SCEVs.
This node represents a polynomial recurrence on the trip count of the specified loop.
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents a constant integer value.
ConstantInt * getValue() const
const APInt & getAPInt() const
This class uses information about analyze scalars to rewrite expressions in canonical form.
This node represents multiplication of some number of SCEVs.
ArrayRef< const SCEV * > operands() const
This means that we are dealing with an entirely unknown SCEV value, and only represent it as its LLVM...
This class represents an analyzed expression in the program.
LLVM_ABI ArrayRef< const SCEV * > operands() const
Return operands of this SCEV expression.
unsigned short getExpressionSize() const
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
SCEVTypes getSCEVType() const
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
LLVM_ABI uint64_t getTypeSizeInBits(Type *Ty) const
Return the size in bits of the specified type, for which isSCEVable must return true.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getNoopOrSignExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI const SCEV * getAddRecExpr(const SCEV *Start, const SCEV *Step, const Loop *L, SCEV::NoWrapFlags Flags)
Get an add recurrence expression for the specified loop.
LLVM_ABI bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
LLVM_ABI Type * getEffectiveSCEVType(Type *Ty) const
Return a type with the same bitwidth as the given type and which represents how SCEV will treat the g...
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getAnyExtendExpr(const SCEV *Op, Type *Ty)
getAnyExtendExpr - Return a SCEV for the given operand extended with unspecified bits out to the give...
LLVM_ABI bool containsUndefs(const SCEV *S) const
Return true if the SCEV expression contains an undef value.
LLVM_ABI const SCEV * getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
LLVM_ABI const SCEV * getVScale(Type *Ty)
LLVM_ABI bool hasComputableLoopEvolution(const SCEV *S, const Loop *L)
Return true if the given SCEV changes value in a known way in the specified loop.
LLVM_ABI const SCEV * getPointerBase(const SCEV *V)
Transitively follow the chain of pointer-type operands until reaching a SCEV that does not have a sin...
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUnknown(Value *V)
LLVM_ABI std::optional< APInt > computeConstantDifference(const SCEV *LHS, const SCEV *RHS)
Compute LHS - RHS and returns the result as an APInt if it is a constant, and std::nullopt if it isn'...
LLVM_ABI bool properlyDominates(const SCEV *S, const BasicBlock *BB)
Return true if elements that makes up the given SCEV properly dominate the specified basic block.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVM_ABI bool containsErasedValue(const SCEV *S) const
Return true if the SCEV expression contains a Value that has been optimised out and is now a nullptr.
LLVMContext & getContext() const
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
iterator end()
Get an iterator to the end of the SetVector.
Definition SetVector.h:112
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition SetVector.h:106
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
iterator_range< const_set_bits_iterator > set_bits() const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
size_type size() const
Returns the number of bits in this bitvector.
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
iterator erase(const_iterator CI)
typename SuperClass::const_iterator const_iterator
typename SuperClass::iterator iterator
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
static StackOffset get(int64_t Fixed, int64_t Scalable)
Definition TypeSize.h:41
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI bool shouldDropLSRSolutionIfLessProfitable() const
Return true if LSR should drop a found solution if it's calculated to be less profitable than the bas...
LLVM_ABI bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const
Return true if LSR cost of C1 is lower than C2.
LLVM_ABI bool isIndexedStoreLegal(enum MemIndexedMode Mode, Type *Ty) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace=0, Instruction *I=nullptr, int64_t ScalableOffset=0) const
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
LLVM_ABI bool isIndexedLoadLegal(enum MemIndexedMode Mode, Type *Ty) const
LLVM_ABI bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
LLVM_ABI bool isLegalAddImmediate(int64_t Imm) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
LLVM_ABI bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *LibInfo) const
Return true if the target can save a compare for loop count, for example hardware loop saves a compar...
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI bool canMacroFuseCmp() const
Return true if the target can fuse a compare and branch.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
@ AMK_All
Consider all addressing modes.
@ AMK_PreIndexed
Prefer pre-indexed addressing mode.
@ AMK_None
Don't prefer any addressing mode.
LLVM_ABI bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
This class represents a truncation of integer types.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI int getFPMantissaWidth() const
Return the width of the mantissa of this type.
Definition Type.cpp:235
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
Use * op_iterator
Definition User.h:280
op_range operands()
Definition User.h:293
op_iterator op_begin()
Definition User.h:285
void setOperand(unsigned i, Value *Val)
Definition User.h:238
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:25
Value * getOperand(unsigned i) const
Definition User.h:233
op_iterator op_end()
Definition User.h:287
static LLVM_ABI ValueAsMetadata * get(Value *V)
Definition Metadata.cpp:503
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI void printAsOperand(raw_ostream &O, bool PrintType=true, const Module *M=nullptr) const
Print the name of this Value out to the specified raw_ostream.
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1106
iterator_range< use_iterator > uses()
Definition Value.h:380
A nullable Value handle that is nullable.
int getNumOccurrences() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
class_match< const SCEVVScale > m_SCEVVScale()
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
class_match< const SCEVConstant > m_SCEVConstant()
SCEVAffineAddRec_match< Op0_t, Op1_t, class_match< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
bind_ty< const SCEVMulExpr > m_scev_Mul(const SCEVMulExpr *&V)
bool match(const SCEV *S, const Pattern &P)
class_match< const Loop > m_Loop()
cst_pred_ty< is_specific_cst > m_scev_SpecificInt(uint64_t V)
Match an SCEV constant with a plain unsigned integer.
class_match< const SCEV > m_SCEV()
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
@ DW_OP_LLVM_arg
Only used in LLVM metadata.
Definition Dwarf.h:149
@ DW_OP_LLVM_convert
Only used in LLVM metadata.
Definition Dwarf.h:145
constexpr double e
Sequence
A sequence of states that a pointer may go through in which an objc_retain and objc_release are actua...
Definition PtrState.h:41
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
iterator end() const
Definition BasicBlock.h:89
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
Definition SFrame.h:77
unsigned KindType
For isa, dyn_cast, etc operations on TelemetryInfo.
Definition Telemetry.h:83
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1763
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
InstructionCost Cost
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition Utils.cpp:1731
bool operator!=(uint64_t V1, const APInt &V2)
Definition APInt.h:2122
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2198
LLVM_ABI char & LoopSimplifyID
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
DomTreeNodeBase< BasicBlock > DomTreeNode
Definition Dominators.h:94
AnalysisManager< Loop, LoopStandardAnalysisResults & > LoopAnalysisManager
The loop analysis manager.
LLVM_ABI bool matchSimpleRecurrence(const PHINode *P, BinaryOperator *&BO, Value *&Start, Value *&Step)
Attempt to match a simple first order recurrence cycle of the form: iv = phi Ty [Start,...
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
LLVM_ABI bool DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr)
Examine each PHI in the given block and delete it if it is dead.
LLVM_ABI void initializeLoopStrengthReducePass(PassRegistry &)
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
LLVM_ABI const SCEV * denormalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE)
Denormalize S to be post-increment for all loops present in Loops.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
LLVM_ABI Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void SplitLandingPadPredecessors(BasicBlock *OrigBB, ArrayRef< BasicBlock * > Preds, const char *Suffix, const char *Suffix2, SmallVectorImpl< BasicBlock * > &NewBBs, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, bool PreserveLCSSA=false)
This method transforms the landing pad, OrigBB, by introducing two new basic blocks into the function...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
LLVM_ABI const SCEV * normalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE, bool CheckInvertible=true)
Normalize S to be post-increment for all loops present in Loops.
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Other
Any other memory.
Definition ModRef.h:68
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
@ Add
Sum of integers.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2002
DWARFExpression::Operation Op
LLVM_ABI Pass * createLoopStrengthReducePass()
LLVM_ABI BasicBlock * SplitCriticalEdge(Instruction *TI, unsigned SuccNum, const CriticalEdgeSplittingOptions &Options=CriticalEdgeSplittingOptions(), const Twine &BBName="")
If this edge is a critical edge, insert a new node to split the critical edge.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructionsPermissive(SmallVectorImpl< WeakTrackingVH > &DeadInsts, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
Same functionality as RecursivelyDeleteTriviallyDeadInstructions, but allow instructions that are not...
Definition Local.cpp:548
constexpr unsigned BitWidth
LLVM_ABI bool formLCSSAForInstructions(SmallVectorImpl< Instruction * > &Worklist, const DominatorTree &DT, const LoopInfo &LI, ScalarEvolution *SE, SmallVectorImpl< PHINode * > *PHIsToRemove=nullptr, SmallVectorImpl< PHINode * > *InsertedPHIs=nullptr)
Ensures LCSSA form for every instruction from the Worklist in the scope of innermost containing loop.
Definition LCSSA.cpp:308
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
SmallPtrSet< const Loop *, 2 > PostIncLoopSet
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
LLVM_ABI int rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI, ScalarEvolution *SE, const TargetTransformInfo *TTI, SCEVExpander &Rewriter, DominatorTree *DT, ReplaceExitVal ReplaceExitValue, SmallVector< WeakTrackingVH, 16 > &DeadInsts)
If the final value of any expressions that are recurrent in the loop can be computed,...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
@ UnusedIndVarInLoop
Definition LoopUtils.h:551
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
bool SCEVExprContains(const SCEV *Root, PredTy Pred)
Return true if any node in Root satisfies the predicate Pred.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
Attributes of a target dependent hardware loop.
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Information about a load/store intrinsic defined by the target.
Value * PtrVal
This is the pointer that the intrinsic is loading from or storing to.