LLVM 23.0.0git
LoopStrengthReduce.cpp
Go to the documentation of this file.
1//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This transformation analyzes and transforms the induction variables (and
10// computations derived from them) into forms suitable for efficient execution
11// on the target.
12//
13// This pass performs a strength reduction on array references inside loops that
14// have as one or more of their components the loop induction variable, it
15// rewrites expressions to take advantage of scaled-index addressing modes
16// available on the target, and it performs a variety of other optimizations
17// related to loop induction variables.
18//
19// Terminology note: this code has a lot of handling for "post-increment" or
20// "post-inc" users. This is not talking about post-increment addressing modes;
21// it is instead talking about code like this:
22//
23// %i = phi [ 0, %entry ], [ %i.next, %latch ]
24// ...
25// %i.next = add %i, 1
26// %c = icmp eq %i.next, %n
27//
28// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
29// it's useful to think about these as the same register, with some uses using
30// the value of the register before the add and some using it after. In this
31// example, the icmp is a post-increment user, since it uses %i.next, which is
32// the value of the induction variable after the increment. The other common
33// case of post-increment users is users outside the loop.
34//
35// TODO: More sophistication in the way Formulae are generated and filtered.
36//
37// TODO: Handle multiple loops at a time.
38//
39// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
40// of a GlobalValue?
41//
42// TODO: When truncation is free, truncate ICmp users' operands to make it a
43// smaller encoding (on x86 at least).
44//
45// TODO: When a negated register is used by an add (such as in a list of
46// multiple base registers, or as the increment expression in an addrec),
47// we may not actually need both reg and (-1 * reg) in registers; the
48// negation can be implemented by using a sub instead of an add. The
49// lack of support for taking this into consideration when making
50// register pressure decisions is partly worked around by the "Special"
51// use kind.
52//
53//===----------------------------------------------------------------------===//
54
56#include "llvm/ADT/APInt.h"
57#include "llvm/ADT/DenseMap.h"
58#include "llvm/ADT/DenseSet.h"
60#include "llvm/ADT/STLExtras.h"
61#include "llvm/ADT/SetVector.h"
64#include "llvm/ADT/SmallSet.h"
66#include "llvm/ADT/Statistic.h"
84#include "llvm/IR/BasicBlock.h"
85#include "llvm/IR/Constant.h"
86#include "llvm/IR/Constants.h"
89#include "llvm/IR/Dominators.h"
90#include "llvm/IR/GlobalValue.h"
91#include "llvm/IR/IRBuilder.h"
92#include "llvm/IR/InstrTypes.h"
93#include "llvm/IR/Instruction.h"
96#include "llvm/IR/Module.h"
97#include "llvm/IR/Operator.h"
98#include "llvm/IR/Type.h"
99#include "llvm/IR/Use.h"
100#include "llvm/IR/User.h"
101#include "llvm/IR/Value.h"
102#include "llvm/IR/ValueHandle.h"
104#include "llvm/Pass.h"
105#include "llvm/Support/Casting.h"
108#include "llvm/Support/Debug.h"
118#include <algorithm>
119#include <cassert>
120#include <cstddef>
121#include <cstdint>
122#include <iterator>
123#include <limits>
124#include <map>
125#include <numeric>
126#include <optional>
127#include <utility>
128
129using namespace llvm;
130using namespace SCEVPatternMatch;
131
132#define DEBUG_TYPE "loop-reduce"
133
134/// MaxIVUsers is an arbitrary threshold that provides an early opportunity for
135/// bail out. This threshold is far beyond the number of users that LSR can
136/// conceivably solve, so it should not affect generated code, but catches the
137/// worst cases before LSR burns too much compile time and stack space.
138static const unsigned MaxIVUsers = 200;
139
140/// Limit the size of expression that SCEV-based salvaging will attempt to
141/// translate into a DIExpression.
142/// Choose a maximum size such that debuginfo is not excessively increased and
143/// the salvaging is not too expensive for the compiler.
144static const unsigned MaxSCEVSalvageExpressionSize = 64;
145
146// Cleanup congruent phis after LSR phi expansion.
148 "enable-lsr-phielim", cl::Hidden, cl::init(true),
149 cl::desc("Enable LSR phi elimination"));
150
151// The flag adds instruction count to solutions cost comparison.
153 "lsr-insns-cost", cl::Hidden, cl::init(true),
154 cl::desc("Add instruction count to a LSR cost model"));
155
156// Flag to choose how to narrow complex lsr solution
158 "lsr-exp-narrow", cl::Hidden, cl::init(false),
159 cl::desc("Narrow LSR complex solution using"
160 " expectation of registers number"));
161
162// Flag to narrow search space by filtering non-optimal formulae with
163// the same ScaledReg and Scale.
165 "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true),
166 cl::desc("Narrow LSR search space by filtering non-optimal formulae"
167 " with the same ScaledReg and Scale"));
168
170 "lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None),
171 cl::desc("A flag that overrides the target's preferred addressing mode."),
173 clEnumValN(TTI::AMK_None, "none", "Don't prefer any addressing mode"),
174 clEnumValN(TTI::AMK_PreIndexed, "preindexed",
175 "Prefer pre-indexed addressing mode"),
176 clEnumValN(TTI::AMK_PostIndexed, "postindexed",
177 "Prefer post-indexed addressing mode"),
178 clEnumValN(TTI::AMK_All, "all", "Consider all addressing modes")));
179
181 "lsr-complexity-limit", cl::Hidden,
182 cl::init(std::numeric_limits<uint16_t>::max()),
183 cl::desc("LSR search space complexity limit"));
184
186 "lsr-setupcost-depth-limit", cl::Hidden, cl::init(7),
187 cl::desc("The limit on recursion depth for LSRs setup cost"));
188
190 "lsr-drop-solution", cl::Hidden,
191 cl::desc("Attempt to drop solution if it is less profitable"));
192
194 "lsr-enable-vscale-immediates", cl::Hidden, cl::init(true),
195 cl::desc("Enable analysis of vscale-relative immediates in LSR"));
196
198 "lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true),
199 cl::desc("Avoid using scaled registers with vscale-relative addressing"));
200
201#ifndef NDEBUG
202// Stress test IV chain generation.
204 "stress-ivchain", cl::Hidden, cl::init(false),
205 cl::desc("Stress test LSR IV chains"));
206#else
207static bool StressIVChain = false;
208#endif
209
210namespace {
211
212struct MemAccessTy {
213 /// Used in situations where the accessed memory type is unknown.
214 static const unsigned UnknownAddressSpace =
215 std::numeric_limits<unsigned>::max();
216
217 Type *MemTy = nullptr;
218 unsigned AddrSpace = UnknownAddressSpace;
219
220 MemAccessTy() = default;
221 MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {}
222
223 bool operator==(MemAccessTy Other) const {
224 return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
225 }
226
227 bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
228
229 static MemAccessTy getUnknown(LLVMContext &Ctx,
230 unsigned AS = UnknownAddressSpace) {
231 return MemAccessTy(Type::getVoidTy(Ctx), AS);
232 }
233
234 Type *getType() { return MemTy; }
235};
236
237/// This class holds data which is used to order reuse candidates.
238class RegSortData {
239public:
240 /// This represents the set of LSRUse indices which reference
241 /// a particular register.
242 SmallBitVector UsedByIndices;
243
244 void print(raw_ostream &OS) const;
245 void dump() const;
246};
247
248// An offset from an address that is either scalable or fixed. Used for
249// per-target optimizations of addressing modes.
250class Immediate : public details::FixedOrScalableQuantity<Immediate, int64_t> {
251 constexpr Immediate(ScalarTy MinVal, bool Scalable)
252 : FixedOrScalableQuantity(MinVal, Scalable) {}
253
254 constexpr Immediate(const FixedOrScalableQuantity<Immediate, int64_t> &V)
255 : FixedOrScalableQuantity(V) {}
256
257public:
258 constexpr Immediate() = delete;
259
260 static constexpr Immediate getFixed(ScalarTy MinVal) {
261 return {MinVal, false};
262 }
263 static constexpr Immediate getScalable(ScalarTy MinVal) {
264 return {MinVal, true};
265 }
266 static constexpr Immediate get(ScalarTy MinVal, bool Scalable) {
267 return {MinVal, Scalable};
268 }
269 static constexpr Immediate getZero() { return {0, false}; }
270 static constexpr Immediate getFixedMin() {
271 return {std::numeric_limits<int64_t>::min(), false};
272 }
273 static constexpr Immediate getFixedMax() {
274 return {std::numeric_limits<int64_t>::max(), false};
275 }
276 static constexpr Immediate getScalableMin() {
277 return {std::numeric_limits<int64_t>::min(), true};
278 }
279 static constexpr Immediate getScalableMax() {
280 return {std::numeric_limits<int64_t>::max(), true};
281 }
282
283 constexpr bool isLessThanZero() const { return Quantity < 0; }
284
285 constexpr bool isGreaterThanZero() const { return Quantity > 0; }
286
287 constexpr bool isCompatibleImmediate(const Immediate &Imm) const {
288 return isZero() || Imm.isZero() || Imm.Scalable == Scalable;
289 }
290
291 constexpr bool isMin() const {
292 return Quantity == std::numeric_limits<ScalarTy>::min();
293 }
294
295 constexpr bool isMax() const {
296 return Quantity == std::numeric_limits<ScalarTy>::max();
297 }
298
299 // Arithmetic 'operators' that cast to unsigned types first.
300 constexpr Immediate addUnsigned(const Immediate &RHS) const {
301 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
302 ScalarTy Value = (uint64_t)Quantity + RHS.getKnownMinValue();
303 return {Value, Scalable || RHS.isScalable()};
304 }
305
306 constexpr Immediate subUnsigned(const Immediate &RHS) const {
307 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
308 ScalarTy Value = (uint64_t)Quantity - RHS.getKnownMinValue();
309 return {Value, Scalable || RHS.isScalable()};
310 }
311
312 // Scale the quantity by a constant without caring about runtime scalability.
313 constexpr Immediate mulUnsigned(const ScalarTy RHS) const {
314 ScalarTy Value = (uint64_t)Quantity * RHS;
315 return {Value, Scalable};
316 }
317
318 // Helpers for generating SCEVs with vscale terms where needed.
319 const SCEV *getSCEV(ScalarEvolution &SE, Type *Ty) const {
320 const SCEV *S = SE.getConstant(Ty, Quantity);
321 if (Scalable)
322 S = SE.getMulExpr(S, SE.getVScale(S->getType()));
323 return S;
324 }
325
326 const SCEV *getNegativeSCEV(ScalarEvolution &SE, Type *Ty) const {
327 const SCEV *NegS = SE.getConstant(Ty, -(uint64_t)Quantity);
328 if (Scalable)
329 NegS = SE.getMulExpr(NegS, SE.getVScale(NegS->getType()));
330 return NegS;
331 }
332
333 const SCEV *getUnknownSCEV(ScalarEvolution &SE, Type *Ty) const {
334 // TODO: Avoid implicit trunc?
335 // See https://github.com/llvm/llvm-project/issues/112510.
336 const SCEV *SU = SE.getUnknown(
337 ConstantInt::getSigned(Ty, Quantity, /*ImplicitTrunc=*/true));
338 if (Scalable)
339 SU = SE.getMulExpr(SU, SE.getVScale(SU->getType()));
340 return SU;
341 }
342};
343
344// This is needed for the Compare type of std::map when Immediate is used
345// as a key. We don't need it to be fully correct against any value of vscale,
346// just to make sure that vscale-related terms in the map are considered against
347// each other rather than being mixed up and potentially missing opportunities.
348struct KeyOrderTargetImmediate {
349 bool operator()(const Immediate &LHS, const Immediate &RHS) const {
350 if (LHS.isScalable() && !RHS.isScalable())
351 return false;
352 if (!LHS.isScalable() && RHS.isScalable())
353 return true;
354 return LHS.getKnownMinValue() < RHS.getKnownMinValue();
355 }
356};
357
358// This would be nicer if we could be generic instead of directly using size_t,
359// but there doesn't seem to be a type trait for is_orderable or
360// is_lessthan_comparable or similar.
361struct KeyOrderSizeTAndImmediate {
362 bool operator()(const std::pair<size_t, Immediate> &LHS,
363 const std::pair<size_t, Immediate> &RHS) const {
364 size_t LSize = LHS.first;
365 size_t RSize = RHS.first;
366 if (LSize != RSize)
367 return LSize < RSize;
368 return KeyOrderTargetImmediate()(LHS.second, RHS.second);
369 }
370};
371} // end anonymous namespace
372
373#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
374void RegSortData::print(raw_ostream &OS) const {
375 OS << "[NumUses=" << UsedByIndices.count() << ']';
376}
377
378LLVM_DUMP_METHOD void RegSortData::dump() const {
379 print(errs()); errs() << '\n';
380}
381#endif
382
383namespace {
384
385/// Map register candidates to information about how they are used.
386class RegUseTracker {
387 using RegUsesTy = DenseMap<const SCEV *, RegSortData>;
388
389 RegUsesTy RegUsesMap;
391
392public:
393 void countRegister(const SCEV *Reg, size_t LUIdx);
394 void dropRegister(const SCEV *Reg, size_t LUIdx);
395 void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
396
397 bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
398
399 const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
400
401 void clear();
402
405
406 iterator begin() { return RegSequence.begin(); }
407 iterator end() { return RegSequence.end(); }
408 const_iterator begin() const { return RegSequence.begin(); }
409 const_iterator end() const { return RegSequence.end(); }
410};
411
412} // end anonymous namespace
413
414void
415RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
416 std::pair<RegUsesTy::iterator, bool> Pair = RegUsesMap.try_emplace(Reg);
417 RegSortData &RSD = Pair.first->second;
418 if (Pair.second)
419 RegSequence.push_back(Reg);
420 RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
421 RSD.UsedByIndices.set(LUIdx);
422}
423
424void
425RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
426 RegUsesTy::iterator It = RegUsesMap.find(Reg);
427 assert(It != RegUsesMap.end());
428 RegSortData &RSD = It->second;
429 assert(RSD.UsedByIndices.size() > LUIdx);
430 RSD.UsedByIndices.reset(LUIdx);
431}
432
433void
434RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
435 assert(LUIdx <= LastLUIdx);
436
437 // Update RegUses. The data structure is not optimized for this purpose;
438 // we must iterate through it and update each of the bit vectors.
439 for (auto &Pair : RegUsesMap) {
440 SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
441 if (LUIdx < UsedByIndices.size())
442 UsedByIndices[LUIdx] =
443 LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : false;
444 UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
445 }
446}
447
448bool
449RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
450 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
451 if (I == RegUsesMap.end())
452 return false;
453 const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
454 int i = UsedByIndices.find_first();
455 if (i == -1) return false;
456 if ((size_t)i != LUIdx) return true;
457 return UsedByIndices.find_next(i) != -1;
458}
459
460const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
461 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
462 assert(I != RegUsesMap.end() && "Unknown register!");
463 return I->second.UsedByIndices;
464}
465
466void RegUseTracker::clear() {
467 RegUsesMap.clear();
468 RegSequence.clear();
469}
470
471namespace {
472
473/// This class holds information that describes a formula for computing
474/// satisfying a use. It may include broken-out immediates and scaled registers.
475struct Formula {
476 /// Global base address used for complex addressing.
477 GlobalValue *BaseGV = nullptr;
478
479 /// Base offset for complex addressing.
480 Immediate BaseOffset = Immediate::getZero();
481
482 /// Whether any complex addressing has a base register.
483 bool HasBaseReg = false;
484
485 /// The scale of any complex addressing.
486 int64_t Scale = 0;
487
488 /// The list of "base" registers for this use. When this is non-empty. The
489 /// canonical representation of a formula is
490 /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
491 /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
492 /// 3. The reg containing recurrent expr related with currect loop in the
493 /// formula should be put in the ScaledReg.
494 /// #1 enforces that the scaled register is always used when at least two
495 /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
496 /// #2 enforces that 1 * reg is reg.
497 /// #3 ensures invariant regs with respect to current loop can be combined
498 /// together in LSR codegen.
499 /// This invariant can be temporarily broken while building a formula.
500 /// However, every formula inserted into the LSRInstance must be in canonical
501 /// form.
503
504 /// The 'scaled' register for this use. This should be non-null when Scale is
505 /// not zero.
506 const SCEV *ScaledReg = nullptr;
507
508 /// An additional constant offset which added near the use. This requires a
509 /// temporary register, but the offset itself can live in an add immediate
510 /// field rather than a register.
511 Immediate UnfoldedOffset = Immediate::getZero();
512
513 Formula() = default;
514
515 void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
516
517 bool isCanonical(const Loop &L) const;
518
519 void canonicalize(const Loop &L);
520
521 bool unscale();
522
523 bool hasZeroEnd() const;
524
525 bool countsDownToZero() const;
526
527 size_t getNumRegs() const;
528 Type *getType() const;
529
530 void deleteBaseReg(const SCEV *&S);
531
532 bool referencesReg(const SCEV *S) const;
533 bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
534 const RegUseTracker &RegUses) const;
535
536 void print(raw_ostream &OS) const;
537 void dump() const;
538};
539
540} // end anonymous namespace
541
542/// Recursion helper for initialMatch.
543static void DoInitialMatch(const SCEV *S, Loop *L,
546 // Collect expressions which properly dominate the loop header.
547 if (SE.properlyDominates(S, L->getHeader())) {
548 Good.push_back(S);
549 return;
550 }
551
552 // Look at add operands.
553 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
554 for (const SCEV *S : Add->operands())
555 DoInitialMatch(S, L, Good, Bad, SE);
556 return;
557 }
558
559 // Look at addrec operands.
560 const SCEV *Start, *Step;
561 const Loop *ARLoop;
562 if (match(S,
563 m_scev_AffineAddRec(m_SCEV(Start), m_SCEV(Step), m_Loop(ARLoop))) &&
564 !Start->isZero()) {
565 DoInitialMatch(Start, L, Good, Bad, SE);
566 DoInitialMatch(SE.getAddRecExpr(SE.getConstant(S->getType(), 0), Step,
567 // FIXME: AR->getNoWrapFlags()
568 ARLoop, SCEV::FlagAnyWrap),
569 L, Good, Bad, SE);
570 return;
571 }
572
573 // Handle a multiplication by -1 (negation) if it didn't fold.
574 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
575 if (Mul->getOperand(0)->isAllOnesValue()) {
577 const SCEV *NewMul = SE.getMulExpr(Ops);
578
581 DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
582 const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
583 SE.getEffectiveSCEVType(NewMul->getType())));
584 for (const SCEV *S : MyGood)
585 Good.push_back(SE.getMulExpr(NegOne, S));
586 for (const SCEV *S : MyBad)
587 Bad.push_back(SE.getMulExpr(NegOne, S));
588 return;
589 }
590
591 // Ok, we can't do anything interesting. Just stuff the whole thing into a
592 // register and hope for the best.
593 Bad.push_back(S);
594}
595
596/// Incorporate loop-variant parts of S into this Formula, attempting to keep
597/// all loop-invariant and loop-computable values in a single base register.
598void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
601 DoInitialMatch(S, L, Good, Bad, SE);
602 if (!Good.empty()) {
603 const SCEV *Sum = SE.getAddExpr(Good);
604 if (!Sum->isZero())
605 BaseRegs.push_back(Sum);
606 HasBaseReg = true;
607 }
608 if (!Bad.empty()) {
609 const SCEV *Sum = SE.getAddExpr(Bad);
610 if (!Sum->isZero())
611 BaseRegs.push_back(Sum);
612 HasBaseReg = true;
613 }
614 canonicalize(*L);
615}
616
617static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L) {
618 return SCEVExprContains(S, [&L](const SCEV *S) {
619 return isa<SCEVAddRecExpr>(S) && (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
620 });
621}
622
623/// Check whether or not this formula satisfies the canonical
624/// representation.
625/// \see Formula::BaseRegs.
626bool Formula::isCanonical(const Loop &L) const {
627 assert((Scale == 0 || ScaledReg) &&
628 "ScaledReg must be non-null if Scale is non-zero");
629
630 if (!ScaledReg)
631 return BaseRegs.size() <= 1;
632
633 if (Scale != 1)
634 return true;
635
636 if (Scale == 1 && BaseRegs.empty())
637 return false;
638
639 if (containsAddRecDependentOnLoop(ScaledReg, L))
640 return true;
641
642 // If ScaledReg is not a recurrent expr, or it is but its loop is not current
643 // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
644 // loop, we want to swap the reg in BaseRegs with ScaledReg.
645 return none_of(BaseRegs, [&L](const SCEV *S) {
647 });
648}
649
650/// Helper method to morph a formula into its canonical representation.
651/// \see Formula::BaseRegs.
652/// Every formula having more than one base register, must use the ScaledReg
653/// field. Otherwise, we would have to do special cases everywhere in LSR
654/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
655/// On the other hand, 1*reg should be canonicalized into reg.
656void Formula::canonicalize(const Loop &L) {
657 if (isCanonical(L))
658 return;
659
660 if (BaseRegs.empty()) {
661 // No base reg? Use scale reg with scale = 1 as such.
662 assert(ScaledReg && "Expected 1*reg => reg");
663 assert(Scale == 1 && "Expected 1*reg => reg");
664 BaseRegs.push_back(ScaledReg);
665 Scale = 0;
666 ScaledReg = nullptr;
667 return;
668 }
669
670 // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
671 if (!ScaledReg) {
672 ScaledReg = BaseRegs.pop_back_val();
673 Scale = 1;
674 }
675
676 // If ScaledReg is an invariant with respect to L, find the reg from
677 // BaseRegs containing the recurrent expr related with Loop L. Swap the
678 // reg with ScaledReg.
679 if (!containsAddRecDependentOnLoop(ScaledReg, L)) {
680 auto I = find_if(BaseRegs, [&L](const SCEV *S) {
682 });
683 if (I != BaseRegs.end())
684 std::swap(ScaledReg, *I);
685 }
686 assert(isCanonical(L) && "Failed to canonicalize?");
687}
688
689/// Get rid of the scale in the formula.
690/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
691/// \return true if it was possible to get rid of the scale, false otherwise.
692/// \note After this operation the formula may not be in the canonical form.
693bool Formula::unscale() {
694 if (Scale != 1)
695 return false;
696 Scale = 0;
697 BaseRegs.push_back(ScaledReg);
698 ScaledReg = nullptr;
699 return true;
700}
701
702bool Formula::hasZeroEnd() const {
703 if (UnfoldedOffset || BaseOffset)
704 return false;
705 if (BaseRegs.size() != 1 || ScaledReg)
706 return false;
707 return true;
708}
709
710bool Formula::countsDownToZero() const {
711 if (!hasZeroEnd())
712 return false;
713 assert(BaseRegs.size() == 1 && "hasZeroEnd should mean one BaseReg");
714 const APInt *StepInt;
715 if (!match(BaseRegs[0], m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StepInt))))
716 return false;
717 return StepInt->isNegative();
718}
719
720/// Return the total number of register operands used by this formula. This does
721/// not include register uses implied by non-constant addrec strides.
722size_t Formula::getNumRegs() const {
723 return !!ScaledReg + BaseRegs.size();
724}
725
726/// Return the type of this formula, if it has one, or null otherwise. This type
727/// is meaningless except for the bit size.
728Type *Formula::getType() const {
729 return !BaseRegs.empty() ? BaseRegs.front()->getType() :
730 ScaledReg ? ScaledReg->getType() :
731 BaseGV ? BaseGV->getType() :
732 nullptr;
733}
734
735/// Delete the given base reg from the BaseRegs list.
736void Formula::deleteBaseReg(const SCEV *&S) {
737 if (&S != &BaseRegs.back())
738 std::swap(S, BaseRegs.back());
739 BaseRegs.pop_back();
740}
741
742/// Test if this formula references the given register.
743bool Formula::referencesReg(const SCEV *S) const {
744 return S == ScaledReg || is_contained(BaseRegs, S);
745}
746
747/// Test whether this formula uses registers which are used by uses other than
748/// the use with the given index.
749bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
750 const RegUseTracker &RegUses) const {
751 if (ScaledReg)
752 if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
753 return true;
754 for (const SCEV *BaseReg : BaseRegs)
755 if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
756 return true;
757 return false;
758}
759
760#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
761void Formula::print(raw_ostream &OS) const {
762 ListSeparator Plus(" + ");
763 if (BaseGV) {
764 OS << Plus;
765 BaseGV->printAsOperand(OS, /*PrintType=*/false);
766 }
767 if (BaseOffset.isNonZero())
768 OS << Plus << BaseOffset;
769
770 for (const SCEV *BaseReg : BaseRegs)
771 OS << Plus << "reg(" << *BaseReg << ')';
772
773 if (HasBaseReg && BaseRegs.empty())
774 OS << Plus << "**error: HasBaseReg**";
775 else if (!HasBaseReg && !BaseRegs.empty())
776 OS << Plus << "**error: !HasBaseReg**";
777
778 if (Scale != 0) {
779 OS << Plus << Scale << "*reg(";
780 if (ScaledReg)
781 OS << *ScaledReg;
782 else
783 OS << "<unknown>";
784 OS << ')';
785 }
786 if (UnfoldedOffset.isNonZero())
787 OS << Plus << "imm(" << UnfoldedOffset << ')';
788}
789
790LLVM_DUMP_METHOD void Formula::dump() const {
791 print(errs()); errs() << '\n';
792}
793#endif
794
795/// Return true if the given addrec can be sign-extended without changing its
796/// value.
798 Type *WideTy =
800 return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
801}
802
803/// Return true if the given add can be sign-extended without changing its
804/// value.
805static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
806 Type *WideTy =
807 IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
808 return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
809}
810
811/// Return true if the given mul can be sign-extended without changing its
812/// value.
813static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
814 Type *WideTy =
816 SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
817 return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
818}
819
820/// Return an expression for LHS /s RHS, if it can be determined and if the
821/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
822/// is true, expressions like (X * Y) /s Y are simplified to X, ignoring that
823/// the multiplication may overflow, which is useful when the result will be
824/// used in a context where the most significant bits are ignored.
825static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
826 ScalarEvolution &SE,
827 bool IgnoreSignificantBits = false) {
828 // Handle the trivial case, which works for any SCEV type.
829 if (LHS == RHS)
830 return SE.getConstant(LHS->getType(), 1);
831
832 // Handle a few RHS special cases.
834 if (RC) {
835 const APInt &RA = RC->getAPInt();
836 // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
837 // some folding.
838 if (RA.isAllOnes()) {
839 if (LHS->getType()->isPointerTy())
840 return nullptr;
841 return SE.getMulExpr(LHS, RC);
842 }
843 // Handle x /s 1 as x.
844 if (RA == 1)
845 return LHS;
846 }
847
848 // Check for a division of a constant by a constant.
850 if (!RC)
851 return nullptr;
852 const APInt &LA = C->getAPInt();
853 const APInt &RA = RC->getAPInt();
854 if (LA.srem(RA) != 0)
855 return nullptr;
856 return SE.getConstant(LA.sdiv(RA));
857 }
858
859 // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
861 if ((IgnoreSignificantBits || isAddRecSExtable(AR, SE)) && AR->isAffine()) {
862 const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
863 IgnoreSignificantBits);
864 if (!Step) return nullptr;
865 const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
866 IgnoreSignificantBits);
867 if (!Start) return nullptr;
868 // FlagNW is independent of the start value, step direction, and is
869 // preserved with smaller magnitude steps.
870 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
871 return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
872 }
873 return nullptr;
874 }
875
876 // Distribute the sdiv over add operands, if the add doesn't overflow.
878 if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
880 for (const SCEV *S : Add->operands()) {
881 const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
882 if (!Op) return nullptr;
883 Ops.push_back(Op);
884 }
885 return SE.getAddExpr(Ops);
886 }
887 return nullptr;
888 }
889
890 // Check for a multiply operand that we can pull RHS out of.
892 if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
893 // Handle special case C1*X*Y /s C2*X*Y.
894 if (const SCEVMulExpr *MulRHS = dyn_cast<SCEVMulExpr>(RHS)) {
895 if (IgnoreSignificantBits || isMulSExtable(MulRHS, SE)) {
896 const SCEVConstant *LC = dyn_cast<SCEVConstant>(Mul->getOperand(0));
897 const SCEVConstant *RC =
898 dyn_cast<SCEVConstant>(MulRHS->getOperand(0));
899 if (LC && RC) {
901 SmallVector<const SCEV *, 4> ROps(drop_begin(MulRHS->operands()));
902 if (LOps == ROps)
903 return getExactSDiv(LC, RC, SE, IgnoreSignificantBits);
904 }
905 }
906 }
907
909 bool Found = false;
910 for (const SCEV *S : Mul->operands()) {
911 if (!Found)
912 if (const SCEV *Q = getExactSDiv(S, RHS, SE,
913 IgnoreSignificantBits)) {
914 S = Q;
915 Found = true;
916 }
917 Ops.push_back(S);
918 }
919 return Found ? SE.getMulExpr(Ops) : nullptr;
920 }
921 return nullptr;
922 }
923
924 // Otherwise we don't know.
925 return nullptr;
926}
927
928/// If S involves the addition of a constant integer value, return that integer
929/// value, and mutate S to point to a new SCEV with that value excluded.
930static Immediate ExtractImmediate(SCEVUse &S, ScalarEvolution &SE) {
931 const APInt *C;
932 if (match(S, m_scev_APInt(C))) {
933 if (C->getSignificantBits() <= 64) {
934 S = SE.getConstant(S->getType(), 0);
935 return Immediate::getFixed(C->getSExtValue());
936 }
937 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
938 SmallVector<SCEVUse, 8> NewOps(Add->operands());
939 Immediate Result = ExtractImmediate(NewOps.front(), SE);
940 if (Result.isNonZero())
941 S = SE.getAddExpr(NewOps);
942 return Result;
943 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
944 SmallVector<SCEVUse, 8> NewOps(AR->operands());
945 Immediate Result = ExtractImmediate(NewOps.front(), SE);
946 if (Result.isNonZero())
947 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
948 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
950 return Result;
951 } else if (EnableVScaleImmediates &&
953 S = SE.getConstant(S->getType(), 0);
954 return Immediate::getScalable(C->getSExtValue());
955 }
956 return Immediate::getZero();
957}
958
959/// If S involves the addition of a GlobalValue address, return that symbol, and
960/// mutate S to point to a new SCEV with that value excluded.
962 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
963 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
964 S = SE.getConstant(GV->getType(), 0);
965 return GV;
966 }
967 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
968 SmallVector<SCEVUse, 8> NewOps(Add->operands());
969 GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
970 if (Result)
971 S = SE.getAddExpr(NewOps);
972 return Result;
973 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
974 SmallVector<SCEVUse, 8> NewOps(AR->operands());
975 GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
976 if (Result)
977 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
978 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
980 return Result;
981 }
982 return nullptr;
983}
984
985/// Returns true if the specified instruction is using the specified value as an
986/// address.
988 Instruction *Inst, Value *OperandVal) {
989 bool isAddress = isa<LoadInst>(Inst);
990 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
991 if (SI->getPointerOperand() == OperandVal)
992 isAddress = true;
993 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
994 // Addressing modes can also be folded into prefetches and a variety
995 // of intrinsics.
996 switch (II->getIntrinsicID()) {
997 case Intrinsic::memset:
998 case Intrinsic::prefetch:
999 case Intrinsic::masked_load:
1000 if (II->getArgOperand(0) == OperandVal)
1001 isAddress = true;
1002 break;
1003 case Intrinsic::masked_store:
1004 if (II->getArgOperand(1) == OperandVal)
1005 isAddress = true;
1006 break;
1007 case Intrinsic::memmove:
1008 case Intrinsic::memcpy:
1009 if (II->getArgOperand(0) == OperandVal ||
1010 II->getArgOperand(1) == OperandVal)
1011 isAddress = true;
1012 break;
1013 default: {
1014 MemIntrinsicInfo IntrInfo;
1015 if (TTI.getTgtMemIntrinsic(II, IntrInfo)) {
1016 if (IntrInfo.PtrVal == OperandVal)
1017 isAddress = true;
1018 }
1019 }
1020 }
1021 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1022 if (RMW->getPointerOperand() == OperandVal)
1023 isAddress = true;
1024 } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1025 if (CmpX->getPointerOperand() == OperandVal)
1026 isAddress = true;
1027 }
1028 return isAddress;
1029}
1030
1031/// Return the type of the memory being accessed.
1032static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
1033 Instruction *Inst, Value *OperandVal) {
1034 MemAccessTy AccessTy = MemAccessTy::getUnknown(Inst->getContext());
1035
1036 // First get the type of memory being accessed.
1037 if (Type *Ty = Inst->getAccessType())
1038 AccessTy.MemTy = Ty;
1039
1040 // Then get the pointer address space.
1041 if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
1042 AccessTy.AddrSpace = SI->getPointerAddressSpace();
1043 } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
1044 AccessTy.AddrSpace = LI->getPointerAddressSpace();
1045 } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1046 AccessTy.AddrSpace = RMW->getPointerAddressSpace();
1047 } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1048 AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
1049 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
1050 switch (II->getIntrinsicID()) {
1051 case Intrinsic::prefetch:
1052 case Intrinsic::memset:
1053 AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
1054 AccessTy.MemTy = OperandVal->getType();
1055 break;
1056 case Intrinsic::memmove:
1057 case Intrinsic::memcpy:
1058 AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace();
1059 AccessTy.MemTy = OperandVal->getType();
1060 break;
1061 case Intrinsic::masked_load:
1062 AccessTy.AddrSpace =
1063 II->getArgOperand(0)->getType()->getPointerAddressSpace();
1064 break;
1065 case Intrinsic::masked_store:
1066 AccessTy.AddrSpace =
1067 II->getArgOperand(1)->getType()->getPointerAddressSpace();
1068 break;
1069 default: {
1070 MemIntrinsicInfo IntrInfo;
1071 if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) {
1072 AccessTy.AddrSpace
1073 = IntrInfo.PtrVal->getType()->getPointerAddressSpace();
1074 }
1075
1076 break;
1077 }
1078 }
1079 }
1080
1081 return AccessTy;
1082}
1083
1084/// Return true if this AddRec is already a phi in its loop.
1085static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
1086 for (PHINode &PN : AR->getLoop()->getHeader()->phis()) {
1087 if (SE.isSCEVable(PN.getType()) &&
1088 (SE.getEffectiveSCEVType(PN.getType()) ==
1089 SE.getEffectiveSCEVType(AR->getType())) &&
1090 SE.getSCEV(&PN) == AR)
1091 return true;
1092 }
1093 return false;
1094}
1095
1096/// Check if expanding this expression is likely to incur significant cost. This
1097/// is tricky because SCEV doesn't track which expressions are actually computed
1098/// by the current IR.
1099///
1100/// We currently allow expansion of IV increments that involve adds,
1101/// multiplication by constants, and AddRecs from existing phis.
1102///
1103/// TODO: Allow UDivExpr if we can find an existing IV increment that is an
1104/// obvious multiple of the UDivExpr.
1105static bool isHighCostExpansion(const SCEV *S,
1107 ScalarEvolution &SE) {
1108 // Zero/One operand expressions
1109 switch (S->getSCEVType()) {
1110 case scUnknown:
1111 case scConstant:
1112 case scVScale:
1113 return false;
1114 case scTruncate:
1115 return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
1116 Processed, SE);
1117 case scZeroExtend:
1118 return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
1119 Processed, SE);
1120 case scSignExtend:
1121 return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
1122 Processed, SE);
1123 default:
1124 break;
1125 }
1126
1127 if (!Processed.insert(S).second)
1128 return false;
1129
1130 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
1131 for (const SCEV *S : Add->operands()) {
1132 if (isHighCostExpansion(S, Processed, SE))
1133 return true;
1134 }
1135 return false;
1136 }
1137
1138 const SCEV *Op0, *Op1;
1139 if (match(S, m_scev_Mul(m_SCEV(Op0), m_SCEV(Op1)))) {
1140 // Multiplication by a constant is ok
1141 if (isa<SCEVConstant>(Op0))
1142 return isHighCostExpansion(Op1, Processed, SE);
1143
1144 // If we have the value of one operand, check if an existing
1145 // multiplication already generates this expression.
1146 if (const auto *U = dyn_cast<SCEVUnknown>(Op1)) {
1147 Value *UVal = U->getValue();
1148 for (User *UR : UVal->users()) {
1149 // If U is a constant, it may be used by a ConstantExpr.
1151 if (UI && UI->getOpcode() == Instruction::Mul &&
1152 SE.isSCEVable(UI->getType())) {
1153 return SE.getSCEV(UI) == S;
1154 }
1155 }
1156 }
1157 }
1158
1159 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
1160 if (isExistingPhi(AR, SE))
1161 return false;
1162 }
1163
1164 // Fow now, consider any other type of expression (div/mul/min/max) high cost.
1165 return true;
1166}
1167
1168namespace {
1169
1170class LSRUse;
1171
1172} // end anonymous namespace
1173
1174/// Check if the addressing mode defined by \p F is completely
1175/// folded in \p LU at isel time.
1176/// This includes address-mode folding and special icmp tricks.
1177/// This function returns true if \p LU can accommodate what \p F
1178/// defines and up to 1 base + 1 scaled + offset.
1179/// In other words, if \p F has several base registers, this function may
1180/// still return true. Therefore, users still need to account for
1181/// additional base registers and/or unfolded offsets to derive an
1182/// accurate cost model.
1183static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1184 const LSRUse &LU, const Formula &F);
1185
1186// Get the cost of the scaling factor used in F for LU.
1187static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI,
1188 const LSRUse &LU, const Formula &F,
1189 const Loop &L);
1190
1191namespace {
1192
1193/// This class is used to measure and compare candidate formulae.
1194class Cost {
1195 const Loop *L = nullptr;
1196 ScalarEvolution *SE = nullptr;
1197 const TargetTransformInfo *TTI = nullptr;
1198 TargetTransformInfo::LSRCost C;
1200
1201public:
1202 Cost() = delete;
1203 Cost(const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
1205 L(L), SE(&SE), TTI(&TTI), AMK(AMK) {
1206 C.Insns = 0;
1207 C.NumRegs = 0;
1208 C.AddRecCost = 0;
1209 C.NumIVMuls = 0;
1210 C.NumBaseAdds = 0;
1211 C.ImmCost = 0;
1212 C.SetupCost = 0;
1213 C.ScaleCost = 0;
1214 }
1215
1216 bool isLess(const Cost &Other) const;
1217
1218 void Lose();
1219
1220#ifndef NDEBUG
1221 // Once any of the metrics loses, they must all remain losers.
1222 bool isValid() {
1223 return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
1224 | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
1225 || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
1226 & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
1227 }
1228#endif
1229
1230 bool isLoser() {
1231 assert(isValid() && "invalid cost");
1232 return C.NumRegs == ~0u;
1233 }
1234
1235 void RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
1236 const DenseSet<const SCEV *> &VisitedRegs, const LSRUse &LU,
1237 bool HardwareLoopProfitable,
1238 SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
1239
1240 void print(raw_ostream &OS) const;
1241 void dump() const;
1242
1243private:
1244 void RateRegister(const Formula &F, const SCEV *Reg,
1245 SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
1246 bool HardwareLoopProfitable);
1247 void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1248 SmallPtrSetImpl<const SCEV *> &Regs,
1249 const LSRUse &LU, bool HardwareLoopProfitable,
1250 SmallPtrSetImpl<const SCEV *> *LoserRegs);
1251};
1252
1253/// An operand value in an instruction which is to be replaced with some
1254/// equivalent, possibly strength-reduced, replacement.
1255struct LSRFixup {
1256 /// The instruction which will be updated.
1257 Instruction *UserInst = nullptr;
1258
1259 /// The operand of the instruction which will be replaced. The operand may be
1260 /// used more than once; every instance will be replaced.
1261 Value *OperandValToReplace = nullptr;
1262
1263 /// If this user is to use the post-incremented value of an induction
1264 /// variable, this set is non-empty and holds the loops associated with the
1265 /// induction variable.
1266 PostIncLoopSet PostIncLoops;
1267
1268 /// A constant offset to be added to the LSRUse expression. This allows
1269 /// multiple fixups to share the same LSRUse with different offsets, for
1270 /// example in an unrolled loop.
1271 Immediate Offset = Immediate::getZero();
1272
1273 LSRFixup() = default;
1274
1275 bool isUseFullyOutsideLoop(const Loop *L) const;
1276
1277 void print(raw_ostream &OS) const;
1278 void dump() const;
1279};
1280
1281/// This class holds the state that LSR keeps for each use in IVUsers, as well
1282/// as uses invented by LSR itself. It includes information about what kinds of
1283/// things can be folded into the user, information about the user itself, and
1284/// information about how the use may be satisfied. TODO: Represent multiple
1285/// users of the same expression in common?
1286class LSRUse {
1287 DenseSet<SmallVector<const SCEV *, 4>> Uniquifier;
1288
1289public:
1290 /// An enum for a kind of use, indicating what types of scaled and immediate
1291 /// operands it might support.
1292 enum KindType {
1293 Basic, ///< A normal use, with no folding.
1294 Special, ///< A special case of basic, allowing -1 scales.
1295 Address, ///< An address use; folding according to TargetLowering
1296 ICmpZero ///< An equality icmp with both operands folded into one.
1297 // TODO: Add a generic icmp too?
1298 };
1299
1300 using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>;
1301
1302 KindType Kind;
1303 MemAccessTy AccessTy;
1304
1305 /// The list of operands which are to be replaced.
1307
1308 /// Keep track of the min and max offsets of the fixups.
1309 Immediate MinOffset = Immediate::getFixedMax();
1310 Immediate MaxOffset = Immediate::getFixedMin();
1311
1312 /// This records whether all of the fixups using this LSRUse are outside of
1313 /// the loop, in which case some special-case heuristics may be used.
1314 bool AllFixupsOutsideLoop = true;
1315
1316 /// This records whether all of the fixups using this LSRUse are unconditional
1317 /// within the loop, meaning they will be executed on every path to the loop
1318 /// latch. This includes fixups before early exits.
1319 bool AllFixupsUnconditional = true;
1320
1321 /// RigidFormula is set to true to guarantee that this use will be associated
1322 /// with a single formula--the one that initially matched. Some SCEV
1323 /// expressions cannot be expanded. This allows LSR to consider the registers
1324 /// used by those expressions without the need to expand them later after
1325 /// changing the formula.
1326 bool RigidFormula = false;
1327
1328 /// A list of ways to build a value that can satisfy this user. After the
1329 /// list is populated, one of these is selected heuristically and used to
1330 /// formulate a replacement for OperandValToReplace in UserInst.
1331 SmallVector<Formula, 12> Formulae;
1332
1333 /// The set of register candidates used by all formulae in this LSRUse.
1334 SmallPtrSet<const SCEV *, 4> Regs;
1335
1336 LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {}
1337
1338 LSRFixup &getNewFixup() {
1339 Fixups.push_back(LSRFixup());
1340 return Fixups.back();
1341 }
1342
1343 void pushFixup(LSRFixup &f) {
1344 Fixups.push_back(f);
1345 if (Immediate::isKnownGT(f.Offset, MaxOffset))
1346 MaxOffset = f.Offset;
1347 if (Immediate::isKnownLT(f.Offset, MinOffset))
1348 MinOffset = f.Offset;
1349 }
1350
1351 bool HasFormulaWithSameRegs(const Formula &F) const;
1352 float getNotSelectedProbability(const SCEV *Reg) const;
1353 bool InsertFormula(const Formula &F, const Loop &L);
1354 void DeleteFormula(Formula &F);
1355 void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
1356
1357 void print(raw_ostream &OS) const;
1358 void dump() const;
1359};
1360
1361} // end anonymous namespace
1362
1363static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1364 LSRUse::KindType Kind, MemAccessTy AccessTy,
1365 GlobalValue *BaseGV, Immediate BaseOffset,
1366 bool HasBaseReg, int64_t Scale,
1367 Instruction *Fixup = nullptr);
1368
1369static unsigned getSetupCost(const SCEV *Reg, unsigned Depth,
1370 const TargetTransformInfo &TTI) {
1371 if (isa<SCEVUnknown>(Reg))
1372 return 1;
1373 if (const auto *C = dyn_cast<SCEVConstant>(Reg)) {
1374 if (TTI.getIntImmCost(C->getAPInt(), C->getType(),
1377 return 0;
1378 return 1;
1379 }
1380 if (Depth == 0)
1381 return 0;
1382 if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))
1383 return getSetupCost(S->getStart(), Depth - 1, TTI);
1384 if (auto S = dyn_cast<SCEVIntegralCastExpr>(Reg))
1385 return getSetupCost(S->getOperand(), Depth - 1, TTI);
1386 if (auto S = dyn_cast<SCEVNAryExpr>(Reg))
1387 return std::accumulate(S->operands().begin(), S->operands().end(), 0,
1388 [&](unsigned i, const SCEV *Reg) {
1389 return i + getSetupCost(Reg, Depth - 1, TTI);
1390 });
1391 if (auto S = dyn_cast<SCEVUDivExpr>(Reg))
1392 return getSetupCost(S->getLHS(), Depth - 1, TTI) +
1393 getSetupCost(S->getRHS(), Depth - 1, TTI);
1394 return 0;
1395}
1396
1397/// Tally up interesting quantities from the given register.
1398void Cost::RateRegister(const Formula &F, const SCEV *Reg,
1399 SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
1400 bool HardwareLoopProfitable) {
1401 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
1402 // If this is an addrec for another loop, it should be an invariant
1403 // with respect to L since L is the innermost loop (at least
1404 // for now LSR only handles innermost loops).
1405 if (AR->getLoop() != L) {
1406 // If the AddRec exists, consider it's register free and leave it alone.
1407 if (isExistingPhi(AR, *SE) && !(AMK & TTI::AMK_PostIndexed))
1408 return;
1409
1410 // It is bad to allow LSR for current loop to add induction variables
1411 // for its sibling loops.
1412 if (!AR->getLoop()->contains(L)) {
1413 Lose();
1414 return;
1415 }
1416
1417 // Otherwise, it will be an invariant with respect to Loop L.
1418 ++C.NumRegs;
1419 return;
1420 }
1421
1422 unsigned LoopCost = 1;
1423 if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||
1424 TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {
1425 const SCEV *Start;
1426 const APInt *Step;
1427 if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_scev_APInt(Step)))) {
1428 // If the step size matches the base offset, we could use pre-indexed
1429 // addressing.
1430 bool CanPreIndex = (AMK & TTI::AMK_PreIndexed) &&
1431 F.BaseOffset.isFixed() &&
1432 *Step == F.BaseOffset.getFixedValue();
1433 bool CanPostIndex = (AMK & TTI::AMK_PostIndexed) &&
1434 !isa<SCEVConstant>(Start) &&
1435 SE->isLoopInvariant(Start, L);
1436 // We can only pre or post index when the load/store is unconditional.
1437 if ((CanPreIndex || CanPostIndex) && LU.AllFixupsUnconditional)
1438 LoopCost = 0;
1439 }
1440 }
1441
1442 // If the loop counts down to zero and we'll be using a hardware loop then
1443 // the addrec will be combined into the hardware loop instruction.
1444 if (LU.Kind == LSRUse::ICmpZero && F.countsDownToZero() &&
1445 HardwareLoopProfitable)
1446 LoopCost = 0;
1447 C.AddRecCost += LoopCost;
1448
1449 // Add the step value register, if it needs one.
1450 // TODO: The non-affine case isn't precisely modeled here.
1451 if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
1452 if (!Regs.count(AR->getOperand(1))) {
1453 RateRegister(F, AR->getOperand(1), Regs, LU, HardwareLoopProfitable);
1454 if (isLoser())
1455 return;
1456 }
1457 }
1458 }
1459 ++C.NumRegs;
1460
1461 // Rough heuristic; favor registers which don't require extra setup
1462 // instructions in the preheader.
1463 C.SetupCost += getSetupCost(Reg, SetupCostDepthLimit, *TTI);
1464 // Ensure we don't, even with the recusion limit, produce invalid costs.
1465 C.SetupCost = std::min<unsigned>(C.SetupCost, 1 << 16);
1466
1467 C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
1469}
1470
1471/// Record this register in the set. If we haven't seen it before, rate
1472/// it. Optional LoserRegs provides a way to declare any formula that refers to
1473/// one of those regs an instant loser.
1474void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1475 SmallPtrSetImpl<const SCEV *> &Regs,
1476 const LSRUse &LU, bool HardwareLoopProfitable,
1477 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1478 if (LoserRegs && LoserRegs->count(Reg)) {
1479 Lose();
1480 return;
1481 }
1482 if (Regs.insert(Reg).second) {
1483 RateRegister(F, Reg, Regs, LU, HardwareLoopProfitable);
1484 if (LoserRegs && isLoser())
1485 LoserRegs->insert(Reg);
1486 }
1487}
1488
1489void Cost::RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
1490 const DenseSet<const SCEV *> &VisitedRegs,
1491 const LSRUse &LU, bool HardwareLoopProfitable,
1492 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1493 if (isLoser())
1494 return;
1495 assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
1496 // Tally up the registers.
1497 unsigned PrevAddRecCost = C.AddRecCost;
1498 unsigned PrevNumRegs = C.NumRegs;
1499 unsigned PrevNumBaseAdds = C.NumBaseAdds;
1500 if (const SCEV *ScaledReg = F.ScaledReg) {
1501 if (VisitedRegs.count(ScaledReg)) {
1502 Lose();
1503 return;
1504 }
1505 RatePrimaryRegister(F, ScaledReg, Regs, LU, HardwareLoopProfitable,
1506 LoserRegs);
1507 if (isLoser())
1508 return;
1509 }
1510 for (const SCEV *BaseReg : F.BaseRegs) {
1511 if (VisitedRegs.count(BaseReg)) {
1512 Lose();
1513 return;
1514 }
1515 RatePrimaryRegister(F, BaseReg, Regs, LU, HardwareLoopProfitable,
1516 LoserRegs);
1517 if (isLoser())
1518 return;
1519 }
1520
1521 // Determine how many (unfolded) adds we'll need inside the loop.
1522 size_t NumBaseParts = F.getNumRegs();
1523 if (NumBaseParts > 1)
1524 // Do not count the base and a possible second register if the target
1525 // allows to fold 2 registers.
1526 C.NumBaseAdds +=
1527 NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F)));
1528 C.NumBaseAdds += (F.UnfoldedOffset.isNonZero());
1529
1530 // Accumulate non-free scaling amounts.
1531 C.ScaleCost += getScalingFactorCost(*TTI, LU, F, *L).getValue();
1532
1533 // Tally up the non-zero immediates.
1534 for (const LSRFixup &Fixup : LU.Fixups) {
1535 if (Fixup.Offset.isCompatibleImmediate(F.BaseOffset)) {
1536 Immediate Offset = Fixup.Offset.addUnsigned(F.BaseOffset);
1537 if (F.BaseGV)
1538 C.ImmCost += 64; // Handle symbolic values conservatively.
1539 // TODO: This should probably be the pointer size.
1540 else if (Offset.isNonZero())
1541 C.ImmCost +=
1542 APInt(64, Offset.getKnownMinValue(), true).getSignificantBits();
1543
1544 // Check with target if this offset with this instruction is
1545 // specifically not supported.
1546 if (LU.Kind == LSRUse::Address && Offset.isNonZero() &&
1547 !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1548 Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
1549 C.NumBaseAdds++;
1550 } else {
1551 // Incompatible immediate type, increase cost to avoid using
1552 C.ImmCost += 2048;
1553 }
1554 }
1555
1556 // If we don't count instruction cost exit here.
1557 if (!InsnsCost) {
1558 assert(isValid() && "invalid cost");
1559 return;
1560 }
1561
1562 // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
1563 // additional instruction (at least fill).
1564 // TODO: Need distinguish register class?
1565 unsigned TTIRegNum = TTI->getNumberOfRegisters(
1566 TTI->getRegisterClassForType(false, F.getType())) - 1;
1567 if (C.NumRegs > TTIRegNum) {
1568 // Cost already exceeded TTIRegNum, then only newly added register can add
1569 // new instructions.
1570 if (PrevNumRegs > TTIRegNum)
1571 C.Insns += (C.NumRegs - PrevNumRegs);
1572 else
1573 C.Insns += (C.NumRegs - TTIRegNum);
1574 }
1575
1576 // If ICmpZero formula ends with not 0, it could not be replaced by
1577 // just add or sub. We'll need to compare final result of AddRec.
1578 // That means we'll need an additional instruction. But if the target can
1579 // macro-fuse a compare with a branch, don't count this extra instruction.
1580 // For -10 + {0, +, 1}:
1581 // i = i + 1;
1582 // cmp i, 10
1583 //
1584 // For {-10, +, 1}:
1585 // i = i + 1;
1586 if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() &&
1587 !TTI->canMacroFuseCmp())
1588 C.Insns++;
1589 // Each new AddRec adds 1 instruction to calculation.
1590 C.Insns += (C.AddRecCost - PrevAddRecCost);
1591
1592 // BaseAdds adds instructions for unfolded registers.
1593 if (LU.Kind != LSRUse::ICmpZero)
1594 C.Insns += C.NumBaseAdds - PrevNumBaseAdds;
1595 assert(isValid() && "invalid cost");
1596}
1597
1598/// Set this cost to a losing value.
1599void Cost::Lose() {
1600 C.Insns = std::numeric_limits<unsigned>::max();
1601 C.NumRegs = std::numeric_limits<unsigned>::max();
1602 C.AddRecCost = std::numeric_limits<unsigned>::max();
1603 C.NumIVMuls = std::numeric_limits<unsigned>::max();
1604 C.NumBaseAdds = std::numeric_limits<unsigned>::max();
1605 C.ImmCost = std::numeric_limits<unsigned>::max();
1606 C.SetupCost = std::numeric_limits<unsigned>::max();
1607 C.ScaleCost = std::numeric_limits<unsigned>::max();
1608}
1609
1610/// Choose the lower cost.
1611bool Cost::isLess(const Cost &Other) const {
1612 if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
1613 C.Insns != Other.C.Insns)
1614 return C.Insns < Other.C.Insns;
1615 return TTI->isLSRCostLess(C, Other.C);
1616}
1617
1618#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1619void Cost::print(raw_ostream &OS) const {
1620 if (InsnsCost)
1621 OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
1622 OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s");
1623 if (C.AddRecCost != 0)
1624 OS << ", with addrec cost " << C.AddRecCost;
1625 if (C.NumIVMuls != 0)
1626 OS << ", plus " << C.NumIVMuls << " IV mul"
1627 << (C.NumIVMuls == 1 ? "" : "s");
1628 if (C.NumBaseAdds != 0)
1629 OS << ", plus " << C.NumBaseAdds << " base add"
1630 << (C.NumBaseAdds == 1 ? "" : "s");
1631 if (C.ScaleCost != 0)
1632 OS << ", plus " << C.ScaleCost << " scale cost";
1633 if (C.ImmCost != 0)
1634 OS << ", plus " << C.ImmCost << " imm cost";
1635 if (C.SetupCost != 0)
1636 OS << ", plus " << C.SetupCost << " setup cost";
1637}
1638
1639LLVM_DUMP_METHOD void Cost::dump() const {
1640 print(errs()); errs() << '\n';
1641}
1642#endif
1643
1644/// Test whether this fixup always uses its value outside of the given loop.
1645bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
1646 // PHI nodes use their value in their incoming blocks.
1647 if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
1648 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
1649 if (PN->getIncomingValue(i) == OperandValToReplace &&
1650 L->contains(PN->getIncomingBlock(i)))
1651 return false;
1652 return true;
1653 }
1654
1655 return !L->contains(UserInst);
1656}
1657
1658#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1659void LSRFixup::print(raw_ostream &OS) const {
1660 OS << "UserInst=";
1661 // Store is common and interesting enough to be worth special-casing.
1662 if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
1663 OS << "store ";
1664 Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
1665 } else if (UserInst->getType()->isVoidTy())
1666 OS << UserInst->getOpcodeName();
1667 else
1668 UserInst->printAsOperand(OS, /*PrintType=*/false);
1669
1670 OS << ", OperandValToReplace=";
1671 OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
1672
1673 for (const Loop *PIL : PostIncLoops) {
1674 OS << ", PostIncLoop=";
1675 PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
1676 }
1677
1678 if (Offset.isNonZero())
1679 OS << ", Offset=" << Offset;
1680}
1681
1682LLVM_DUMP_METHOD void LSRFixup::dump() const {
1683 print(errs()); errs() << '\n';
1684}
1685#endif
1686
1687/// Test whether this use as a formula which has the same registers as the given
1688/// formula.
1689bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
1691 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1692 // Unstable sort by host order ok, because this is only used for uniquifying.
1693 llvm::sort(Key);
1694 return Uniquifier.count(Key);
1695}
1696
1697/// The function returns a probability of selecting formula without Reg.
1698float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
1699 unsigned FNum = 0;
1700 for (const Formula &F : Formulae)
1701 if (F.referencesReg(Reg))
1702 FNum++;
1703 return ((float)(Formulae.size() - FNum)) / Formulae.size();
1704}
1705
1706/// If the given formula has not yet been inserted, add it to the list, and
1707/// return true. Return false otherwise. The formula must be in canonical form.
1708bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
1709 assert(F.isCanonical(L) && "Invalid canonical representation");
1710
1711 if (!Formulae.empty() && RigidFormula)
1712 return false;
1713
1715 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1716 // Unstable sort by host order ok, because this is only used for uniquifying.
1717 llvm::sort(Key);
1718
1719 if (!Uniquifier.insert(Key).second)
1720 return false;
1721
1722 // Using a register to hold the value of 0 is not profitable.
1723 assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
1724 "Zero allocated in a scaled register!");
1725#ifndef NDEBUG
1726 for (const SCEV *BaseReg : F.BaseRegs)
1727 assert(!BaseReg->isZero() && "Zero allocated in a base register!");
1728#endif
1729
1730 // Add the formula to the list.
1731 Formulae.push_back(F);
1732
1733 // Record registers now being used by this use.
1734 Regs.insert_range(F.BaseRegs);
1735 if (F.ScaledReg)
1736 Regs.insert(F.ScaledReg);
1737
1738 return true;
1739}
1740
1741/// Remove the given formula from this use's list.
1742void LSRUse::DeleteFormula(Formula &F) {
1743 if (&F != &Formulae.back())
1744 std::swap(F, Formulae.back());
1745 Formulae.pop_back();
1746}
1747
1748/// Recompute the Regs field, and update RegUses.
1749void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
1750 // Now that we've filtered out some formulae, recompute the Regs set.
1751 SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
1752 Regs.clear();
1753 for (const Formula &F : Formulae) {
1754 if (F.ScaledReg) Regs.insert(F.ScaledReg);
1755 Regs.insert_range(F.BaseRegs);
1756 }
1757
1758 // Update the RegTracker.
1759 for (const SCEV *S : OldRegs)
1760 if (!Regs.count(S))
1761 RegUses.dropRegister(S, LUIdx);
1762}
1763
1764#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1765void LSRUse::print(raw_ostream &OS) const {
1766 OS << "LSR Use: Kind=";
1767 switch (Kind) {
1768 case Basic: OS << "Basic"; break;
1769 case Special: OS << "Special"; break;
1770 case ICmpZero: OS << "ICmpZero"; break;
1771 case Address:
1772 OS << "Address of ";
1773 if (AccessTy.MemTy->isPointerTy())
1774 OS << "pointer"; // the full pointer type could be really verbose
1775 else {
1776 OS << *AccessTy.MemTy;
1777 }
1778
1779 OS << " in addrspace(" << AccessTy.AddrSpace << ')';
1780 }
1781
1782 OS << ", Offsets={";
1783 bool NeedComma = false;
1784 for (const LSRFixup &Fixup : Fixups) {
1785 if (NeedComma) OS << ',';
1786 OS << Fixup.Offset;
1787 NeedComma = true;
1788 }
1789 OS << '}';
1790
1791 if (AllFixupsOutsideLoop)
1792 OS << ", all-fixups-outside-loop";
1793
1794 if (AllFixupsUnconditional)
1795 OS << ", all-fixups-unconditional";
1796}
1797
1798LLVM_DUMP_METHOD void LSRUse::dump() const {
1799 print(errs()); errs() << '\n';
1800}
1801#endif
1802
1804 LSRUse::KindType Kind, MemAccessTy AccessTy,
1805 GlobalValue *BaseGV, Immediate BaseOffset,
1806 bool HasBaseReg, int64_t Scale,
1807 Instruction *Fixup /* = nullptr */) {
1808 switch (Kind) {
1809 case LSRUse::Address: {
1810 int64_t FixedOffset =
1811 BaseOffset.isScalable() ? 0 : BaseOffset.getFixedValue();
1812 int64_t ScalableOffset =
1813 BaseOffset.isScalable() ? BaseOffset.getKnownMinValue() : 0;
1814 return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, FixedOffset,
1815 HasBaseReg, Scale, AccessTy.AddrSpace,
1816 Fixup, ScalableOffset);
1817 }
1818 case LSRUse::ICmpZero:
1819 // There's not even a target hook for querying whether it would be legal to
1820 // fold a GV into an ICmp.
1821 if (BaseGV)
1822 return false;
1823
1824 // ICmp only has two operands; don't allow more than two non-trivial parts.
1825 if (Scale != 0 && HasBaseReg && BaseOffset.isNonZero())
1826 return false;
1827
1828 // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
1829 // putting the scaled register in the other operand of the icmp.
1830 if (Scale != 0 && Scale != -1)
1831 return false;
1832
1833 // If we have low-level target information, ask the target if it can fold an
1834 // integer immediate on an icmp.
1835 if (BaseOffset.isNonZero()) {
1836 // We don't have an interface to query whether the target supports
1837 // icmpzero against scalable quantities yet.
1838 if (BaseOffset.isScalable())
1839 return false;
1840
1841 // We have one of:
1842 // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
1843 // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
1844 // Offs is the ICmp immediate.
1845 if (Scale == 0)
1846 // The cast does the right thing with
1847 // std::numeric_limits<int64_t>::min().
1848 BaseOffset = BaseOffset.getFixed(-(uint64_t)BaseOffset.getFixedValue());
1849 return TTI.isLegalICmpImmediate(BaseOffset.getFixedValue());
1850 }
1851
1852 // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
1853 return true;
1854
1855 case LSRUse::Basic:
1856 // Only handle single-register values.
1857 return !BaseGV && Scale == 0 && BaseOffset.isZero();
1858
1859 case LSRUse::Special:
1860 // Special case Basic to handle -1 scales.
1861 return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset.isZero();
1862 }
1863
1864 llvm_unreachable("Invalid LSRUse Kind!");
1865}
1866
1868 Immediate MinOffset, Immediate MaxOffset,
1869 LSRUse::KindType Kind, MemAccessTy AccessTy,
1870 GlobalValue *BaseGV, Immediate BaseOffset,
1871 bool HasBaseReg, int64_t Scale) {
1872 if (BaseOffset.isNonZero() &&
1873 (BaseOffset.isScalable() != MinOffset.isScalable() ||
1874 BaseOffset.isScalable() != MaxOffset.isScalable()))
1875 return false;
1876 // Check for overflow.
1877 int64_t Base = BaseOffset.getKnownMinValue();
1878 int64_t Min = MinOffset.getKnownMinValue();
1879 int64_t Max = MaxOffset.getKnownMinValue();
1880 if (((int64_t)((uint64_t)Base + Min) > Base) != (Min > 0))
1881 return false;
1882 MinOffset = Immediate::get((uint64_t)Base + Min, MinOffset.isScalable());
1883 if (((int64_t)((uint64_t)Base + Max) > Base) != (Max > 0))
1884 return false;
1885 MaxOffset = Immediate::get((uint64_t)Base + Max, MaxOffset.isScalable());
1886
1887 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
1888 HasBaseReg, Scale) &&
1889 isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
1890 HasBaseReg, Scale);
1891}
1892
1894 Immediate MinOffset, Immediate MaxOffset,
1895 LSRUse::KindType Kind, MemAccessTy AccessTy,
1896 const Formula &F, const Loop &L) {
1897 // For the purpose of isAMCompletelyFolded either having a canonical formula
1898 // or a scale not equal to zero is correct.
1899 // Problems may arise from non canonical formulae having a scale == 0.
1900 // Strictly speaking it would best to just rely on canonical formulae.
1901 // However, when we generate the scaled formulae, we first check that the
1902 // scaling factor is profitable before computing the actual ScaledReg for
1903 // compile time sake.
1904 assert((F.isCanonical(L) || F.Scale != 0));
1905 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1906 F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
1907}
1908
1909/// Test whether we know how to expand the current formula.
1910static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1911 Immediate MaxOffset, LSRUse::KindType Kind,
1912 MemAccessTy AccessTy, GlobalValue *BaseGV,
1913 Immediate BaseOffset, bool HasBaseReg, int64_t Scale) {
1914 // We know how to expand completely foldable formulae.
1915 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1916 BaseOffset, HasBaseReg, Scale) ||
1917 // Or formulae that use a base register produced by a sum of base
1918 // registers.
1919 (Scale == 1 &&
1920 isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1921 BaseGV, BaseOffset, true, 0));
1922}
1923
1924static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1925 Immediate MaxOffset, LSRUse::KindType Kind,
1926 MemAccessTy AccessTy, const Formula &F) {
1927 return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
1928 F.BaseOffset, F.HasBaseReg, F.Scale);
1929}
1930
1932 Immediate Offset) {
1933 if (Offset.isScalable())
1934 return TTI.isLegalAddScalableImmediate(Offset.getKnownMinValue());
1935
1936 return TTI.isLegalAddImmediate(Offset.getFixedValue());
1937}
1938
1940 const LSRUse &LU, const Formula &F) {
1941 // Target may want to look at the user instructions.
1942 if (LU.Kind == LSRUse::Address && TTI.LSRWithInstrQueries()) {
1943 for (const LSRFixup &Fixup : LU.Fixups)
1944 if (!isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1945 (F.BaseOffset + Fixup.Offset), F.HasBaseReg,
1946 F.Scale, Fixup.UserInst))
1947 return false;
1948 return true;
1949 }
1950
1951 return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1952 LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
1953 F.Scale);
1954}
1955
1957 const LSRUse &LU, const Formula &F,
1958 const Loop &L) {
1959 if (!F.Scale)
1960 return 0;
1961
1962 // If the use is not completely folded in that instruction, we will have to
1963 // pay an extra cost only for scale != 1.
1964 if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1965 LU.AccessTy, F, L))
1966 return F.Scale != 1;
1967
1968 switch (LU.Kind) {
1969 case LSRUse::Address: {
1970 // Check the scaling factor cost with both the min and max offsets.
1971 int64_t ScalableMin = 0, ScalableMax = 0, FixedMin = 0, FixedMax = 0;
1972 if (F.BaseOffset.isScalable()) {
1973 ScalableMin = (F.BaseOffset + LU.MinOffset).getKnownMinValue();
1974 ScalableMax = (F.BaseOffset + LU.MaxOffset).getKnownMinValue();
1975 } else {
1976 FixedMin = (F.BaseOffset + LU.MinOffset).getFixedValue();
1977 FixedMax = (F.BaseOffset + LU.MaxOffset).getFixedValue();
1978 }
1979 InstructionCost ScaleCostMinOffset = TTI.getScalingFactorCost(
1980 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMin, ScalableMin),
1981 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
1982 InstructionCost ScaleCostMaxOffset = TTI.getScalingFactorCost(
1983 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMax, ScalableMax),
1984 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
1985
1986 assert(ScaleCostMinOffset.isValid() && ScaleCostMaxOffset.isValid() &&
1987 "Legal addressing mode has an illegal cost!");
1988 return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
1989 }
1990 case LSRUse::ICmpZero:
1991 case LSRUse::Basic:
1992 case LSRUse::Special:
1993 // The use is completely folded, i.e., everything is folded into the
1994 // instruction.
1995 return 0;
1996 }
1997
1998 llvm_unreachable("Invalid LSRUse Kind!");
1999}
2000
2002 LSRUse::KindType Kind, MemAccessTy AccessTy,
2003 GlobalValue *BaseGV, Immediate BaseOffset,
2004 bool HasBaseReg) {
2005 // Fast-path: zero is always foldable.
2006 if (BaseOffset.isZero() && !BaseGV)
2007 return true;
2008
2009 // Conservatively, create an address with an immediate and a
2010 // base and a scale.
2011 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2012
2013 // Canonicalize a scale of 1 to a base register if the formula doesn't
2014 // already have a base register.
2015 if (!HasBaseReg && Scale == 1) {
2016 Scale = 0;
2017 HasBaseReg = true;
2018 }
2019
2020 // FIXME: Try with + without a scale? Maybe based on TTI?
2021 // I think basereg + scaledreg + immediateoffset isn't a good 'conservative'
2022 // default for many architectures, not just AArch64 SVE. More investigation
2023 // needed later to determine if this should be used more widely than just
2024 // on scalable types.
2025 if (HasBaseReg && BaseOffset.isNonZero() && Kind != LSRUse::ICmpZero &&
2026 AccessTy.MemTy && AccessTy.MemTy->isScalableTy() && DropScaledForVScale)
2027 Scale = 0;
2028
2029 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
2030 HasBaseReg, Scale);
2031}
2032
2034 ScalarEvolution &SE, Immediate MinOffset,
2035 Immediate MaxOffset, LSRUse::KindType Kind,
2036 MemAccessTy AccessTy, const SCEV *S,
2037 bool HasBaseReg) {
2038 // Fast-path: zero is always foldable.
2039 if (S->isZero()) return true;
2040
2041 // Conservatively, create an address with an immediate and a
2042 // base and a scale.
2043 SCEVUse SCopy = S;
2044 Immediate BaseOffset = ExtractImmediate(SCopy, SE);
2045 GlobalValue *BaseGV = ExtractSymbol(SCopy, SE);
2046
2047 // If there's anything else involved, it's not foldable.
2048 if (!SCopy->isZero())
2049 return false;
2050
2051 // Fast-path: zero is always foldable.
2052 if (BaseOffset.isZero() && !BaseGV)
2053 return true;
2054
2055 if (BaseOffset.isScalable())
2056 return false;
2057
2058 // Conservatively, create an address with an immediate and a
2059 // base and a scale.
2060 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2061
2062 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
2063 BaseOffset, HasBaseReg, Scale);
2064}
2065
2066namespace {
2067
2068/// An individual increment in a Chain of IV increments. Relate an IV user to
2069/// an expression that computes the IV it uses from the IV used by the previous
2070/// link in the Chain.
2071///
2072/// For the head of a chain, IncExpr holds the absolute SCEV expression for the
2073/// original IVOperand. The head of the chain's IVOperand is only valid during
2074/// chain collection, before LSR replaces IV users. During chain generation,
2075/// IncExpr can be used to find the new IVOperand that computes the same
2076/// expression.
2077struct IVInc {
2078 Instruction *UserInst;
2079 Value* IVOperand;
2080 const SCEV *IncExpr;
2081
2082 IVInc(Instruction *U, Value *O, const SCEV *E)
2083 : UserInst(U), IVOperand(O), IncExpr(E) {}
2084};
2085
2086// The list of IV increments in program order. We typically add the head of a
2087// chain without finding subsequent links.
2088struct IVChain {
2090 const SCEV *ExprBase = nullptr;
2091
2092 IVChain() = default;
2093 IVChain(const IVInc &Head, const SCEV *Base)
2094 : Incs(1, Head), ExprBase(Base) {}
2095
2096 using const_iterator = SmallVectorImpl<IVInc>::const_iterator;
2097
2098 // Return the first increment in the chain.
2099 const_iterator begin() const {
2100 assert(!Incs.empty());
2101 return std::next(Incs.begin());
2102 }
2103 const_iterator end() const {
2104 return Incs.end();
2105 }
2106
2107 // Returns true if this chain contains any increments.
2108 bool hasIncs() const { return Incs.size() >= 2; }
2109
2110 // Add an IVInc to the end of this chain.
2111 void add(const IVInc &X) { Incs.push_back(X); }
2112
2113 // Returns the last UserInst in the chain.
2114 Instruction *tailUserInst() const { return Incs.back().UserInst; }
2115
2116 // Returns true if IncExpr can be profitably added to this chain.
2117 bool isProfitableIncrement(const SCEV *OperExpr,
2118 const SCEV *IncExpr,
2119 ScalarEvolution&);
2120};
2121
2122/// Helper for CollectChains to track multiple IV increment uses. Distinguish
2123/// between FarUsers that definitely cross IV increments and NearUsers that may
2124/// be used between IV increments.
2125struct ChainUsers {
2126 SmallPtrSet<Instruction*, 4> FarUsers;
2127 SmallPtrSet<Instruction*, 4> NearUsers;
2128};
2129
2130/// This class holds state for the main loop strength reduction logic.
2131class LSRInstance {
2132 IVUsers &IU;
2133 ScalarEvolution &SE;
2134 DominatorTree &DT;
2135 LoopInfo &LI;
2136 AssumptionCache &AC;
2137 TargetLibraryInfo &TLI;
2138 const TargetTransformInfo &TTI;
2139 Loop *const L;
2140 MemorySSAUpdater *MSSAU;
2142 mutable SCEVExpander Rewriter;
2143 bool Changed = false;
2144 bool HardwareLoopProfitable = false;
2145
2146 /// This is the insert position that the current loop's induction variable
2147 /// increment should be placed. In simple loops, this is the latch block's
2148 /// terminator. But in more complicated cases, this is a position which will
2149 /// dominate all the in-loop post-increment users.
2150 Instruction *IVIncInsertPos = nullptr;
2151
2152 /// Interesting factors between use strides.
2153 ///
2154 /// We explicitly use a SetVector which contains a SmallSet, instead of the
2155 /// default, a SmallDenseSet, because we need to use the full range of
2156 /// int64_ts, and there's currently no good way of doing that with
2157 /// SmallDenseSet.
2158 SetVector<int64_t, SmallVector<int64_t, 8>, SmallSet<int64_t, 8>> Factors;
2159
2160 /// The cost of the current SCEV, the best solution by LSR will be dropped if
2161 /// the solution is not profitable.
2162 Cost BaselineCost;
2163
2164 /// Interesting use types, to facilitate truncation reuse.
2165 SmallSetVector<Type *, 4> Types;
2166
2167 /// The list of interesting uses.
2169
2170 /// Track which uses use which register candidates.
2171 RegUseTracker RegUses;
2172
2173 // Limit the number of chains to avoid quadratic behavior. We don't expect to
2174 // have more than a few IV increment chains in a loop. Missing a Chain falls
2175 // back to normal LSR behavior for those uses.
2176 static const unsigned MaxChains = 8;
2177
2178 /// IV users can form a chain of IV increments.
2180
2181 /// IV users that belong to profitable IVChains.
2182 SmallPtrSet<Use*, MaxChains> IVIncSet;
2183
2184 /// Induction variables that were generated and inserted by the SCEV Expander.
2185 SmallVector<llvm::WeakVH, 2> ScalarEvolutionIVs;
2186
2187 // Inserting instructions in the loop and using them as PHI's input could
2188 // break LCSSA in case if PHI's parent block is not a loop exit (i.e. the
2189 // corresponding incoming block is not loop exiting). So collect all such
2190 // instructions to form LCSSA for them later.
2191 SmallSetVector<Instruction *, 4> InsertedNonLCSSAInsts;
2192
2193 void OptimizeShadowIV();
2194 bool FindIVUserForCond(Instruction *Cond, IVStrideUse *&CondUse);
2195 Instruction *OptimizeMax(ICmpInst *Cond, IVStrideUse *&CondUse);
2196 void OptimizeLoopTermCond();
2197
2198 void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
2199 SmallVectorImpl<ChainUsers> &ChainUsersVec);
2200 void FinalizeChain(IVChain &Chain);
2201 void CollectChains();
2202 void GenerateIVChain(const IVChain &Chain,
2203 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2204
2205 void CollectInterestingTypesAndFactors();
2206 void CollectFixupsAndInitialFormulae();
2207
2208 // Support for sharing of LSRUses between LSRFixups.
2209 using UseMapTy = DenseMap<LSRUse::SCEVUseKindPair, size_t>;
2210 UseMapTy UseMap;
2211
2212 bool reconcileNewOffset(LSRUse &LU, Immediate NewOffset, bool HasBaseReg,
2213 LSRUse::KindType Kind, MemAccessTy AccessTy);
2214
2215 std::pair<size_t, Immediate> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
2216 MemAccessTy AccessTy);
2217
2218 void DeleteUse(LSRUse &LU, size_t LUIdx);
2219
2220 LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
2221
2222 void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2223 void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2224 void CountRegisters(const Formula &F, size_t LUIdx);
2225 bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
2226 bool IsFixupExecutedEachIncrement(const LSRFixup &LF) const;
2227
2228 void CollectLoopInvariantFixupsAndFormulae();
2229
2230 void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
2231 unsigned Depth = 0);
2232
2233 void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
2234 const Formula &Base, unsigned Depth,
2235 size_t Idx, bool IsScaledReg = false);
2236 void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
2237 void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2238 const Formula &Base, size_t Idx,
2239 bool IsScaledReg = false);
2240 void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2241 void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2242 const Formula &Base,
2243 const SmallVectorImpl<Immediate> &Worklist,
2244 size_t Idx, bool IsScaledReg = false);
2245 void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2246 void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2247 void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2248 void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
2249 void GenerateCrossUseConstantOffsets();
2250 void GenerateAllReuseFormulae();
2251
2252 void FilterOutUndesirableDedicatedRegisters();
2253
2254 size_t EstimateSearchSpaceComplexity() const;
2255 void NarrowSearchSpaceByDetectingSupersets();
2256 void NarrowSearchSpaceByCollapsingUnrolledCode();
2257 void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
2258 void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
2259 void NarrowSearchSpaceByFilterPostInc();
2260 void NarrowSearchSpaceByDeletingCostlyFormulas();
2261 void NarrowSearchSpaceByPickingWinnerRegs();
2262 void NarrowSearchSpaceUsingHeuristics();
2263
2264 void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
2265 Cost &SolutionCost,
2266 SmallVectorImpl<const Formula *> &Workspace,
2267 const Cost &CurCost,
2268 const SmallPtrSet<const SCEV *, 16> &CurRegs,
2269 DenseSet<const SCEV *> &VisitedRegs) const;
2270 void Solve(SmallVectorImpl<const Formula *> &Solution) const;
2271
2273 HoistInsertPosition(BasicBlock::iterator IP,
2274 const SmallVectorImpl<Instruction *> &Inputs) const;
2275 BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP,
2276 const LSRFixup &LF,
2277 const LSRUse &LU) const;
2278
2279 Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2281 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2282 void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
2283 const Formula &F,
2284 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2285 void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2286 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2287 void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
2288
2289public:
2290 LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
2291 LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC,
2292 TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU);
2293
2294 bool getChanged() const { return Changed; }
2295 const SmallVectorImpl<WeakVH> &getScalarEvolutionIVs() const {
2296 return ScalarEvolutionIVs;
2297 }
2298
2299 void print_factors_and_types(raw_ostream &OS) const;
2300 void print_fixups(raw_ostream &OS) const;
2301 void print_uses(raw_ostream &OS) const;
2302 void print(raw_ostream &OS) const;
2303 void dump() const;
2304};
2305
2306} // end anonymous namespace
2307
2308/// If IV is used in a int-to-float cast inside the loop then try to eliminate
2309/// the cast operation.
2310void LSRInstance::OptimizeShadowIV() {
2311 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2312 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2313 return;
2314
2315 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
2316 UI != E; /* empty */) {
2317 IVUsers::const_iterator CandidateUI = UI;
2318 ++UI;
2319 Instruction *ShadowUse = CandidateUI->getUser();
2320 Type *DestTy = nullptr;
2321 bool IsSigned = false;
2322
2323 /* If shadow use is a int->float cast then insert a second IV
2324 to eliminate this cast.
2325
2326 for (unsigned i = 0; i < n; ++i)
2327 foo((double)i);
2328
2329 is transformed into
2330
2331 double d = 0.0;
2332 for (unsigned i = 0; i < n; ++i, ++d)
2333 foo(d);
2334 */
2335 if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
2336 IsSigned = false;
2337 DestTy = UCast->getDestTy();
2338 }
2339 else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
2340 IsSigned = true;
2341 DestTy = SCast->getDestTy();
2342 }
2343 if (!DestTy) continue;
2344
2345 // If target does not support DestTy natively then do not apply
2346 // this transformation.
2347 if (!TTI.isTypeLegal(DestTy)) continue;
2348
2349 PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
2350 if (!PH) continue;
2351 if (PH->getNumIncomingValues() != 2) continue;
2352
2353 // If the calculation in integers overflows, the result in FP type will
2354 // differ. So we only can do this transformation if we are guaranteed to not
2355 // deal with overflowing values
2356 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PH));
2357 if (!AR) continue;
2358 if (IsSigned && !AR->hasNoSignedWrap()) continue;
2359 if (!IsSigned && !AR->hasNoUnsignedWrap()) continue;
2360
2361 Type *SrcTy = PH->getType();
2362 int Mantissa = DestTy->getFPMantissaWidth();
2363 if (Mantissa == -1) continue;
2364 if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
2365 continue;
2366
2367 unsigned Entry, Latch;
2368 if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
2369 Entry = 0;
2370 Latch = 1;
2371 } else {
2372 Entry = 1;
2373 Latch = 0;
2374 }
2375
2376 ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
2377 if (!Init) continue;
2378 Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
2379 (double)Init->getSExtValue() :
2380 (double)Init->getZExtValue());
2381
2382 BinaryOperator *Incr =
2384 if (!Incr) continue;
2385 if (Incr->getOpcode() != Instruction::Add
2386 && Incr->getOpcode() != Instruction::Sub)
2387 continue;
2388
2389 /* Initialize new IV, double d = 0.0 in above example. */
2390 ConstantInt *C = nullptr;
2391 if (Incr->getOperand(0) == PH)
2393 else if (Incr->getOperand(1) == PH)
2395 else
2396 continue;
2397
2398 if (!C) continue;
2399
2400 // Ignore negative constants, as the code below doesn't handle them
2401 // correctly. TODO: Remove this restriction.
2402 if (!C->getValue().isStrictlyPositive())
2403 continue;
2404
2405 /* Add new PHINode. */
2406 PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH->getIterator());
2407 NewPH->setDebugLoc(PH->getDebugLoc());
2408
2409 /* create new increment. '++d' in above example. */
2410 Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
2411 BinaryOperator *NewIncr = BinaryOperator::Create(
2412 Incr->getOpcode() == Instruction::Add ? Instruction::FAdd
2413 : Instruction::FSub,
2414 NewPH, CFP, "IV.S.next.", Incr->getIterator());
2415 NewIncr->setDebugLoc(Incr->getDebugLoc());
2416
2417 NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
2418 NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
2419
2420 /* Remove cast operation */
2421 ShadowUse->replaceAllUsesWith(NewPH);
2422 ShadowUse->eraseFromParent();
2423 Changed = true;
2424 break;
2425 }
2426}
2427
2428/// If Cond has an operand that is an expression of an IV, set the IV user and
2429/// stride information and return true, otherwise return false.
2430bool LSRInstance::FindIVUserForCond(Instruction *Cond, IVStrideUse *&CondUse) {
2431 for (IVStrideUse &U : IU)
2432 if (U.getUser() == Cond) {
2433 // NOTE: we could handle setcc instructions with multiple uses here, but
2434 // InstCombine does it as well for simple uses, it's not clear that it
2435 // occurs enough in real life to handle.
2436 CondUse = &U;
2437 return true;
2438 }
2439 return false;
2440}
2441
2442/// Rewrite the loop's terminating condition if it uses a max computation.
2443///
2444/// This is a narrow solution to a specific, but acute, problem. For loops
2445/// like this:
2446///
2447/// i = 0;
2448/// do {
2449/// p[i] = 0.0;
2450/// } while (++i < n);
2451///
2452/// the trip count isn't just 'n', because 'n' might not be positive. And
2453/// unfortunately this can come up even for loops where the user didn't use
2454/// a C do-while loop. For example, seemingly well-behaved top-test loops
2455/// will commonly be lowered like this:
2456///
2457/// if (n > 0) {
2458/// i = 0;
2459/// do {
2460/// p[i] = 0.0;
2461/// } while (++i < n);
2462/// }
2463///
2464/// and then it's possible for subsequent optimization to obscure the if
2465/// test in such a way that indvars can't find it.
2466///
2467/// When indvars can't find the if test in loops like this, it creates a
2468/// max expression, which allows it to give the loop a canonical
2469/// induction variable:
2470///
2471/// i = 0;
2472/// max = n < 1 ? 1 : n;
2473/// do {
2474/// p[i] = 0.0;
2475/// } while (++i != max);
2476///
2477/// Canonical induction variables are necessary because the loop passes
2478/// are designed around them. The most obvious example of this is the
2479/// LoopInfo analysis, which doesn't remember trip count values. It
2480/// expects to be able to rediscover the trip count each time it is
2481/// needed, and it does this using a simple analysis that only succeeds if
2482/// the loop has a canonical induction variable.
2483///
2484/// However, when it comes time to generate code, the maximum operation
2485/// can be quite costly, especially if it's inside of an outer loop.
2486///
2487/// This function solves this problem by detecting this type of loop and
2488/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
2489/// the instructions for the maximum computation.
2490Instruction *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse *&CondUse) {
2491 // Check that the loop matches the pattern we're looking for.
2492 if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
2493 Cond->getPredicate() != CmpInst::ICMP_NE)
2494 return Cond;
2495
2496 SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
2497 if (!Sel || !Sel->hasOneUse()) return Cond;
2498
2499 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2500 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2501 return Cond;
2502 const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
2503
2504 // Add one to the backedge-taken count to get the trip count.
2505 const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
2506 if (IterationCount != SE.getSCEV(Sel)) return Cond;
2507
2508 // Check for a max calculation that matches the pattern. There's no check
2509 // for ICMP_ULE here because the comparison would be with zero, which
2510 // isn't interesting.
2511 CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
2512 const SCEVNAryExpr *Max = nullptr;
2513 if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
2514 Pred = ICmpInst::ICMP_SLE;
2515 Max = S;
2516 } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
2517 Pred = ICmpInst::ICMP_SLT;
2518 Max = S;
2519 } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
2520 Pred = ICmpInst::ICMP_ULT;
2521 Max = U;
2522 } else {
2523 // No match; bail.
2524 return Cond;
2525 }
2526
2527 // To handle a max with more than two operands, this optimization would
2528 // require additional checking and setup.
2529 if (Max->getNumOperands() != 2)
2530 return Cond;
2531
2532 const SCEV *MaxLHS = Max->getOperand(0);
2533 const SCEV *MaxRHS = Max->getOperand(1);
2534
2535 // ScalarEvolution canonicalizes constants to the left. For < and >, look
2536 // for a comparison with 1. For <= and >=, a comparison with zero.
2537 if (!MaxLHS ||
2538 (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
2539 return Cond;
2540
2541 // Check the relevant induction variable for conformance to
2542 // the pattern.
2543 const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
2544 if (!match(IV,
2546 return Cond;
2547
2548 assert(cast<SCEVAddRecExpr>(IV)->getLoop() == L &&
2549 "Loop condition operand is an addrec in a different loop!");
2550
2551 // Check the right operand of the select, and remember it, as it will
2552 // be used in the new comparison instruction.
2553 Value *NewRHS = nullptr;
2554 if (ICmpInst::isTrueWhenEqual(Pred)) {
2555 // Look for n+1, and grab n.
2556 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
2557 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2558 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2559 NewRHS = BO->getOperand(0);
2560 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
2561 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2562 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2563 NewRHS = BO->getOperand(0);
2564 if (!NewRHS)
2565 return Cond;
2566 } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
2567 NewRHS = Sel->getOperand(1);
2568 else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
2569 NewRHS = Sel->getOperand(2);
2570 else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
2571 NewRHS = SU->getValue();
2572 else
2573 // Max doesn't match expected pattern.
2574 return Cond;
2575
2576 // Determine the new comparison opcode. It may be signed or unsigned,
2577 // and the original comparison may be either equality or inequality.
2578 if (Cond->getPredicate() == CmpInst::ICMP_EQ)
2579 Pred = CmpInst::getInversePredicate(Pred);
2580
2581 // Ok, everything looks ok to change the condition into an SLT or SGE and
2582 // delete the max calculation.
2583 ICmpInst *NewCond = new ICmpInst(Cond->getIterator(), Pred,
2584 Cond->getOperand(0), NewRHS, "scmp");
2585
2586 // Delete the max calculation instructions.
2587 NewCond->setDebugLoc(Cond->getDebugLoc());
2588 Cond->replaceAllUsesWith(NewCond);
2589 CondUse->setUser(NewCond);
2591 Cond->eraseFromParent();
2592 Sel->eraseFromParent();
2593 if (Cmp->use_empty()) {
2594 salvageDebugInfo(*Cmp);
2595 Cmp->eraseFromParent();
2596 }
2597 return NewCond;
2598}
2599
2600/// Change loop terminating condition to use the postinc iv when possible.
2601void
2602LSRInstance::OptimizeLoopTermCond() {
2603 SmallPtrSet<Instruction *, 4> PostIncs;
2604
2605 // We need a different set of heuristics for rotated and non-rotated loops.
2606 // If a loop is rotated then the latch is also the backedge, so inserting
2607 // post-inc expressions just before the latch is ideal. To reduce live ranges
2608 // it also makes sense to rewrite terminating conditions to use post-inc
2609 // expressions.
2610 //
2611 // If the loop is not rotated then the latch is not a backedge; the latch
2612 // check is done in the loop head. Adding post-inc expressions before the
2613 // latch will cause overlapping live-ranges of pre-inc and post-inc expressions
2614 // in the loop body. In this case we do *not* want to use post-inc expressions
2615 // in the latch check, and we want to insert post-inc expressions before
2616 // the backedge.
2617 BasicBlock *LatchBlock = L->getLoopLatch();
2618 SmallVector<BasicBlock*, 8> ExitingBlocks;
2619 L->getExitingBlocks(ExitingBlocks);
2620 if (!llvm::is_contained(ExitingBlocks, LatchBlock)) {
2621 // The backedge doesn't exit the loop; treat this as a head-tested loop.
2622 IVIncInsertPos = LatchBlock->getTerminator();
2623 return;
2624 }
2625
2626 // Otherwise treat this as a rotated loop.
2627 for (BasicBlock *ExitingBlock : ExitingBlocks) {
2628 // Get the terminating condition for the loop if possible. If we
2629 // can, we want to change it to use a post-incremented version of its
2630 // induction variable, to allow coalescing the live ranges for the IV into
2631 // one register value.
2632
2633 CondBrInst *TermBr = dyn_cast<CondBrInst>(ExitingBlock->getTerminator());
2634 if (!TermBr)
2635 continue;
2636
2638 // If the argument to TermBr is an extractelement, then the source of that
2639 // instruction is what's generated the condition.
2641 if (Extract)
2642 Cond = dyn_cast<Instruction>(Extract->getVectorOperand());
2643 // FIXME: We could do more here, like handling logical operations where one
2644 // side is a cmp that uses an induction variable.
2645 if (!Cond)
2646 continue;
2647
2648 // Search IVUsesByStride to find Cond's IVUse if there is one.
2649 IVStrideUse *CondUse = nullptr;
2650 if (!FindIVUserForCond(Cond, CondUse))
2651 continue;
2652
2653 // If the trip count is computed in terms of a max (due to ScalarEvolution
2654 // being unable to find a sufficient guard, for example), change the loop
2655 // comparison to use SLT or ULT instead of NE.
2656 // One consequence of doing this now is that it disrupts the count-down
2657 // optimization. That's not always a bad thing though, because in such
2658 // cases it may still be worthwhile to avoid a max.
2659 if (auto *Cmp = dyn_cast<ICmpInst>(Cond))
2660 Cond = OptimizeMax(Cmp, CondUse);
2661
2662 // If this exiting block dominates the latch block, it may also use
2663 // the post-inc value if it won't be shared with other uses.
2664 // Check for dominance.
2665 if (!DT.dominates(ExitingBlock, LatchBlock))
2666 continue;
2667
2668 // Conservatively avoid trying to use the post-inc value in non-latch
2669 // exits if there may be pre-inc users in intervening blocks.
2670 if (LatchBlock != ExitingBlock)
2671 for (const IVStrideUse &UI : IU)
2672 // Test if the use is reachable from the exiting block. This dominator
2673 // query is a conservative approximation of reachability.
2674 if (&UI != CondUse &&
2675 !DT.properlyDominates(UI.getUser()->getParent(), ExitingBlock)) {
2676 // Conservatively assume there may be reuse if the quotient of their
2677 // strides could be a legal scale.
2678 const SCEV *A = IU.getStride(*CondUse, L);
2679 const SCEV *B = IU.getStride(UI, L);
2680 if (!A || !B) continue;
2681 if (SE.getTypeSizeInBits(A->getType()) !=
2682 SE.getTypeSizeInBits(B->getType())) {
2683 if (SE.getTypeSizeInBits(A->getType()) >
2684 SE.getTypeSizeInBits(B->getType()))
2685 B = SE.getSignExtendExpr(B, A->getType());
2686 else
2687 A = SE.getSignExtendExpr(A, B->getType());
2688 }
2689 if (const SCEVConstant *D =
2691 const ConstantInt *C = D->getValue();
2692 // Stride of one or negative one can have reuse with non-addresses.
2693 if (C->isOne() || C->isMinusOne())
2694 goto decline_post_inc;
2695 // Avoid weird situations.
2696 if (C->getValue().getSignificantBits() >= 64 ||
2697 C->getValue().isMinSignedValue())
2698 goto decline_post_inc;
2699 // Check for possible scaled-address reuse.
2700 if (isAddressUse(TTI, UI.getUser(), UI.getOperandValToReplace())) {
2701 MemAccessTy AccessTy =
2702 getAccessType(TTI, UI.getUser(), UI.getOperandValToReplace());
2703 int64_t Scale = C->getSExtValue();
2704 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2705 /*BaseOffset=*/0,
2706 /*HasBaseReg=*/true, Scale,
2707 AccessTy.AddrSpace))
2708 goto decline_post_inc;
2709 Scale = -Scale;
2710 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2711 /*BaseOffset=*/0,
2712 /*HasBaseReg=*/true, Scale,
2713 AccessTy.AddrSpace))
2714 goto decline_post_inc;
2715 }
2716 }
2717 }
2718
2719 LLVM_DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: "
2720 << *Cond << '\n');
2721
2722 // It's possible for the setcc instruction to be anywhere in the loop, and
2723 // possible for it to have multiple users. If it is not immediately before
2724 // the exiting block branch, move it.
2725 if (isa_and_nonnull<CmpInst>(Cond) && Cond->getNextNode() != TermBr &&
2726 !Extract) {
2727 if (Cond->hasOneUse()) {
2728 Cond->moveBefore(TermBr->getIterator());
2729 } else {
2730 // Clone the terminating condition and insert into the loopend.
2731 Instruction *OldCond = Cond;
2732 Cond = Cond->clone();
2733 Cond->setName(L->getHeader()->getName() + ".termcond");
2734 Cond->insertInto(ExitingBlock, TermBr->getIterator());
2735
2736 // Clone the IVUse, as the old use still exists!
2737 CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
2738 TermBr->replaceUsesOfWith(OldCond, Cond);
2739 }
2740 }
2741
2742 // If we get to here, we know that we can transform the setcc instruction to
2743 // use the post-incremented version of the IV, allowing us to coalesce the
2744 // live ranges for the IV correctly.
2745 CondUse->transformToPostInc(L);
2746 Changed = true;
2747
2748 PostIncs.insert(Cond);
2749 decline_post_inc:;
2750 }
2751
2752 // Determine an insertion point for the loop induction variable increment. It
2753 // must dominate all the post-inc comparisons we just set up, and it must
2754 // dominate the loop latch edge.
2755 IVIncInsertPos = L->getLoopLatch()->getTerminator();
2756 for (Instruction *Inst : PostIncs)
2757 IVIncInsertPos = DT.findNearestCommonDominator(IVIncInsertPos, Inst);
2758}
2759
2760/// Determine if the given use can accommodate a fixup at the given offset and
2761/// other details. If so, update the use and return true.
2762bool LSRInstance::reconcileNewOffset(LSRUse &LU, Immediate NewOffset,
2763 bool HasBaseReg, LSRUse::KindType Kind,
2764 MemAccessTy AccessTy) {
2765 Immediate NewMinOffset = LU.MinOffset;
2766 Immediate NewMaxOffset = LU.MaxOffset;
2767 MemAccessTy NewAccessTy = AccessTy;
2768
2769 // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
2770 // something conservative, however this can pessimize in the case that one of
2771 // the uses will have all its uses outside the loop, for example.
2772 if (LU.Kind != Kind)
2773 return false;
2774
2775 // Check for a mismatched access type, and fall back conservatively as needed.
2776 // TODO: Be less conservative when the type is similar and can use the same
2777 // addressing modes.
2778 if (Kind == LSRUse::Address) {
2779 if (AccessTy.MemTy != LU.AccessTy.MemTy) {
2780 NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext(),
2781 AccessTy.AddrSpace);
2782 }
2783 }
2784
2785 // Conservatively assume HasBaseReg is true for now.
2786 if (Immediate::isKnownLT(NewOffset, LU.MinOffset)) {
2787 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2788 LU.MaxOffset - NewOffset, HasBaseReg))
2789 return false;
2790 NewMinOffset = NewOffset;
2791 } else if (Immediate::isKnownGT(NewOffset, LU.MaxOffset)) {
2792 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2793 NewOffset - LU.MinOffset, HasBaseReg))
2794 return false;
2795 NewMaxOffset = NewOffset;
2796 }
2797
2798 // FIXME: We should be able to handle some level of scalable offset support
2799 // for 'void', but in order to get basic support up and running this is
2800 // being left out.
2801 if (NewAccessTy.MemTy && NewAccessTy.MemTy->isVoidTy() &&
2802 (NewMinOffset.isScalable() || NewMaxOffset.isScalable()))
2803 return false;
2804
2805 // Update the use.
2806 LU.MinOffset = NewMinOffset;
2807 LU.MaxOffset = NewMaxOffset;
2808 LU.AccessTy = NewAccessTy;
2809 return true;
2810}
2811
2812/// Return an LSRUse index and an offset value for a fixup which needs the given
2813/// expression, with the given kind and optional access type. Either reuse an
2814/// existing use or create a new one, as needed.
2815std::pair<size_t, Immediate> LSRInstance::getUse(const SCEV *&Expr,
2816 LSRUse::KindType Kind,
2817 MemAccessTy AccessTy) {
2818 const SCEV *Copy = Expr;
2819 SCEVUse ExprUse = Expr;
2820 Immediate Offset = ExtractImmediate(ExprUse, SE);
2821 Expr = ExprUse;
2822
2823 // Basic uses can't accept any offset, for example.
2824 if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
2825 Offset, /*HasBaseReg=*/ true)) {
2826 Expr = Copy;
2827 Offset = Immediate::getFixed(0);
2828 }
2829
2830 std::pair<UseMapTy::iterator, bool> P =
2831 UseMap.try_emplace(LSRUse::SCEVUseKindPair(Expr, Kind));
2832 if (!P.second) {
2833 // A use already existed with this base.
2834 size_t LUIdx = P.first->second;
2835 LSRUse &LU = Uses[LUIdx];
2836 if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
2837 // Reuse this use.
2838 return std::make_pair(LUIdx, Offset);
2839 }
2840
2841 // Create a new use.
2842 size_t LUIdx = Uses.size();
2843 P.first->second = LUIdx;
2844 Uses.push_back(LSRUse(Kind, AccessTy));
2845 LSRUse &LU = Uses[LUIdx];
2846
2847 LU.MinOffset = Offset;
2848 LU.MaxOffset = Offset;
2849 return std::make_pair(LUIdx, Offset);
2850}
2851
2852/// Delete the given use from the Uses list.
2853void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
2854 if (&LU != &Uses.back())
2855 std::swap(LU, Uses.back());
2856 Uses.pop_back();
2857
2858 // Update RegUses.
2859 RegUses.swapAndDropUse(LUIdx, Uses.size());
2860}
2861
2862/// Look for a use distinct from OrigLU which is has a formula that has the same
2863/// registers as the given formula.
2864LSRUse *
2865LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
2866 const LSRUse &OrigLU) {
2867 // Search all uses for the formula. This could be more clever.
2868 for (LSRUse &LU : Uses) {
2869 // Check whether this use is close enough to OrigLU, to see whether it's
2870 // worthwhile looking through its formulae.
2871 // Ignore ICmpZero uses because they may contain formulae generated by
2872 // GenerateICmpZeroScales, in which case adding fixup offsets may
2873 // be invalid.
2874 if (&LU != &OrigLU && LU.Kind != LSRUse::ICmpZero &&
2875 LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
2876 LU.HasFormulaWithSameRegs(OrigF)) {
2877 // Scan through this use's formulae.
2878 for (const Formula &F : LU.Formulae) {
2879 // Check to see if this formula has the same registers and symbols
2880 // as OrigF.
2881 if (F.BaseRegs == OrigF.BaseRegs &&
2882 F.ScaledReg == OrigF.ScaledReg &&
2883 F.BaseGV == OrigF.BaseGV &&
2884 F.Scale == OrigF.Scale &&
2885 F.UnfoldedOffset == OrigF.UnfoldedOffset) {
2886 if (F.BaseOffset.isZero())
2887 return &LU;
2888 // This is the formula where all the registers and symbols matched;
2889 // there aren't going to be any others. Since we declined it, we
2890 // can skip the rest of the formulae and proceed to the next LSRUse.
2891 break;
2892 }
2893 }
2894 }
2895 }
2896
2897 // Nothing looked good.
2898 return nullptr;
2899}
2900
2901void LSRInstance::CollectInterestingTypesAndFactors() {
2902 SmallSetVector<const SCEV *, 4> Strides;
2903
2904 // Collect interesting types and strides.
2906 for (const IVStrideUse &U : IU) {
2907 const SCEV *Expr = IU.getExpr(U);
2908 if (!Expr)
2909 continue;
2910
2911 // Collect interesting types.
2912 Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
2913
2914 // Add strides for mentioned loops.
2915 Worklist.push_back(Expr);
2916 do {
2917 const SCEV *S = Worklist.pop_back_val();
2918 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
2919 if (AR->getLoop() == L)
2920 Strides.insert(AR->getStepRecurrence(SE));
2921 Worklist.push_back(AR->getStart());
2922 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
2923 append_range(Worklist, Add->operands());
2924 }
2925 } while (!Worklist.empty());
2926 }
2927
2928 // Compute interesting factors from the set of interesting strides.
2929 for (SmallSetVector<const SCEV *, 4>::const_iterator
2930 I = Strides.begin(), E = Strides.end(); I != E; ++I)
2931 for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
2932 std::next(I); NewStrideIter != E; ++NewStrideIter) {
2933 const SCEV *OldStride = *I;
2934 const SCEV *NewStride = *NewStrideIter;
2935
2936 if (SE.getTypeSizeInBits(OldStride->getType()) !=
2937 SE.getTypeSizeInBits(NewStride->getType())) {
2938 if (SE.getTypeSizeInBits(OldStride->getType()) >
2939 SE.getTypeSizeInBits(NewStride->getType()))
2940 NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
2941 else
2942 OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
2943 }
2944 if (const SCEVConstant *Factor =
2945 dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
2946 SE, true))) {
2947 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2948 Factors.insert(Factor->getAPInt().getSExtValue());
2949 } else if (const SCEVConstant *Factor =
2951 NewStride,
2952 SE, true))) {
2953 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2954 Factors.insert(Factor->getAPInt().getSExtValue());
2955 }
2956 }
2957
2958 // If all uses use the same type, don't bother looking for truncation-based
2959 // reuse.
2960 if (Types.size() == 1)
2961 Types.clear();
2962
2963 LLVM_DEBUG(print_factors_and_types(dbgs()));
2964}
2965
2966/// Helper for CollectChains that finds an IV operand (computed by an AddRec in
2967/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
2968/// IVStrideUses, we could partially skip this.
2969static User::op_iterator
2971 Loop *L, ScalarEvolution &SE) {
2972 for(; OI != OE; ++OI) {
2973 if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
2974 if (!SE.isSCEVable(Oper->getType()))
2975 continue;
2976
2977 if (const SCEVAddRecExpr *AR =
2979 if (AR->getLoop() == L)
2980 break;
2981 }
2982 }
2983 }
2984 return OI;
2985}
2986
2987/// IVChain logic must consistently peek base TruncInst operands, so wrap it in
2988/// a convenient helper.
2990 if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
2991 return Trunc->getOperand(0);
2992 return Oper;
2993}
2994
2995/// Return an approximation of this SCEV expression's "base", or NULL for any
2996/// constant. Returning the expression itself is conservative. Returning a
2997/// deeper subexpression is more precise and valid as long as it isn't less
2998/// complex than another subexpression. For expressions involving multiple
2999/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
3000/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
3001/// IVInc==b-a.
3002///
3003/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
3004/// SCEVUnknown, we simply return the rightmost SCEV operand.
3005static const SCEV *getExprBase(const SCEV *S) {
3006 switch (S->getSCEVType()) {
3007 default: // including scUnknown.
3008 return S;
3009 case scConstant:
3010 case scVScale:
3011 return nullptr;
3012 case scTruncate:
3013 return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
3014 case scZeroExtend:
3015 return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
3016 case scSignExtend:
3017 return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
3018 case scAddExpr: {
3019 // Skip over scaled operands (scMulExpr) to follow add operands as long as
3020 // there's nothing more complex.
3021 // FIXME: not sure if we want to recognize negation.
3022 const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
3023 for (const SCEV *SubExpr : reverse(Add->operands())) {
3024 if (SubExpr->getSCEVType() == scAddExpr)
3025 return getExprBase(SubExpr);
3026
3027 if (SubExpr->getSCEVType() != scMulExpr)
3028 return SubExpr;
3029 }
3030 return S; // all operands are scaled, be conservative.
3031 }
3032 case scAddRecExpr:
3033 return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
3034 }
3035 llvm_unreachable("Unknown SCEV kind!");
3036}
3037
3038/// Return true if the chain increment is profitable to expand into a loop
3039/// invariant value, which may require its own register. A profitable chain
3040/// increment will be an offset relative to the same base. We allow such offsets
3041/// to potentially be used as chain increment as long as it's not obviously
3042/// expensive to expand using real instructions.
3043bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
3044 const SCEV *IncExpr,
3045 ScalarEvolution &SE) {
3046 // Aggressively form chains when -stress-ivchain.
3047 if (StressIVChain)
3048 return true;
3049
3050 // Do not replace a constant offset from IV head with a nonconstant IV
3051 // increment.
3052 if (!isa<SCEVConstant>(IncExpr)) {
3053 const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
3054 if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
3055 return false;
3056 }
3057
3058 SmallPtrSet<const SCEV*, 8> Processed;
3059 return !isHighCostExpansion(IncExpr, Processed, SE);
3060}
3061
3062/// Return true if the number of registers needed for the chain is estimated to
3063/// be less than the number required for the individual IV users. First prohibit
3064/// any IV users that keep the IV live across increments (the Users set should
3065/// be empty). Next count the number and type of increments in the chain.
3066///
3067/// Chaining IVs can lead to considerable code bloat if ISEL doesn't
3068/// effectively use postinc addressing modes. Only consider it profitable it the
3069/// increments can be computed in fewer registers when chained.
3070///
3071/// TODO: Consider IVInc free if it's already used in another chains.
3072static bool isProfitableChain(IVChain &Chain,
3074 ScalarEvolution &SE,
3075 const TargetTransformInfo &TTI) {
3076 if (StressIVChain)
3077 return true;
3078
3079 if (!Chain.hasIncs())
3080 return false;
3081
3082 if (!Users.empty()) {
3083 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
3084 for (Instruction *Inst
3085 : Users) { dbgs() << " " << *Inst << "\n"; });
3086 return false;
3087 }
3088 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3089
3090 // The chain itself may require a register, so initialize cost to 1.
3091 int cost = 1;
3092
3093 // A complete chain likely eliminates the need for keeping the original IV in
3094 // a register. LSR does not currently know how to form a complete chain unless
3095 // the header phi already exists.
3096 if (isa<PHINode>(Chain.tailUserInst())
3097 && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
3098 --cost;
3099 }
3100 const SCEV *LastIncExpr = nullptr;
3101 unsigned NumConstIncrements = 0;
3102 unsigned NumVarIncrements = 0;
3103 unsigned NumReusedIncrements = 0;
3104
3105 if (TTI.isProfitableLSRChainElement(Chain.Incs[0].UserInst))
3106 return true;
3107
3108 for (const IVInc &Inc : Chain) {
3109 if (TTI.isProfitableLSRChainElement(Inc.UserInst))
3110 return true;
3111 if (Inc.IncExpr->isZero())
3112 continue;
3113
3114 // Incrementing by zero or some constant is neutral. We assume constants can
3115 // be folded into an addressing mode or an add's immediate operand.
3116 if (isa<SCEVConstant>(Inc.IncExpr)) {
3117 ++NumConstIncrements;
3118 continue;
3119 }
3120
3121 if (Inc.IncExpr == LastIncExpr)
3122 ++NumReusedIncrements;
3123 else
3124 ++NumVarIncrements;
3125
3126 LastIncExpr = Inc.IncExpr;
3127 }
3128 // An IV chain with a single increment is handled by LSR's postinc
3129 // uses. However, a chain with multiple increments requires keeping the IV's
3130 // value live longer than it needs to be if chained.
3131 if (NumConstIncrements > 1)
3132 --cost;
3133
3134 // Materializing increment expressions in the preheader that didn't exist in
3135 // the original code may cost a register. For example, sign-extended array
3136 // indices can produce ridiculous increments like this:
3137 // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
3138 cost += NumVarIncrements;
3139
3140 // Reusing variable increments likely saves a register to hold the multiple of
3141 // the stride.
3142 cost -= NumReusedIncrements;
3143
3144 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
3145 << "\n");
3146
3147 return cost < 0;
3148}
3149
3150/// Add this IV user to an existing chain or make it the head of a new chain.
3151void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
3152 SmallVectorImpl<ChainUsers> &ChainUsersVec) {
3153 // When IVs are used as types of varying widths, they are generally converted
3154 // to a wider type with some uses remaining narrow under a (free) trunc.
3155 Value *const NextIV = getWideOperand(IVOper);
3156 const SCEV *const OperExpr = SE.getSCEV(NextIV);
3157 const SCEV *const OperExprBase = getExprBase(OperExpr);
3158
3159 // Visit all existing chains. Check if its IVOper can be computed as a
3160 // profitable loop invariant increment from the last link in the Chain.
3161 unsigned ChainIdx = 0, NChains = IVChainVec.size();
3162 const SCEV *LastIncExpr = nullptr;
3163 for (; ChainIdx < NChains; ++ChainIdx) {
3164 IVChain &Chain = IVChainVec[ChainIdx];
3165
3166 // Prune the solution space aggressively by checking that both IV operands
3167 // are expressions that operate on the same unscaled SCEVUnknown. This
3168 // "base" will be canceled by the subsequent getMinusSCEV call. Checking
3169 // first avoids creating extra SCEV expressions.
3170 if (!StressIVChain && Chain.ExprBase != OperExprBase)
3171 continue;
3172
3173 Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
3174 if (PrevIV->getType() != NextIV->getType())
3175 continue;
3176
3177 // A phi node terminates a chain.
3178 if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
3179 continue;
3180
3181 // The increment must be loop-invariant so it can be kept in a register.
3182 const SCEV *PrevExpr = SE.getSCEV(PrevIV);
3183 const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
3184 if (isa<SCEVCouldNotCompute>(IncExpr) || !SE.isLoopInvariant(IncExpr, L))
3185 continue;
3186
3187 if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
3188 LastIncExpr = IncExpr;
3189 break;
3190 }
3191 }
3192 // If we haven't found a chain, create a new one, unless we hit the max. Don't
3193 // bother for phi nodes, because they must be last in the chain.
3194 if (ChainIdx == NChains) {
3195 if (isa<PHINode>(UserInst))
3196 return;
3197 if (NChains >= MaxChains && !StressIVChain) {
3198 LLVM_DEBUG(dbgs() << "IV Chain Limit\n");
3199 return;
3200 }
3201 LastIncExpr = OperExpr;
3202 // IVUsers may have skipped over sign/zero extensions. We don't currently
3203 // attempt to form chains involving extensions unless they can be hoisted
3204 // into this loop's AddRec.
3205 if (!isa<SCEVAddRecExpr>(LastIncExpr))
3206 return;
3207 ++NChains;
3208 IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
3209 OperExprBase));
3210 ChainUsersVec.resize(NChains);
3211 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
3212 << ") IV=" << *LastIncExpr << "\n");
3213 } else {
3214 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst
3215 << ") IV+" << *LastIncExpr << "\n");
3216 // Add this IV user to the end of the chain.
3217 IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
3218 }
3219 IVChain &Chain = IVChainVec[ChainIdx];
3220
3221 SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
3222 // This chain's NearUsers become FarUsers.
3223 if (!LastIncExpr->isZero()) {
3224 ChainUsersVec[ChainIdx].FarUsers.insert_range(NearUsers);
3225 NearUsers.clear();
3226 }
3227
3228 // All other uses of IVOperand become near uses of the chain.
3229 // We currently ignore intermediate values within SCEV expressions, assuming
3230 // they will eventually be used be the current chain, or can be computed
3231 // from one of the chain increments. To be more precise we could
3232 // transitively follow its user and only add leaf IV users to the set.
3233 for (User *U : IVOper->users()) {
3234 Instruction *OtherUse = dyn_cast<Instruction>(U);
3235 if (!OtherUse)
3236 continue;
3237 // Uses in the chain will no longer be uses if the chain is formed.
3238 // Include the head of the chain in this iteration (not Chain.begin()).
3239 IVChain::const_iterator IncIter = Chain.Incs.begin();
3240 IVChain::const_iterator IncEnd = Chain.Incs.end();
3241 for( ; IncIter != IncEnd; ++IncIter) {
3242 if (IncIter->UserInst == OtherUse)
3243 break;
3244 }
3245 if (IncIter != IncEnd)
3246 continue;
3247
3248 if (SE.isSCEVable(OtherUse->getType())
3249 && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
3250 && IU.isIVUserOrOperand(OtherUse)) {
3251 continue;
3252 }
3253 NearUsers.insert(OtherUse);
3254 }
3255
3256 // Since this user is part of the chain, it's no longer considered a use
3257 // of the chain.
3258 ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
3259}
3260
3261/// Populate the vector of Chains.
3262///
3263/// This decreases ILP at the architecture level. Targets with ample registers,
3264/// multiple memory ports, and no register renaming probably don't want
3265/// this. However, such targets should probably disable LSR altogether.
3266///
3267/// The job of LSR is to make a reasonable choice of induction variables across
3268/// the loop. Subsequent passes can easily "unchain" computation exposing more
3269/// ILP *within the loop* if the target wants it.
3270///
3271/// Finding the best IV chain is potentially a scheduling problem. Since LSR
3272/// will not reorder memory operations, it will recognize this as a chain, but
3273/// will generate redundant IV increments. Ideally this would be corrected later
3274/// by a smart scheduler:
3275/// = A[i]
3276/// = A[i+x]
3277/// A[i] =
3278/// A[i+x] =
3279///
3280/// TODO: Walk the entire domtree within this loop, not just the path to the
3281/// loop latch. This will discover chains on side paths, but requires
3282/// maintaining multiple copies of the Chains state.
3283void LSRInstance::CollectChains() {
3284 LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n");
3285 SmallVector<ChainUsers, 8> ChainUsersVec;
3286
3287 SmallVector<BasicBlock *,8> LatchPath;
3288 BasicBlock *LoopHeader = L->getHeader();
3289 for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
3290 Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
3291 LatchPath.push_back(Rung->getBlock());
3292 }
3293 LatchPath.push_back(LoopHeader);
3294
3295 // Walk the instruction stream from the loop header to the loop latch.
3296 for (BasicBlock *BB : reverse(LatchPath)) {
3297 for (Instruction &I : *BB) {
3298 // Skip instructions that weren't seen by IVUsers analysis.
3299 if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&I))
3300 continue;
3301
3302 // Skip ephemeral values, as they don't produce real code.
3303 if (IU.isEphemeral(&I))
3304 continue;
3305
3306 // Ignore users that are part of a SCEV expression. This way we only
3307 // consider leaf IV Users. This effectively rediscovers a portion of
3308 // IVUsers analysis but in program order this time.
3309 if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I)))
3310 continue;
3311
3312 // Remove this instruction from any NearUsers set it may be in.
3313 for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
3314 ChainIdx < NChains; ++ChainIdx) {
3315 ChainUsersVec[ChainIdx].NearUsers.erase(&I);
3316 }
3317 // Search for operands that can be chained.
3318 SmallPtrSet<Instruction*, 4> UniqueOperands;
3319 User::op_iterator IVOpEnd = I.op_end();
3320 User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE);
3321 while (IVOpIter != IVOpEnd) {
3322 Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
3323 if (UniqueOperands.insert(IVOpInst).second)
3324 ChainInstruction(&I, IVOpInst, ChainUsersVec);
3325 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3326 }
3327 } // Continue walking down the instructions.
3328 } // Continue walking down the domtree.
3329 // Visit phi backedges to determine if the chain can generate the IV postinc.
3330 for (PHINode &PN : L->getHeader()->phis()) {
3331 if (!SE.isSCEVable(PN.getType()))
3332 continue;
3333
3334 Instruction *IncV =
3335 dyn_cast<Instruction>(PN.getIncomingValueForBlock(L->getLoopLatch()));
3336 if (IncV)
3337 ChainInstruction(&PN, IncV, ChainUsersVec);
3338 }
3339 // Remove any unprofitable chains.
3340 unsigned ChainIdx = 0;
3341 for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
3342 UsersIdx < NChains; ++UsersIdx) {
3343 if (!isProfitableChain(IVChainVec[UsersIdx],
3344 ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
3345 continue;
3346 // Preserve the chain at UsesIdx.
3347 if (ChainIdx != UsersIdx)
3348 IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
3349 FinalizeChain(IVChainVec[ChainIdx]);
3350 ++ChainIdx;
3351 }
3352 IVChainVec.resize(ChainIdx);
3353}
3354
3355void LSRInstance::FinalizeChain(IVChain &Chain) {
3356 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3357 LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
3358
3359 for (const IVInc &Inc : Chain) {
3360 LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");
3361 auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
3362 assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
3363 IVIncSet.insert(UseI);
3364 }
3365}
3366
3367/// Return true if the IVInc can be folded into an addressing mode.
3368static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
3369 Value *Operand, const TargetTransformInfo &TTI) {
3370 const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
3371 Immediate IncOffset = Immediate::getZero();
3372 if (IncConst) {
3373 if (IncConst && IncConst->getAPInt().getSignificantBits() > 64)
3374 return false;
3375 IncOffset = Immediate::getFixed(IncConst->getValue()->getSExtValue());
3376 } else {
3377 // Look for mul(vscale, constant), to detect a scalable offset.
3378 const APInt *C;
3379 if (!match(IncExpr, m_scev_Mul(m_scev_APInt(C), m_SCEVVScale())) ||
3380 C->getSignificantBits() > 64)
3381 return false;
3382 IncOffset = Immediate::getScalable(C->getSExtValue());
3383 }
3384
3385 if (!isAddressUse(TTI, UserInst, Operand))
3386 return false;
3387
3388 MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
3389 if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
3390 IncOffset, /*HasBaseReg=*/false))
3391 return false;
3392
3393 return true;
3394}
3395
3396/// Generate an add or subtract for each IVInc in a chain to materialize the IV
3397/// user's operand from the previous IV user's operand.
3398void LSRInstance::GenerateIVChain(const IVChain &Chain,
3399 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
3400 // Find the new IVOperand for the head of the chain. It may have been replaced
3401 // by LSR.
3402 const IVInc &Head = Chain.Incs[0];
3403 User::op_iterator IVOpEnd = Head.UserInst->op_end();
3404 // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
3405 User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
3406 IVOpEnd, L, SE);
3407 Value *IVSrc = nullptr;
3408 while (IVOpIter != IVOpEnd) {
3409 IVSrc = getWideOperand(*IVOpIter);
3410
3411 // If this operand computes the expression that the chain needs, we may use
3412 // it. (Check this after setting IVSrc which is used below.)
3413 //
3414 // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
3415 // narrow for the chain, so we can no longer use it. We do allow using a
3416 // wider phi, assuming the LSR checked for free truncation. In that case we
3417 // should already have a truncate on this operand such that
3418 // getSCEV(IVSrc) == IncExpr.
3419 if (SE.getSCEV(*IVOpIter) == Head.IncExpr
3420 || SE.getSCEV(IVSrc) == Head.IncExpr) {
3421 break;
3422 }
3423 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3424 }
3425 if (IVOpIter == IVOpEnd) {
3426 // Gracefully give up on this chain.
3427 LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
3428 return;
3429 }
3430 assert(IVSrc && "Failed to find IV chain source");
3431
3432 LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
3433 Type *IVTy = IVSrc->getType();
3434 Type *IntTy = SE.getEffectiveSCEVType(IVTy);
3435 const SCEV *LeftOverExpr = nullptr;
3436 const SCEV *Accum = SE.getZero(IntTy);
3438 Bases.emplace_back(Accum, IVSrc);
3439
3440 for (const IVInc &Inc : Chain) {
3441 Instruction *InsertPt = Inc.UserInst;
3442 if (isa<PHINode>(InsertPt))
3443 InsertPt = L->getLoopLatch()->getTerminator();
3444
3445 // IVOper will replace the current IV User's operand. IVSrc is the IV
3446 // value currently held in a register.
3447 Value *IVOper = IVSrc;
3448 if (!Inc.IncExpr->isZero()) {
3449 // IncExpr was the result of subtraction of two narrow values, so must
3450 // be signed.
3451 const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
3452 Accum = SE.getAddExpr(Accum, IncExpr);
3453 LeftOverExpr = LeftOverExpr ?
3454 SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
3455 }
3456
3457 // Look through each base to see if any can produce a nice addressing mode.
3458 bool FoundBase = false;
3459 for (auto [MapScev, MapIVOper] : reverse(Bases)) {
3460 const SCEV *Remainder = SE.getMinusSCEV(Accum, MapScev);
3461 if (canFoldIVIncExpr(Remainder, Inc.UserInst, Inc.IVOperand, TTI)) {
3462 if (!Remainder->isZero()) {
3463 Rewriter.clearPostInc();
3464 Value *IncV = Rewriter.expandCodeFor(Remainder, IntTy, InsertPt);
3465 const SCEV *IVOperExpr =
3466 SE.getAddExpr(SE.getUnknown(MapIVOper), SE.getUnknown(IncV));
3467 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3468 } else {
3469 IVOper = MapIVOper;
3470 }
3471
3472 FoundBase = true;
3473 break;
3474 }
3475 }
3476 if (!FoundBase && LeftOverExpr && !LeftOverExpr->isZero()) {
3477 // Expand the IV increment.
3478 Rewriter.clearPostInc();
3479 Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
3480 const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
3481 SE.getUnknown(IncV));
3482 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3483
3484 // If an IV increment can't be folded, use it as the next IV value.
3485 if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
3486 assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
3487 Bases.emplace_back(Accum, IVOper);
3488 IVSrc = IVOper;
3489 LeftOverExpr = nullptr;
3490 }
3491 }
3492 Type *OperTy = Inc.IVOperand->getType();
3493 if (IVTy != OperTy) {
3494 assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
3495 "cannot extend a chained IV");
3496 IRBuilder<> Builder(InsertPt);
3497 IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
3498 }
3499 Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
3500 if (auto *OperandIsInstr = dyn_cast<Instruction>(Inc.IVOperand))
3501 DeadInsts.emplace_back(OperandIsInstr);
3502 }
3503 // If LSR created a new, wider phi, we may also replace its postinc. We only
3504 // do this if we also found a wide value for the head of the chain.
3505 if (isa<PHINode>(Chain.tailUserInst())) {
3506 for (PHINode &Phi : L->getHeader()->phis()) {
3507 if (Phi.getType() != IVSrc->getType())
3508 continue;
3510 Phi.getIncomingValueForBlock(L->getLoopLatch()));
3511 if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
3512 continue;
3513 Value *IVOper = IVSrc;
3514 Type *PostIncTy = PostIncV->getType();
3515 if (IVTy != PostIncTy) {
3516 assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
3517 IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
3518 Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
3519 IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
3520 }
3521 Phi.replaceUsesOfWith(PostIncV, IVOper);
3522 DeadInsts.emplace_back(PostIncV);
3523 }
3524 }
3525}
3526
3527void LSRInstance::CollectFixupsAndInitialFormulae() {
3528 CondBrInst *ExitBranch = nullptr;
3529 bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &TLI);
3530
3531 // For calculating baseline cost
3532 SmallPtrSet<const SCEV *, 16> Regs;
3533 DenseSet<const SCEV *> VisitedRegs;
3534 DenseSet<size_t> VisitedLSRUse;
3535
3536 for (const IVStrideUse &U : IU) {
3537 Instruction *UserInst = U.getUser();
3538 // Skip IV users that are part of profitable IV Chains.
3539 User::op_iterator UseI =
3540 find(UserInst->operands(), U.getOperandValToReplace());
3541 assert(UseI != UserInst->op_end() && "cannot find IV operand");
3542 if (IVIncSet.count(UseI)) {
3543 LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
3544 continue;
3545 }
3546
3547 LSRUse::KindType Kind = LSRUse::Basic;
3548 MemAccessTy AccessTy;
3549 if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
3550 Kind = LSRUse::Address;
3551 AccessTy = getAccessType(TTI, UserInst, U.getOperandValToReplace());
3552 }
3553
3554 const SCEV *S = IU.getExpr(U);
3555 if (!S)
3556 continue;
3557 PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
3558
3559 // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
3560 // (N - i == 0), and this allows (N - i) to be the expression that we work
3561 // with rather than just N or i, so we can consider the register
3562 // requirements for both N and i at the same time. Limiting this code to
3563 // equality icmps is not a problem because all interesting loops use
3564 // equality icmps, thanks to IndVarSimplify.
3565 if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) {
3566 // If CI can be saved in some target, like replaced inside hardware loop
3567 // in PowerPC, no need to generate initial formulae for it.
3568 if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition()))
3569 continue;
3570 if (CI->isEquality()) {
3571 // Swap the operands if needed to put the OperandValToReplace on the
3572 // left, for consistency.
3573 Value *NV = CI->getOperand(1);
3574 if (NV == U.getOperandValToReplace()) {
3575 CI->setOperand(1, CI->getOperand(0));
3576 CI->setOperand(0, NV);
3577 NV = CI->getOperand(1);
3578 Changed = true;
3579 }
3580
3581 // x == y --> x - y == 0
3582 const SCEV *N = SE.getSCEV(NV);
3583 if (SE.isLoopInvariant(N, L) && Rewriter.isSafeToExpand(N) &&
3584 (!NV->getType()->isPointerTy() ||
3585 SE.getPointerBase(N) == SE.getPointerBase(S))) {
3586 // S is normalized, so normalize N before folding it into S
3587 // to keep the result normalized.
3588 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3589 if (!N)
3590 continue;
3591 Kind = LSRUse::ICmpZero;
3592 S = SE.getMinusSCEV(N, S);
3593 } else if (L->isLoopInvariant(NV) &&
3594 (!isa<Instruction>(NV) ||
3595 DT.dominates(cast<Instruction>(NV), L->getHeader())) &&
3596 !NV->getType()->isPointerTy()) {
3597 // If we can't generally expand the expression (e.g. it contains
3598 // a divide), but it is already at a loop invariant point before the
3599 // loop, wrap it in an unknown (to prevent the expander from trying
3600 // to re-expand in a potentially unsafe way.) The restriction to
3601 // integer types is required because the unknown hides the base, and
3602 // SCEV can't compute the difference of two unknown pointers.
3603 N = SE.getUnknown(NV);
3604 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3605 if (!N)
3606 continue;
3607 Kind = LSRUse::ICmpZero;
3608 S = SE.getMinusSCEV(N, S);
3610 }
3611
3612 // -1 and the negations of all interesting strides (except the negation
3613 // of -1) are now also interesting.
3614 for (size_t i = 0, e = Factors.size(); i != e; ++i)
3615 if (Factors[i] != -1)
3616 Factors.insert(-(uint64_t)Factors[i]);
3617 Factors.insert(-1);
3618 }
3619 }
3620
3621 // Get or create an LSRUse.
3622 std::pair<size_t, Immediate> P = getUse(S, Kind, AccessTy);
3623 size_t LUIdx = P.first;
3624 Immediate Offset = P.second;
3625 LSRUse &LU = Uses[LUIdx];
3626
3627 // Record the fixup.
3628 LSRFixup &LF = LU.getNewFixup();
3629 LF.UserInst = UserInst;
3630 LF.OperandValToReplace = U.getOperandValToReplace();
3631 LF.PostIncLoops = TmpPostIncLoops;
3632 LF.Offset = Offset;
3633 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3634 LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF);
3635
3636 // Create SCEV as Formula for calculating baseline cost
3637 if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) {
3638 Formula F;
3639 F.initialMatch(S, L, SE);
3640 BaselineCost.RateFormula(F, Regs, VisitedRegs, LU,
3641 HardwareLoopProfitable);
3642 VisitedLSRUse.insert(LUIdx);
3643 }
3644
3645 // If this is the first use of this LSRUse, give it a formula.
3646 if (LU.Formulae.empty()) {
3647 InsertInitialFormula(S, LU, LUIdx);
3648 CountRegisters(LU.Formulae.back(), LUIdx);
3649 }
3650 }
3651
3652 LLVM_DEBUG(print_fixups(dbgs()));
3653}
3654
3655/// Insert a formula for the given expression into the given use, separating out
3656/// loop-variant portions from loop-invariant and loop-computable portions.
3657void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU,
3658 size_t LUIdx) {
3659 // Mark uses whose expressions cannot be expanded.
3660 if (!Rewriter.isSafeToExpand(S))
3661 LU.RigidFormula = true;
3662
3663 Formula F;
3664 F.initialMatch(S, L, SE);
3665 bool Inserted = InsertFormula(LU, LUIdx, F);
3666 assert(Inserted && "Initial formula already exists!"); (void)Inserted;
3667}
3668
3669/// Insert a simple single-register formula for the given expression into the
3670/// given use.
3671void
3672LSRInstance::InsertSupplementalFormula(const SCEV *S,
3673 LSRUse &LU, size_t LUIdx) {
3674 Formula F;
3675 F.BaseRegs.push_back(S);
3676 F.HasBaseReg = true;
3677 bool Inserted = InsertFormula(LU, LUIdx, F);
3678 assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
3679}
3680
3681/// Note which registers are used by the given formula, updating RegUses.
3682void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
3683 if (F.ScaledReg)
3684 RegUses.countRegister(F.ScaledReg, LUIdx);
3685 for (const SCEV *BaseReg : F.BaseRegs)
3686 RegUses.countRegister(BaseReg, LUIdx);
3687}
3688
3689/// If the given formula has not yet been inserted, add it to the list, and
3690/// return true. Return false otherwise.
3691bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
3692 // Do not insert formula that we will not be able to expand.
3693 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
3694 "Formula is illegal");
3695
3696 if (!LU.InsertFormula(F, *L))
3697 return false;
3698
3699 CountRegisters(F, LUIdx);
3700 return true;
3701}
3702
3703/// Test whether this fixup will be executed each time the corresponding IV
3704/// increment instruction is executed.
3705bool LSRInstance::IsFixupExecutedEachIncrement(const LSRFixup &LF) const {
3706 // If the fixup block dominates the IV increment block then there is no path
3707 // through the loop to the increment that doesn't pass through the fixup.
3708 return DT.dominates(LF.UserInst->getParent(), IVIncInsertPos->getParent());
3709}
3710
3711/// Check for other uses of loop-invariant values which we're tracking. These
3712/// other uses will pin these values in registers, making them less profitable
3713/// for elimination.
3714/// TODO: This currently misses non-constant addrec step registers.
3715/// TODO: Should this give more weight to users inside the loop?
3716void
3717LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
3718 SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
3719 SmallPtrSet<const SCEV *, 32> Visited;
3720
3721 // Don't collect outside uses if we are favoring postinc - the instructions in
3722 // the loop are more important than the ones outside of it.
3723 if (AMK == TTI::AMK_PostIndexed)
3724 return;
3725
3726 while (!Worklist.empty()) {
3727 const SCEV *S = Worklist.pop_back_val();
3728
3729 // Don't process the same SCEV twice
3730 if (!Visited.insert(S).second)
3731 continue;
3732
3733 if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
3734 append_range(Worklist, N->operands());
3735 else if (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(S))
3736 Worklist.push_back(C->getOperand());
3737 else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
3738 Worklist.push_back(D->getLHS());
3739 Worklist.push_back(D->getRHS());
3740 } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
3741 const Value *V = US->getValue();
3742 if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
3743 // Look for instructions defined outside the loop.
3744 if (L->contains(Inst)) continue;
3745 } else if (isa<Constant>(V))
3746 // Constants can be re-materialized.
3747 continue;
3748 for (const Use &U : V->uses()) {
3749 const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
3750 // Ignore non-instructions.
3751 if (!UserInst)
3752 continue;
3753 // Don't bother if the instruction is an EHPad.
3754 if (UserInst->isEHPad())
3755 continue;
3756 // Ignore instructions in other functions (as can happen with
3757 // Constants).
3758 if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
3759 continue;
3760 // Ignore instructions not dominated by the loop.
3761 const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
3762 UserInst->getParent() :
3763 cast<PHINode>(UserInst)->getIncomingBlock(
3765 if (!DT.dominates(L->getHeader(), UseBB))
3766 continue;
3767 // Don't bother if the instruction is in a BB which ends in an EHPad.
3768 if (UseBB->getTerminator()->isEHPad())
3769 continue;
3770
3771 // Ignore cases in which the currently-examined value could come from
3772 // a basic block terminated with an EHPad. This checks all incoming
3773 // blocks of the phi node since it is possible that the same incoming
3774 // value comes from multiple basic blocks, only some of which may end
3775 // in an EHPad. If any of them do, a subsequent rewrite attempt by this
3776 // pass would try to insert instructions into an EHPad, hitting an
3777 // assertion.
3778 if (isa<PHINode>(UserInst)) {
3779 const auto *PhiNode = cast<PHINode>(UserInst);
3780 bool HasIncompatibleEHPTerminatedBlock = false;
3781 llvm::Value *ExpectedValue = U;
3782 for (unsigned int I = 0; I < PhiNode->getNumIncomingValues(); I++) {
3783 if (PhiNode->getIncomingValue(I) == ExpectedValue) {
3784 if (PhiNode->getIncomingBlock(I)->getTerminator()->isEHPad()) {
3785 HasIncompatibleEHPTerminatedBlock = true;
3786 break;
3787 }
3788 }
3789 }
3790 if (HasIncompatibleEHPTerminatedBlock) {
3791 continue;
3792 }
3793 }
3794
3795 // Don't bother rewriting PHIs in catchswitch blocks.
3796 if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
3797 continue;
3798 // Ignore uses which are part of other SCEV expressions, to avoid
3799 // analyzing them multiple times.
3800 if (SE.isSCEVable(UserInst->getType())) {
3801 const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
3802 // If the user is a no-op, look through to its uses.
3803 if (!isa<SCEVUnknown>(UserS))
3804 continue;
3805 if (UserS == US) {
3806 Worklist.push_back(
3807 SE.getUnknown(const_cast<Instruction *>(UserInst)));
3808 continue;
3809 }
3810 }
3811 // Ignore icmp instructions which are already being analyzed.
3812 if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
3813 unsigned OtherIdx = !U.getOperandNo();
3814 Value *OtherOp = ICI->getOperand(OtherIdx);
3815 if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
3816 continue;
3817 }
3818
3819 // Do not consider uses inside lifetime intrinsics. These are not
3820 // actually materialized.
3821 if (UserInst->isLifetimeStartOrEnd())
3822 continue;
3823
3824 std::pair<size_t, Immediate> P =
3825 getUse(S, LSRUse::Basic, MemAccessTy());
3826 size_t LUIdx = P.first;
3827 Immediate Offset = P.second;
3828 LSRUse &LU = Uses[LUIdx];
3829 LSRFixup &LF = LU.getNewFixup();
3830 LF.UserInst = const_cast<Instruction *>(UserInst);
3831 LF.OperandValToReplace = U;
3832 LF.Offset = Offset;
3833 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3834 LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF);
3835 InsertSupplementalFormula(US, LU, LUIdx);
3836 CountRegisters(LU.Formulae.back(), Uses.size() - 1);
3837 break;
3838 }
3839 }
3840 }
3841}
3842
3843/// Split S into subexpressions which can be pulled out into separate
3844/// registers. If C is non-null, multiply each subexpression by C.
3845///
3846/// Return remainder expression after factoring the subexpressions captured by
3847/// Ops. If Ops is complete, return NULL.
3848static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
3850 const Loop *L,
3851 ScalarEvolution &SE,
3852 unsigned Depth = 0) {
3853 // Arbitrarily cap recursion to protect compile time.
3854 if (Depth >= 3)
3855 return S;
3856
3857 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
3858 // Break out add operands.
3859 for (const SCEV *S : Add->operands()) {
3860 const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
3861 if (Remainder)
3862 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3863 }
3864 return nullptr;
3865 }
3866 const SCEV *Start, *Step;
3867 const SCEVConstant *Op0;
3868 const SCEV *Op1;
3869 if (match(S, m_scev_AffineAddRec(m_SCEV(Start), m_SCEV(Step)))) {
3870 // Split a non-zero base out of an addrec.
3871 if (Start->isZero())
3872 return S;
3873
3874 const SCEV *Remainder = CollectSubexprs(Start, C, Ops, L, SE, Depth + 1);
3875 // Split the non-zero AddRec unless it is part of a nested recurrence that
3876 // does not pertain to this loop.
3877 if (Remainder && (cast<SCEVAddRecExpr>(S)->getLoop() == L ||
3878 !isa<SCEVAddRecExpr>(Remainder))) {
3879 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3880 Remainder = nullptr;
3881 }
3882 if (Remainder != Start) {
3883 if (!Remainder)
3884 Remainder = SE.getConstant(S->getType(), 0);
3885 return SE.getAddRecExpr(Remainder, Step,
3886 cast<SCEVAddRecExpr>(S)->getLoop(),
3887 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
3889 }
3890 } else if (match(S, m_scev_Mul(m_SCEVConstant(Op0), m_SCEV(Op1)))) {
3891 // Break (C * (a + b + c)) into C*a + C*b + C*c.
3892 C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
3893 const SCEV *Remainder = CollectSubexprs(Op1, C, Ops, L, SE, Depth + 1);
3894 if (Remainder)
3895 Ops.push_back(SE.getMulExpr(C, Remainder));
3896 return nullptr;
3897 }
3898 return S;
3899}
3900
3901/// Return true if the SCEV represents a value that may end up as a
3902/// post-increment operation.
3904 LSRUse &LU, const SCEV *S, const Loop *L,
3905 ScalarEvolution &SE) {
3906 if (LU.Kind != LSRUse::Address ||
3907 !LU.AccessTy.getType()->isIntOrIntVectorTy())
3908 return false;
3909 const SCEV *Start;
3910 if (!match(S, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant())))
3911 return false;
3912 // Check if a post-indexed load/store can be used.
3913 if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, S->getType()) ||
3914 TTI.isIndexedStoreLegal(TTI.MIM_PostInc, S->getType())) {
3915 if (!isa<SCEVConstant>(Start) && SE.isLoopInvariant(Start, L))
3916 return true;
3917 }
3918 return false;
3919}
3920
3921/// Helper function for LSRInstance::GenerateReassociations.
3922void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
3923 const Formula &Base,
3924 unsigned Depth, size_t Idx,
3925 bool IsScaledReg) {
3926 const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3927 // Don't generate reassociations for the base register of a value that
3928 // may generate a post-increment operator. The reason is that the
3929 // reassociations cause extra base+register formula to be created,
3930 // and possibly chosen, but the post-increment is more efficient.
3931 if (AMK == TTI::AMK_PostIndexed && mayUsePostIncMode(TTI, LU, BaseReg, L, SE))
3932 return;
3934 const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
3935 if (Remainder)
3936 AddOps.push_back(Remainder);
3937
3938 if (AddOps.size() == 1)
3939 return;
3940
3942 JE = AddOps.end();
3943 J != JE; ++J) {
3944 // Loop-variant "unknown" values are uninteresting; we won't be able to
3945 // do anything meaningful with them.
3946 if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
3947 continue;
3948
3949 // Don't pull a constant into a register if the constant could be folded
3950 // into an immediate field.
3951 if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3952 LU.AccessTy, *J, Base.getNumRegs() > 1))
3953 continue;
3954
3955 // Collect all operands except *J.
3956 SmallVector<SCEVUse, 8> InnerAddOps(std::as_const(AddOps).begin(), J);
3957 InnerAddOps.append(std::next(J), std::as_const(AddOps).end());
3958
3959 // Don't leave just a constant behind in a register if the constant could
3960 // be folded into an immediate field.
3961 if (InnerAddOps.size() == 1 &&
3962 isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3963 LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
3964 continue;
3965
3966 const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
3967 if (InnerSum->isZero())
3968 continue;
3969 Formula F = Base;
3970
3971 if (F.UnfoldedOffset.isNonZero() && F.UnfoldedOffset.isScalable())
3972 continue;
3973
3974 // Add the remaining pieces of the add back into the new formula.
3975 const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
3976 if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
3977 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
3978 InnerSumSC->getValue()->getZExtValue())) {
3979 F.UnfoldedOffset =
3980 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
3981 InnerSumSC->getValue()->getZExtValue());
3982 if (IsScaledReg) {
3983 F.ScaledReg = nullptr;
3984 F.Scale = 0;
3985 } else
3986 F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
3987 } else if (IsScaledReg)
3988 F.ScaledReg = InnerSum;
3989 else
3990 F.BaseRegs[Idx] = InnerSum;
3991
3992 // Add J as its own register, or an unfolded immediate.
3993 const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
3994 if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
3995 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
3996 SC->getValue()->getZExtValue()))
3997 F.UnfoldedOffset =
3998 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
3999 SC->getValue()->getZExtValue());
4000 else
4001 F.BaseRegs.push_back(*J);
4002 // We may have changed the number of register in base regs, adjust the
4003 // formula accordingly.
4004 F.canonicalize(*L);
4005
4006 if (InsertFormula(LU, LUIdx, F))
4007 // If that formula hadn't been seen before, recurse to find more like
4008 // it.
4009 // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2)
4010 // Because just Depth is not enough to bound compile time.
4011 // This means that every time AddOps.size() is greater 16^x we will add
4012 // x to Depth.
4013 GenerateReassociations(LU, LUIdx, LU.Formulae.back(),
4014 Depth + 1 + (Log2_32(AddOps.size()) >> 2));
4015 }
4016}
4017
4018/// Split out subexpressions from adds and the bases of addrecs.
4019void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
4020 Formula Base, unsigned Depth) {
4021 assert(Base.isCanonical(*L) && "Input must be in the canonical form");
4022 // Arbitrarily cap recursion to protect compile time.
4023 if (Depth >= 3)
4024 return;
4025
4026 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4027 GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
4028
4029 if (Base.Scale == 1)
4030 GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
4031 /* Idx */ -1, /* IsScaledReg */ true);
4032}
4033
4034/// Generate a formula consisting of all of the loop-dominating registers added
4035/// into a single register.
4036void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
4037 Formula Base) {
4038 // This method is only interesting on a plurality of registers.
4039 if (Base.BaseRegs.size() + (Base.Scale == 1) +
4040 (Base.UnfoldedOffset.isNonZero()) <=
4041 1)
4042 return;
4043
4044 // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
4045 // processing the formula.
4046 Base.unscale();
4048 Formula NewBase = Base;
4049 NewBase.BaseRegs.clear();
4050 Type *CombinedIntegerType = nullptr;
4051 for (const SCEV *BaseReg : Base.BaseRegs) {
4052 if (SE.properlyDominates(BaseReg, L->getHeader()) &&
4053 !SE.hasComputableLoopEvolution(BaseReg, L)) {
4054 if (!CombinedIntegerType)
4055 CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType());
4056 Ops.push_back(BaseReg);
4057 }
4058 else
4059 NewBase.BaseRegs.push_back(BaseReg);
4060 }
4061
4062 // If no register is relevant, we're done.
4063 if (Ops.size() == 0)
4064 return;
4065
4066 // Utility function for generating the required variants of the combined
4067 // registers.
4068 auto GenerateFormula = [&](const SCEV *Sum) {
4069 Formula F = NewBase;
4070
4071 // TODO: If Sum is zero, it probably means ScalarEvolution missed an
4072 // opportunity to fold something. For now, just ignore such cases
4073 // rather than proceed with zero in a register.
4074 if (Sum->isZero())
4075 return;
4076
4077 F.BaseRegs.push_back(Sum);
4078 F.canonicalize(*L);
4079 (void)InsertFormula(LU, LUIdx, F);
4080 };
4081
4082 // If we collected at least two registers, generate a formula combining them.
4083 if (Ops.size() > 1) {
4084 SmallVector<SCEVUse, 4> OpsCopy(Ops); // Don't let SE modify Ops.
4085 GenerateFormula(SE.getAddExpr(OpsCopy));
4086 }
4087
4088 // If we have an unfolded offset, generate a formula combining it with the
4089 // registers collected.
4090 if (NewBase.UnfoldedOffset.isNonZero() && NewBase.UnfoldedOffset.isFixed()) {
4091 assert(CombinedIntegerType && "Missing a type for the unfolded offset");
4092 Ops.push_back(SE.getConstant(CombinedIntegerType,
4093 NewBase.UnfoldedOffset.getFixedValue(), true));
4094 NewBase.UnfoldedOffset = Immediate::getFixed(0);
4095 GenerateFormula(SE.getAddExpr(Ops));
4096 }
4097}
4098
4099/// Helper function for LSRInstance::GenerateSymbolicOffsets.
4100void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
4101 const Formula &Base, size_t Idx,
4102 bool IsScaledReg) {
4103 SCEVUse G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4104 GlobalValue *GV = ExtractSymbol(G, SE);
4105 if (G->isZero() || !GV)
4106 return;
4107 Formula F = Base;
4108 F.BaseGV = GV;
4109 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4110 return;
4111 if (IsScaledReg)
4112 F.ScaledReg = G;
4113 else
4114 F.BaseRegs[Idx] = G;
4115 (void)InsertFormula(LU, LUIdx, F);
4116}
4117
4118/// Generate reuse formulae using symbolic offsets.
4119void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
4120 Formula Base) {
4121 // We can't add a symbolic offset if the address already contains one.
4122 if (Base.BaseGV) return;
4123
4124 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4125 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
4126 if (Base.Scale == 1)
4127 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
4128 /* IsScaledReg */ true);
4129}
4130
4131/// Helper function for LSRInstance::GenerateConstantOffsets.
4132void LSRInstance::GenerateConstantOffsetsImpl(
4133 LSRUse &LU, unsigned LUIdx, const Formula &Base,
4134 const SmallVectorImpl<Immediate> &Worklist, size_t Idx, bool IsScaledReg) {
4135
4136 auto GenerateOffset = [&](const SCEV *G, Immediate Offset) {
4137 Formula F = Base;
4138 if (!Base.BaseOffset.isCompatibleImmediate(Offset))
4139 return;
4140 F.BaseOffset = Base.BaseOffset.subUnsigned(Offset);
4141
4142 if (isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) {
4143 // Add the offset to the base register.
4144 const SCEV *NewOffset = Offset.getSCEV(SE, G->getType());
4145 const SCEV *NewG = SE.getAddExpr(NewOffset, G);
4146 // If it cancelled out, drop the base register, otherwise update it.
4147 if (NewG->isZero()) {
4148 if (IsScaledReg) {
4149 F.Scale = 0;
4150 F.ScaledReg = nullptr;
4151 } else
4152 F.deleteBaseReg(F.BaseRegs[Idx]);
4153 F.canonicalize(*L);
4154 } else if (IsScaledReg)
4155 F.ScaledReg = NewG;
4156 else
4157 F.BaseRegs[Idx] = NewG;
4158
4159 (void)InsertFormula(LU, LUIdx, F);
4160 }
4161 };
4162
4163 SCEVUse G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4164
4165 // With constant offsets and constant steps, we can generate pre-inc
4166 // accesses by having the offset equal the step. So, for access #0 with a
4167 // step of 8, we generate a G - 8 base which would require the first access
4168 // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer
4169 // for itself and hopefully becomes the base for other accesses. This means
4170 // means that a single pre-indexed access can be generated to become the new
4171 // base pointer for each iteration of the loop, resulting in no extra add/sub
4172 // instructions for pointer updating.
4173 if ((AMK & TTI::AMK_PreIndexed) && LU.Kind == LSRUse::Address) {
4174 const APInt *StepInt;
4175 if (match(G, m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StepInt)))) {
4176 int64_t Step = StepInt->isNegative() ? StepInt->getSExtValue()
4177 : StepInt->getZExtValue();
4178
4179 for (Immediate Offset : Worklist) {
4180 if (Offset.isFixed()) {
4181 Offset = Immediate::getFixed(Offset.getFixedValue() - Step);
4182 GenerateOffset(G, Offset);
4183 }
4184 }
4185 }
4186 }
4187 for (Immediate Offset : Worklist)
4188 GenerateOffset(G, Offset);
4189
4190 Immediate Imm = ExtractImmediate(G, SE);
4191 if (G->isZero() || Imm.isZero() ||
4192 !Base.BaseOffset.isCompatibleImmediate(Imm))
4193 return;
4194 Formula F = Base;
4195 F.BaseOffset = F.BaseOffset.addUnsigned(Imm);
4196 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4197 return;
4198 if (IsScaledReg) {
4199 F.ScaledReg = G;
4200 } else {
4201 F.BaseRegs[Idx] = G;
4202 // We may generate non canonical Formula if G is a recurrent expr reg
4203 // related with current loop while F.ScaledReg is not.
4204 F.canonicalize(*L);
4205 }
4206 (void)InsertFormula(LU, LUIdx, F);
4207}
4208
4209/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
4210void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
4211 Formula Base) {
4212 // TODO: For now, just add the min and max offset, because it usually isn't
4213 // worthwhile looking at everything inbetween.
4215 Worklist.push_back(LU.MinOffset);
4216 if (LU.MaxOffset != LU.MinOffset)
4217 Worklist.push_back(LU.MaxOffset);
4218
4219 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4220 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
4221 if (Base.Scale == 1)
4222 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
4223 /* IsScaledReg */ true);
4224}
4225
4226/// For ICmpZero, check to see if we can scale up the comparison. For example, x
4227/// == y -> x*c == y*c.
4228void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
4229 Formula Base) {
4230 if (LU.Kind != LSRUse::ICmpZero) return;
4231
4232 // Determine the integer type for the base formula.
4233 Type *IntTy = Base.getType();
4234 if (!IntTy) return;
4235 if (SE.getTypeSizeInBits(IntTy) > 64) return;
4236
4237 // Don't do this if there is more than one offset.
4238 if (LU.MinOffset != LU.MaxOffset) return;
4239
4240 // Check if transformation is valid. It is illegal to multiply pointer.
4241 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4242 return;
4243 for (const SCEV *BaseReg : Base.BaseRegs)
4244 if (BaseReg->getType()->isPointerTy())
4245 return;
4246 assert(!Base.BaseGV && "ICmpZero use is not legal!");
4247
4248 // Check each interesting stride.
4249 for (int64_t Factor : Factors) {
4250 // Check that Factor can be represented by IntTy
4251 if (!ConstantInt::isValueValidForType(IntTy, Factor))
4252 continue;
4253 // Check that the multiplication doesn't overflow.
4254 if (Base.BaseOffset.isMin() && Factor == -1)
4255 continue;
4256 // Not supporting scalable immediates.
4257 if (Base.BaseOffset.isNonZero() && Base.BaseOffset.isScalable())
4258 continue;
4259 Immediate NewBaseOffset = Base.BaseOffset.mulUnsigned(Factor);
4260 assert(Factor != 0 && "Zero factor not expected!");
4261 if (NewBaseOffset.getFixedValue() / Factor !=
4262 Base.BaseOffset.getFixedValue())
4263 continue;
4264 // If the offset will be truncated at this use, check that it is in bounds.
4265 if (!IntTy->isPointerTy() &&
4266 !ConstantInt::isValueValidForType(IntTy, NewBaseOffset.getFixedValue()))
4267 continue;
4268
4269 // Check that multiplying with the use offset doesn't overflow.
4270 Immediate Offset = LU.MinOffset;
4271 if (Offset.isMin() && Factor == -1)
4272 continue;
4273 Offset = Offset.mulUnsigned(Factor);
4274 if (Offset.getFixedValue() / Factor != LU.MinOffset.getFixedValue())
4275 continue;
4276 // If the offset will be truncated at this use, check that it is in bounds.
4277 if (!IntTy->isPointerTy() &&
4278 !ConstantInt::isValueValidForType(IntTy, Offset.getFixedValue()))
4279 continue;
4280
4281 Formula F = Base;
4282 F.BaseOffset = NewBaseOffset;
4283
4284 // Check that this scale is legal.
4285 if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
4286 continue;
4287
4288 // Compensate for the use having MinOffset built into it.
4289 F.BaseOffset = F.BaseOffset.addUnsigned(Offset).subUnsigned(LU.MinOffset);
4290
4291 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4292
4293 // Check that multiplying with each base register doesn't overflow.
4294 for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
4295 F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
4296 if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
4297 goto next;
4298 }
4299
4300 // Check that multiplying with the scaled register doesn't overflow.
4301 if (F.ScaledReg) {
4302 F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
4303 if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
4304 continue;
4305 }
4306
4307 // Check that multiplying with the unfolded offset doesn't overflow.
4308 if (F.UnfoldedOffset.isNonZero()) {
4309 if (F.UnfoldedOffset.isMin() && Factor == -1)
4310 continue;
4311 F.UnfoldedOffset = F.UnfoldedOffset.mulUnsigned(Factor);
4312 if (F.UnfoldedOffset.getFixedValue() / Factor !=
4313 Base.UnfoldedOffset.getFixedValue())
4314 continue;
4315 // If the offset will be truncated, check that it is in bounds.
4317 IntTy, F.UnfoldedOffset.getFixedValue()))
4318 continue;
4319 }
4320
4321 // If we make it here and it's legal, add it.
4322 (void)InsertFormula(LU, LUIdx, F);
4323 next:;
4324 }
4325}
4326
4327/// Generate stride factor reuse formulae by making use of scaled-offset address
4328/// modes, for example.
4329void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
4330 // Determine the integer type for the base formula.
4331 Type *IntTy = Base.getType();
4332 if (!IntTy) return;
4333
4334 // If this Formula already has a scaled register, we can't add another one.
4335 // Try to unscale the formula to generate a better scale.
4336 if (Base.Scale != 0 && !Base.unscale())
4337 return;
4338
4339 assert(Base.Scale == 0 && "unscale did not did its job!");
4340
4341 // Check each interesting stride.
4342 for (int64_t Factor : Factors) {
4343 Base.Scale = Factor;
4344 Base.HasBaseReg = Base.BaseRegs.size() > 1;
4345 // Check whether this scale is going to be legal.
4346 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4347 Base)) {
4348 // As a special-case, handle special out-of-loop Basic users specially.
4349 // TODO: Reconsider this special case.
4350 if (LU.Kind == LSRUse::Basic &&
4351 isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
4352 LU.AccessTy, Base) &&
4353 LU.AllFixupsOutsideLoop)
4354 LU.Kind = LSRUse::Special;
4355 else
4356 continue;
4357 }
4358 // For an ICmpZero, negating a solitary base register won't lead to
4359 // new solutions.
4360 if (LU.Kind == LSRUse::ICmpZero && !Base.HasBaseReg &&
4361 Base.BaseOffset.isZero() && !Base.BaseGV)
4362 continue;
4363 // For each addrec base reg, if its loop is current loop, apply the scale.
4364 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
4365 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
4366 if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
4367 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4368 if (FactorS->isZero())
4369 continue;
4370 // Divide out the factor, ignoring high bits, since we'll be
4371 // scaling the value back up in the end.
4372 if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true))
4373 if (!Quotient->isZero()) {
4374 // TODO: This could be optimized to avoid all the copying.
4375 Formula F = Base;
4376 F.ScaledReg = Quotient;
4377 F.deleteBaseReg(F.BaseRegs[i]);
4378 // The canonical representation of 1*reg is reg, which is already in
4379 // Base. In that case, do not try to insert the formula, it will be
4380 // rejected anyway.
4381 if (F.Scale == 1 && (F.BaseRegs.empty() ||
4382 (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
4383 continue;
4384 // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
4385 // non canonical Formula with ScaledReg's loop not being L.
4386 if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
4387 F.canonicalize(*L);
4388 (void)InsertFormula(LU, LUIdx, F);
4389 }
4390 }
4391 }
4392 }
4393}
4394
4395/// Extend/Truncate \p Expr to \p ToTy considering post-inc uses in \p Loops.
4396/// For all PostIncLoopSets in \p Loops, first de-normalize \p Expr, then
4397/// perform the extension/truncate and normalize again, as the normalized form
4398/// can result in folds that are not valid in the post-inc use contexts. The
4399/// expressions for all PostIncLoopSets must match, otherwise return nullptr.
4400static const SCEV *
4402 const SCEV *Expr, Type *ToTy,
4403 ScalarEvolution &SE) {
4404 const SCEV *Result = nullptr;
4405 for (auto &L : Loops) {
4406 auto *DenormExpr = denormalizeForPostIncUse(Expr, L, SE);
4407 const SCEV *NewDenormExpr = SE.getAnyExtendExpr(DenormExpr, ToTy);
4408 const SCEV *New = normalizeForPostIncUse(NewDenormExpr, L, SE);
4409 if (!New || (Result && New != Result))
4410 return nullptr;
4411 Result = New;
4412 }
4413
4414 assert(Result && "failed to create expression");
4415 return Result;
4416}
4417
4418/// Generate reuse formulae from different IV types.
4419void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
4420 // Don't bother truncating symbolic values.
4421 if (Base.BaseGV) return;
4422
4423 // Determine the integer type for the base formula.
4424 Type *DstTy = Base.getType();
4425 if (!DstTy) return;
4426 if (DstTy->isPointerTy())
4427 return;
4428
4429 // It is invalid to extend a pointer type so exit early if ScaledReg or
4430 // any of the BaseRegs are pointers.
4431 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4432 return;
4433 if (any_of(Base.BaseRegs,
4434 [](const SCEV *S) { return S->getType()->isPointerTy(); }))
4435 return;
4436
4438 for (auto &LF : LU.Fixups)
4439 Loops.push_back(LF.PostIncLoops);
4440
4441 for (Type *SrcTy : Types) {
4442 if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
4443 Formula F = Base;
4444
4445 // Sometimes SCEV is able to prove zero during ext transform. It may
4446 // happen if SCEV did not do all possible transforms while creating the
4447 // initial node (maybe due to depth limitations), but it can do them while
4448 // taking ext.
4449 if (F.ScaledReg) {
4450 const SCEV *NewScaledReg =
4451 getAnyExtendConsideringPostIncUses(Loops, F.ScaledReg, SrcTy, SE);
4452 if (!NewScaledReg || NewScaledReg->isZero())
4453 continue;
4454 F.ScaledReg = NewScaledReg;
4455 }
4456 bool HasZeroBaseReg = false;
4457 for (const SCEV *&BaseReg : F.BaseRegs) {
4458 const SCEV *NewBaseReg =
4459 getAnyExtendConsideringPostIncUses(Loops, BaseReg, SrcTy, SE);
4460 if (!NewBaseReg || NewBaseReg->isZero()) {
4461 HasZeroBaseReg = true;
4462 break;
4463 }
4464 BaseReg = NewBaseReg;
4465 }
4466 if (HasZeroBaseReg)
4467 continue;
4468
4469 // TODO: This assumes we've done basic processing on all uses and
4470 // have an idea what the register usage is.
4471 if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
4472 continue;
4473
4474 F.canonicalize(*L);
4475 (void)InsertFormula(LU, LUIdx, F);
4476 }
4477 }
4478}
4479
4480namespace {
4481
4482/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
4483/// modifications so that the search phase doesn't have to worry about the data
4484/// structures moving underneath it.
4485struct WorkItem {
4486 size_t LUIdx;
4487 Immediate Imm;
4488 const SCEV *OrigReg;
4489
4490 WorkItem(size_t LI, Immediate I, const SCEV *R)
4491 : LUIdx(LI), Imm(I), OrigReg(R) {}
4492
4493 void print(raw_ostream &OS) const;
4494 void dump() const;
4495};
4496
4497} // end anonymous namespace
4498
4499#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4500void WorkItem::print(raw_ostream &OS) const {
4501 OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
4502 << " , add offset " << Imm;
4503}
4504
4505LLVM_DUMP_METHOD void WorkItem::dump() const {
4506 print(errs()); errs() << '\n';
4507}
4508#endif
4509
4510/// Look for registers which are a constant distance apart and try to form reuse
4511/// opportunities between them.
4512void LSRInstance::GenerateCrossUseConstantOffsets() {
4513 // Group the registers by their value without any added constant offset.
4514 using ImmMapTy = std::map<Immediate, const SCEV *, KeyOrderTargetImmediate>;
4515
4516 DenseMap<const SCEV *, ImmMapTy> Map;
4517 DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
4519 for (const SCEV *Use : RegUses) {
4520 SCEVUse Reg = Use; // Make a copy for ExtractImmediate to modify.
4521 Immediate Imm = ExtractImmediate(Reg, SE);
4522 auto Pair = Map.try_emplace(Reg);
4523 if (Pair.second)
4524 Sequence.push_back(Reg);
4525 Pair.first->second.insert(std::make_pair(Imm, Use));
4526 UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
4527 }
4528
4529 // Now examine each set of registers with the same base value. Build up
4530 // a list of work to do and do the work in a separate step so that we're
4531 // not adding formulae and register counts while we're searching.
4532 SmallVector<WorkItem, 32> WorkItems;
4533 SmallSet<std::pair<size_t, Immediate>, 32, KeyOrderSizeTAndImmediate>
4534 UniqueItems;
4535 for (const SCEV *Reg : Sequence) {
4536 const ImmMapTy &Imms = Map.find(Reg)->second;
4537
4538 // It's not worthwhile looking for reuse if there's only one offset.
4539 if (Imms.size() == 1)
4540 continue;
4541
4542 LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
4543 for (const auto &Entry
4544 : Imms) dbgs()
4545 << ' ' << Entry.first;
4546 dbgs() << '\n');
4547
4548 // Examine each offset.
4549 for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
4550 J != JE; ++J) {
4551 const SCEV *OrigReg = J->second;
4552
4553 Immediate JImm = J->first;
4554 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
4555
4556 if (!isa<SCEVConstant>(OrigReg) &&
4557 UsedByIndicesMap[Reg].count() == 1) {
4558 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4559 << '\n');
4560 continue;
4561 }
4562
4563 // Conservatively examine offsets between this orig reg a few selected
4564 // other orig regs.
4565 Immediate First = Imms.begin()->first;
4566 Immediate Last = std::prev(Imms.end())->first;
4567 if (!First.isCompatibleImmediate(Last)) {
4568 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4569 << "\n");
4570 continue;
4571 }
4572 // Only scalable if both terms are scalable, or if one is scalable and
4573 // the other is 0.
4574 bool Scalable = First.isScalable() || Last.isScalable();
4575 int64_t FI = First.getKnownMinValue();
4576 int64_t LI = Last.getKnownMinValue();
4577 // Compute (First + Last) / 2 without overflow using the fact that
4578 // First + Last = 2 * (First + Last) + (First ^ Last).
4579 int64_t Avg = (FI & LI) + ((FI ^ LI) >> 1);
4580 // If the result is negative and FI is odd and LI even (or vice versa),
4581 // we rounded towards -inf. Add 1 in that case, to round towards 0.
4582 Avg = Avg + ((FI ^ LI) & ((uint64_t)Avg >> 63));
4583 ImmMapTy::const_iterator OtherImms[] = {
4584 Imms.begin(), std::prev(Imms.end()),
4585 Imms.lower_bound(Immediate::get(Avg, Scalable))};
4586 for (const auto &M : OtherImms) {
4587 if (M == J || M == JE) continue;
4588 if (!JImm.isCompatibleImmediate(M->first))
4589 continue;
4590
4591 // Compute the difference between the two.
4592 Immediate Imm = JImm.subUnsigned(M->first);
4593 for (unsigned LUIdx : UsedByIndices.set_bits())
4594 // Make a memo of this use, offset, and register tuple.
4595 if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
4596 WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
4597 }
4598 }
4599 }
4600
4601 Map.clear();
4602 Sequence.clear();
4603 UsedByIndicesMap.clear();
4604 UniqueItems.clear();
4605
4606 // Now iterate through the worklist and add new formulae.
4607 for (const WorkItem &WI : WorkItems) {
4608 size_t LUIdx = WI.LUIdx;
4609 LSRUse &LU = Uses[LUIdx];
4610 Immediate Imm = WI.Imm;
4611 const SCEV *OrigReg = WI.OrigReg;
4612
4613 Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
4614 const SCEV *NegImmS = Imm.getNegativeSCEV(SE, IntTy);
4615 unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
4616
4617 // TODO: Use a more targeted data structure.
4618 for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
4619 Formula F = LU.Formulae[L];
4620 // FIXME: The code for the scaled and unscaled registers looks
4621 // very similar but slightly different. Investigate if they
4622 // could be merged. That way, we would not have to unscale the
4623 // Formula.
4624 F.unscale();
4625 // Use the immediate in the scaled register.
4626 if (F.ScaledReg == OrigReg) {
4627 if (!F.BaseOffset.isCompatibleImmediate(Imm))
4628 continue;
4629 Immediate Offset = F.BaseOffset.addUnsigned(Imm.mulUnsigned(F.Scale));
4630 // Don't create 50 + reg(-50).
4631 const SCEV *S = Offset.getNegativeSCEV(SE, IntTy);
4632 if (F.referencesReg(S))
4633 continue;
4634 Formula NewF = F;
4635 NewF.BaseOffset = Offset;
4636 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4637 NewF))
4638 continue;
4639 NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
4640
4641 // If the new scale is a constant in a register, and adding the constant
4642 // value to the immediate would produce a value closer to zero than the
4643 // immediate itself, then the formula isn't worthwhile.
4644 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) {
4645 // FIXME: Do we need to do something for scalable immediates here?
4646 // A scalable SCEV won't be constant, but we might still have
4647 // something in the offset? Bail out for now to be safe.
4648 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4649 continue;
4650 if (C->getValue()->isNegative() !=
4651 (NewF.BaseOffset.isLessThanZero()) &&
4652 (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
4653 .ule(std::abs(NewF.BaseOffset.getFixedValue())))
4654 continue;
4655 }
4656
4657 // OK, looks good.
4658 NewF.canonicalize(*this->L);
4659 (void)InsertFormula(LU, LUIdx, NewF);
4660 } else {
4661 // Use the immediate in a base register.
4662 for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
4663 const SCEV *BaseReg = F.BaseRegs[N];
4664 if (BaseReg != OrigReg)
4665 continue;
4666 Formula NewF = F;
4667 if (!NewF.BaseOffset.isCompatibleImmediate(Imm) ||
4668 !NewF.UnfoldedOffset.isCompatibleImmediate(Imm) ||
4669 !NewF.BaseOffset.isCompatibleImmediate(NewF.UnfoldedOffset))
4670 continue;
4671 NewF.BaseOffset = NewF.BaseOffset.addUnsigned(Imm);
4672 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
4673 LU.Kind, LU.AccessTy, NewF)) {
4674 if (AMK == TTI::AMK_PostIndexed &&
4675 mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
4676 continue;
4677 Immediate NewUnfoldedOffset = NewF.UnfoldedOffset.addUnsigned(Imm);
4678 if (!isLegalAddImmediate(TTI, NewUnfoldedOffset))
4679 continue;
4680 NewF = F;
4681 NewF.UnfoldedOffset = NewUnfoldedOffset;
4682 }
4683 NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
4684
4685 // If the new formula has a constant in a register, and adding the
4686 // constant value to the immediate would produce a value closer to
4687 // zero than the immediate itself, then the formula isn't worthwhile.
4688 for (const SCEV *NewReg : NewF.BaseRegs)
4689 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg)) {
4690 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4691 goto skip_formula;
4692 if ((C->getAPInt() + NewF.BaseOffset.getFixedValue())
4693 .abs()
4694 .slt(std::abs(NewF.BaseOffset.getFixedValue())) &&
4695 (C->getAPInt() + NewF.BaseOffset.getFixedValue())
4696 .countr_zero() >=
4698 NewF.BaseOffset.getFixedValue()))
4699 goto skip_formula;
4700 }
4701
4702 // Ok, looks good.
4703 NewF.canonicalize(*this->L);
4704 (void)InsertFormula(LU, LUIdx, NewF);
4705 break;
4706 skip_formula:;
4707 }
4708 }
4709 }
4710 }
4711}
4712
4713/// Generate formulae for each use.
4714void
4715LSRInstance::GenerateAllReuseFormulae() {
4716 // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
4717 // queries are more precise.
4718 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4719 LSRUse &LU = Uses[LUIdx];
4720 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4721 GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
4722 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4723 GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
4724 }
4725 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4726 LSRUse &LU = Uses[LUIdx];
4727 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4728 GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
4729 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4730 GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
4731 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4732 GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
4733 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4734 GenerateScales(LU, LUIdx, LU.Formulae[i]);
4735 }
4736 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4737 LSRUse &LU = Uses[LUIdx];
4738 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4739 GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
4740 }
4741
4742 GenerateCrossUseConstantOffsets();
4743
4744 LLVM_DEBUG(dbgs() << "\n"
4745 "After generating reuse formulae:\n";
4746 print_uses(dbgs()));
4747}
4748
4749/// If there are multiple formulae with the same set of registers used
4750/// by other uses, pick the best one and delete the others.
4751void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
4752 DenseSet<const SCEV *> VisitedRegs;
4753 SmallPtrSet<const SCEV *, 16> Regs;
4754 SmallPtrSet<const SCEV *, 16> LoserRegs;
4755#ifndef NDEBUG
4756 bool ChangedFormulae = false;
4757#endif
4758
4759 // Collect the best formula for each unique set of shared registers. This
4760 // is reset for each use.
4761 using BestFormulaeTy = DenseMap<SmallVector<const SCEV *, 4>, size_t>;
4762
4763 BestFormulaeTy BestFormulae;
4764
4765 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4766 LSRUse &LU = Uses[LUIdx];
4767 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
4768 dbgs() << '\n');
4769
4770 bool Any = false;
4771 for (size_t FIdx = 0, NumForms = LU.Formulae.size();
4772 FIdx != NumForms; ++FIdx) {
4773 Formula &F = LU.Formulae[FIdx];
4774
4775 // Some formulas are instant losers. For example, they may depend on
4776 // nonexistent AddRecs from other loops. These need to be filtered
4777 // immediately, otherwise heuristics could choose them over others leading
4778 // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
4779 // avoids the need to recompute this information across formulae using the
4780 // same bad AddRec. Passing LoserRegs is also essential unless we remove
4781 // the corresponding bad register from the Regs set.
4782 Cost CostF(L, SE, TTI, AMK);
4783 Regs.clear();
4784 CostF.RateFormula(F, Regs, VisitedRegs, LU, HardwareLoopProfitable,
4785 &LoserRegs);
4786 if (CostF.isLoser()) {
4787 // During initial formula generation, undesirable formulae are generated
4788 // by uses within other loops that have some non-trivial address mode or
4789 // use the postinc form of the IV. LSR needs to provide these formulae
4790 // as the basis of rediscovering the desired formula that uses an AddRec
4791 // corresponding to the existing phi. Once all formulae have been
4792 // generated, these initial losers may be pruned.
4793 LLVM_DEBUG(dbgs() << " Filtering loser "; F.print(dbgs());
4794 dbgs() << "\n");
4795 }
4796 else {
4798 for (const SCEV *Reg : F.BaseRegs) {
4799 if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
4800 Key.push_back(Reg);
4801 }
4802 if (F.ScaledReg &&
4803 RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
4804 Key.push_back(F.ScaledReg);
4805 // Unstable sort by host order ok, because this is only used for
4806 // uniquifying.
4807 llvm::sort(Key);
4808
4809 std::pair<BestFormulaeTy::const_iterator, bool> P =
4810 BestFormulae.insert(std::make_pair(Key, FIdx));
4811 if (P.second)
4812 continue;
4813
4814 Formula &Best = LU.Formulae[P.first->second];
4815
4816 Cost CostBest(L, SE, TTI, AMK);
4817 Regs.clear();
4818 CostBest.RateFormula(Best, Regs, VisitedRegs, LU,
4819 HardwareLoopProfitable);
4820 if (CostF.isLess(CostBest))
4821 std::swap(F, Best);
4822 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
4823 dbgs() << "\n"
4824 " in favor of formula ";
4825 Best.print(dbgs()); dbgs() << '\n');
4826 }
4827#ifndef NDEBUG
4828 ChangedFormulae = true;
4829#endif
4830 LU.DeleteFormula(F);
4831 --FIdx;
4832 --NumForms;
4833 Any = true;
4834 }
4835
4836 // Now that we've filtered out some formulae, recompute the Regs set.
4837 if (Any)
4838 LU.RecomputeRegs(LUIdx, RegUses);
4839
4840 // Reset this to prepare for the next use.
4841 BestFormulae.clear();
4842 }
4843
4844 LLVM_DEBUG(if (ChangedFormulae) {
4845 dbgs() << "\n"
4846 "After filtering out undesirable candidates:\n";
4847 print_uses(dbgs());
4848 });
4849}
4850
4851/// Estimate the worst-case number of solutions the solver might have to
4852/// consider. It almost never considers this many solutions because it prune the
4853/// search space, but the pruning isn't always sufficient.
4854size_t LSRInstance::EstimateSearchSpaceComplexity() const {
4855 size_t Power = 1;
4856 for (const LSRUse &LU : Uses) {
4857 size_t FSize = LU.Formulae.size();
4858 if (FSize >= ComplexityLimit) {
4859 Power = ComplexityLimit;
4860 break;
4861 }
4862 Power *= FSize;
4863 if (Power >= ComplexityLimit)
4864 break;
4865 }
4866 return Power;
4867}
4868
4869/// When one formula uses a superset of the registers of another formula, it
4870/// won't help reduce register pressure (though it may not necessarily hurt
4871/// register pressure); remove it to simplify the system.
4872void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
4873 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4874 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4875
4876 LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
4877 "which use a superset of registers used by other "
4878 "formulae.\n");
4879
4880 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4881 LSRUse &LU = Uses[LUIdx];
4882 bool Any = false;
4883 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
4884 Formula &F = LU.Formulae[i];
4885 if (F.BaseOffset.isNonZero() && F.BaseOffset.isScalable())
4886 continue;
4887 // Look for a formula with a constant or GV in a register. If the use
4888 // also has a formula with that same value in an immediate field,
4889 // delete the one that uses a register.
4891 I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
4892 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
4893 Formula NewF = F;
4894 //FIXME: Formulas should store bitwidth to do wrapping properly.
4895 // See PR41034.
4896 NewF.BaseOffset =
4897 Immediate::getFixed(NewF.BaseOffset.getFixedValue() +
4898 (uint64_t)C->getValue()->getSExtValue());
4899 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4900 (I - F.BaseRegs.begin()));
4901 if (LU.HasFormulaWithSameRegs(NewF)) {
4902 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4903 dbgs() << '\n');
4904 LU.DeleteFormula(F);
4905 --i;
4906 --e;
4907 Any = true;
4908 break;
4909 }
4910 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
4911 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
4912 if (!F.BaseGV) {
4913 Formula NewF = F;
4914 NewF.BaseGV = GV;
4915 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4916 (I - F.BaseRegs.begin()));
4917 if (LU.HasFormulaWithSameRegs(NewF)) {
4918 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4919 dbgs() << '\n');
4920 LU.DeleteFormula(F);
4921 --i;
4922 --e;
4923 Any = true;
4924 break;
4925 }
4926 }
4927 }
4928 }
4929 }
4930 if (Any)
4931 LU.RecomputeRegs(LUIdx, RegUses);
4932 }
4933
4934 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4935 }
4936}
4937
4938/// When there are many registers for expressions like A, A+1, A+2, etc.,
4939/// allocate a single register for them.
4940void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
4941 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4942 return;
4943
4944 LLVM_DEBUG(
4945 dbgs() << "The search space is too complex.\n"
4946 "Narrowing the search space by assuming that uses separated "
4947 "by a constant offset will use the same registers.\n");
4948
4949 // This is especially useful for unrolled loops.
4950
4951 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4952 LSRUse &LU = Uses[LUIdx];
4953 for (const Formula &F : LU.Formulae) {
4954 if (F.BaseOffset.isZero() || (F.Scale != 0 && F.Scale != 1))
4955 continue;
4956 assert((LU.Kind == LSRUse::Address || LU.Kind == LSRUse::ICmpZero) &&
4957 "Only address and cmp uses expected to have nonzero BaseOffset");
4958
4959 LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
4960 if (!LUThatHas)
4961 continue;
4962
4963 if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
4964 LU.Kind, LU.AccessTy))
4965 continue;
4966
4967 LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');
4968
4969 LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
4970 LUThatHas->AllFixupsUnconditional &= LU.AllFixupsUnconditional;
4971
4972 // Transfer the fixups of LU to LUThatHas.
4973 for (LSRFixup &Fixup : LU.Fixups) {
4974 Fixup.Offset += F.BaseOffset;
4975 LUThatHas->pushFixup(Fixup);
4976 LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
4977 }
4978
4979#ifndef NDEBUG
4980 Type *FixupType = LUThatHas->Fixups[0].OperandValToReplace->getType();
4981 for (LSRFixup &Fixup : LUThatHas->Fixups)
4982 assert(Fixup.OperandValToReplace->getType() == FixupType &&
4983 "Expected all fixups to have the same type");
4984#endif
4985
4986 // Delete formulae from the new use which are no longer legal.
4987 bool Any = false;
4988 for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
4989 Formula &F = LUThatHas->Formulae[i];
4990 if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
4991 LUThatHas->Kind, LUThatHas->AccessTy, F)) {
4992 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
4993 LUThatHas->DeleteFormula(F);
4994 --i;
4995 --e;
4996 Any = true;
4997 }
4998 }
4999
5000 if (Any)
5001 LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
5002
5003 // Delete the old use.
5004 DeleteUse(LU, LUIdx);
5005 --LUIdx;
5006 --NumUses;
5007 break;
5008 }
5009 }
5010
5011 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5012}
5013
5014/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
5015/// we've done more filtering, as it may be able to find more formulae to
5016/// eliminate.
5017void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
5018 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5019 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5020
5021 LLVM_DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
5022 "undesirable dedicated registers.\n");
5023
5024 FilterOutUndesirableDedicatedRegisters();
5025
5026 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5027 }
5028}
5029
5030/// If a LSRUse has multiple formulae with the same ScaledReg and Scale.
5031/// Pick the best one and delete the others.
5032/// This narrowing heuristic is to keep as many formulae with different
5033/// Scale and ScaledReg pair as possible while narrowing the search space.
5034/// The benefit is that it is more likely to find out a better solution
5035/// from a formulae set with more Scale and ScaledReg variations than
5036/// a formulae set with the same Scale and ScaledReg. The picking winner
5037/// reg heuristic will often keep the formulae with the same Scale and
5038/// ScaledReg and filter others, and we want to avoid that if possible.
5039void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
5040 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5041 return;
5042
5043 LLVM_DEBUG(
5044 dbgs() << "The search space is too complex.\n"
5045 "Narrowing the search space by choosing the best Formula "
5046 "from the Formulae with the same Scale and ScaledReg.\n");
5047
5048 // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
5049 using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>;
5050
5051 BestFormulaeTy BestFormulae;
5052#ifndef NDEBUG
5053 bool ChangedFormulae = false;
5054#endif
5055 DenseSet<const SCEV *> VisitedRegs;
5056 SmallPtrSet<const SCEV *, 16> Regs;
5057
5058 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5059 LSRUse &LU = Uses[LUIdx];
5060 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
5061 dbgs() << '\n');
5062
5063 // Return true if Formula FA is better than Formula FB.
5064 auto IsBetterThan = [&](Formula &FA, Formula &FB) {
5065 // First we will try to choose the Formula with fewer new registers.
5066 // For a register used by current Formula, the more the register is
5067 // shared among LSRUses, the less we increase the register number
5068 // counter of the formula.
5069 size_t FARegNum = 0;
5070 for (const SCEV *Reg : FA.BaseRegs) {
5071 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5072 FARegNum += (NumUses - UsedByIndices.count() + 1);
5073 }
5074 size_t FBRegNum = 0;
5075 for (const SCEV *Reg : FB.BaseRegs) {
5076 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5077 FBRegNum += (NumUses - UsedByIndices.count() + 1);
5078 }
5079 if (FARegNum != FBRegNum)
5080 return FARegNum < FBRegNum;
5081
5082 // If the new register numbers are the same, choose the Formula with
5083 // less Cost.
5084 Cost CostFA(L, SE, TTI, AMK);
5085 Cost CostFB(L, SE, TTI, AMK);
5086 Regs.clear();
5087 CostFA.RateFormula(FA, Regs, VisitedRegs, LU, HardwareLoopProfitable);
5088 Regs.clear();
5089 CostFB.RateFormula(FB, Regs, VisitedRegs, LU, HardwareLoopProfitable);
5090 return CostFA.isLess(CostFB);
5091 };
5092
5093 bool Any = false;
5094 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5095 ++FIdx) {
5096 Formula &F = LU.Formulae[FIdx];
5097 if (!F.ScaledReg)
5098 continue;
5099 auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx});
5100 if (P.second)
5101 continue;
5102
5103 Formula &Best = LU.Formulae[P.first->second];
5104 if (IsBetterThan(F, Best))
5105 std::swap(F, Best);
5106 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5107 dbgs() << "\n"
5108 " in favor of formula ";
5109 Best.print(dbgs()); dbgs() << '\n');
5110#ifndef NDEBUG
5111 ChangedFormulae = true;
5112#endif
5113 LU.DeleteFormula(F);
5114 --FIdx;
5115 --NumForms;
5116 Any = true;
5117 }
5118 if (Any)
5119 LU.RecomputeRegs(LUIdx, RegUses);
5120
5121 // Reset this to prepare for the next use.
5122 BestFormulae.clear();
5123 }
5124
5125 LLVM_DEBUG(if (ChangedFormulae) {
5126 dbgs() << "\n"
5127 "After filtering out undesirable candidates:\n";
5128 print_uses(dbgs());
5129 });
5130}
5131
5132/// If we are over the complexity limit, filter out any post-inc prefering
5133/// variables to only post-inc values.
5134void LSRInstance::NarrowSearchSpaceByFilterPostInc() {
5135 if (AMK != TTI::AMK_PostIndexed)
5136 return;
5137 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5138 return;
5139
5140 LLVM_DEBUG(dbgs() << "The search space is too complex.\n"
5141 "Narrowing the search space by choosing the lowest "
5142 "register Formula for PostInc Uses.\n");
5143
5144 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5145 LSRUse &LU = Uses[LUIdx];
5146
5147 if (LU.Kind != LSRUse::Address)
5148 continue;
5149 if (!TTI.isIndexedLoadLegal(TTI.MIM_PostInc, LU.AccessTy.getType()) &&
5150 !TTI.isIndexedStoreLegal(TTI.MIM_PostInc, LU.AccessTy.getType()))
5151 continue;
5152
5153 size_t MinRegs = std::numeric_limits<size_t>::max();
5154 for (const Formula &F : LU.Formulae)
5155 MinRegs = std::min(F.getNumRegs(), MinRegs);
5156
5157 bool Any = false;
5158 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5159 ++FIdx) {
5160 Formula &F = LU.Formulae[FIdx];
5161 if (F.getNumRegs() > MinRegs) {
5162 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5163 dbgs() << "\n");
5164 LU.DeleteFormula(F);
5165 --FIdx;
5166 --NumForms;
5167 Any = true;
5168 }
5169 }
5170 if (Any)
5171 LU.RecomputeRegs(LUIdx, RegUses);
5172
5173 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5174 break;
5175 }
5176
5177 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5178}
5179
5180/// The function delete formulas with high registers number expectation.
5181/// Assuming we don't know the value of each formula (already delete
5182/// all inefficient), generate probability of not selecting for each
5183/// register.
5184/// For example,
5185/// Use1:
5186/// reg(a) + reg({0,+,1})
5187/// reg(a) + reg({-1,+,1}) + 1
5188/// reg({a,+,1})
5189/// Use2:
5190/// reg(b) + reg({0,+,1})
5191/// reg(b) + reg({-1,+,1}) + 1
5192/// reg({b,+,1})
5193/// Use3:
5194/// reg(c) + reg(b) + reg({0,+,1})
5195/// reg(c) + reg({b,+,1})
5196///
5197/// Probability of not selecting
5198/// Use1 Use2 Use3
5199/// reg(a) (1/3) * 1 * 1
5200/// reg(b) 1 * (1/3) * (1/2)
5201/// reg({0,+,1}) (2/3) * (2/3) * (1/2)
5202/// reg({-1,+,1}) (2/3) * (2/3) * 1
5203/// reg({a,+,1}) (2/3) * 1 * 1
5204/// reg({b,+,1}) 1 * (2/3) * (2/3)
5205/// reg(c) 1 * 1 * 0
5206///
5207/// Now count registers number mathematical expectation for each formula:
5208/// Note that for each use we exclude probability if not selecting for the use.
5209/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
5210/// probabilty 1/3 of not selecting for Use1).
5211/// Use1:
5212/// reg(a) + reg({0,+,1}) 1 + 1/3 -- to be deleted
5213/// reg(a) + reg({-1,+,1}) + 1 1 + 4/9 -- to be deleted
5214/// reg({a,+,1}) 1
5215/// Use2:
5216/// reg(b) + reg({0,+,1}) 1/2 + 1/3 -- to be deleted
5217/// reg(b) + reg({-1,+,1}) + 1 1/2 + 2/3 -- to be deleted
5218/// reg({b,+,1}) 2/3
5219/// Use3:
5220/// reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
5221/// reg(c) + reg({b,+,1}) 1 + 2/3
5222void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
5223 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5224 return;
5225 // Ok, we have too many of formulae on our hands to conveniently handle.
5226 // Use a rough heuristic to thin out the list.
5227
5228 // Set of Regs wich will be 100% used in final solution.
5229 // Used in each formula of a solution (in example above this is reg(c)).
5230 // We can skip them in calculations.
5231 SmallPtrSet<const SCEV *, 4> UniqRegs;
5232 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5233
5234 // Map each register to probability of not selecting
5235 DenseMap <const SCEV *, float> RegNumMap;
5236 for (const SCEV *Reg : RegUses) {
5237 if (UniqRegs.count(Reg))
5238 continue;
5239 float PNotSel = 1;
5240 for (const LSRUse &LU : Uses) {
5241 if (!LU.Regs.count(Reg))
5242 continue;
5243 float P = LU.getNotSelectedProbability(Reg);
5244 if (P != 0.0)
5245 PNotSel *= P;
5246 else
5247 UniqRegs.insert(Reg);
5248 }
5249 RegNumMap.insert(std::make_pair(Reg, PNotSel));
5250 }
5251
5252 LLVM_DEBUG(
5253 dbgs() << "Narrowing the search space by deleting costly formulas\n");
5254
5255 // Delete formulas where registers number expectation is high.
5256 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5257 LSRUse &LU = Uses[LUIdx];
5258 // If nothing to delete - continue.
5259 if (LU.Formulae.size() < 2)
5260 continue;
5261 // This is temporary solution to test performance. Float should be
5262 // replaced with round independent type (based on integers) to avoid
5263 // different results for different target builds.
5264 float FMinRegNum = LU.Formulae[0].getNumRegs();
5265 float FMinARegNum = LU.Formulae[0].getNumRegs();
5266 size_t MinIdx = 0;
5267 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5268 Formula &F = LU.Formulae[i];
5269 float FRegNum = 0;
5270 float FARegNum = 0;
5271 for (const SCEV *BaseReg : F.BaseRegs) {
5272 if (UniqRegs.count(BaseReg))
5273 continue;
5274 FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5275 if (isa<SCEVAddRecExpr>(BaseReg))
5276 FARegNum +=
5277 RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5278 }
5279 if (const SCEV *ScaledReg = F.ScaledReg) {
5280 if (!UniqRegs.count(ScaledReg)) {
5281 FRegNum +=
5282 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5283 if (isa<SCEVAddRecExpr>(ScaledReg))
5284 FARegNum +=
5285 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5286 }
5287 }
5288 if (FMinRegNum > FRegNum ||
5289 (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
5290 FMinRegNum = FRegNum;
5291 FMinARegNum = FARegNum;
5292 MinIdx = i;
5293 }
5294 }
5295 LLVM_DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs());
5296 dbgs() << " with min reg num " << FMinRegNum << '\n');
5297 if (MinIdx != 0)
5298 std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
5299 while (LU.Formulae.size() != 1) {
5300 LLVM_DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs());
5301 dbgs() << '\n');
5302 LU.Formulae.pop_back();
5303 }
5304 LU.RecomputeRegs(LUIdx, RegUses);
5305 assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
5306 Formula &F = LU.Formulae[0];
5307 LLVM_DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n');
5308 // When we choose the formula, the regs become unique.
5309 UniqRegs.insert_range(F.BaseRegs);
5310 if (F.ScaledReg)
5311 UniqRegs.insert(F.ScaledReg);
5312 }
5313 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5314}
5315
5316// Check if Best and Reg are SCEVs separated by a constant amount C, and if so
5317// would the addressing offset +C would be legal where the negative offset -C is
5318// not.
5320 ScalarEvolution &SE, const SCEV *Best,
5321 const SCEV *Reg,
5322 MemAccessTy AccessType) {
5323 if (Best->getType() != Reg->getType() ||
5325 cast<SCEVAddRecExpr>(Best)->getLoop() !=
5326 cast<SCEVAddRecExpr>(Reg)->getLoop()))
5327 return false;
5328 std::optional<APInt> Diff = SE.computeConstantDifference(Best, Reg);
5329 if (!Diff)
5330 return false;
5331
5332 return TTI.isLegalAddressingMode(
5333 AccessType.MemTy, /*BaseGV=*/nullptr,
5334 /*BaseOffset=*/Diff->getSExtValue(),
5335 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace) &&
5336 !TTI.isLegalAddressingMode(
5337 AccessType.MemTy, /*BaseGV=*/nullptr,
5338 /*BaseOffset=*/-Diff->getSExtValue(),
5339 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace);
5340}
5341
5342/// Pick a register which seems likely to be profitable, and then in any use
5343/// which has any reference to that register, delete all formulae which do not
5344/// reference that register.
5345void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
5346 // With all other options exhausted, loop until the system is simple
5347 // enough to handle.
5348 SmallPtrSet<const SCEV *, 4> Taken;
5349 while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5350 // Ok, we have too many of formulae on our hands to conveniently handle.
5351 // Use a rough heuristic to thin out the list.
5352 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5353
5354 // Pick the register which is used by the most LSRUses, which is likely
5355 // to be a good reuse register candidate.
5356 const SCEV *Best = nullptr;
5357 unsigned BestNum = 0;
5358 for (const SCEV *Reg : RegUses) {
5359 if (Taken.count(Reg))
5360 continue;
5361 if (!Best) {
5362 Best = Reg;
5363 BestNum = RegUses.getUsedByIndices(Reg).count();
5364 } else {
5365 unsigned Count = RegUses.getUsedByIndices(Reg).count();
5366 if (Count > BestNum) {
5367 Best = Reg;
5368 BestNum = Count;
5369 }
5370
5371 // If the scores are the same, but the Reg is simpler for the target
5372 // (for example {x,+,1} as opposed to {x+C,+,1}, where the target can
5373 // handle +C but not -C), opt for the simpler formula.
5374 if (Count == BestNum) {
5375 int LUIdx = RegUses.getUsedByIndices(Reg).find_first();
5376 if (LUIdx >= 0 && Uses[LUIdx].Kind == LSRUse::Address &&
5378 Uses[LUIdx].AccessTy)) {
5379 Best = Reg;
5380 BestNum = Count;
5381 }
5382 }
5383 }
5384 }
5385 assert(Best && "Failed to find best LSRUse candidate");
5386
5387 LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
5388 << " will yield profitable reuse.\n");
5389 Taken.insert(Best);
5390
5391 // In any use with formulae which references this register, delete formulae
5392 // which don't reference it.
5393 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5394 LSRUse &LU = Uses[LUIdx];
5395 if (!LU.Regs.count(Best)) continue;
5396
5397 bool Any = false;
5398 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5399 Formula &F = LU.Formulae[i];
5400 if (!F.referencesReg(Best)) {
5401 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
5402 LU.DeleteFormula(F);
5403 --e;
5404 --i;
5405 Any = true;
5406 assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
5407 continue;
5408 }
5409 }
5410
5411 if (Any)
5412 LU.RecomputeRegs(LUIdx, RegUses);
5413 }
5414
5415 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5416 }
5417}
5418
5419/// If there are an extraordinary number of formulae to choose from, use some
5420/// rough heuristics to prune down the number of formulae. This keeps the main
5421/// solver from taking an extraordinary amount of time in some worst-case
5422/// scenarios.
5423void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
5424 NarrowSearchSpaceByDetectingSupersets();
5425 NarrowSearchSpaceByCollapsingUnrolledCode();
5426 NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
5428 NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
5429 NarrowSearchSpaceByFilterPostInc();
5430 if (LSRExpNarrow)
5431 NarrowSearchSpaceByDeletingCostlyFormulas();
5432 else
5433 NarrowSearchSpaceByPickingWinnerRegs();
5434}
5435
5436/// This is the recursive solver.
5437void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
5438 Cost &SolutionCost,
5439 SmallVectorImpl<const Formula *> &Workspace,
5440 const Cost &CurCost,
5441 const SmallPtrSet<const SCEV *, 16> &CurRegs,
5442 DenseSet<const SCEV *> &VisitedRegs) const {
5443 // Some ideas:
5444 // - prune more:
5445 // - use more aggressive filtering
5446 // - sort the formula so that the most profitable solutions are found first
5447 // - sort the uses too
5448 // - search faster:
5449 // - don't compute a cost, and then compare. compare while computing a cost
5450 // and bail early.
5451 // - track register sets with SmallBitVector
5452
5453 const LSRUse &LU = Uses[Workspace.size()];
5454
5455 // If this use references any register that's already a part of the
5456 // in-progress solution, consider it a requirement that a formula must
5457 // reference that register in order to be considered. This prunes out
5458 // unprofitable searching.
5459 SmallSetVector<const SCEV *, 4> ReqRegs;
5460 for (const SCEV *S : CurRegs)
5461 if (LU.Regs.count(S))
5462 ReqRegs.insert(S);
5463
5464 SmallPtrSet<const SCEV *, 16> NewRegs;
5465 Cost NewCost(L, SE, TTI, AMK);
5466 for (const Formula &F : LU.Formulae) {
5467 // Ignore formulae which may not be ideal in terms of register reuse of
5468 // ReqRegs. The formula should use all required registers before
5469 // introducing new ones.
5470 // This can sometimes (notably when trying to favour postinc) lead to
5471 // sub-optimial decisions. There it is best left to the cost modelling to
5472 // get correct.
5473 if (!(AMK & TTI::AMK_PostIndexed) || LU.Kind != LSRUse::Address) {
5474 int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
5475 for (const SCEV *Reg : ReqRegs) {
5476 if ((F.ScaledReg && F.ScaledReg == Reg) ||
5477 is_contained(F.BaseRegs, Reg)) {
5478 --NumReqRegsToFind;
5479 if (NumReqRegsToFind == 0)
5480 break;
5481 }
5482 }
5483 if (NumReqRegsToFind != 0) {
5484 // If none of the formulae satisfied the required registers, then we could
5485 // clear ReqRegs and try again. Currently, we simply give up in this case.
5486 continue;
5487 }
5488 }
5489
5490 // Evaluate the cost of the current formula. If it's already worse than
5491 // the current best, prune the search at that point.
5492 NewCost = CurCost;
5493 NewRegs = CurRegs;
5494 NewCost.RateFormula(F, NewRegs, VisitedRegs, LU, HardwareLoopProfitable);
5495 if (NewCost.isLess(SolutionCost)) {
5496 Workspace.push_back(&F);
5497 if (Workspace.size() != Uses.size()) {
5498 SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
5499 NewRegs, VisitedRegs);
5500 if (F.getNumRegs() == 1 && Workspace.size() == 1)
5501 VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
5502 } else {
5503 LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
5504 dbgs() << ".\nRegs:\n";
5505 for (const SCEV *S : NewRegs) dbgs()
5506 << "- " << *S << "\n";
5507 dbgs() << '\n');
5508
5509 SolutionCost = NewCost;
5510 Solution = Workspace;
5511 }
5512 Workspace.pop_back();
5513 }
5514 }
5515}
5516
5517/// Choose one formula from each use. Return the results in the given Solution
5518/// vector.
5519void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
5521 Cost SolutionCost(L, SE, TTI, AMK);
5522 SolutionCost.Lose();
5523 Cost CurCost(L, SE, TTI, AMK);
5524 SmallPtrSet<const SCEV *, 16> CurRegs;
5525 DenseSet<const SCEV *> VisitedRegs;
5526 Workspace.reserve(Uses.size());
5527
5528 // SolveRecurse does all the work.
5529 SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
5530 CurRegs, VisitedRegs);
5531 if (Solution.empty()) {
5532 LLVM_DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
5533 return;
5534 }
5535
5536 // Ok, we've now made all our decisions.
5537 LLVM_DEBUG(dbgs() << "\n"
5538 "The chosen solution requires ";
5539 SolutionCost.print(dbgs()); dbgs() << ":\n";
5540 for (size_t i = 0, e = Uses.size(); i != e; ++i) {
5541 dbgs() << " ";
5542 Uses[i].print(dbgs());
5543 dbgs() << "\n"
5544 " ";
5545 Solution[i]->print(dbgs());
5546 dbgs() << '\n';
5547 });
5548
5549 assert(Solution.size() == Uses.size() && "Malformed solution!");
5550
5551 const bool EnableDropUnprofitableSolution = [&] {
5553 case cl::BOU_TRUE:
5554 return true;
5555 case cl::BOU_FALSE:
5556 return false;
5557 case cl::BOU_UNSET:
5559 }
5560 llvm_unreachable("Unhandled cl::boolOrDefault enum");
5561 }();
5562
5563 if (BaselineCost.isLess(SolutionCost)) {
5564 if (!EnableDropUnprofitableSolution)
5565 LLVM_DEBUG(
5566 dbgs() << "Baseline is more profitable than chosen solution, "
5567 "add option 'lsr-drop-solution' to drop LSR solution.\n");
5568 else {
5569 LLVM_DEBUG(dbgs() << "Baseline is more profitable than chosen "
5570 "solution, dropping LSR solution.\n";);
5571 Solution.clear();
5572 }
5573 }
5574}
5575
5576/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
5577/// we can go while still being dominated by the input positions. This helps
5578/// canonicalize the insert position, which encourages sharing.
5580LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
5581 const SmallVectorImpl<Instruction *> &Inputs)
5582 const {
5583 Instruction *Tentative = &*IP;
5584 while (true) {
5585 bool AllDominate = true;
5586 Instruction *BetterPos = nullptr;
5587 // Don't bother attempting to insert before a catchswitch, their basic block
5588 // cannot have other non-PHI instructions.
5589 if (isa<CatchSwitchInst>(Tentative))
5590 return IP;
5591
5592 for (Instruction *Inst : Inputs) {
5593 if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
5594 AllDominate = false;
5595 break;
5596 }
5597 // Attempt to find an insert position in the middle of the block,
5598 // instead of at the end, so that it can be used for other expansions.
5599 if (Tentative->getParent() == Inst->getParent() &&
5600 (!BetterPos || !DT.dominates(Inst, BetterPos)))
5601 BetterPos = &*std::next(BasicBlock::iterator(Inst));
5602 }
5603 if (!AllDominate)
5604 break;
5605 if (BetterPos)
5606 IP = BetterPos->getIterator();
5607 else
5608 IP = Tentative->getIterator();
5609
5610 const Loop *IPLoop = LI.getLoopFor(IP->getParent());
5611 unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
5612
5613 BasicBlock *IDom;
5614 for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
5615 if (!Rung) return IP;
5616 Rung = Rung->getIDom();
5617 if (!Rung) return IP;
5618 IDom = Rung->getBlock();
5619
5620 // Don't climb into a loop though.
5621 const Loop *IDomLoop = LI.getLoopFor(IDom);
5622 unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
5623 if (IDomDepth <= IPLoopDepth &&
5624 (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
5625 break;
5626 }
5627
5628 Tentative = IDom->getTerminator();
5629 }
5630
5631 return IP;
5632}
5633
5634/// Determine an input position which will be dominated by the operands and
5635/// which will dominate the result.
5636BasicBlock::iterator LSRInstance::AdjustInsertPositionForExpand(
5637 BasicBlock::iterator LowestIP, const LSRFixup &LF, const LSRUse &LU) const {
5638 // Collect some instructions which must be dominated by the
5639 // expanding replacement. These must be dominated by any operands that
5640 // will be required in the expansion.
5641 SmallVector<Instruction *, 4> Inputs;
5642 if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
5643 Inputs.push_back(I);
5644 if (LU.Kind == LSRUse::ICmpZero)
5645 if (Instruction *I =
5646 dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
5647 Inputs.push_back(I);
5648 if (LF.PostIncLoops.count(L)) {
5649 if (LF.isUseFullyOutsideLoop(L))
5650 Inputs.push_back(L->getLoopLatch()->getTerminator());
5651 else
5652 Inputs.push_back(IVIncInsertPos);
5653 }
5654 // The expansion must also be dominated by the increment positions of any
5655 // loops it for which it is using post-inc mode.
5656 for (const Loop *PIL : LF.PostIncLoops) {
5657 if (PIL == L) continue;
5658
5659 // Be dominated by the loop exit.
5660 SmallVector<BasicBlock *, 4> ExitingBlocks;
5661 PIL->getExitingBlocks(ExitingBlocks);
5662 if (!ExitingBlocks.empty()) {
5663 BasicBlock *BB = ExitingBlocks[0];
5664 for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
5665 BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
5666 Inputs.push_back(BB->getTerminator());
5667 }
5668 }
5669
5670 assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad() &&
5671 "Insertion point must be a normal instruction");
5672
5673 // Then, climb up the immediate dominator tree as far as we can go while
5674 // still being dominated by the input positions.
5675 BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
5676
5677 // Don't insert instructions before PHI nodes.
5678 while (isa<PHINode>(IP)) ++IP;
5679
5680 // Ignore landingpad instructions.
5681 while (IP->isEHPad()) ++IP;
5682
5683 // Set IP below instructions recently inserted by SCEVExpander. This keeps the
5684 // IP consistent across expansions and allows the previously inserted
5685 // instructions to be reused by subsequent expansion.
5686 while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP)
5687 ++IP;
5688
5689 return IP;
5690}
5691
5692/// Emit instructions for the leading candidate expression for this LSRUse (this
5693/// is called "expanding").
5694Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
5695 const Formula &F, BasicBlock::iterator IP,
5696 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5697 if (LU.RigidFormula)
5698 return LF.OperandValToReplace;
5699
5700 // Determine an input position which will be dominated by the operands and
5701 // which will dominate the result.
5702 IP = AdjustInsertPositionForExpand(IP, LF, LU);
5703 Rewriter.setInsertPoint(&*IP);
5704
5705 // Inform the Rewriter if we have a post-increment use, so that it can
5706 // perform an advantageous expansion.
5707 Rewriter.setPostInc(LF.PostIncLoops);
5708
5709 // This is the type that the user actually needs.
5710 Type *OpTy = LF.OperandValToReplace->getType();
5711 // This will be the type that we'll initially expand to.
5712 Type *Ty = F.getType();
5713 if (!Ty)
5714 // No type known; just expand directly to the ultimate type.
5715 Ty = OpTy;
5716 else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
5717 // Expand directly to the ultimate type if it's the right size.
5718 Ty = OpTy;
5719 // This is the type to do integer arithmetic in.
5720 Type *IntTy = SE.getEffectiveSCEVType(Ty);
5721
5722 // Build up a list of operands to add together to form the full base.
5724
5725 // Expand the BaseRegs portion.
5726 for (const SCEV *Reg : F.BaseRegs) {
5727 assert(!Reg->isZero() && "Zero allocated in a base register!");
5728
5729 // If we're expanding for a post-inc user, make the post-inc adjustment.
5730 Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE);
5731 Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
5732 }
5733
5734 // Expand the ScaledReg portion.
5735 Value *ICmpScaledV = nullptr;
5736 if (F.Scale != 0) {
5737 const SCEV *ScaledS = F.ScaledReg;
5738
5739 // If we're expanding for a post-inc user, make the post-inc adjustment.
5740 PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
5741 ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE);
5742
5743 if (LU.Kind == LSRUse::ICmpZero) {
5744 // Expand ScaleReg as if it was part of the base regs.
5745 if (F.Scale == 1)
5746 Ops.push_back(
5747 SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)));
5748 else {
5749 // An interesting way of "folding" with an icmp is to use a negated
5750 // scale, which we'll implement by inserting it into the other operand
5751 // of the icmp.
5752 assert(F.Scale == -1 &&
5753 "The only scale supported by ICmpZero uses is -1!");
5754 ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr);
5755 }
5756 } else {
5757 // Otherwise just expand the scaled register and an explicit scale,
5758 // which is expected to be matched as part of the address.
5759
5760 // Flush the operand list to suppress SCEVExpander hoisting address modes.
5761 // Unless the addressing mode will not be folded.
5762 if (!Ops.empty() && LU.Kind == LSRUse::Address &&
5763 isAMCompletelyFolded(TTI, LU, F)) {
5764 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), nullptr);
5765 Ops.clear();
5766 Ops.push_back(SE.getUnknown(FullV));
5767 }
5768 ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr));
5769 if (F.Scale != 1)
5770 ScaledS =
5771 SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
5772 Ops.push_back(ScaledS);
5773 }
5774 }
5775
5776 // Expand the GV portion.
5777 if (F.BaseGV) {
5778 // Flush the operand list to suppress SCEVExpander hoisting.
5779 if (!Ops.empty()) {
5780 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), IntTy);
5781 Ops.clear();
5782 Ops.push_back(SE.getUnknown(FullV));
5783 }
5784 Ops.push_back(SE.getUnknown(F.BaseGV));
5785 }
5786
5787 // Flush the operand list to suppress SCEVExpander hoisting of both folded and
5788 // unfolded offsets. LSR assumes they both live next to their uses.
5789 if (!Ops.empty()) {
5790 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
5791 Ops.clear();
5792 Ops.push_back(SE.getUnknown(FullV));
5793 }
5794
5795 // FIXME: Are we sure we won't get a mismatch here? Is there a way to bail
5796 // out at this point, or should we generate a SCEV adding together mixed
5797 // offsets?
5798 assert(F.BaseOffset.isCompatibleImmediate(LF.Offset) &&
5799 "Expanding mismatched offsets\n");
5800 // Expand the immediate portion.
5801 Immediate Offset = F.BaseOffset.addUnsigned(LF.Offset);
5802 if (Offset.isNonZero()) {
5803 if (LU.Kind == LSRUse::ICmpZero) {
5804 // The other interesting way of "folding" with an ICmpZero is to use a
5805 // negated immediate.
5806 if (!ICmpScaledV) {
5807 // TODO: Avoid implicit trunc?
5808 // See https://github.com/llvm/llvm-project/issues/112510.
5809 ICmpScaledV = ConstantInt::getSigned(
5810 IntTy, -(uint64_t)Offset.getFixedValue(), /*ImplicitTrunc=*/true);
5811 } else {
5812 Ops.push_back(SE.getUnknown(ICmpScaledV));
5813 ICmpScaledV = ConstantInt::getSigned(IntTy, Offset.getFixedValue(),
5814 /*ImplicitTrunc=*/true);
5815 }
5816 } else {
5817 // Just add the immediate values. These again are expected to be matched
5818 // as part of the address.
5819 Ops.push_back(Offset.getUnknownSCEV(SE, IntTy));
5820 }
5821 }
5822
5823 // Expand the unfolded offset portion.
5824 Immediate UnfoldedOffset = F.UnfoldedOffset;
5825 if (UnfoldedOffset.isNonZero()) {
5826 // Just add the immediate values.
5827 Ops.push_back(UnfoldedOffset.getUnknownSCEV(SE, IntTy));
5828 }
5829
5830 // Emit instructions summing all the operands.
5831 const SCEV *FullS = Ops.empty() ?
5832 SE.getConstant(IntTy, 0) :
5833 SE.getAddExpr(Ops);
5834 Value *FullV = Rewriter.expandCodeFor(FullS, Ty);
5835
5836 // We're done expanding now, so reset the rewriter.
5837 Rewriter.clearPostInc();
5838
5839 // An ICmpZero Formula represents an ICmp which we're handling as a
5840 // comparison against zero. Now that we've expanded an expression for that
5841 // form, update the ICmp's other operand.
5842 if (LU.Kind == LSRUse::ICmpZero) {
5843 ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
5844 if (auto *OperandIsInstr = dyn_cast<Instruction>(CI->getOperand(1)))
5845 DeadInsts.emplace_back(OperandIsInstr);
5846 assert(!F.BaseGV && "ICmp does not support folding a global value and "
5847 "a scale at the same time!");
5848 if (F.Scale == -1) {
5849 if (ICmpScaledV->getType() != OpTy) {
5851 CastInst::getCastOpcode(ICmpScaledV, false, OpTy, false),
5852 ICmpScaledV, OpTy, "tmp", CI->getIterator());
5853 ICmpScaledV = Cast;
5854 }
5855 CI->setOperand(1, ICmpScaledV);
5856 } else {
5857 // A scale of 1 means that the scale has been expanded as part of the
5858 // base regs.
5859 assert((F.Scale == 0 || F.Scale == 1) &&
5860 "ICmp does not support folding a global value and "
5861 "a scale at the same time!");
5862 // TODO: Avoid implicit trunc?
5863 // See https://github.com/llvm/llvm-project/issues/112510.
5865 -(uint64_t)Offset.getFixedValue(),
5866 /*ImplicitTrunc=*/true);
5867 if (C->getType() != OpTy) {
5869 CastInst::getCastOpcode(C, false, OpTy, false), C, OpTy,
5870 CI->getDataLayout());
5871 assert(C && "Cast of ConstantInt should have folded");
5872 }
5873
5874 CI->setOperand(1, C);
5875 }
5876 }
5877
5878 return FullV;
5879}
5880
5881/// Helper for Rewrite. PHI nodes are special because the use of their operands
5882/// effectively happens in their predecessor blocks, so the expression may need
5883/// to be expanded in multiple places.
5884void LSRInstance::RewriteForPHI(PHINode *PN, const LSRUse &LU,
5885 const LSRFixup &LF, const Formula &F,
5886 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
5887 DenseMap<BasicBlock *, Value *> Inserted;
5888
5889 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
5890 if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
5891 bool needUpdateFixups = false;
5892 BasicBlock *BB = PN->getIncomingBlock(i);
5893
5894 // If this is a critical edge, split the edge so that we do not insert
5895 // the code on all predecessor/successor paths. We do this unless this
5896 // is the canonical backedge for this loop, which complicates post-inc
5897 // users.
5898 if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
5901 BasicBlock *Parent = PN->getParent();
5902 Loop *PNLoop = LI.getLoopFor(Parent);
5903 if (!PNLoop || Parent != PNLoop->getHeader()) {
5904 // Split the critical edge.
5905 BasicBlock *NewBB = nullptr;
5906 if (!Parent->isLandingPad()) {
5907 NewBB =
5908 SplitCriticalEdge(BB, Parent,
5909 CriticalEdgeSplittingOptions(&DT, &LI, MSSAU)
5910 .setMergeIdenticalEdges()
5911 .setKeepOneInputPHIs());
5912 } else {
5914 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
5915 SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DTU, &LI);
5916 NewBB = NewBBs[0];
5917 }
5918 // If NewBB==NULL, then SplitCriticalEdge refused to split because all
5919 // phi predecessors are identical. The simple thing to do is skip
5920 // splitting in this case rather than complicate the API.
5921 if (NewBB) {
5922 // If PN is outside of the loop and BB is in the loop, we want to
5923 // move the block to be immediately before the PHI block, not
5924 // immediately after BB.
5925 if (L->contains(BB) && !L->contains(PN))
5926 NewBB->moveBefore(PN->getParent());
5927
5928 // Splitting the edge can reduce the number of PHI entries we have.
5929 e = PN->getNumIncomingValues();
5930 BB = NewBB;
5931 i = PN->getBasicBlockIndex(BB);
5932
5933 needUpdateFixups = true;
5934 }
5935 }
5936 }
5937
5938 std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
5939 Inserted.try_emplace(BB);
5940 if (!Pair.second)
5941 PN->setIncomingValue(i, Pair.first->second);
5942 else {
5943 Value *FullV =
5944 Expand(LU, LF, F, BB->getTerminator()->getIterator(), DeadInsts);
5945
5946 // If this is reuse-by-noop-cast, insert the noop cast.
5947 Type *OpTy = LF.OperandValToReplace->getType();
5948 if (FullV->getType() != OpTy)
5949 FullV = CastInst::Create(
5950 CastInst::getCastOpcode(FullV, false, OpTy, false), FullV,
5951 LF.OperandValToReplace->getType(), "tmp",
5952 BB->getTerminator()->getIterator());
5953
5954 // If the incoming block for this value is not in the loop, it means the
5955 // current PHI is not in a loop exit, so we must create a LCSSA PHI for
5956 // the inserted value.
5957 if (auto *I = dyn_cast<Instruction>(FullV))
5958 if (L->contains(I) && !L->contains(BB))
5959 InsertedNonLCSSAInsts.insert(I);
5960
5961 PN->setIncomingValue(i, FullV);
5962 Pair.first->second = FullV;
5963 }
5964
5965 // If LSR splits critical edge and phi node has other pending
5966 // fixup operands, we need to update those pending fixups. Otherwise
5967 // formulae will not be implemented completely and some instructions
5968 // will not be eliminated.
5969 if (needUpdateFixups) {
5970 for (LSRUse &LU : Uses)
5971 for (LSRFixup &Fixup : LU.Fixups)
5972 // If fixup is supposed to rewrite some operand in the phi
5973 // that was just updated, it may be already moved to
5974 // another phi node. Such fixup requires update.
5975 if (Fixup.UserInst == PN) {
5976 // Check if the operand we try to replace still exists in the
5977 // original phi.
5978 bool foundInOriginalPHI = false;
5979 for (const auto &val : PN->incoming_values())
5980 if (val == Fixup.OperandValToReplace) {
5981 foundInOriginalPHI = true;
5982 break;
5983 }
5984
5985 // If fixup operand found in original PHI - nothing to do.
5986 if (foundInOriginalPHI)
5987 continue;
5988
5989 // Otherwise it might be moved to another PHI and requires update.
5990 // If fixup operand not found in any of the incoming blocks that
5991 // means we have already rewritten it - nothing to do.
5992 for (const auto &Block : PN->blocks())
5993 for (BasicBlock::iterator I = Block->begin(); isa<PHINode>(I);
5994 ++I) {
5995 PHINode *NewPN = cast<PHINode>(I);
5996 for (const auto &val : NewPN->incoming_values())
5997 if (val == Fixup.OperandValToReplace)
5998 Fixup.UserInst = NewPN;
5999 }
6000 }
6001 }
6002 }
6003}
6004
6005/// Emit instructions for the leading candidate expression for this LSRUse (this
6006/// is called "expanding"), and update the UserInst to reference the newly
6007/// expanded value.
6008void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
6009 const Formula &F,
6010 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
6011 // First, find an insertion point that dominates UserInst. For PHI nodes,
6012 // find the nearest block which dominates all the relevant uses.
6013 if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
6014 RewriteForPHI(PN, LU, LF, F, DeadInsts);
6015 } else {
6016 Value *FullV = Expand(LU, LF, F, LF.UserInst->getIterator(), DeadInsts);
6017
6018 // If this is reuse-by-noop-cast, insert the noop cast.
6019 Type *OpTy = LF.OperandValToReplace->getType();
6020 if (FullV->getType() != OpTy) {
6021 Instruction *Cast =
6022 CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
6023 FullV, OpTy, "tmp", LF.UserInst->getIterator());
6024 FullV = Cast;
6025 }
6026
6027 // Update the user. ICmpZero is handled specially here (for now) because
6028 // Expand may have updated one of the operands of the icmp already, and
6029 // its new value may happen to be equal to LF.OperandValToReplace, in
6030 // which case doing replaceUsesOfWith leads to replacing both operands
6031 // with the same value. TODO: Reorganize this.
6032 if (LU.Kind == LSRUse::ICmpZero)
6033 LF.UserInst->setOperand(0, FullV);
6034 else
6035 LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
6036 }
6037
6038 if (auto *OperandIsInstr = dyn_cast<Instruction>(LF.OperandValToReplace))
6039 DeadInsts.emplace_back(OperandIsInstr);
6040}
6041
6042// Determine where to insert the transformed IV increment instruction for this
6043// fixup. By default this is the default insert position, but if this is a
6044// postincrement opportunity then we try to insert it in the same block as the
6045// fixup user instruction, as this is needed for a postincrement instruction to
6046// be generated.
6048 const LSRFixup &Fixup, const LSRUse &LU,
6049 Instruction *IVIncInsertPos,
6050 DominatorTree &DT) {
6051 // Only address uses can be postincremented
6052 if (LU.Kind != LSRUse::Address)
6053 return IVIncInsertPos;
6054
6055 // Don't try to postincrement if it's not legal
6056 Instruction *I = Fixup.UserInst;
6057 Type *Ty = I->getType();
6058 if (!(isa<LoadInst>(I) && TTI.isIndexedLoadLegal(TTI.MIM_PostInc, Ty)) &&
6059 !(isa<StoreInst>(I) && TTI.isIndexedStoreLegal(TTI.MIM_PostInc, Ty)))
6060 return IVIncInsertPos;
6061
6062 // It's only legal to hoist to the user block if it dominates the default
6063 // insert position.
6064 BasicBlock *HoistBlock = I->getParent();
6065 BasicBlock *IVIncBlock = IVIncInsertPos->getParent();
6066 if (!DT.dominates(I, IVIncBlock))
6067 return IVIncInsertPos;
6068
6069 return HoistBlock->getTerminator();
6070}
6071
6072/// Rewrite all the fixup locations with new values, following the chosen
6073/// solution.
6074void LSRInstance::ImplementSolution(
6075 const SmallVectorImpl<const Formula *> &Solution) {
6076 // Keep track of instructions we may have made dead, so that
6077 // we can remove them after we are done working.
6079
6080 // Mark phi nodes that terminate chains so the expander tries to reuse them.
6081 for (const IVChain &Chain : IVChainVec) {
6082 if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst()))
6083 Rewriter.setChainedPhi(PN);
6084 }
6085
6086 // Expand the new value definitions and update the users.
6087 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
6088 for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) {
6089 Instruction *InsertPos =
6090 getFixupInsertPos(TTI, Fixup, Uses[LUIdx], IVIncInsertPos, DT);
6091 Rewriter.setIVIncInsertPos(L, InsertPos);
6092 Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], DeadInsts);
6093 Changed = true;
6094 }
6095
6096 auto InsertedInsts = InsertedNonLCSSAInsts.takeVector();
6097 formLCSSAForInstructions(InsertedInsts, DT, LI, &SE);
6098
6099 for (const IVChain &Chain : IVChainVec) {
6100 GenerateIVChain(Chain, DeadInsts);
6101 Changed = true;
6102 }
6103
6104 for (const WeakVH &IV : Rewriter.getInsertedIVs())
6105 if (IV && dyn_cast<Instruction>(&*IV)->getParent())
6106 ScalarEvolutionIVs.push_back(IV);
6107
6108 // Clean up after ourselves. This must be done before deleting any
6109 // instructions.
6110 Rewriter.clear();
6111
6113 &TLI, MSSAU);
6114
6115 // In our cost analysis above, we assume that each addrec consumes exactly
6116 // one register, and arrange to have increments inserted just before the
6117 // latch to maximimize the chance this is true. However, if we reused
6118 // existing IVs, we now need to move the increments to match our
6119 // expectations. Otherwise, our cost modeling results in us having a
6120 // chosen a non-optimal result for the actual schedule. (And yes, this
6121 // scheduling decision does impact later codegen.)
6122 for (PHINode &PN : L->getHeader()->phis()) {
6123 BinaryOperator *BO = nullptr;
6124 Value *Start = nullptr, *Step = nullptr;
6125 if (!matchSimpleRecurrence(&PN, BO, Start, Step))
6126 continue;
6127
6128 switch (BO->getOpcode()) {
6129 case Instruction::Sub:
6130 if (BO->getOperand(0) != &PN)
6131 // sub is non-commutative - match handling elsewhere in LSR
6132 continue;
6133 break;
6134 case Instruction::Add:
6135 break;
6136 default:
6137 continue;
6138 };
6139
6140 if (!isa<Constant>(Step))
6141 // If not a constant step, might increase register pressure
6142 // (We assume constants have been canonicalized to RHS)
6143 continue;
6144
6145 if (BO->getParent() == IVIncInsertPos->getParent())
6146 // Only bother moving across blocks. Isel can handle block local case.
6147 continue;
6148
6149 // Can we legally schedule inc at the desired point?
6150 if (!llvm::all_of(BO->uses(),
6151 [&](Use &U) {return DT.dominates(IVIncInsertPos, U);}))
6152 continue;
6153 BO->moveBefore(IVIncInsertPos->getIterator());
6154 Changed = true;
6155 }
6156
6157
6158}
6159
6160LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
6161 DominatorTree &DT, LoopInfo &LI,
6162 const TargetTransformInfo &TTI, AssumptionCache &AC,
6163 TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU)
6164 : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), TLI(TLI), TTI(TTI), L(L),
6165 MSSAU(MSSAU), AMK(PreferredAddresingMode.getNumOccurrences() > 0
6167 : TTI.getPreferredAddressingMode(L, &SE)),
6168 Rewriter(SE, "lsr", false), BaselineCost(L, SE, TTI, AMK) {
6169 // If LoopSimplify form is not available, stay out of trouble.
6170 if (!L->isLoopSimplifyForm())
6171 return;
6172
6173 // If there's no interesting work to be done, bail early.
6174 if (IU.empty()) return;
6175
6176 // If there's too much analysis to be done, bail early. We won't be able to
6177 // model the problem anyway.
6178 unsigned NumUsers = 0;
6179 for (const IVStrideUse &U : IU) {
6180 if (++NumUsers > MaxIVUsers) {
6181 (void)U;
6182 LLVM_DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U
6183 << "\n");
6184 return;
6185 }
6186 // Bail out if we have a PHI on an EHPad that gets a value from a
6187 // CatchSwitchInst. Because the CatchSwitchInst cannot be split, there is
6188 // no good place to stick any instructions.
6189 if (auto *PN = dyn_cast<PHINode>(U.getUser())) {
6190 auto FirstNonPHI = PN->getParent()->getFirstNonPHIIt();
6191 if (isa<FuncletPadInst>(FirstNonPHI) ||
6192 isa<CatchSwitchInst>(FirstNonPHI))
6193 for (BasicBlock *PredBB : PN->blocks())
6194 if (isa<CatchSwitchInst>(PredBB->getFirstNonPHIIt()))
6195 return;
6196 }
6197 }
6198
6199 LLVM_DEBUG(dbgs() << "\nLSR on loop ";
6200 L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
6201 dbgs() << ":\n");
6202
6203 // Check if we expect this loop to use a hardware loop instruction, which will
6204 // be used when calculating the costs of formulas.
6205 HardwareLoopInfo HWLoopInfo(L);
6206 HardwareLoopProfitable =
6207 TTI.isHardwareLoopProfitable(L, SE, AC, &TLI, HWLoopInfo);
6208
6209 // Configure SCEVExpander already now, so the correct mode is used for
6210 // isSafeToExpand() checks.
6211#if LLVM_ENABLE_ABI_BREAKING_CHECKS
6212 Rewriter.setDebugType(DEBUG_TYPE);
6213#endif
6214 Rewriter.disableCanonicalMode();
6215 Rewriter.enableLSRMode();
6216
6217 // First, perform some low-level loop optimizations.
6218 OptimizeShadowIV();
6219 OptimizeLoopTermCond();
6220
6221 // If loop preparation eliminates all interesting IV users, bail.
6222 if (IU.empty()) return;
6223
6224 // Skip nested loops until we can model them better with formulae.
6225 if (!L->isInnermost()) {
6226 LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
6227 return;
6228 }
6229
6230 // Start collecting data and preparing for the solver.
6231 // If number of registers is not the major cost, we cannot benefit from the
6232 // current profitable chain optimization which is based on number of
6233 // registers.
6234 // FIXME: add profitable chain optimization for other kinds major cost, for
6235 // example number of instructions.
6236 if (TTI.isNumRegsMajorCostOfLSR() || StressIVChain)
6237 CollectChains();
6238 CollectInterestingTypesAndFactors();
6239 CollectFixupsAndInitialFormulae();
6240 CollectLoopInvariantFixupsAndFormulae();
6241
6242 if (Uses.empty())
6243 return;
6244
6245 LLVM_DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
6246 print_uses(dbgs()));
6247 LLVM_DEBUG(dbgs() << "The baseline solution requires ";
6248 BaselineCost.print(dbgs()); dbgs() << "\n");
6249
6250 // Now use the reuse data to generate a bunch of interesting ways
6251 // to formulate the values needed for the uses.
6252 GenerateAllReuseFormulae();
6253
6254 FilterOutUndesirableDedicatedRegisters();
6255 NarrowSearchSpaceUsingHeuristics();
6256
6258 Solve(Solution);
6259
6260 // Release memory that is no longer needed.
6261 Factors.clear();
6262 Types.clear();
6263 RegUses.clear();
6264
6265 if (Solution.empty())
6266 return;
6267
6268#ifndef NDEBUG
6269 // Formulae should be legal.
6270 for (const LSRUse &LU : Uses) {
6271 for (const Formula &F : LU.Formulae)
6272 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
6273 F) && "Illegal formula generated!");
6274 };
6275#endif
6276
6277 // Now that we've decided what we want, make it so.
6278 ImplementSolution(Solution);
6279}
6280
6281#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
6282void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
6283 if (Factors.empty() && Types.empty()) return;
6284
6285 OS << "LSR has identified the following interesting factors and types: ";
6286 ListSeparator LS;
6287
6288 for (int64_t Factor : Factors)
6289 OS << LS << '*' << Factor;
6290
6291 for (Type *Ty : Types)
6292 OS << LS << '(' << *Ty << ')';
6293 OS << '\n';
6294}
6295
6296void LSRInstance::print_fixups(raw_ostream &OS) const {
6297 OS << "LSR is examining the following fixup sites:\n";
6298 for (const LSRUse &LU : Uses)
6299 for (const LSRFixup &LF : LU.Fixups) {
6300 dbgs() << " ";
6301 LF.print(OS);
6302 OS << '\n';
6303 }
6304}
6305
6306void LSRInstance::print_uses(raw_ostream &OS) const {
6307 OS << "LSR is examining the following uses:\n";
6308 for (const LSRUse &LU : Uses) {
6309 dbgs() << " ";
6310 LU.print(OS);
6311 OS << '\n';
6312 for (const Formula &F : LU.Formulae) {
6313 OS << " ";
6314 F.print(OS);
6315 OS << '\n';
6316 }
6317 }
6318}
6319
6320void LSRInstance::print(raw_ostream &OS) const {
6321 print_factors_and_types(OS);
6322 print_fixups(OS);
6323 print_uses(OS);
6324}
6325
6326LLVM_DUMP_METHOD void LSRInstance::dump() const {
6327 print(errs()); errs() << '\n';
6328}
6329#endif
6330
6331namespace {
6332
6333class LoopStrengthReduce : public LoopPass {
6334public:
6335 static char ID; // Pass ID, replacement for typeid
6336
6337 LoopStrengthReduce();
6338
6339private:
6340 bool runOnLoop(Loop *L, LPPassManager &LPM) override;
6341 void getAnalysisUsage(AnalysisUsage &AU) const override;
6342};
6343
6344} // end anonymous namespace
6345
6346LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
6348}
6349
6350void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
6351 // We split critical edges, so we change the CFG. However, we do update
6352 // many analyses if they are around.
6354
6355 AU.addRequired<LoopInfoWrapperPass>();
6356 AU.addPreserved<LoopInfoWrapperPass>();
6358 AU.addRequired<DominatorTreeWrapperPass>();
6359 AU.addPreserved<DominatorTreeWrapperPass>();
6360 AU.addRequired<ScalarEvolutionWrapperPass>();
6361 AU.addPreserved<ScalarEvolutionWrapperPass>();
6362 AU.addRequired<AssumptionCacheTracker>();
6363 AU.addRequired<TargetLibraryInfoWrapperPass>();
6364 // Requiring LoopSimplify a second time here prevents IVUsers from running
6365 // twice, since LoopSimplify was invalidated by running ScalarEvolution.
6367 AU.addRequired<IVUsersWrapperPass>();
6368 AU.addPreserved<IVUsersWrapperPass>();
6369 AU.addRequired<TargetTransformInfoWrapperPass>();
6370 AU.addPreserved<MemorySSAWrapperPass>();
6371}
6372
6373namespace {
6374
6375/// Enables more convenient iteration over a DWARF expression vector.
6377ToDwarfOpIter(SmallVectorImpl<uint64_t> &Expr) {
6378 llvm::DIExpression::expr_op_iterator Begin =
6379 llvm::DIExpression::expr_op_iterator(Expr.begin());
6380 llvm::DIExpression::expr_op_iterator End =
6381 llvm::DIExpression::expr_op_iterator(Expr.end());
6382 return {Begin, End};
6383}
6384
6385struct SCEVDbgValueBuilder {
6386 SCEVDbgValueBuilder() = default;
6387 SCEVDbgValueBuilder(const SCEVDbgValueBuilder &Base) { clone(Base); }
6388
6389 void clone(const SCEVDbgValueBuilder &Base) {
6390 LocationOps = Base.LocationOps;
6391 Expr = Base.Expr;
6392 }
6393
6394 void clear() {
6395 LocationOps.clear();
6396 Expr.clear();
6397 }
6398
6399 /// The DIExpression as we translate the SCEV.
6401 /// The location ops of the DIExpression.
6402 SmallVector<Value *, 2> LocationOps;
6403
6404 void pushOperator(uint64_t Op) { Expr.push_back(Op); }
6405 void pushUInt(uint64_t Operand) { Expr.push_back(Operand); }
6406
6407 /// Add a DW_OP_LLVM_arg to the expression, followed by the index of the value
6408 /// in the set of values referenced by the expression.
6409 void pushLocation(llvm::Value *V) {
6411 auto *It = llvm::find(LocationOps, V);
6412 unsigned ArgIndex = 0;
6413 if (It != LocationOps.end()) {
6414 ArgIndex = std::distance(LocationOps.begin(), It);
6415 } else {
6416 ArgIndex = LocationOps.size();
6417 LocationOps.push_back(V);
6418 }
6419 Expr.push_back(ArgIndex);
6420 }
6421
6422 void pushValue(const SCEVUnknown *U) {
6423 llvm::Value *V = cast<SCEVUnknown>(U)->getValue();
6424 pushLocation(V);
6425 }
6426
6427 bool pushConst(const SCEVConstant *C) {
6428 if (C->getAPInt().getSignificantBits() > 64)
6429 return false;
6430 Expr.push_back(llvm::dwarf::DW_OP_consts);
6431 Expr.push_back(C->getAPInt().getSExtValue());
6432 return true;
6433 }
6434
6435 // Iterating the expression as DWARF ops is convenient when updating
6436 // DWARF_OP_LLVM_args.
6438 return ToDwarfOpIter(Expr);
6439 }
6440
6441 /// Several SCEV types are sequences of the same arithmetic operator applied
6442 /// to constants and values that may be extended or truncated.
6443 bool pushArithmeticExpr(const llvm::SCEVCommutativeExpr *CommExpr,
6444 uint64_t DwarfOp) {
6445 assert((isa<llvm::SCEVAddExpr>(CommExpr) || isa<SCEVMulExpr>(CommExpr)) &&
6446 "Expected arithmetic SCEV type");
6447 bool Success = true;
6448 unsigned EmitOperator = 0;
6449 for (const auto &Op : CommExpr->operands()) {
6450 Success &= pushSCEV(Op);
6451
6452 if (EmitOperator >= 1)
6453 pushOperator(DwarfOp);
6454 ++EmitOperator;
6455 }
6456 return Success;
6457 }
6458
6459 // TODO: Identify and omit noop casts.
6460 bool pushCast(const llvm::SCEVCastExpr *C, bool IsSigned) {
6461 const llvm::SCEV *Inner = C->getOperand(0);
6462 const llvm::Type *Type = C->getType();
6463 uint64_t ToWidth = Type->getIntegerBitWidth();
6464 bool Success = pushSCEV(Inner);
6465 uint64_t CastOps[] = {dwarf::DW_OP_LLVM_convert, ToWidth,
6466 IsSigned ? llvm::dwarf::DW_ATE_signed
6467 : llvm::dwarf::DW_ATE_unsigned};
6468 for (const auto &Op : CastOps)
6469 pushOperator(Op);
6470 return Success;
6471 }
6472
6473 // TODO: MinMax - although these haven't been encountered in the test suite.
6474 bool pushSCEV(const llvm::SCEV *S) {
6475 bool Success = true;
6476 if (const SCEVConstant *StartInt = dyn_cast<SCEVConstant>(S)) {
6477 Success &= pushConst(StartInt);
6478
6479 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
6480 if (!U->getValue())
6481 return false;
6482 pushLocation(U->getValue());
6483
6484 } else if (const SCEVMulExpr *MulRec = dyn_cast<SCEVMulExpr>(S)) {
6485 Success &= pushArithmeticExpr(MulRec, llvm::dwarf::DW_OP_mul);
6486
6487 } else if (const SCEVUDivExpr *UDiv = dyn_cast<SCEVUDivExpr>(S)) {
6488 Success &= pushSCEV(UDiv->getLHS());
6489 Success &= pushSCEV(UDiv->getRHS());
6490 pushOperator(llvm::dwarf::DW_OP_div);
6491
6492 } else if (const SCEVCastExpr *Cast = dyn_cast<SCEVCastExpr>(S)) {
6493 // Assert if a new and unknown SCEVCastEXpr type is encountered.
6496 isa<SCEVSignExtendExpr>(Cast)) &&
6497 "Unexpected cast type in SCEV.");
6498 Success &= pushCast(Cast, (isa<SCEVSignExtendExpr>(Cast)));
6499
6500 } else if (const SCEVAddExpr *AddExpr = dyn_cast<SCEVAddExpr>(S)) {
6501 Success &= pushArithmeticExpr(AddExpr, llvm::dwarf::DW_OP_plus);
6502
6503 } else if (isa<SCEVAddRecExpr>(S)) {
6504 // Nested SCEVAddRecExpr are generated by nested loops and are currently
6505 // unsupported.
6506 return false;
6507
6508 } else {
6509 return false;
6510 }
6511 return Success;
6512 }
6513
6514 /// Return true if the combination of arithmetic operator and underlying
6515 /// SCEV constant value is an identity function.
6516 bool isIdentityFunction(uint64_t Op, const SCEV *S) {
6517 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
6518 if (C->getAPInt().getSignificantBits() > 64)
6519 return false;
6520 int64_t I = C->getAPInt().getSExtValue();
6521 switch (Op) {
6522 case llvm::dwarf::DW_OP_plus:
6523 case llvm::dwarf::DW_OP_minus:
6524 return I == 0;
6525 case llvm::dwarf::DW_OP_mul:
6526 case llvm::dwarf::DW_OP_div:
6527 return I == 1;
6528 }
6529 }
6530 return false;
6531 }
6532
6533 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6534 /// builder's expression stack. The stack should already contain an
6535 /// expression for the iteration count, so that it can be multiplied by
6536 /// the stride and added to the start.
6537 /// Components of the expression are omitted if they are an identity function.
6538 /// Chain (non-affine) SCEVs are not supported.
6539 bool SCEVToValueExpr(const llvm::SCEVAddRecExpr &SAR, ScalarEvolution &SE) {
6540 assert(SAR.isAffine() && "Expected affine SCEV");
6541 const SCEV *Start = SAR.getStart();
6542 const SCEV *Stride = SAR.getStepRecurrence(SE);
6543
6544 // Skip pushing arithmetic noops.
6545 if (!isIdentityFunction(llvm::dwarf::DW_OP_mul, Stride)) {
6546 if (!pushSCEV(Stride))
6547 return false;
6548 pushOperator(llvm::dwarf::DW_OP_mul);
6549 }
6550 if (!isIdentityFunction(llvm::dwarf::DW_OP_plus, Start)) {
6551 if (!pushSCEV(Start))
6552 return false;
6553 pushOperator(llvm::dwarf::DW_OP_plus);
6554 }
6555 return true;
6556 }
6557
6558 /// Create an expression that is an offset from a value (usually the IV).
6559 void createOffsetExpr(int64_t Offset, Value *OffsetValue) {
6560 pushLocation(OffsetValue);
6562 LLVM_DEBUG(
6563 dbgs() << "scev-salvage: Generated IV offset expression. Offset: "
6564 << std::to_string(Offset) << "\n");
6565 }
6566
6567 /// Combine a translation of the SCEV and the IV to create an expression that
6568 /// recovers a location's value.
6569 /// returns true if an expression was created.
6570 bool createIterCountExpr(const SCEV *S,
6571 const SCEVDbgValueBuilder &IterationCount,
6572 ScalarEvolution &SE) {
6573 // SCEVs for SSA values are most frquently of the form
6574 // {start,+,stride}, but sometimes they are ({start,+,stride} + %a + ..).
6575 // This is because %a is a PHI node that is not the IV. However, these
6576 // SCEVs have not been observed to result in debuginfo-lossy optimisations,
6577 // so its not expected this point will be reached.
6578 if (!isa<SCEVAddRecExpr>(S))
6579 return false;
6580
6581 LLVM_DEBUG(dbgs() << "scev-salvage: Location to salvage SCEV: " << *S
6582 << '\n');
6583
6584 const auto *Rec = cast<SCEVAddRecExpr>(S);
6585 if (!Rec->isAffine())
6586 return false;
6587
6589 return false;
6590
6591 // Initialise a new builder with the iteration count expression. In
6592 // combination with the value's SCEV this enables recovery.
6593 clone(IterationCount);
6594 if (!SCEVToValueExpr(*Rec, SE))
6595 return false;
6596
6597 return true;
6598 }
6599
6600 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6601 /// builder's expression stack. The stack should already contain an
6602 /// expression for the iteration count, so that it can be multiplied by
6603 /// the stride and added to the start.
6604 /// Components of the expression are omitted if they are an identity function.
6605 bool SCEVToIterCountExpr(const llvm::SCEVAddRecExpr &SAR,
6606 ScalarEvolution &SE) {
6607 assert(SAR.isAffine() && "Expected affine SCEV");
6608 const SCEV *Start = SAR.getStart();
6609 const SCEV *Stride = SAR.getStepRecurrence(SE);
6610
6611 // Skip pushing arithmetic noops.
6612 if (!isIdentityFunction(llvm::dwarf::DW_OP_minus, Start)) {
6613 if (!pushSCEV(Start))
6614 return false;
6615 pushOperator(llvm::dwarf::DW_OP_minus);
6616 }
6617 if (!isIdentityFunction(llvm::dwarf::DW_OP_div, Stride)) {
6618 if (!pushSCEV(Stride))
6619 return false;
6620 pushOperator(llvm::dwarf::DW_OP_div);
6621 }
6622 return true;
6623 }
6624
6625 // Append the current expression and locations to a location list and an
6626 // expression list. Modify the DW_OP_LLVM_arg indexes to account for
6627 // the locations already present in the destination list.
6628 void appendToVectors(SmallVectorImpl<uint64_t> &DestExpr,
6629 SmallVectorImpl<Value *> &DestLocations) {
6630 assert(!DestLocations.empty() &&
6631 "Expected the locations vector to contain the IV");
6632 // The DWARF_OP_LLVM_arg arguments of the expression being appended must be
6633 // modified to account for the locations already in the destination vector.
6634 // All builders contain the IV as the first location op.
6635 assert(!LocationOps.empty() &&
6636 "Expected the location ops to contain the IV.");
6637 // DestIndexMap[n] contains the index in DestLocations for the nth
6638 // location in this SCEVDbgValueBuilder.
6639 SmallVector<uint64_t, 2> DestIndexMap;
6640 for (const auto &Op : LocationOps) {
6641 auto It = find(DestLocations, Op);
6642 if (It != DestLocations.end()) {
6643 // Location already exists in DestLocations, reuse existing ArgIndex.
6644 DestIndexMap.push_back(std::distance(DestLocations.begin(), It));
6645 continue;
6646 }
6647 // Location is not in DestLocations, add it.
6648 DestIndexMap.push_back(DestLocations.size());
6649 DestLocations.push_back(Op);
6650 }
6651
6652 for (const auto &Op : expr_ops()) {
6653 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6654 Op.appendToVector(DestExpr);
6655 continue;
6656 }
6657
6659 // `DW_OP_LLVM_arg n` represents the nth LocationOp in this SCEV,
6660 // DestIndexMap[n] contains its new index in DestLocations.
6661 uint64_t NewIndex = DestIndexMap[Op.getArg(0)];
6662 DestExpr.push_back(NewIndex);
6663 }
6664 }
6665};
6666
6667/// Holds all the required data to salvage a dbg.value using the pre-LSR SCEVs
6668/// and DIExpression.
6669struct DVIRecoveryRec {
6670 DVIRecoveryRec(DbgVariableRecord *DVR)
6671 : DbgRef(DVR), Expr(DVR->getExpression()), HadLocationArgList(false) {}
6672
6673 DbgVariableRecord *DbgRef;
6674 DIExpression *Expr;
6675 bool HadLocationArgList;
6676 SmallVector<WeakVH, 2> LocationOps;
6679
6680 void clear() {
6681 for (auto &RE : RecoveryExprs)
6682 RE.reset();
6683 RecoveryExprs.clear();
6684 }
6685
6686 ~DVIRecoveryRec() { clear(); }
6687};
6688} // namespace
6689
6690/// Returns the total number of DW_OP_llvm_arg operands in the expression.
6691/// This helps in determining if a DIArglist is necessary or can be omitted from
6692/// the dbg.value.
6694 auto expr_ops = ToDwarfOpIter(Expr);
6695 unsigned Count = 0;
6696 for (auto Op : expr_ops)
6697 if (Op.getOp() == dwarf::DW_OP_LLVM_arg)
6698 Count++;
6699 return Count;
6700}
6701
6702/// Overwrites DVI with the location and Ops as the DIExpression. This will
6703/// create an invalid expression if Ops has any dwarf::DW_OP_llvm_arg operands,
6704/// because a DIArglist is not created for the first argument of the dbg.value.
6705template <typename T>
6706static void updateDVIWithLocation(T &DbgVal, Value *Location,
6708 assert(numLLVMArgOps(Ops) == 0 && "Expected expression that does not "
6709 "contain any DW_OP_llvm_arg operands.");
6710 DbgVal.setRawLocation(ValueAsMetadata::get(Location));
6711 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6712 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6713}
6714
6715/// Overwrite DVI with locations placed into a DIArglist.
6716template <typename T>
6717static void updateDVIWithLocations(T &DbgVal,
6718 SmallVectorImpl<Value *> &Locations,
6720 assert(numLLVMArgOps(Ops) != 0 &&
6721 "Expected expression that references DIArglist locations using "
6722 "DW_OP_llvm_arg operands.");
6724 for (Value *V : Locations)
6725 MetadataLocs.push_back(ValueAsMetadata::get(V));
6726 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6727 DbgVal.setRawLocation(llvm::DIArgList::get(DbgVal.getContext(), ValArrayRef));
6728 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6729}
6730
6731/// Write the new expression and new location ops for the dbg.value. If possible
6732/// reduce the szie of the dbg.value by omitting DIArglist. This
6733/// can be omitted if:
6734/// 1. There is only a single location, refenced by a single DW_OP_llvm_arg.
6735/// 2. The DW_OP_LLVM_arg is the first operand in the expression.
6736static void UpdateDbgValue(DVIRecoveryRec &DVIRec,
6737 SmallVectorImpl<Value *> &NewLocationOps,
6739 DbgVariableRecord *DbgVal = DVIRec.DbgRef;
6740 unsigned NumLLVMArgs = numLLVMArgOps(NewExpr);
6741 if (NumLLVMArgs == 0) {
6742 // Location assumed to be on the stack.
6743 updateDVIWithLocation(*DbgVal, NewLocationOps[0], NewExpr);
6744 } else if (NumLLVMArgs == 1 && NewExpr[0] == dwarf::DW_OP_LLVM_arg) {
6745 // There is only a single DW_OP_llvm_arg at the start of the expression,
6746 // so it can be omitted along with DIArglist.
6747 assert(NewExpr[1] == 0 &&
6748 "Lone LLVM_arg in a DIExpression should refer to location-op 0.");
6750 updateDVIWithLocation(*DbgVal, NewLocationOps[0], ShortenedOps);
6751 } else {
6752 // Multiple DW_OP_llvm_arg, so DIArgList is strictly necessary.
6753 updateDVIWithLocations(*DbgVal, NewLocationOps, NewExpr);
6754 }
6755
6756 // If the DIExpression was previously empty then add the stack terminator.
6757 // Non-empty expressions have only had elements inserted into them and so
6758 // the terminator should already be present e.g. stack_value or fragment.
6759 DIExpression *SalvageExpr = DbgVal->getExpression();
6760 if (!DVIRec.Expr->isComplex() && SalvageExpr->isComplex()) {
6761 SalvageExpr = DIExpression::append(SalvageExpr, {dwarf::DW_OP_stack_value});
6762 DbgVal->setExpression(SalvageExpr);
6763 }
6764}
6765
6766/// Cached location ops may be erased during LSR, in which case a poison is
6767/// required when restoring from the cache. The type of that location is no
6768/// longer available, so just use int8. The poison will be replaced by one or
6769/// more locations later when a SCEVDbgValueBuilder selects alternative
6770/// locations to use for the salvage.
6772 return (VH) ? VH : PoisonValue::get(llvm::Type::getInt8Ty(C));
6773}
6774
6775/// Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
6776static void restorePreTransformState(DVIRecoveryRec &DVIRec) {
6777 DbgVariableRecord *DbgVal = DVIRec.DbgRef;
6778 LLVM_DEBUG(dbgs() << "scev-salvage: restore dbg.value to pre-LSR state\n"
6779 << "scev-salvage: post-LSR: " << *DbgVal << '\n');
6780 assert(DVIRec.Expr && "Expected an expression");
6781 DbgVal->setExpression(DVIRec.Expr);
6782
6783 // Even a single location-op may be inside a DIArgList and referenced with
6784 // DW_OP_LLVM_arg, which is valid only with a DIArgList.
6785 if (!DVIRec.HadLocationArgList) {
6786 assert(DVIRec.LocationOps.size() == 1 &&
6787 "Unexpected number of location ops.");
6788 // LSR's unsuccessful salvage attempt may have added DIArgList, which in
6789 // this case was not present before, so force the location back to a
6790 // single uncontained Value.
6791 Value *CachedValue =
6792 getValueOrPoison(DVIRec.LocationOps[0], DbgVal->getContext());
6793 DbgVal->setRawLocation(ValueAsMetadata::get(CachedValue));
6794 } else {
6796 for (WeakVH VH : DVIRec.LocationOps) {
6797 Value *CachedValue = getValueOrPoison(VH, DbgVal->getContext());
6798 MetadataLocs.push_back(ValueAsMetadata::get(CachedValue));
6799 }
6800 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6801 DbgVal->setRawLocation(
6802 llvm::DIArgList::get(DbgVal->getContext(), ValArrayRef));
6803 }
6804 LLVM_DEBUG(dbgs() << "scev-salvage: pre-LSR: " << *DbgVal << '\n');
6805}
6806
6808 llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec,
6809 const SCEV *SCEVInductionVar,
6810 SCEVDbgValueBuilder IterCountExpr) {
6811
6812 if (!DVIRec.DbgRef->isKillLocation())
6813 return false;
6814
6815 // LSR may have caused several changes to the dbg.value in the failed salvage
6816 // attempt. So restore the DIExpression, the location ops and also the
6817 // location ops format, which is always DIArglist for multiple ops, but only
6818 // sometimes for a single op.
6820
6821 // LocationOpIndexMap[i] will store the post-LSR location index of
6822 // the non-optimised out location at pre-LSR index i.
6823 SmallVector<int64_t, 2> LocationOpIndexMap;
6824 LocationOpIndexMap.assign(DVIRec.LocationOps.size(), -1);
6825 SmallVector<Value *, 2> NewLocationOps;
6826 NewLocationOps.push_back(LSRInductionVar);
6827
6828 for (unsigned i = 0; i < DVIRec.LocationOps.size(); i++) {
6829 WeakVH VH = DVIRec.LocationOps[i];
6830 // Place the locations not optimised out in the list first, avoiding
6831 // inserts later. The map is used to update the DIExpression's
6832 // DW_OP_LLVM_arg arguments as the expression is updated.
6833 if (VH && !isa<UndefValue>(VH)) {
6834 NewLocationOps.push_back(VH);
6835 LocationOpIndexMap[i] = NewLocationOps.size() - 1;
6836 LLVM_DEBUG(dbgs() << "scev-salvage: Location index " << i
6837 << " now at index " << LocationOpIndexMap[i] << "\n");
6838 continue;
6839 }
6840
6841 // It's possible that a value referred to in the SCEV may have been
6842 // optimised out by LSR.
6843 if (SE.containsErasedValue(DVIRec.SCEVs[i]) ||
6844 SE.containsUndefs(DVIRec.SCEVs[i])) {
6845 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV for location at index: " << i
6846 << " refers to a location that is now undef or erased. "
6847 "Salvage abandoned.\n");
6848 return false;
6849 }
6850
6851 LLVM_DEBUG(dbgs() << "scev-salvage: salvaging location at index " << i
6852 << " with SCEV: " << *DVIRec.SCEVs[i] << "\n");
6853
6854 DVIRec.RecoveryExprs[i] = std::make_unique<SCEVDbgValueBuilder>();
6855 SCEVDbgValueBuilder *SalvageExpr = DVIRec.RecoveryExprs[i].get();
6856
6857 // Create an offset-based salvage expression if possible, as it requires
6858 // less DWARF ops than an iteration count-based expression.
6859 if (std::optional<APInt> Offset =
6860 SE.computeConstantDifference(DVIRec.SCEVs[i], SCEVInductionVar)) {
6861 if (Offset->getSignificantBits() <= 64)
6862 SalvageExpr->createOffsetExpr(Offset->getSExtValue(), LSRInductionVar);
6863 else
6864 return false;
6865 } else if (!SalvageExpr->createIterCountExpr(DVIRec.SCEVs[i], IterCountExpr,
6866 SE))
6867 return false;
6868 }
6869
6870 // Merge the DbgValueBuilder generated expressions and the original
6871 // DIExpression, place the result into an new vector.
6873 if (DVIRec.Expr->getNumElements() == 0) {
6874 assert(DVIRec.RecoveryExprs.size() == 1 &&
6875 "Expected only a single recovery expression for an empty "
6876 "DIExpression.");
6877 assert(DVIRec.RecoveryExprs[0] &&
6878 "Expected a SCEVDbgSalvageBuilder for location 0");
6879 SCEVDbgValueBuilder *B = DVIRec.RecoveryExprs[0].get();
6880 B->appendToVectors(NewExpr, NewLocationOps);
6881 }
6882 for (const auto &Op : DVIRec.Expr->expr_ops()) {
6883 // Most Ops needn't be updated.
6884 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6885 Op.appendToVector(NewExpr);
6886 continue;
6887 }
6888
6889 uint64_t LocationArgIndex = Op.getArg(0);
6890 SCEVDbgValueBuilder *DbgBuilder =
6891 DVIRec.RecoveryExprs[LocationArgIndex].get();
6892 // The location doesn't have s SCEVDbgValueBuilder, so LSR did not
6893 // optimise it away. So just translate the argument to the updated
6894 // location index.
6895 if (!DbgBuilder) {
6896 NewExpr.push_back(dwarf::DW_OP_LLVM_arg);
6897 assert(LocationOpIndexMap[Op.getArg(0)] != -1 &&
6898 "Expected a positive index for the location-op position.");
6899 NewExpr.push_back(LocationOpIndexMap[Op.getArg(0)]);
6900 continue;
6901 }
6902 // The location has a recovery expression.
6903 DbgBuilder->appendToVectors(NewExpr, NewLocationOps);
6904 }
6905
6906 UpdateDbgValue(DVIRec, NewLocationOps, NewExpr);
6907 LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: " << *DVIRec.DbgRef << "\n");
6908 return true;
6909}
6910
6911/// Obtain an expression for the iteration count, then attempt to salvage the
6912/// dbg.value intrinsics.
6914 llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar,
6915 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &DVIToUpdate) {
6916 if (DVIToUpdate.empty())
6917 return;
6918
6919 const llvm::SCEV *SCEVInductionVar = SE.getSCEV(LSRInductionVar);
6920 assert(SCEVInductionVar &&
6921 "Anticipated a SCEV for the post-LSR induction variable");
6922
6923 if (const SCEVAddRecExpr *IVAddRec =
6924 dyn_cast<SCEVAddRecExpr>(SCEVInductionVar)) {
6925 if (!IVAddRec->isAffine())
6926 return;
6927
6928 // Prevent translation using excessive resources.
6929 if (IVAddRec->getExpressionSize() > MaxSCEVSalvageExpressionSize)
6930 return;
6931
6932 // The iteration count is required to recover location values.
6933 SCEVDbgValueBuilder IterCountExpr;
6934 IterCountExpr.pushLocation(LSRInductionVar);
6935 if (!IterCountExpr.SCEVToIterCountExpr(*IVAddRec, SE))
6936 return;
6937
6938 LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV: " << *SCEVInductionVar
6939 << '\n');
6940
6941 for (auto &DVIRec : DVIToUpdate) {
6942 SalvageDVI(L, SE, LSRInductionVar, *DVIRec, SCEVInductionVar,
6943 IterCountExpr);
6944 }
6945 }
6946}
6947
6948/// Identify and cache salvageable DVI locations and expressions along with the
6949/// corresponding SCEV(s). Also ensure that the DVI is not deleted between
6950/// cacheing and salvaging.
6952 Loop *L, ScalarEvolution &SE,
6953 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &SalvageableDVISCEVs) {
6954 for (const auto &B : L->getBlocks()) {
6955 for (auto &I : *B) {
6956 for (DbgVariableRecord &DbgVal : filterDbgVars(I.getDbgRecordRange())) {
6957 if (!DbgVal.isDbgValue() && !DbgVal.isDbgAssign())
6958 continue;
6959
6960 // Ensure that if any location op is undef that the dbg.vlue is not
6961 // cached.
6962 if (DbgVal.isKillLocation())
6963 continue;
6964
6965 // Check that the location op SCEVs are suitable for translation to
6966 // DIExpression.
6967 const auto &HasTranslatableLocationOps =
6968 [&](const DbgVariableRecord &DbgValToTranslate) -> bool {
6969 for (const auto LocOp : DbgValToTranslate.location_ops()) {
6970 if (!LocOp)
6971 return false;
6972
6973 if (!SE.isSCEVable(LocOp->getType()))
6974 return false;
6975
6976 const SCEV *S = SE.getSCEV(LocOp);
6977 if (SE.containsUndefs(S))
6978 return false;
6979 }
6980 return true;
6981 };
6982
6983 if (!HasTranslatableLocationOps(DbgVal))
6984 continue;
6985
6986 std::unique_ptr<DVIRecoveryRec> NewRec =
6987 std::make_unique<DVIRecoveryRec>(&DbgVal);
6988 // Each location Op may need a SCEVDbgValueBuilder in order to recover
6989 // it. Pre-allocating a vector will enable quick lookups of the builder
6990 // later during the salvage.
6991 NewRec->RecoveryExprs.resize(DbgVal.getNumVariableLocationOps());
6992 for (const auto LocOp : DbgVal.location_ops()) {
6993 NewRec->SCEVs.push_back(SE.getSCEV(LocOp));
6994 NewRec->LocationOps.push_back(LocOp);
6995 NewRec->HadLocationArgList = DbgVal.hasArgList();
6996 }
6997 SalvageableDVISCEVs.push_back(std::move(NewRec));
6998 }
6999 }
7000 }
7001}
7002
7003/// Ideally pick the PHI IV inserted by ScalarEvolutionExpander. As a fallback
7004/// any PHi from the loop header is usable, but may have less chance of
7005/// surviving subsequent transforms.
7007 const LSRInstance &LSR) {
7008
7009 auto IsSuitableIV = [&](PHINode *P) {
7010 if (!SE.isSCEVable(P->getType()))
7011 return false;
7012 if (const SCEVAddRecExpr *Rec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(P)))
7013 return Rec->isAffine() && !SE.containsUndefs(SE.getSCEV(P));
7014 return false;
7015 };
7016
7017 // For now, just pick the first IV that was generated and inserted by
7018 // ScalarEvolution. Ideally pick an IV that is unlikely to be optimised away
7019 // by subsequent transforms.
7020 for (const WeakVH &IV : LSR.getScalarEvolutionIVs()) {
7021 if (!IV)
7022 continue;
7023
7024 // There should only be PHI node IVs.
7025 PHINode *P = cast<PHINode>(&*IV);
7026
7027 if (IsSuitableIV(P))
7028 return P;
7029 }
7030
7031 for (PHINode &P : L.getHeader()->phis()) {
7032 if (IsSuitableIV(&P))
7033 return &P;
7034 }
7035 return nullptr;
7036}
7037
7039 DominatorTree &DT, LoopInfo &LI,
7040 const TargetTransformInfo &TTI,
7042 MemorySSA *MSSA) {
7043
7044 // Debug preservation - before we start removing anything identify which DVI
7045 // meet the salvageable criteria and store their DIExpression and SCEVs.
7046 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> SalvageableDVIRecords;
7047 DbgGatherSalvagableDVI(L, SE, SalvageableDVIRecords);
7048
7049 bool Changed = false;
7050 std::unique_ptr<MemorySSAUpdater> MSSAU;
7051 if (MSSA)
7052 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
7053
7054 // Run the main LSR transformation.
7055 const LSRInstance &Reducer =
7056 LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get());
7057 Changed |= Reducer.getChanged();
7058
7059 // Remove any extra phis created by processing inner loops.
7060 Changed |= DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7061 if (EnablePhiElim && L->isLoopSimplifyForm()) {
7063 SCEVExpander Rewriter(SE, "lsr", false);
7064#if LLVM_ENABLE_ABI_BREAKING_CHECKS
7065 Rewriter.setDebugType(DEBUG_TYPE);
7066#endif
7067 unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI);
7068 Rewriter.clear();
7069 if (numFolded) {
7070 Changed = true;
7072 MSSAU.get());
7073 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7074 }
7075 }
7076 // LSR may at times remove all uses of an induction variable from a loop.
7077 // The only remaining use is the PHI in the exit block.
7078 // When this is the case, if the exit value of the IV can be calculated using
7079 // SCEV, we can replace the exit block PHI with the final value of the IV and
7080 // skip the updates in each loop iteration.
7081 if (L->isRecursivelyLCSSAForm(DT, LI) && L->getExitBlock()) {
7083 SCEVExpander Rewriter(SE, "lsr", true);
7084 int Rewrites = rewriteLoopExitValues(L, &LI, &TLI, &SE, &TTI, Rewriter, &DT,
7085 UnusedIndVarInLoop, DeadInsts);
7086 Rewriter.clear();
7087 if (Rewrites) {
7088 Changed = true;
7090 MSSAU.get());
7091 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7092 }
7093 }
7094
7095 if (SalvageableDVIRecords.empty())
7096 return Changed;
7097
7098 // Obtain relevant IVs and attempt to rewrite the salvageable DVIs with
7099 // expressions composed using the derived iteration count.
7100 // TODO: Allow for multiple IV references for nested AddRecSCEVs
7101 for (const auto &L : LI) {
7102 if (llvm::PHINode *IV = GetInductionVariable(*L, SE, Reducer))
7103 DbgRewriteSalvageableDVIs(L, SE, IV, SalvageableDVIRecords);
7104 else {
7105 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV salvaging not possible. An IV "
7106 "could not be identified.\n");
7107 }
7108 }
7109
7110 for (auto &Rec : SalvageableDVIRecords)
7111 Rec->clear();
7112 SalvageableDVIRecords.clear();
7113 return Changed;
7114}
7115
7116bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
7117 if (skipLoop(L))
7118 return false;
7119
7120 auto &IU = getAnalysis<IVUsersWrapperPass>().getIU();
7121 auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
7122 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
7123 auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
7124 const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
7125 *L->getHeader()->getParent());
7126 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
7127 *L->getHeader()->getParent());
7128 auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
7129 *L->getHeader()->getParent());
7130 auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
7131 MemorySSA *MSSA = nullptr;
7132 if (MSSAAnalysis)
7133 MSSA = &MSSAAnalysis->getMSSA();
7134 return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, TLI, MSSA);
7135}
7136
7139 LPMUpdater &) {
7140 if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,
7141 AR.DT, AR.LI, AR.TTI, AR.AC, AR.TLI, AR.MSSA))
7142 return PreservedAnalyses::all();
7143
7144 auto PA = getLoopPassPreservedAnalyses();
7145 if (AR.MSSA)
7146 PA.preserve<MemorySSAAnalysis>();
7147 return PA;
7148}
7149
7150char LoopStrengthReduce::ID = 0;
7151
7152INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
7153 "Loop Strength Reduction", false, false)
7159INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
7160INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
7161 "Loop Strength Reduction", false, false)
7162
7163Pass *llvm::createLoopStrengthReducePass() { return new LoopStrengthReduce(); }
#define Success
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
Function Alias Analysis false
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:851
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:661
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isCanonical(const MDString *S)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
This file contains constants used for implementing Dwarf debug support.
early cse Early CSE w MemorySSA
#define DEBUG_TYPE
Hexagon Hardware Loops
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
This header provides classes for managing per-loop analyses.
static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec, const SCEV *SCEVInductionVar, SCEVDbgValueBuilder IterCountExpr)
static Immediate ExtractImmediate(SCEVUse &S, ScalarEvolution &SE)
If S involves the addition of a constant integer value, return that integer value,...
static cl::opt< bool > DropScaledForVScale("lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true), cl::desc("Avoid using scaled registers with vscale-relative addressing"))
static Value * getWideOperand(Value *Oper)
IVChain logic must consistently peek base TruncInst operands, so wrap it in a convenient helper.
static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE)
Return true if the given add can be sign-extended without changing its value.
static bool mayUsePostIncMode(const TargetTransformInfo &TTI, LSRUse &LU, const SCEV *S, const Loop *L, ScalarEvolution &SE)
Return true if the SCEV represents a value that may end up as a post-increment operation.
static void restorePreTransformState(DVIRecoveryRec &DVIRec)
Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L)
static User::op_iterator findIVOperand(User::op_iterator OI, User::op_iterator OE, Loop *L, ScalarEvolution &SE)
Helper for CollectChains that finds an IV operand (computed by an AddRec in this loop) within [OI,...
static cl::opt< TTI::AddressingModeKind > PreferredAddresingMode("lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None), cl::desc("A flag that overrides the target's preferred addressing mode."), cl::values(clEnumValN(TTI::AMK_None, "none", "Don't prefer any addressing mode"), clEnumValN(TTI::AMK_PreIndexed, "preindexed", "Prefer pre-indexed addressing mode"), clEnumValN(TTI::AMK_PostIndexed, "postindexed", "Prefer post-indexed addressing mode"), clEnumValN(TTI::AMK_All, "all", "Consider all addressing modes")))
static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset, Immediate MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg, int64_t Scale)
Test whether we know how to expand the current formula.
static void DbgGatherSalvagableDVI(Loop *L, ScalarEvolution &SE, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &SalvageableDVISCEVs)
Identify and cache salvageable DVI locations and expressions along with the corresponding SCEV(s).
static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE)
Return true if the given mul can be sign-extended without changing its value.
static const unsigned MaxSCEVSalvageExpressionSize
Limit the size of expression that SCEV-based salvaging will attempt to translate into a DIExpression.
static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if this AddRec is already a phi in its loop.
static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F, const Loop &L)
static cl::opt< bool > InsnsCost("lsr-insns-cost", cl::Hidden, cl::init(true), cl::desc("Add instruction count to a LSR cost model"))
static cl::opt< bool > StressIVChain("stress-ivchain", cl::Hidden, cl::init(false), cl::desc("Stress test LSR IV chains"))
static bool isAddressUse(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Returns true if the specified instruction is using the specified value as an address.
static void DoInitialMatch(const SCEV *S, Loop *L, SmallVectorImpl< SCEVUse > &Good, SmallVectorImpl< SCEVUse > &Bad, ScalarEvolution &SE)
Recursion helper for initialMatch.
static void updateDVIWithLocation(T &DbgVal, Value *Location, SmallVectorImpl< uint64_t > &Ops)
Overwrites DVI with the location and Ops as the DIExpression.
static bool isLegalAddImmediate(const TargetTransformInfo &TTI, Immediate Offset)
static cl::opt< cl::boolOrDefault > AllowDropSolutionIfLessProfitable("lsr-drop-solution", cl::Hidden, cl::desc("Attempt to drop solution if it is less profitable"))
static cl::opt< bool > EnableVScaleImmediates("lsr-enable-vscale-immediates", cl::Hidden, cl::init(true), cl::desc("Enable analysis of vscale-relative immediates in LSR"))
static Instruction * getFixupInsertPos(const TargetTransformInfo &TTI, const LSRFixup &Fixup, const LSRUse &LU, Instruction *IVIncInsertPos, DominatorTree &DT)
static const SCEV * getExprBase(const SCEV *S)
Return an approximation of this SCEV expression's "base", or NULL for any constant.
static bool isAlwaysFoldable(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg)
static llvm::PHINode * GetInductionVariable(const Loop &L, ScalarEvolution &SE, const LSRInstance &LSR)
Ideally pick the PHI IV inserted by ScalarEvolutionExpander.
static bool IsSimplerBaseSCEVForTarget(const TargetTransformInfo &TTI, ScalarEvolution &SE, const SCEV *Best, const SCEV *Reg, MemAccessTy AccessType)
static const unsigned MaxIVUsers
MaxIVUsers is an arbitrary threshold that provides an early opportunity for bail out.
static bool isHighCostExpansion(const SCEV *S, SmallPtrSetImpl< const SCEV * > &Processed, ScalarEvolution &SE)
Check if expanding this expression is likely to incur significant cost.
static Value * getValueOrPoison(WeakVH &VH, LLVMContext &C)
Cached location ops may be erased during LSR, in which case a poison is required when restoring from ...
static MemAccessTy getAccessType(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Return the type of the memory being accessed.
static unsigned numLLVMArgOps(SmallVectorImpl< uint64_t > &Expr)
Returns the total number of DW_OP_llvm_arg operands in the expression.
static void DbgRewriteSalvageableDVIs(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &DVIToUpdate)
Obtain an expression for the iteration count, then attempt to salvage the dbg.value intrinsics.
static cl::opt< bool > EnablePhiElim("enable-lsr-phielim", cl::Hidden, cl::init(true), cl::desc("Enable LSR phi elimination"))
static void UpdateDbgValue(DVIRecoveryRec &DVIRec, SmallVectorImpl< Value * > &NewLocationOps, SmallVectorImpl< uint64_t > &NewExpr)
Write the new expression and new location ops for the dbg.value.
static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if the given addrec can be sign-extended without changing its value.
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F)
Check if the addressing mode defined by F is completely folded in LU at isel time.
static cl::opt< bool > LSRExpNarrow("lsr-exp-narrow", cl::Hidden, cl::init(false), cl::desc("Narrow LSR complex solution using" " expectation of registers number"))
static cl::opt< bool > FilterSameScaledReg("lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true), cl::desc("Narrow LSR search space by filtering non-optimal formulae" " with the same ScaledReg and Scale"))
static void updateDVIWithLocations(T &DbgVal, SmallVectorImpl< Value * > &Locations, SmallVectorImpl< uint64_t > &Ops)
Overwrite DVI with locations placed into a DIArglist.
static cl::opt< unsigned > ComplexityLimit("lsr-complexity-limit", cl::Hidden, cl::init(std::numeric_limits< uint16_t >::max()), cl::desc("LSR search space complexity limit"))
static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC, TargetLibraryInfo &TLI, MemorySSA *MSSA)
static GlobalValue * ExtractSymbol(SCEVUse &S, ScalarEvolution &SE)
If S involves the addition of a GlobalValue address, return that symbol, and mutate S to point to a n...
static bool isProfitableChain(IVChain &Chain, SmallPtrSetImpl< Instruction * > &Users, ScalarEvolution &SE, const TargetTransformInfo &TTI)
Return true if the number of registers needed for the chain is estimated to be less than the number r...
static const SCEV * CollectSubexprs(const SCEV *S, const SCEVConstant *C, SmallVectorImpl< const SCEV * > &Ops, const Loop *L, ScalarEvolution &SE, unsigned Depth=0)
Split S into subexpressions which can be pulled out into separate registers.
static const SCEV * getExactSDiv(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool IgnoreSignificantBits=false)
Return an expression for LHS /s RHS, if it can be determined and if the remainder is known to be zero...
static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, Value *Operand, const TargetTransformInfo &TTI)
Return true if the IVInc can be folded into an addressing mode.
static const SCEV * getAnyExtendConsideringPostIncUses(ArrayRef< PostIncLoopSet > Loops, const SCEV *Expr, Type *ToTy, ScalarEvolution &SE)
Extend/Truncate Expr to ToTy considering post-inc uses in Loops.
static unsigned getSetupCost(const SCEV *Reg, unsigned Depth, const TargetTransformInfo &TTI)
static cl::opt< unsigned > SetupCostDepthLimit("lsr-setupcost-depth-limit", cl::Hidden, cl::init(7), cl::desc("The limit on recursion depth for LSRs setup cost"))
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
Register Reg
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
#define T
uint64_t IntrinsicInst * II
#define P(N)
PowerPC TLS Dynamic Call Fixup
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the PointerIntPair class.
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
SI optimize exec mask operations pre RA
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
static const unsigned UnknownAddressSpace
#define LLVM_DEBUG(...)
Definition Debug.h:114
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
Virtual Register Rewriter
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:330
LLVM_ABI APInt sdiv(const APInt &RHS) const
Signed division function for APInt.
Definition APInt.cpp:1686
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition APInt.h:1554
LLVM_ABI APInt srem(const APInt &RHS) const
Function for signed remainder operation.
Definition APInt.cpp:1787
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
LLVM_ABI AnalysisUsage & addRequiredID(const void *ID)
Definition Pass.cpp:284
AnalysisUsage & addPreservedID(const void *ID)
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:530
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:388
LLVM_ABI bool isLandingPad() const
Return true if this basic block is a landing pad.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
BinaryOps getOpcode() const
Definition InstrTypes.h:374
static LLVM_ABI BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
static LLVM_ABI Instruction::CastOps getCastOpcode(const Value *Val, bool SrcIsSigned, Type *Ty, bool DstIsSigned)
Returns the opcode necessary to cast Val into Ty using usual casting rules.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_NE
not equal
Definition InstrTypes.h:698
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
Value * getCondition() const
static LLVM_ABI bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI DIArgList * get(LLVMContext &Context, ArrayRef< ValueAsMetadata * > Args)
DWARF expression.
iterator_range< expr_op_iterator > expr_ops() const
static LLVM_ABI DIExpression * append(const DIExpression *Expr, ArrayRef< uint64_t > Ops)
Append the opcodes Ops to DIExpr.
unsigned getNumElements() const
static LLVM_ABI void appendOffset(SmallVectorImpl< uint64_t > &Ops, int64_t Offset)
Append Ops with operations to apply the Offset.
LLVM_ABI bool isComplex() const
Return whether the location is computed on the expression stack, meaning it cannot be a simple regist...
LLVM_ABI LLVMContext & getContext()
Record of a variable value-assignment, aka a non instruction representation of the dbg....
LLVM_ABI bool isKillLocation() const
void setRawLocation(Metadata *NewLocation)
Use of this should generally be avoided; instead, replaceVariableLocationOp and addVariableLocationOp...
void setExpression(DIExpression *NewExpr)
DIExpression * getExpression() const
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
NodeT * getBlock() const
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:316
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
LLVM_ABI Instruction * findNearestCommonDominator(Instruction *I1, Instruction *I2) const
Find the nearest instruction I that dominates both I1 and I2, in the sense that a result produced bef...
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
PointerType * getType() const
Global values are always pointers.
IVStrideUse - Keep track of one use of a strided induction variable.
Definition IVUsers.h:35
void transformToPostInc(const Loop *L)
transformToPostInc - Transform the expression to post-inc form for the given loop.
Definition IVUsers.cpp:365
Value * getOperandValToReplace() const
getOperandValToReplace - Return the Value of the operand in the user instruction that this IVStrideUs...
Definition IVUsers.h:54
void setUser(Instruction *NewUser)
setUser - Assign a new user instruction for this use.
Definition IVUsers.h:48
Analysis pass that exposes the IVUsers for a loop.
Definition IVUsers.h:186
ilist< IVStrideUse >::const_iterator const_iterator
Definition IVUsers.h:142
iterator end()
Definition IVUsers.h:144
iterator begin()
Definition IVUsers.h:143
bool empty() const
Definition IVUsers.h:147
LLVM_ABI void print(raw_ostream &OS) const
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isLifetimeStartOrEnd() const LLVM_READONLY
Return true if the instruction is a llvm.lifetime.start or llvm.lifetime.end marker.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isEHPad() const
Return true if the instruction is a variety of EH-block.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI Type * getAccessType() const LLVM_READONLY
Return the type this instruction accesses in memory, if any.
const char * getOpcodeName() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
An instruction for reading from memory.
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
The legacy pass manager's analysis pass to compute loop information.
Definition LoopInfo.h:596
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
An analysis that produces MemorySSA for a function.
Definition MemorySSA.h:922
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition MemorySSA.h:702
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
iterator_range< const_block_iterator > blocks() const
op_range incoming_values()
void setIncomingValue(unsigned i, Value *V)
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
static unsigned getIncomingValueNumForOperand(unsigned i)
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
This node represents an addition of some number of SCEVs.
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
SCEVUse getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
This class represents a constant integer value.
ConstantInt * getValue() const
const APInt & getAPInt() const
This class uses information about analyze scalars to rewrite expressions in canonical form.
This node represents multiplication of some number of SCEVs.
ArrayRef< SCEVUse > operands() const
This means that we are dealing with an entirely unknown SCEV value, and only represent it as its LLVM...
This class represents an analyzed expression in the program.
unsigned short getExpressionSize() const
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
static constexpr auto FlagAnyWrap
LLVM_ABI ArrayRef< SCEVUse > operands() const
Return operands of this SCEV expression.
SCEVTypes getSCEVType() const
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
LLVM_ABI uint64_t getTypeSizeInBits(Type *Ty) const
Return the size in bits of the specified type, for which isSCEVable must return true.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getAddRecExpr(SCEVUse Start, SCEVUse Step, const Loop *L, SCEV::NoWrapFlags Flags)
Get an add recurrence expression for the specified loop.
LLVM_ABI const SCEV * getNoopOrSignExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
LLVM_ABI Type * getEffectiveSCEVType(Type *Ty) const
Return a type with the same bitwidth as the given type and which represents how SCEV will treat the g...
LLVM_ABI const SCEV * getAnyExtendExpr(const SCEV *Op, Type *Ty)
getAnyExtendExpr - Return a SCEV for the given operand extended with unspecified bits out to the give...
LLVM_ABI bool containsUndefs(const SCEV *S) const
Return true if the SCEV expression contains an undef value.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
LLVM_ABI const SCEV * getVScale(Type *Ty)
LLVM_ABI bool hasComputableLoopEvolution(const SCEV *S, const Loop *L)
Return true if the given SCEV changes value in a known way in the specified loop.
LLVM_ABI const SCEV * getPointerBase(const SCEV *V)
Transitively follow the chain of pointer-type operands until reaching a SCEV that does not have a sin...
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVM_ABI const SCEV * getUnknown(Value *V)
LLVM_ABI std::optional< APInt > computeConstantDifference(const SCEV *LHS, const SCEV *RHS)
Compute LHS - RHS and returns the result as an APInt if it is a constant, and std::nullopt if it isn'...
LLVM_ABI bool properlyDominates(const SCEV *S, const BasicBlock *BB)
Return true if elements that makes up the given SCEV properly dominate the specified basic block.
LLVM_ABI bool containsErasedValue(const SCEV *S) const
Return true if the SCEV expression contains a Value that has been optimised out and is now a nullptr.
LLVMContext & getContext() const
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
iterator end()
Get an iterator to the end of the SetVector.
Definition SetVector.h:112
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition SetVector.h:106
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
iterator_range< const_set_bits_iterator > set_bits() const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
size_type size() const
Returns the number of bits in this bitvector.
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
iterator erase(const_iterator CI)
typename SuperClass::const_iterator const_iterator
typename SuperClass::iterator iterator
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
static StackOffset get(int64_t Fixed, int64_t Scalable)
Definition TypeSize.h:41
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI bool shouldDropLSRSolutionIfLessProfitable() const
Return true if LSR should drop a found solution if it's calculated to be less profitable than the bas...
LLVM_ABI bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const
Return true if LSR cost of C1 is lower than C2.
LLVM_ABI bool isIndexedStoreLegal(enum MemIndexedMode Mode, Type *Ty) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace=0, Instruction *I=nullptr, int64_t ScalableOffset=0) const
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI bool isIndexedLoadLegal(enum MemIndexedMode Mode, Type *Ty) const
LLVM_ABI bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
LLVM_ABI bool isLegalAddImmediate(int64_t Imm) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
LLVM_ABI bool canSaveCmp(Loop *L, CondBrInst **BI, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *LibInfo) const
Return true if the target can save a compare for loop count, for example hardware loop saves a compar...
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
@ TCC_Free
Expected to fold away in lowering.
LLVM_ABI bool canMacroFuseCmp() const
Return true if the target can fuse a compare and branch.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
@ AMK_All
Consider all addressing modes.
@ AMK_PreIndexed
Prefer pre-indexed addressing mode.
@ AMK_None
Don't prefer any addressing mode.
LLVM_ABI bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
This class represents a truncation of integer types.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:65
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:311
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI int getFPMantissaWidth() const
Return the width of the mantissa of this type.
Definition Type.cpp:241
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
Use * op_iterator
Definition User.h:254
op_range operands()
Definition User.h:267
op_iterator op_begin()
Definition User.h:259
void setOperand(unsigned i, Value *Val)
Definition User.h:212
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:25
Value * getOperand(unsigned i) const
Definition User.h:207
op_iterator op_end()
Definition User.h:261
static LLVM_ABI ValueAsMetadata * get(Value *V)
Definition Metadata.cpp:509
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:549
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI void printAsOperand(raw_ostream &O, bool PrintType=true, const Module *M=nullptr) const
Print the name of this Value out to the specified raw_ostream.
iterator_range< use_iterator > uses()
Definition Value.h:380
A nullable Value handle that is nullable.
int getNumOccurrences() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
match_bind< const SCEVMulExpr > m_scev_Mul(const SCEVMulExpr *&V)
bool match(const SCEV *S, const Pattern &P)
SCEVAffineAddRec_match< Op0_t, Op1_t, match_isa< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
cst_pred_ty< is_specific_cst > m_scev_SpecificInt(uint64_t V)
Match an SCEV constant with a plain unsigned integer.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
@ DW_OP_LLVM_arg
Only used in LLVM metadata.
Definition Dwarf.h:149
@ DW_OP_LLVM_convert
Only used in LLVM metadata.
Definition Dwarf.h:145
constexpr double e
Sequence
A sequence of states that a pointer may go through in which an objc_retain and objc_release are actua...
Definition PtrState.h:41
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
iterator end() const
Definition BasicBlock.h:89
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
Definition SFrame.h:77
unsigned KindType
For isa, dyn_cast, etc operations on TelemetryInfo.
Definition Telemetry.h:83
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1765
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
InstructionCost Cost
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition Utils.cpp:1666
bool operator!=(uint64_t V1, const APInt &V2)
Definition APInt.h:2142
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
LLVM_ABI char & LoopSimplifyID
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
DomTreeNodeBase< BasicBlock > DomTreeNode
Definition Dominators.h:94
AnalysisManager< Loop, LoopStandardAnalysisResults & > LoopAnalysisManager
The loop analysis manager.
LLVM_ABI bool matchSimpleRecurrence(const PHINode *P, BinaryOperator *&BO, Value *&Start, Value *&Step)
Attempt to match a simple first order recurrence cycle of the form: iv = phi Ty [Start,...
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
LLVM_ABI bool DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr)
Examine each PHI in the given block and delete it if it is dead.
LLVM_ABI void initializeLoopStrengthReducePass(PassRegistry &)
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
LLVM_ABI const SCEV * denormalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE)
Denormalize S to be post-increment for all loops present in Loops.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
LLVM_ABI Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void SplitLandingPadPredecessors(BasicBlock *OrigBB, ArrayRef< BasicBlock * > Preds, const char *Suffix, const char *Suffix2, SmallVectorImpl< BasicBlock * > &NewBBs, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, bool PreserveLCSSA=false)
This method transforms the landing pad, OrigBB, by introducing two new basic blocks into the function...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
LLVM_ABI const SCEV * normalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE, bool CheckInvertible=true)
Normalize S to be post-increment for all loops present in Loops.
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Other
Any other memory.
Definition ModRef.h:68
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
@ Add
Sum of integers.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2012
DWARFExpression::Operation Op
LLVM_ABI Pass * createLoopStrengthReducePass()
LLVM_ABI BasicBlock * SplitCriticalEdge(Instruction *TI, unsigned SuccNum, const CriticalEdgeSplittingOptions &Options=CriticalEdgeSplittingOptions(), const Twine &BBName="")
If this edge is a critical edge, insert a new node to split the critical edge.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructionsPermissive(SmallVectorImpl< WeakTrackingVH > &DeadInsts, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
Same functionality as RecursivelyDeleteTriviallyDeadInstructions, but allow instructions that are not...
Definition Local.cpp:550
constexpr unsigned BitWidth
LLVM_ABI bool formLCSSAForInstructions(SmallVectorImpl< Instruction * > &Worklist, const DominatorTree &DT, const LoopInfo &LI, ScalarEvolution *SE, SmallVectorImpl< PHINode * > *PHIsToRemove=nullptr, SmallVectorImpl< PHINode * > *InsertedPHIs=nullptr)
Ensures LCSSA form for every instruction from the Worklist in the scope of innermost containing loop.
Definition LCSSA.cpp:308
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
SmallPtrSet< const Loop *, 2 > PostIncLoopSet
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
LLVM_ABI int rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI, ScalarEvolution *SE, const TargetTransformInfo *TTI, SCEVExpander &Rewriter, DominatorTree *DT, ReplaceExitVal ReplaceExitValue, SmallVector< WeakTrackingVH, 16 > &DeadInsts)
If the final value of any expressions that are recurrent in the loop can be computed,...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
@ UnusedIndVarInLoop
Definition LoopUtils.h:569
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
SCEVUseT< const SCEV * > SCEVUse
bool SCEVExprContains(const SCEV *Root, PredTy Pred)
Return true if any node in Root satisfies the predicate Pred.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
Attributes of a target dependent hardware loop.
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Information about a load/store intrinsic defined by the target.
Value * PtrVal
This is the pointer that the intrinsic is loading from or storing to.