LLVM 20.0.0git
LoopStrengthReduce.cpp
Go to the documentation of this file.
1//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This transformation analyzes and transforms the induction variables (and
10// computations derived from them) into forms suitable for efficient execution
11// on the target.
12//
13// This pass performs a strength reduction on array references inside loops that
14// have as one or more of their components the loop induction variable, it
15// rewrites expressions to take advantage of scaled-index addressing modes
16// available on the target, and it performs a variety of other optimizations
17// related to loop induction variables.
18//
19// Terminology note: this code has a lot of handling for "post-increment" or
20// "post-inc" users. This is not talking about post-increment addressing modes;
21// it is instead talking about code like this:
22//
23// %i = phi [ 0, %entry ], [ %i.next, %latch ]
24// ...
25// %i.next = add %i, 1
26// %c = icmp eq %i.next, %n
27//
28// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
29// it's useful to think about these as the same register, with some uses using
30// the value of the register before the add and some using it after. In this
31// example, the icmp is a post-increment user, since it uses %i.next, which is
32// the value of the induction variable after the increment. The other common
33// case of post-increment users is users outside the loop.
34//
35// TODO: More sophistication in the way Formulae are generated and filtered.
36//
37// TODO: Handle multiple loops at a time.
38//
39// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
40// of a GlobalValue?
41//
42// TODO: When truncation is free, truncate ICmp users' operands to make it a
43// smaller encoding (on x86 at least).
44//
45// TODO: When a negated register is used by an add (such as in a list of
46// multiple base registers, or as the increment expression in an addrec),
47// we may not actually need both reg and (-1 * reg) in registers; the
48// negation can be implemented by using a sub instead of an add. The
49// lack of support for taking this into consideration when making
50// register pressure decisions is partly worked around by the "Special"
51// use kind.
52//
53//===----------------------------------------------------------------------===//
54
56#include "llvm/ADT/APInt.h"
57#include "llvm/ADT/DenseMap.h"
58#include "llvm/ADT/DenseSet.h"
59#include "llvm/ADT/Hashing.h"
61#include "llvm/ADT/STLExtras.h"
62#include "llvm/ADT/SetVector.h"
65#include "llvm/ADT/SmallSet.h"
67#include "llvm/ADT/Statistic.h"
84#include "llvm/Config/llvm-config.h"
85#include "llvm/IR/BasicBlock.h"
86#include "llvm/IR/Constant.h"
87#include "llvm/IR/Constants.h"
90#include "llvm/IR/Dominators.h"
91#include "llvm/IR/GlobalValue.h"
92#include "llvm/IR/IRBuilder.h"
93#include "llvm/IR/InstrTypes.h"
94#include "llvm/IR/Instruction.h"
97#include "llvm/IR/Module.h"
98#include "llvm/IR/Operator.h"
99#include "llvm/IR/Type.h"
100#include "llvm/IR/Use.h"
101#include "llvm/IR/User.h"
102#include "llvm/IR/Value.h"
103#include "llvm/IR/ValueHandle.h"
105#include "llvm/Pass.h"
106#include "llvm/Support/Casting.h"
109#include "llvm/Support/Debug.h"
119#include <algorithm>
120#include <cassert>
121#include <cstddef>
122#include <cstdint>
123#include <iterator>
124#include <limits>
125#include <map>
126#include <numeric>
127#include <optional>
128#include <utility>
129
130using namespace llvm;
131
132#define DEBUG_TYPE "loop-reduce"
133
134/// MaxIVUsers is an arbitrary threshold that provides an early opportunity for
135/// bail out. This threshold is far beyond the number of users that LSR can
136/// conceivably solve, so it should not affect generated code, but catches the
137/// worst cases before LSR burns too much compile time and stack space.
138static const unsigned MaxIVUsers = 200;
139
140/// Limit the size of expression that SCEV-based salvaging will attempt to
141/// translate into a DIExpression.
142/// Choose a maximum size such that debuginfo is not excessively increased and
143/// the salvaging is not too expensive for the compiler.
144static const unsigned MaxSCEVSalvageExpressionSize = 64;
145
146// Cleanup congruent phis after LSR phi expansion.
148 "enable-lsr-phielim", cl::Hidden, cl::init(true),
149 cl::desc("Enable LSR phi elimination"));
150
151// The flag adds instruction count to solutions cost comparison.
153 "lsr-insns-cost", cl::Hidden, cl::init(true),
154 cl::desc("Add instruction count to a LSR cost model"));
155
156// Flag to choose how to narrow complex lsr solution
158 "lsr-exp-narrow", cl::Hidden, cl::init(false),
159 cl::desc("Narrow LSR complex solution using"
160 " expectation of registers number"));
161
162// Flag to narrow search space by filtering non-optimal formulae with
163// the same ScaledReg and Scale.
165 "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true),
166 cl::desc("Narrow LSR search space by filtering non-optimal formulae"
167 " with the same ScaledReg and Scale"));
168
170 "lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None),
171 cl::desc("A flag that overrides the target's preferred addressing mode."),
173 "none",
174 "Don't prefer any addressing mode"),
176 "preindexed",
177 "Prefer pre-indexed addressing mode"),
179 "postindexed",
180 "Prefer post-indexed addressing mode")));
181
183 "lsr-complexity-limit", cl::Hidden,
184 cl::init(std::numeric_limits<uint16_t>::max()),
185 cl::desc("LSR search space complexity limit"));
186
188 "lsr-setupcost-depth-limit", cl::Hidden, cl::init(7),
189 cl::desc("The limit on recursion depth for LSRs setup cost"));
190
192 "lsr-drop-solution", cl::Hidden,
193 cl::desc("Attempt to drop solution if it is less profitable"));
194
196 "lsr-enable-vscale-immediates", cl::Hidden, cl::init(true),
197 cl::desc("Enable analysis of vscale-relative immediates in LSR"));
198
200 "lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true),
201 cl::desc("Avoid using scaled registers with vscale-relative addressing"));
202
203#ifndef NDEBUG
204// Stress test IV chain generation.
206 "stress-ivchain", cl::Hidden, cl::init(false),
207 cl::desc("Stress test LSR IV chains"));
208#else
209static bool StressIVChain = false;
210#endif
211
212namespace {
213
214struct MemAccessTy {
215 /// Used in situations where the accessed memory type is unknown.
216 static const unsigned UnknownAddressSpace =
217 std::numeric_limits<unsigned>::max();
218
219 Type *MemTy = nullptr;
220 unsigned AddrSpace = UnknownAddressSpace;
221
222 MemAccessTy() = default;
223 MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {}
224
225 bool operator==(MemAccessTy Other) const {
226 return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
227 }
228
229 bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
230
231 static MemAccessTy getUnknown(LLVMContext &Ctx,
232 unsigned AS = UnknownAddressSpace) {
233 return MemAccessTy(Type::getVoidTy(Ctx), AS);
234 }
235
236 Type *getType() { return MemTy; }
237};
238
239/// This class holds data which is used to order reuse candidates.
240class RegSortData {
241public:
242 /// This represents the set of LSRUse indices which reference
243 /// a particular register.
244 SmallBitVector UsedByIndices;
245
246 void print(raw_ostream &OS) const;
247 void dump() const;
248};
249
250// An offset from an address that is either scalable or fixed. Used for
251// per-target optimizations of addressing modes.
252class Immediate : public details::FixedOrScalableQuantity<Immediate, int64_t> {
253 constexpr Immediate(ScalarTy MinVal, bool Scalable)
254 : FixedOrScalableQuantity(MinVal, Scalable) {}
255
256 constexpr Immediate(const FixedOrScalableQuantity<Immediate, int64_t> &V)
257 : FixedOrScalableQuantity(V) {}
258
259public:
260 constexpr Immediate() = delete;
261
262 static constexpr Immediate getFixed(ScalarTy MinVal) {
263 return {MinVal, false};
264 }
265 static constexpr Immediate getScalable(ScalarTy MinVal) {
266 return {MinVal, true};
267 }
268 static constexpr Immediate get(ScalarTy MinVal, bool Scalable) {
269 return {MinVal, Scalable};
270 }
271 static constexpr Immediate getZero() { return {0, false}; }
272 static constexpr Immediate getFixedMin() {
273 return {std::numeric_limits<int64_t>::min(), false};
274 }
275 static constexpr Immediate getFixedMax() {
276 return {std::numeric_limits<int64_t>::max(), false};
277 }
278 static constexpr Immediate getScalableMin() {
279 return {std::numeric_limits<int64_t>::min(), true};
280 }
281 static constexpr Immediate getScalableMax() {
282 return {std::numeric_limits<int64_t>::max(), true};
283 }
284
285 constexpr bool isLessThanZero() const { return Quantity < 0; }
286
287 constexpr bool isGreaterThanZero() const { return Quantity > 0; }
288
289 constexpr bool isCompatibleImmediate(const Immediate &Imm) const {
290 return isZero() || Imm.isZero() || Imm.Scalable == Scalable;
291 }
292
293 constexpr bool isMin() const {
294 return Quantity == std::numeric_limits<ScalarTy>::min();
295 }
296
297 constexpr bool isMax() const {
298 return Quantity == std::numeric_limits<ScalarTy>::max();
299 }
300
301 // Arithmetic 'operators' that cast to unsigned types first.
302 constexpr Immediate addUnsigned(const Immediate &RHS) const {
303 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
304 ScalarTy Value = (uint64_t)Quantity + RHS.getKnownMinValue();
305 return {Value, Scalable || RHS.isScalable()};
306 }
307
308 constexpr Immediate subUnsigned(const Immediate &RHS) const {
309 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
310 ScalarTy Value = (uint64_t)Quantity - RHS.getKnownMinValue();
311 return {Value, Scalable || RHS.isScalable()};
312 }
313
314 // Scale the quantity by a constant without caring about runtime scalability.
315 constexpr Immediate mulUnsigned(const ScalarTy RHS) const {
316 ScalarTy Value = (uint64_t)Quantity * RHS;
317 return {Value, Scalable};
318 }
319
320 // Helpers for generating SCEVs with vscale terms where needed.
321 const SCEV *getSCEV(ScalarEvolution &SE, Type *Ty) const {
322 const SCEV *S = SE.getConstant(Ty, Quantity);
323 if (Scalable)
324 S = SE.getMulExpr(S, SE.getVScale(S->getType()));
325 return S;
326 }
327
328 const SCEV *getNegativeSCEV(ScalarEvolution &SE, Type *Ty) const {
329 const SCEV *NegS = SE.getConstant(Ty, -(uint64_t)Quantity);
330 if (Scalable)
331 NegS = SE.getMulExpr(NegS, SE.getVScale(NegS->getType()));
332 return NegS;
333 }
334
335 const SCEV *getUnknownSCEV(ScalarEvolution &SE, Type *Ty) const {
336 const SCEV *SU = SE.getUnknown(ConstantInt::getSigned(Ty, Quantity));
337 if (Scalable)
338 SU = SE.getMulExpr(SU, SE.getVScale(SU->getType()));
339 return SU;
340 }
341};
342
343// This is needed for the Compare type of std::map when Immediate is used
344// as a key. We don't need it to be fully correct against any value of vscale,
345// just to make sure that vscale-related terms in the map are considered against
346// each other rather than being mixed up and potentially missing opportunities.
347struct KeyOrderTargetImmediate {
348 bool operator()(const Immediate &LHS, const Immediate &RHS) const {
349 if (LHS.isScalable() && !RHS.isScalable())
350 return false;
351 if (!LHS.isScalable() && RHS.isScalable())
352 return true;
353 return LHS.getKnownMinValue() < RHS.getKnownMinValue();
354 }
355};
356
357// This would be nicer if we could be generic instead of directly using size_t,
358// but there doesn't seem to be a type trait for is_orderable or
359// is_lessthan_comparable or similar.
360struct KeyOrderSizeTAndImmediate {
361 bool operator()(const std::pair<size_t, Immediate> &LHS,
362 const std::pair<size_t, Immediate> &RHS) const {
363 size_t LSize = LHS.first;
364 size_t RSize = RHS.first;
365 if (LSize != RSize)
366 return LSize < RSize;
367 return KeyOrderTargetImmediate()(LHS.second, RHS.second);
368 }
369};
370} // end anonymous namespace
371
372#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
373void RegSortData::print(raw_ostream &OS) const {
374 OS << "[NumUses=" << UsedByIndices.count() << ']';
375}
376
377LLVM_DUMP_METHOD void RegSortData::dump() const {
378 print(errs()); errs() << '\n';
379}
380#endif
381
382namespace {
383
384/// Map register candidates to information about how they are used.
385class RegUseTracker {
386 using RegUsesTy = DenseMap<const SCEV *, RegSortData>;
387
388 RegUsesTy RegUsesMap;
390
391public:
392 void countRegister(const SCEV *Reg, size_t LUIdx);
393 void dropRegister(const SCEV *Reg, size_t LUIdx);
394 void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
395
396 bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
397
398 const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
399
400 void clear();
401
404
405 iterator begin() { return RegSequence.begin(); }
406 iterator end() { return RegSequence.end(); }
407 const_iterator begin() const { return RegSequence.begin(); }
408 const_iterator end() const { return RegSequence.end(); }
409};
410
411} // end anonymous namespace
412
413void
414RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
415 std::pair<RegUsesTy::iterator, bool> Pair =
416 RegUsesMap.insert(std::make_pair(Reg, RegSortData()));
417 RegSortData &RSD = Pair.first->second;
418 if (Pair.second)
419 RegSequence.push_back(Reg);
420 RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
421 RSD.UsedByIndices.set(LUIdx);
422}
423
424void
425RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
426 RegUsesTy::iterator It = RegUsesMap.find(Reg);
427 assert(It != RegUsesMap.end());
428 RegSortData &RSD = It->second;
429 assert(RSD.UsedByIndices.size() > LUIdx);
430 RSD.UsedByIndices.reset(LUIdx);
431}
432
433void
434RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
435 assert(LUIdx <= LastLUIdx);
436
437 // Update RegUses. The data structure is not optimized for this purpose;
438 // we must iterate through it and update each of the bit vectors.
439 for (auto &Pair : RegUsesMap) {
440 SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
441 if (LUIdx < UsedByIndices.size())
442 UsedByIndices[LUIdx] =
443 LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : false;
444 UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
445 }
446}
447
448bool
449RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
450 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
451 if (I == RegUsesMap.end())
452 return false;
453 const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
454 int i = UsedByIndices.find_first();
455 if (i == -1) return false;
456 if ((size_t)i != LUIdx) return true;
457 return UsedByIndices.find_next(i) != -1;
458}
459
460const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
461 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
462 assert(I != RegUsesMap.end() && "Unknown register!");
463 return I->second.UsedByIndices;
464}
465
466void RegUseTracker::clear() {
467 RegUsesMap.clear();
468 RegSequence.clear();
469}
470
471namespace {
472
473/// This class holds information that describes a formula for computing
474/// satisfying a use. It may include broken-out immediates and scaled registers.
475struct Formula {
476 /// Global base address used for complex addressing.
477 GlobalValue *BaseGV = nullptr;
478
479 /// Base offset for complex addressing.
480 Immediate BaseOffset = Immediate::getZero();
481
482 /// Whether any complex addressing has a base register.
483 bool HasBaseReg = false;
484
485 /// The scale of any complex addressing.
486 int64_t Scale = 0;
487
488 /// The list of "base" registers for this use. When this is non-empty. The
489 /// canonical representation of a formula is
490 /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
491 /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
492 /// 3. The reg containing recurrent expr related with currect loop in the
493 /// formula should be put in the ScaledReg.
494 /// #1 enforces that the scaled register is always used when at least two
495 /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
496 /// #2 enforces that 1 * reg is reg.
497 /// #3 ensures invariant regs with respect to current loop can be combined
498 /// together in LSR codegen.
499 /// This invariant can be temporarily broken while building a formula.
500 /// However, every formula inserted into the LSRInstance must be in canonical
501 /// form.
503
504 /// The 'scaled' register for this use. This should be non-null when Scale is
505 /// not zero.
506 const SCEV *ScaledReg = nullptr;
507
508 /// An additional constant offset which added near the use. This requires a
509 /// temporary register, but the offset itself can live in an add immediate
510 /// field rather than a register.
511 Immediate UnfoldedOffset = Immediate::getZero();
512
513 Formula() = default;
514
515 void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
516
517 bool isCanonical(const Loop &L) const;
518
519 void canonicalize(const Loop &L);
520
521 bool unscale();
522
523 bool hasZeroEnd() const;
524
525 size_t getNumRegs() const;
526 Type *getType() const;
527
528 void deleteBaseReg(const SCEV *&S);
529
530 bool referencesReg(const SCEV *S) const;
531 bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
532 const RegUseTracker &RegUses) const;
533
534 void print(raw_ostream &OS) const;
535 void dump() const;
536};
537
538} // end anonymous namespace
539
540/// Recursion helper for initialMatch.
541static void DoInitialMatch(const SCEV *S, Loop *L,
544 ScalarEvolution &SE) {
545 // Collect expressions which properly dominate the loop header.
546 if (SE.properlyDominates(S, L->getHeader())) {
547 Good.push_back(S);
548 return;
549 }
550
551 // Look at add operands.
552 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
553 for (const SCEV *S : Add->operands())
554 DoInitialMatch(S, L, Good, Bad, SE);
555 return;
556 }
557
558 // Look at addrec operands.
559 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
560 if (!AR->getStart()->isZero() && AR->isAffine()) {
561 DoInitialMatch(AR->getStart(), L, Good, Bad, SE);
562 DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
563 AR->getStepRecurrence(SE),
564 // FIXME: AR->getNoWrapFlags()
565 AR->getLoop(), SCEV::FlagAnyWrap),
566 L, Good, Bad, SE);
567 return;
568 }
569
570 // Handle a multiplication by -1 (negation) if it didn't fold.
571 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
572 if (Mul->getOperand(0)->isAllOnesValue()) {
574 const SCEV *NewMul = SE.getMulExpr(Ops);
575
578 DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
579 const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
580 SE.getEffectiveSCEVType(NewMul->getType())));
581 for (const SCEV *S : MyGood)
582 Good.push_back(SE.getMulExpr(NegOne, S));
583 for (const SCEV *S : MyBad)
584 Bad.push_back(SE.getMulExpr(NegOne, S));
585 return;
586 }
587
588 // Ok, we can't do anything interesting. Just stuff the whole thing into a
589 // register and hope for the best.
590 Bad.push_back(S);
591}
592
593/// Incorporate loop-variant parts of S into this Formula, attempting to keep
594/// all loop-invariant and loop-computable values in a single base register.
595void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
598 DoInitialMatch(S, L, Good, Bad, SE);
599 if (!Good.empty()) {
600 const SCEV *Sum = SE.getAddExpr(Good);
601 if (!Sum->isZero())
602 BaseRegs.push_back(Sum);
603 HasBaseReg = true;
604 }
605 if (!Bad.empty()) {
606 const SCEV *Sum = SE.getAddExpr(Bad);
607 if (!Sum->isZero())
608 BaseRegs.push_back(Sum);
609 HasBaseReg = true;
610 }
611 canonicalize(*L);
612}
613
614static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L) {
615 return SCEVExprContains(S, [&L](const SCEV *S) {
616 return isa<SCEVAddRecExpr>(S) && (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
617 });
618}
619
620/// Check whether or not this formula satisfies the canonical
621/// representation.
622/// \see Formula::BaseRegs.
623bool Formula::isCanonical(const Loop &L) const {
624 assert((Scale == 0 || ScaledReg) &&
625 "ScaledReg must be non-null if Scale is non-zero");
626
627 if (!ScaledReg)
628 return BaseRegs.size() <= 1;
629
630 if (Scale != 1)
631 return true;
632
633 if (Scale == 1 && BaseRegs.empty())
634 return false;
635
636 if (containsAddRecDependentOnLoop(ScaledReg, L))
637 return true;
638
639 // If ScaledReg is not a recurrent expr, or it is but its loop is not current
640 // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
641 // loop, we want to swap the reg in BaseRegs with ScaledReg.
642 return none_of(BaseRegs, [&L](const SCEV *S) {
644 });
645}
646
647/// Helper method to morph a formula into its canonical representation.
648/// \see Formula::BaseRegs.
649/// Every formula having more than one base register, must use the ScaledReg
650/// field. Otherwise, we would have to do special cases everywhere in LSR
651/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
652/// On the other hand, 1*reg should be canonicalized into reg.
653void Formula::canonicalize(const Loop &L) {
654 if (isCanonical(L))
655 return;
656
657 if (BaseRegs.empty()) {
658 // No base reg? Use scale reg with scale = 1 as such.
659 assert(ScaledReg && "Expected 1*reg => reg");
660 assert(Scale == 1 && "Expected 1*reg => reg");
661 BaseRegs.push_back(ScaledReg);
662 Scale = 0;
663 ScaledReg = nullptr;
664 return;
665 }
666
667 // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
668 if (!ScaledReg) {
669 ScaledReg = BaseRegs.pop_back_val();
670 Scale = 1;
671 }
672
673 // If ScaledReg is an invariant with respect to L, find the reg from
674 // BaseRegs containing the recurrent expr related with Loop L. Swap the
675 // reg with ScaledReg.
676 if (!containsAddRecDependentOnLoop(ScaledReg, L)) {
677 auto I = find_if(BaseRegs, [&L](const SCEV *S) {
679 });
680 if (I != BaseRegs.end())
681 std::swap(ScaledReg, *I);
682 }
683 assert(isCanonical(L) && "Failed to canonicalize?");
684}
685
686/// Get rid of the scale in the formula.
687/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
688/// \return true if it was possible to get rid of the scale, false otherwise.
689/// \note After this operation the formula may not be in the canonical form.
690bool Formula::unscale() {
691 if (Scale != 1)
692 return false;
693 Scale = 0;
694 BaseRegs.push_back(ScaledReg);
695 ScaledReg = nullptr;
696 return true;
697}
698
699bool Formula::hasZeroEnd() const {
700 if (UnfoldedOffset || BaseOffset)
701 return false;
702 if (BaseRegs.size() != 1 || ScaledReg)
703 return false;
704 return true;
705}
706
707/// Return the total number of register operands used by this formula. This does
708/// not include register uses implied by non-constant addrec strides.
709size_t Formula::getNumRegs() const {
710 return !!ScaledReg + BaseRegs.size();
711}
712
713/// Return the type of this formula, if it has one, or null otherwise. This type
714/// is meaningless except for the bit size.
715Type *Formula::getType() const {
716 return !BaseRegs.empty() ? BaseRegs.front()->getType() :
717 ScaledReg ? ScaledReg->getType() :
718 BaseGV ? BaseGV->getType() :
719 nullptr;
720}
721
722/// Delete the given base reg from the BaseRegs list.
723void Formula::deleteBaseReg(const SCEV *&S) {
724 if (&S != &BaseRegs.back())
725 std::swap(S, BaseRegs.back());
726 BaseRegs.pop_back();
727}
728
729/// Test if this formula references the given register.
730bool Formula::referencesReg(const SCEV *S) const {
731 return S == ScaledReg || is_contained(BaseRegs, S);
732}
733
734/// Test whether this formula uses registers which are used by uses other than
735/// the use with the given index.
736bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
737 const RegUseTracker &RegUses) const {
738 if (ScaledReg)
739 if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
740 return true;
741 for (const SCEV *BaseReg : BaseRegs)
742 if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
743 return true;
744 return false;
745}
746
747#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
748void Formula::print(raw_ostream &OS) const {
749 bool First = true;
750 if (BaseGV) {
751 if (!First) OS << " + "; else First = false;
752 BaseGV->printAsOperand(OS, /*PrintType=*/false);
753 }
754 if (BaseOffset.isNonZero()) {
755 if (!First) OS << " + "; else First = false;
756 OS << BaseOffset;
757 }
758 for (const SCEV *BaseReg : BaseRegs) {
759 if (!First) OS << " + "; else First = false;
760 OS << "reg(" << *BaseReg << ')';
761 }
762 if (HasBaseReg && BaseRegs.empty()) {
763 if (!First) OS << " + "; else First = false;
764 OS << "**error: HasBaseReg**";
765 } else if (!HasBaseReg && !BaseRegs.empty()) {
766 if (!First) OS << " + "; else First = false;
767 OS << "**error: !HasBaseReg**";
768 }
769 if (Scale != 0) {
770 if (!First) OS << " + "; else First = false;
771 OS << Scale << "*reg(";
772 if (ScaledReg)
773 OS << *ScaledReg;
774 else
775 OS << "<unknown>";
776 OS << ')';
777 }
778 if (UnfoldedOffset.isNonZero()) {
779 if (!First) OS << " + ";
780 OS << "imm(" << UnfoldedOffset << ')';
781 }
782}
783
784LLVM_DUMP_METHOD void Formula::dump() const {
785 print(errs()); errs() << '\n';
786}
787#endif
788
789/// Return true if the given addrec can be sign-extended without changing its
790/// value.
792 Type *WideTy =
794 return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
795}
796
797/// Return true if the given add can be sign-extended without changing its
798/// value.
799static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
800 Type *WideTy =
801 IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
802 return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
803}
804
805/// Return true if the given mul can be sign-extended without changing its
806/// value.
807static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
808 Type *WideTy =
810 SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
811 return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
812}
813
814/// Return an expression for LHS /s RHS, if it can be determined and if the
815/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
816/// is true, expressions like (X * Y) /s Y are simplified to X, ignoring that
817/// the multiplication may overflow, which is useful when the result will be
818/// used in a context where the most significant bits are ignored.
819static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
820 ScalarEvolution &SE,
821 bool IgnoreSignificantBits = false) {
822 // Handle the trivial case, which works for any SCEV type.
823 if (LHS == RHS)
824 return SE.getConstant(LHS->getType(), 1);
825
826 // Handle a few RHS special cases.
827 const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
828 if (RC) {
829 const APInt &RA = RC->getAPInt();
830 // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
831 // some folding.
832 if (RA.isAllOnes()) {
833 if (LHS->getType()->isPointerTy())
834 return nullptr;
835 return SE.getMulExpr(LHS, RC);
836 }
837 // Handle x /s 1 as x.
838 if (RA == 1)
839 return LHS;
840 }
841
842 // Check for a division of a constant by a constant.
843 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
844 if (!RC)
845 return nullptr;
846 const APInt &LA = C->getAPInt();
847 const APInt &RA = RC->getAPInt();
848 if (LA.srem(RA) != 0)
849 return nullptr;
850 return SE.getConstant(LA.sdiv(RA));
851 }
852
853 // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
854 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
855 if ((IgnoreSignificantBits || isAddRecSExtable(AR, SE)) && AR->isAffine()) {
856 const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
857 IgnoreSignificantBits);
858 if (!Step) return nullptr;
859 const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
860 IgnoreSignificantBits);
861 if (!Start) return nullptr;
862 // FlagNW is independent of the start value, step direction, and is
863 // preserved with smaller magnitude steps.
864 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
865 return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
866 }
867 return nullptr;
868 }
869
870 // Distribute the sdiv over add operands, if the add doesn't overflow.
871 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
872 if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
874 for (const SCEV *S : Add->operands()) {
875 const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
876 if (!Op) return nullptr;
877 Ops.push_back(Op);
878 }
879 return SE.getAddExpr(Ops);
880 }
881 return nullptr;
882 }
883
884 // Check for a multiply operand that we can pull RHS out of.
885 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
886 if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
887 // Handle special case C1*X*Y /s C2*X*Y.
888 if (const SCEVMulExpr *MulRHS = dyn_cast<SCEVMulExpr>(RHS)) {
889 if (IgnoreSignificantBits || isMulSExtable(MulRHS, SE)) {
890 const SCEVConstant *LC = dyn_cast<SCEVConstant>(Mul->getOperand(0));
891 const SCEVConstant *RC =
892 dyn_cast<SCEVConstant>(MulRHS->getOperand(0));
893 if (LC && RC) {
895 SmallVector<const SCEV *, 4> ROps(drop_begin(MulRHS->operands()));
896 if (LOps == ROps)
897 return getExactSDiv(LC, RC, SE, IgnoreSignificantBits);
898 }
899 }
900 }
901
903 bool Found = false;
904 for (const SCEV *S : Mul->operands()) {
905 if (!Found)
906 if (const SCEV *Q = getExactSDiv(S, RHS, SE,
907 IgnoreSignificantBits)) {
908 S = Q;
909 Found = true;
910 }
911 Ops.push_back(S);
912 }
913 return Found ? SE.getMulExpr(Ops) : nullptr;
914 }
915 return nullptr;
916 }
917
918 // Otherwise we don't know.
919 return nullptr;
920}
921
922/// If S involves the addition of a constant integer value, return that integer
923/// value, and mutate S to point to a new SCEV with that value excluded.
924static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
925 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
926 if (C->getAPInt().getSignificantBits() <= 64) {
927 S = SE.getConstant(C->getType(), 0);
928 return Immediate::getFixed(C->getValue()->getSExtValue());
929 }
930 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
931 SmallVector<const SCEV *, 8> NewOps(Add->operands());
932 Immediate Result = ExtractImmediate(NewOps.front(), SE);
933 if (Result.isNonZero())
934 S = SE.getAddExpr(NewOps);
935 return Result;
936 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
937 SmallVector<const SCEV *, 8> NewOps(AR->operands());
938 Immediate Result = ExtractImmediate(NewOps.front(), SE);
939 if (Result.isNonZero())
940 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
941 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
943 return Result;
944 } else if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S)) {
945 if (EnableVScaleImmediates && M->getNumOperands() == 2) {
946 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(M->getOperand(0)))
947 if (isa<SCEVVScale>(M->getOperand(1))) {
948 S = SE.getConstant(M->getType(), 0);
949 return Immediate::getScalable(C->getValue()->getSExtValue());
950 }
951 }
952 }
953 return Immediate::getZero();
954}
955
956/// If S involves the addition of a GlobalValue address, return that symbol, and
957/// mutate S to point to a new SCEV with that value excluded.
959 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
960 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
961 S = SE.getConstant(GV->getType(), 0);
962 return GV;
963 }
964 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
965 SmallVector<const SCEV *, 8> NewOps(Add->operands());
966 GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
967 if (Result)
968 S = SE.getAddExpr(NewOps);
969 return Result;
970 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
971 SmallVector<const SCEV *, 8> NewOps(AR->operands());
972 GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
973 if (Result)
974 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
975 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
977 return Result;
978 }
979 return nullptr;
980}
981
982/// Returns true if the specified instruction is using the specified value as an
983/// address.
985 Instruction *Inst, Value *OperandVal) {
986 bool isAddress = isa<LoadInst>(Inst);
987 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
988 if (SI->getPointerOperand() == OperandVal)
989 isAddress = true;
990 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
991 // Addressing modes can also be folded into prefetches and a variety
992 // of intrinsics.
993 switch (II->getIntrinsicID()) {
994 case Intrinsic::memset:
995 case Intrinsic::prefetch:
996 case Intrinsic::masked_load:
997 if (II->getArgOperand(0) == OperandVal)
998 isAddress = true;
999 break;
1000 case Intrinsic::masked_store:
1001 if (II->getArgOperand(1) == OperandVal)
1002 isAddress = true;
1003 break;
1004 case Intrinsic::memmove:
1005 case Intrinsic::memcpy:
1006 if (II->getArgOperand(0) == OperandVal ||
1007 II->getArgOperand(1) == OperandVal)
1008 isAddress = true;
1009 break;
1010 default: {
1011 MemIntrinsicInfo IntrInfo;
1012 if (TTI.getTgtMemIntrinsic(II, IntrInfo)) {
1013 if (IntrInfo.PtrVal == OperandVal)
1014 isAddress = true;
1015 }
1016 }
1017 }
1018 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1019 if (RMW->getPointerOperand() == OperandVal)
1020 isAddress = true;
1021 } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1022 if (CmpX->getPointerOperand() == OperandVal)
1023 isAddress = true;
1024 }
1025 return isAddress;
1026}
1027
1028/// Return the type of the memory being accessed.
1029static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
1030 Instruction *Inst, Value *OperandVal) {
1031 MemAccessTy AccessTy = MemAccessTy::getUnknown(Inst->getContext());
1032
1033 // First get the type of memory being accessed.
1034 if (Type *Ty = Inst->getAccessType())
1035 AccessTy.MemTy = Ty;
1036
1037 // Then get the pointer address space.
1038 if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
1039 AccessTy.AddrSpace = SI->getPointerAddressSpace();
1040 } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
1041 AccessTy.AddrSpace = LI->getPointerAddressSpace();
1042 } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1043 AccessTy.AddrSpace = RMW->getPointerAddressSpace();
1044 } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1045 AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
1046 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
1047 switch (II->getIntrinsicID()) {
1048 case Intrinsic::prefetch:
1049 case Intrinsic::memset:
1050 AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
1051 AccessTy.MemTy = OperandVal->getType();
1052 break;
1053 case Intrinsic::memmove:
1054 case Intrinsic::memcpy:
1055 AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace();
1056 AccessTy.MemTy = OperandVal->getType();
1057 break;
1058 case Intrinsic::masked_load:
1059 AccessTy.AddrSpace =
1060 II->getArgOperand(0)->getType()->getPointerAddressSpace();
1061 break;
1062 case Intrinsic::masked_store:
1063 AccessTy.AddrSpace =
1064 II->getArgOperand(1)->getType()->getPointerAddressSpace();
1065 break;
1066 default: {
1067 MemIntrinsicInfo IntrInfo;
1068 if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) {
1069 AccessTy.AddrSpace
1070 = IntrInfo.PtrVal->getType()->getPointerAddressSpace();
1071 }
1072
1073 break;
1074 }
1075 }
1076 }
1077
1078 return AccessTy;
1079}
1080
1081/// Return true if this AddRec is already a phi in its loop.
1082static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
1083 for (PHINode &PN : AR->getLoop()->getHeader()->phis()) {
1084 if (SE.isSCEVable(PN.getType()) &&
1085 (SE.getEffectiveSCEVType(PN.getType()) ==
1086 SE.getEffectiveSCEVType(AR->getType())) &&
1087 SE.getSCEV(&PN) == AR)
1088 return true;
1089 }
1090 return false;
1091}
1092
1093/// Check if expanding this expression is likely to incur significant cost. This
1094/// is tricky because SCEV doesn't track which expressions are actually computed
1095/// by the current IR.
1096///
1097/// We currently allow expansion of IV increments that involve adds,
1098/// multiplication by constants, and AddRecs from existing phis.
1099///
1100/// TODO: Allow UDivExpr if we can find an existing IV increment that is an
1101/// obvious multiple of the UDivExpr.
1102static bool isHighCostExpansion(const SCEV *S,
1104 ScalarEvolution &SE) {
1105 // Zero/One operand expressions
1106 switch (S->getSCEVType()) {
1107 case scUnknown:
1108 case scConstant:
1109 case scVScale:
1110 return false;
1111 case scTruncate:
1112 return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
1113 Processed, SE);
1114 case scZeroExtend:
1115 return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
1116 Processed, SE);
1117 case scSignExtend:
1118 return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
1119 Processed, SE);
1120 default:
1121 break;
1122 }
1123
1124 if (!Processed.insert(S).second)
1125 return false;
1126
1127 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
1128 for (const SCEV *S : Add->operands()) {
1129 if (isHighCostExpansion(S, Processed, SE))
1130 return true;
1131 }
1132 return false;
1133 }
1134
1135 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
1136 if (Mul->getNumOperands() == 2) {
1137 // Multiplication by a constant is ok
1138 if (isa<SCEVConstant>(Mul->getOperand(0)))
1139 return isHighCostExpansion(Mul->getOperand(1), Processed, SE);
1140
1141 // If we have the value of one operand, check if an existing
1142 // multiplication already generates this expression.
1143 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) {
1144 Value *UVal = U->getValue();
1145 for (User *UR : UVal->users()) {
1146 // If U is a constant, it may be used by a ConstantExpr.
1147 Instruction *UI = dyn_cast<Instruction>(UR);
1148 if (UI && UI->getOpcode() == Instruction::Mul &&
1149 SE.isSCEVable(UI->getType())) {
1150 return SE.getSCEV(UI) == Mul;
1151 }
1152 }
1153 }
1154 }
1155 }
1156
1157 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
1158 if (isExistingPhi(AR, SE))
1159 return false;
1160 }
1161
1162 // Fow now, consider any other type of expression (div/mul/min/max) high cost.
1163 return true;
1164}
1165
1166namespace {
1167
1168class LSRUse;
1169
1170} // end anonymous namespace
1171
1172/// Check if the addressing mode defined by \p F is completely
1173/// folded in \p LU at isel time.
1174/// This includes address-mode folding and special icmp tricks.
1175/// This function returns true if \p LU can accommodate what \p F
1176/// defines and up to 1 base + 1 scaled + offset.
1177/// In other words, if \p F has several base registers, this function may
1178/// still return true. Therefore, users still need to account for
1179/// additional base registers and/or unfolded offsets to derive an
1180/// accurate cost model.
1182 const LSRUse &LU, const Formula &F);
1183
1184// Get the cost of the scaling factor used in F for LU.
1186 const LSRUse &LU, const Formula &F,
1187 const Loop &L);
1188
1189namespace {
1190
1191/// This class is used to measure and compare candidate formulae.
1192class Cost {
1193 const Loop *L = nullptr;
1194 ScalarEvolution *SE = nullptr;
1195 const TargetTransformInfo *TTI = nullptr;
1198
1199public:
1200 Cost() = delete;
1201 Cost(const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
1203 L(L), SE(&SE), TTI(&TTI), AMK(AMK) {
1204 C.Insns = 0;
1205 C.NumRegs = 0;
1206 C.AddRecCost = 0;
1207 C.NumIVMuls = 0;
1208 C.NumBaseAdds = 0;
1209 C.ImmCost = 0;
1210 C.SetupCost = 0;
1211 C.ScaleCost = 0;
1212 }
1213
1214 bool isLess(const Cost &Other) const;
1215
1216 void Lose();
1217
1218#ifndef NDEBUG
1219 // Once any of the metrics loses, they must all remain losers.
1220 bool isValid() {
1221 return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
1222 | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
1223 || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
1224 & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
1225 }
1226#endif
1227
1228 bool isLoser() {
1229 assert(isValid() && "invalid cost");
1230 return C.NumRegs == ~0u;
1231 }
1232
1233 void RateFormula(const Formula &F,
1235 const DenseSet<const SCEV *> &VisitedRegs,
1236 const LSRUse &LU,
1237 SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
1238
1239 void print(raw_ostream &OS) const;
1240 void dump() const;
1241
1242private:
1243 void RateRegister(const Formula &F, const SCEV *Reg,
1245 void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1248};
1249
1250/// An operand value in an instruction which is to be replaced with some
1251/// equivalent, possibly strength-reduced, replacement.
1252struct LSRFixup {
1253 /// The instruction which will be updated.
1254 Instruction *UserInst = nullptr;
1255
1256 /// The operand of the instruction which will be replaced. The operand may be
1257 /// used more than once; every instance will be replaced.
1258 Value *OperandValToReplace = nullptr;
1259
1260 /// If this user is to use the post-incremented value of an induction
1261 /// variable, this set is non-empty and holds the loops associated with the
1262 /// induction variable.
1263 PostIncLoopSet PostIncLoops;
1264
1265 /// A constant offset to be added to the LSRUse expression. This allows
1266 /// multiple fixups to share the same LSRUse with different offsets, for
1267 /// example in an unrolled loop.
1268 Immediate Offset = Immediate::getZero();
1269
1270 LSRFixup() = default;
1271
1272 bool isUseFullyOutsideLoop(const Loop *L) const;
1273
1274 void print(raw_ostream &OS) const;
1275 void dump() const;
1276};
1277
1278/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted
1279/// SmallVectors of const SCEV*.
1280struct UniquifierDenseMapInfo {
1281 static SmallVector<const SCEV *, 4> getEmptyKey() {
1283 V.push_back(reinterpret_cast<const SCEV *>(-1));
1284 return V;
1285 }
1286
1287 static SmallVector<const SCEV *, 4> getTombstoneKey() {
1289 V.push_back(reinterpret_cast<const SCEV *>(-2));
1290 return V;
1291 }
1292
1293 static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) {
1294 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
1295 }
1296
1297 static bool isEqual(const SmallVector<const SCEV *, 4> &LHS,
1299 return LHS == RHS;
1300 }
1301};
1302
1303/// This class holds the state that LSR keeps for each use in IVUsers, as well
1304/// as uses invented by LSR itself. It includes information about what kinds of
1305/// things can be folded into the user, information about the user itself, and
1306/// information about how the use may be satisfied. TODO: Represent multiple
1307/// users of the same expression in common?
1308class LSRUse {
1309 DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;
1310
1311public:
1312 /// An enum for a kind of use, indicating what types of scaled and immediate
1313 /// operands it might support.
1314 enum KindType {
1315 Basic, ///< A normal use, with no folding.
1316 Special, ///< A special case of basic, allowing -1 scales.
1317 Address, ///< An address use; folding according to TargetLowering
1318 ICmpZero ///< An equality icmp with both operands folded into one.
1319 // TODO: Add a generic icmp too?
1320 };
1321
1322 using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>;
1323
1324 KindType Kind;
1325 MemAccessTy AccessTy;
1326
1327 /// The list of operands which are to be replaced.
1329
1330 /// Keep track of the min and max offsets of the fixups.
1331 Immediate MinOffset = Immediate::getFixedMax();
1332 Immediate MaxOffset = Immediate::getFixedMin();
1333
1334 /// This records whether all of the fixups using this LSRUse are outside of
1335 /// the loop, in which case some special-case heuristics may be used.
1336 bool AllFixupsOutsideLoop = true;
1337
1338 /// RigidFormula is set to true to guarantee that this use will be associated
1339 /// with a single formula--the one that initially matched. Some SCEV
1340 /// expressions cannot be expanded. This allows LSR to consider the registers
1341 /// used by those expressions without the need to expand them later after
1342 /// changing the formula.
1343 bool RigidFormula = false;
1344
1345 /// This records the widest use type for any fixup using this
1346 /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
1347 /// fixup widths to be equivalent, because the narrower one may be relying on
1348 /// the implicit truncation to truncate away bogus bits.
1349 Type *WidestFixupType = nullptr;
1350
1351 /// A list of ways to build a value that can satisfy this user. After the
1352 /// list is populated, one of these is selected heuristically and used to
1353 /// formulate a replacement for OperandValToReplace in UserInst.
1354 SmallVector<Formula, 12> Formulae;
1355
1356 /// The set of register candidates used by all formulae in this LSRUse.
1358
1359 LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {}
1360
1361 LSRFixup &getNewFixup() {
1362 Fixups.push_back(LSRFixup());
1363 return Fixups.back();
1364 }
1365
1366 void pushFixup(LSRFixup &f) {
1367 Fixups.push_back(f);
1368 if (Immediate::isKnownGT(f.Offset, MaxOffset))
1369 MaxOffset = f.Offset;
1370 if (Immediate::isKnownLT(f.Offset, MinOffset))
1371 MinOffset = f.Offset;
1372 }
1373
1374 bool HasFormulaWithSameRegs(const Formula &F) const;
1375 float getNotSelectedProbability(const SCEV *Reg) const;
1376 bool InsertFormula(const Formula &F, const Loop &L);
1377 void DeleteFormula(Formula &F);
1378 void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
1379
1380 void print(raw_ostream &OS) const;
1381 void dump() const;
1382};
1383
1384} // end anonymous namespace
1385
1387 LSRUse::KindType Kind, MemAccessTy AccessTy,
1388 GlobalValue *BaseGV, Immediate BaseOffset,
1389 bool HasBaseReg, int64_t Scale,
1390 Instruction *Fixup = nullptr);
1391
1392static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) {
1393 if (isa<SCEVUnknown>(Reg) || isa<SCEVConstant>(Reg))
1394 return 1;
1395 if (Depth == 0)
1396 return 0;
1397 if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))
1398 return getSetupCost(S->getStart(), Depth - 1);
1399 if (auto S = dyn_cast<SCEVIntegralCastExpr>(Reg))
1400 return getSetupCost(S->getOperand(), Depth - 1);
1401 if (auto S = dyn_cast<SCEVNAryExpr>(Reg))
1402 return std::accumulate(S->operands().begin(), S->operands().end(), 0,
1403 [&](unsigned i, const SCEV *Reg) {
1404 return i + getSetupCost(Reg, Depth - 1);
1405 });
1406 if (auto S = dyn_cast<SCEVUDivExpr>(Reg))
1407 return getSetupCost(S->getLHS(), Depth - 1) +
1408 getSetupCost(S->getRHS(), Depth - 1);
1409 return 0;
1410}
1411
1412/// Tally up interesting quantities from the given register.
1413void Cost::RateRegister(const Formula &F, const SCEV *Reg,
1415 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
1416 // If this is an addrec for another loop, it should be an invariant
1417 // with respect to L since L is the innermost loop (at least
1418 // for now LSR only handles innermost loops).
1419 if (AR->getLoop() != L) {
1420 // If the AddRec exists, consider it's register free and leave it alone.
1421 if (isExistingPhi(AR, *SE) && AMK != TTI::AMK_PostIndexed)
1422 return;
1423
1424 // It is bad to allow LSR for current loop to add induction variables
1425 // for its sibling loops.
1426 if (!AR->getLoop()->contains(L)) {
1427 Lose();
1428 return;
1429 }
1430
1431 // Otherwise, it will be an invariant with respect to Loop L.
1432 ++C.NumRegs;
1433 return;
1434 }
1435
1436 unsigned LoopCost = 1;
1437 if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||
1438 TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {
1439
1440 // If the step size matches the base offset, we could use pre-indexed
1441 // addressing.
1442 if (AMK == TTI::AMK_PreIndexed && F.BaseOffset.isFixed()) {
1443 if (auto *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)))
1444 if (Step->getAPInt() == F.BaseOffset.getFixedValue())
1445 LoopCost = 0;
1446 } else if (AMK == TTI::AMK_PostIndexed) {
1447 const SCEV *LoopStep = AR->getStepRecurrence(*SE);
1448 if (isa<SCEVConstant>(LoopStep)) {
1449 const SCEV *LoopStart = AR->getStart();
1450 if (!isa<SCEVConstant>(LoopStart) &&
1451 SE->isLoopInvariant(LoopStart, L))
1452 LoopCost = 0;
1453 }
1454 }
1455 }
1456 C.AddRecCost += LoopCost;
1457
1458 // Add the step value register, if it needs one.
1459 // TODO: The non-affine case isn't precisely modeled here.
1460 if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
1461 if (!Regs.count(AR->getOperand(1))) {
1462 RateRegister(F, AR->getOperand(1), Regs);
1463 if (isLoser())
1464 return;
1465 }
1466 }
1467 }
1468 ++C.NumRegs;
1469
1470 // Rough heuristic; favor registers which don't require extra setup
1471 // instructions in the preheader.
1472 C.SetupCost += getSetupCost(Reg, SetupCostDepthLimit);
1473 // Ensure we don't, even with the recusion limit, produce invalid costs.
1474 C.SetupCost = std::min<unsigned>(C.SetupCost, 1 << 16);
1475
1476 C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
1477 SE->hasComputableLoopEvolution(Reg, L);
1478}
1479
1480/// Record this register in the set. If we haven't seen it before, rate
1481/// it. Optional LoserRegs provides a way to declare any formula that refers to
1482/// one of those regs an instant loser.
1483void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1485 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1486 if (LoserRegs && LoserRegs->count(Reg)) {
1487 Lose();
1488 return;
1489 }
1490 if (Regs.insert(Reg).second) {
1491 RateRegister(F, Reg, Regs);
1492 if (LoserRegs && isLoser())
1493 LoserRegs->insert(Reg);
1494 }
1495}
1496
1497void Cost::RateFormula(const Formula &F,
1499 const DenseSet<const SCEV *> &VisitedRegs,
1500 const LSRUse &LU,
1501 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1502 if (isLoser())
1503 return;
1504 assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
1505 // Tally up the registers.
1506 unsigned PrevAddRecCost = C.AddRecCost;
1507 unsigned PrevNumRegs = C.NumRegs;
1508 unsigned PrevNumBaseAdds = C.NumBaseAdds;
1509 if (const SCEV *ScaledReg = F.ScaledReg) {
1510 if (VisitedRegs.count(ScaledReg)) {
1511 Lose();
1512 return;
1513 }
1514 RatePrimaryRegister(F, ScaledReg, Regs, LoserRegs);
1515 if (isLoser())
1516 return;
1517 }
1518 for (const SCEV *BaseReg : F.BaseRegs) {
1519 if (VisitedRegs.count(BaseReg)) {
1520 Lose();
1521 return;
1522 }
1523 RatePrimaryRegister(F, BaseReg, Regs, LoserRegs);
1524 if (isLoser())
1525 return;
1526 }
1527
1528 // Determine how many (unfolded) adds we'll need inside the loop.
1529 size_t NumBaseParts = F.getNumRegs();
1530 if (NumBaseParts > 1)
1531 // Do not count the base and a possible second register if the target
1532 // allows to fold 2 registers.
1533 C.NumBaseAdds +=
1534 NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F)));
1535 C.NumBaseAdds += (F.UnfoldedOffset.isNonZero());
1536
1537 // Accumulate non-free scaling amounts.
1538 C.ScaleCost += *getScalingFactorCost(*TTI, LU, F, *L).getValue();
1539
1540 // Tally up the non-zero immediates.
1541 for (const LSRFixup &Fixup : LU.Fixups) {
1542 if (Fixup.Offset.isCompatibleImmediate(F.BaseOffset)) {
1543 Immediate Offset = Fixup.Offset.addUnsigned(F.BaseOffset);
1544 if (F.BaseGV)
1545 C.ImmCost += 64; // Handle symbolic values conservatively.
1546 // TODO: This should probably be the pointer size.
1547 else if (Offset.isNonZero())
1548 C.ImmCost +=
1549 APInt(64, Offset.getKnownMinValue(), true).getSignificantBits();
1550
1551 // Check with target if this offset with this instruction is
1552 // specifically not supported.
1553 if (LU.Kind == LSRUse::Address && Offset.isNonZero() &&
1554 !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1555 Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
1556 C.NumBaseAdds++;
1557 } else {
1558 // Incompatible immediate type, increase cost to avoid using
1559 C.ImmCost += 2048;
1560 }
1561 }
1562
1563 // If we don't count instruction cost exit here.
1564 if (!InsnsCost) {
1565 assert(isValid() && "invalid cost");
1566 return;
1567 }
1568
1569 // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
1570 // additional instruction (at least fill).
1571 // TODO: Need distinguish register class?
1572 unsigned TTIRegNum = TTI->getNumberOfRegisters(
1573 TTI->getRegisterClassForType(false, F.getType())) - 1;
1574 if (C.NumRegs > TTIRegNum) {
1575 // Cost already exceeded TTIRegNum, then only newly added register can add
1576 // new instructions.
1577 if (PrevNumRegs > TTIRegNum)
1578 C.Insns += (C.NumRegs - PrevNumRegs);
1579 else
1580 C.Insns += (C.NumRegs - TTIRegNum);
1581 }
1582
1583 // If ICmpZero formula ends with not 0, it could not be replaced by
1584 // just add or sub. We'll need to compare final result of AddRec.
1585 // That means we'll need an additional instruction. But if the target can
1586 // macro-fuse a compare with a branch, don't count this extra instruction.
1587 // For -10 + {0, +, 1}:
1588 // i = i + 1;
1589 // cmp i, 10
1590 //
1591 // For {-10, +, 1}:
1592 // i = i + 1;
1593 if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() &&
1594 !TTI->canMacroFuseCmp())
1595 C.Insns++;
1596 // Each new AddRec adds 1 instruction to calculation.
1597 C.Insns += (C.AddRecCost - PrevAddRecCost);
1598
1599 // BaseAdds adds instructions for unfolded registers.
1600 if (LU.Kind != LSRUse::ICmpZero)
1601 C.Insns += C.NumBaseAdds - PrevNumBaseAdds;
1602 assert(isValid() && "invalid cost");
1603}
1604
1605/// Set this cost to a losing value.
1606void Cost::Lose() {
1607 C.Insns = std::numeric_limits<unsigned>::max();
1608 C.NumRegs = std::numeric_limits<unsigned>::max();
1609 C.AddRecCost = std::numeric_limits<unsigned>::max();
1610 C.NumIVMuls = std::numeric_limits<unsigned>::max();
1611 C.NumBaseAdds = std::numeric_limits<unsigned>::max();
1612 C.ImmCost = std::numeric_limits<unsigned>::max();
1613 C.SetupCost = std::numeric_limits<unsigned>::max();
1614 C.ScaleCost = std::numeric_limits<unsigned>::max();
1615}
1616
1617/// Choose the lower cost.
1618bool Cost::isLess(const Cost &Other) const {
1619 if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
1620 C.Insns != Other.C.Insns)
1621 return C.Insns < Other.C.Insns;
1622 return TTI->isLSRCostLess(C, Other.C);
1623}
1624
1625#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1626void Cost::print(raw_ostream &OS) const {
1627 if (InsnsCost)
1628 OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
1629 OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s");
1630 if (C.AddRecCost != 0)
1631 OS << ", with addrec cost " << C.AddRecCost;
1632 if (C.NumIVMuls != 0)
1633 OS << ", plus " << C.NumIVMuls << " IV mul"
1634 << (C.NumIVMuls == 1 ? "" : "s");
1635 if (C.NumBaseAdds != 0)
1636 OS << ", plus " << C.NumBaseAdds << " base add"
1637 << (C.NumBaseAdds == 1 ? "" : "s");
1638 if (C.ScaleCost != 0)
1639 OS << ", plus " << C.ScaleCost << " scale cost";
1640 if (C.ImmCost != 0)
1641 OS << ", plus " << C.ImmCost << " imm cost";
1642 if (C.SetupCost != 0)
1643 OS << ", plus " << C.SetupCost << " setup cost";
1644}
1645
1646LLVM_DUMP_METHOD void Cost::dump() const {
1647 print(errs()); errs() << '\n';
1648}
1649#endif
1650
1651/// Test whether this fixup always uses its value outside of the given loop.
1652bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
1653 // PHI nodes use their value in their incoming blocks.
1654 if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
1655 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
1656 if (PN->getIncomingValue(i) == OperandValToReplace &&
1657 L->contains(PN->getIncomingBlock(i)))
1658 return false;
1659 return true;
1660 }
1661
1662 return !L->contains(UserInst);
1663}
1664
1665#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1666void LSRFixup::print(raw_ostream &OS) const {
1667 OS << "UserInst=";
1668 // Store is common and interesting enough to be worth special-casing.
1669 if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
1670 OS << "store ";
1671 Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
1672 } else if (UserInst->getType()->isVoidTy())
1673 OS << UserInst->getOpcodeName();
1674 else
1675 UserInst->printAsOperand(OS, /*PrintType=*/false);
1676
1677 OS << ", OperandValToReplace=";
1678 OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
1679
1680 for (const Loop *PIL : PostIncLoops) {
1681 OS << ", PostIncLoop=";
1682 PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
1683 }
1684
1685 if (Offset.isNonZero())
1686 OS << ", Offset=" << Offset;
1687}
1688
1689LLVM_DUMP_METHOD void LSRFixup::dump() const {
1690 print(errs()); errs() << '\n';
1691}
1692#endif
1693
1694/// Test whether this use as a formula which has the same registers as the given
1695/// formula.
1696bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
1698 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1699 // Unstable sort by host order ok, because this is only used for uniquifying.
1700 llvm::sort(Key);
1701 return Uniquifier.count(Key);
1702}
1703
1704/// The function returns a probability of selecting formula without Reg.
1705float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
1706 unsigned FNum = 0;
1707 for (const Formula &F : Formulae)
1708 if (F.referencesReg(Reg))
1709 FNum++;
1710 return ((float)(Formulae.size() - FNum)) / Formulae.size();
1711}
1712
1713/// If the given formula has not yet been inserted, add it to the list, and
1714/// return true. Return false otherwise. The formula must be in canonical form.
1715bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
1716 assert(F.isCanonical(L) && "Invalid canonical representation");
1717
1718 if (!Formulae.empty() && RigidFormula)
1719 return false;
1720
1722 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1723 // Unstable sort by host order ok, because this is only used for uniquifying.
1724 llvm::sort(Key);
1725
1726 if (!Uniquifier.insert(Key).second)
1727 return false;
1728
1729 // Using a register to hold the value of 0 is not profitable.
1730 assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
1731 "Zero allocated in a scaled register!");
1732#ifndef NDEBUG
1733 for (const SCEV *BaseReg : F.BaseRegs)
1734 assert(!BaseReg->isZero() && "Zero allocated in a base register!");
1735#endif
1736
1737 // Add the formula to the list.
1738 Formulae.push_back(F);
1739
1740 // Record registers now being used by this use.
1741 Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1742 if (F.ScaledReg)
1743 Regs.insert(F.ScaledReg);
1744
1745 return true;
1746}
1747
1748/// Remove the given formula from this use's list.
1749void LSRUse::DeleteFormula(Formula &F) {
1750 if (&F != &Formulae.back())
1751 std::swap(F, Formulae.back());
1752 Formulae.pop_back();
1753}
1754
1755/// Recompute the Regs field, and update RegUses.
1756void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
1757 // Now that we've filtered out some formulae, recompute the Regs set.
1758 SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
1759 Regs.clear();
1760 for (const Formula &F : Formulae) {
1761 if (F.ScaledReg) Regs.insert(F.ScaledReg);
1762 Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1763 }
1764
1765 // Update the RegTracker.
1766 for (const SCEV *S : OldRegs)
1767 if (!Regs.count(S))
1768 RegUses.dropRegister(S, LUIdx);
1769}
1770
1771#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1772void LSRUse::print(raw_ostream &OS) const {
1773 OS << "LSR Use: Kind=";
1774 switch (Kind) {
1775 case Basic: OS << "Basic"; break;
1776 case Special: OS << "Special"; break;
1777 case ICmpZero: OS << "ICmpZero"; break;
1778 case Address:
1779 OS << "Address of ";
1780 if (AccessTy.MemTy->isPointerTy())
1781 OS << "pointer"; // the full pointer type could be really verbose
1782 else {
1783 OS << *AccessTy.MemTy;
1784 }
1785
1786 OS << " in addrspace(" << AccessTy.AddrSpace << ')';
1787 }
1788
1789 OS << ", Offsets={";
1790 bool NeedComma = false;
1791 for (const LSRFixup &Fixup : Fixups) {
1792 if (NeedComma) OS << ',';
1793 OS << Fixup.Offset;
1794 NeedComma = true;
1795 }
1796 OS << '}';
1797
1798 if (AllFixupsOutsideLoop)
1799 OS << ", all-fixups-outside-loop";
1800
1801 if (WidestFixupType)
1802 OS << ", widest fixup type: " << *WidestFixupType;
1803}
1804
1805LLVM_DUMP_METHOD void LSRUse::dump() const {
1806 print(errs()); errs() << '\n';
1807}
1808#endif
1809
1811 LSRUse::KindType Kind, MemAccessTy AccessTy,
1812 GlobalValue *BaseGV, Immediate BaseOffset,
1813 bool HasBaseReg, int64_t Scale,
1814 Instruction *Fixup /* = nullptr */) {
1815 switch (Kind) {
1816 case LSRUse::Address: {
1817 int64_t FixedOffset =
1818 BaseOffset.isScalable() ? 0 : BaseOffset.getFixedValue();
1819 int64_t ScalableOffset =
1820 BaseOffset.isScalable() ? BaseOffset.getKnownMinValue() : 0;
1821 return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, FixedOffset,
1822 HasBaseReg, Scale, AccessTy.AddrSpace,
1823 Fixup, ScalableOffset);
1824 }
1825 case LSRUse::ICmpZero:
1826 // There's not even a target hook for querying whether it would be legal to
1827 // fold a GV into an ICmp.
1828 if (BaseGV)
1829 return false;
1830
1831 // ICmp only has two operands; don't allow more than two non-trivial parts.
1832 if (Scale != 0 && HasBaseReg && BaseOffset.isNonZero())
1833 return false;
1834
1835 // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
1836 // putting the scaled register in the other operand of the icmp.
1837 if (Scale != 0 && Scale != -1)
1838 return false;
1839
1840 // If we have low-level target information, ask the target if it can fold an
1841 // integer immediate on an icmp.
1842 if (BaseOffset.isNonZero()) {
1843 // We don't have an interface to query whether the target supports
1844 // icmpzero against scalable quantities yet.
1845 if (BaseOffset.isScalable())
1846 return false;
1847
1848 // We have one of:
1849 // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
1850 // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
1851 // Offs is the ICmp immediate.
1852 if (Scale == 0)
1853 // The cast does the right thing with
1854 // std::numeric_limits<int64_t>::min().
1855 BaseOffset = BaseOffset.getFixed(-(uint64_t)BaseOffset.getFixedValue());
1856 return TTI.isLegalICmpImmediate(BaseOffset.getFixedValue());
1857 }
1858
1859 // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
1860 return true;
1861
1862 case LSRUse::Basic:
1863 // Only handle single-register values.
1864 return !BaseGV && Scale == 0 && BaseOffset.isZero();
1865
1866 case LSRUse::Special:
1867 // Special case Basic to handle -1 scales.
1868 return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset.isZero();
1869 }
1870
1871 llvm_unreachable("Invalid LSRUse Kind!");
1872}
1873
1875 Immediate MinOffset, Immediate MaxOffset,
1876 LSRUse::KindType Kind, MemAccessTy AccessTy,
1877 GlobalValue *BaseGV, Immediate BaseOffset,
1878 bool HasBaseReg, int64_t Scale) {
1879 if (BaseOffset.isNonZero() &&
1880 (BaseOffset.isScalable() != MinOffset.isScalable() ||
1881 BaseOffset.isScalable() != MaxOffset.isScalable()))
1882 return false;
1883 // Check for overflow.
1884 int64_t Base = BaseOffset.getKnownMinValue();
1885 int64_t Min = MinOffset.getKnownMinValue();
1886 int64_t Max = MaxOffset.getKnownMinValue();
1887 if (((int64_t)((uint64_t)Base + Min) > Base) != (Min > 0))
1888 return false;
1889 MinOffset = Immediate::get((uint64_t)Base + Min, MinOffset.isScalable());
1890 if (((int64_t)((uint64_t)Base + Max) > Base) != (Max > 0))
1891 return false;
1892 MaxOffset = Immediate::get((uint64_t)Base + Max, MaxOffset.isScalable());
1893
1894 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
1895 HasBaseReg, Scale) &&
1896 isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
1897 HasBaseReg, Scale);
1898}
1899
1901 Immediate MinOffset, Immediate MaxOffset,
1902 LSRUse::KindType Kind, MemAccessTy AccessTy,
1903 const Formula &F, const Loop &L) {
1904 // For the purpose of isAMCompletelyFolded either having a canonical formula
1905 // or a scale not equal to zero is correct.
1906 // Problems may arise from non canonical formulae having a scale == 0.
1907 // Strictly speaking it would best to just rely on canonical formulae.
1908 // However, when we generate the scaled formulae, we first check that the
1909 // scaling factor is profitable before computing the actual ScaledReg for
1910 // compile time sake.
1911 assert((F.isCanonical(L) || F.Scale != 0));
1912 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1913 F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
1914}
1915
1916/// Test whether we know how to expand the current formula.
1917static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1918 Immediate MaxOffset, LSRUse::KindType Kind,
1919 MemAccessTy AccessTy, GlobalValue *BaseGV,
1920 Immediate BaseOffset, bool HasBaseReg, int64_t Scale) {
1921 // We know how to expand completely foldable formulae.
1922 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1923 BaseOffset, HasBaseReg, Scale) ||
1924 // Or formulae that use a base register produced by a sum of base
1925 // registers.
1926 (Scale == 1 &&
1927 isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1928 BaseGV, BaseOffset, true, 0));
1929}
1930
1931static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1932 Immediate MaxOffset, LSRUse::KindType Kind,
1933 MemAccessTy AccessTy, const Formula &F) {
1934 return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
1935 F.BaseOffset, F.HasBaseReg, F.Scale);
1936}
1937
1939 Immediate Offset) {
1940 if (Offset.isScalable())
1941 return TTI.isLegalAddScalableImmediate(Offset.getKnownMinValue());
1942
1943 return TTI.isLegalAddImmediate(Offset.getFixedValue());
1944}
1945
1947 const LSRUse &LU, const Formula &F) {
1948 // Target may want to look at the user instructions.
1949 if (LU.Kind == LSRUse::Address && TTI.LSRWithInstrQueries()) {
1950 for (const LSRFixup &Fixup : LU.Fixups)
1951 if (!isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1952 (F.BaseOffset + Fixup.Offset), F.HasBaseReg,
1953 F.Scale, Fixup.UserInst))
1954 return false;
1955 return true;
1956 }
1957
1958 return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1959 LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
1960 F.Scale);
1961}
1962
1964 const LSRUse &LU, const Formula &F,
1965 const Loop &L) {
1966 if (!F.Scale)
1967 return 0;
1968
1969 // If the use is not completely folded in that instruction, we will have to
1970 // pay an extra cost only for scale != 1.
1971 if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1972 LU.AccessTy, F, L))
1973 return F.Scale != 1;
1974
1975 switch (LU.Kind) {
1976 case LSRUse::Address: {
1977 // Check the scaling factor cost with both the min and max offsets.
1978 int64_t ScalableMin = 0, ScalableMax = 0, FixedMin = 0, FixedMax = 0;
1979 if (F.BaseOffset.isScalable()) {
1980 ScalableMin = (F.BaseOffset + LU.MinOffset).getKnownMinValue();
1981 ScalableMax = (F.BaseOffset + LU.MaxOffset).getKnownMinValue();
1982 } else {
1983 FixedMin = (F.BaseOffset + LU.MinOffset).getFixedValue();
1984 FixedMax = (F.BaseOffset + LU.MaxOffset).getFixedValue();
1985 }
1986 InstructionCost ScaleCostMinOffset = TTI.getScalingFactorCost(
1987 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMin, ScalableMin),
1988 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
1989 InstructionCost ScaleCostMaxOffset = TTI.getScalingFactorCost(
1990 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMax, ScalableMax),
1991 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
1992
1993 assert(ScaleCostMinOffset.isValid() && ScaleCostMaxOffset.isValid() &&
1994 "Legal addressing mode has an illegal cost!");
1995 return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
1996 }
1997 case LSRUse::ICmpZero:
1998 case LSRUse::Basic:
1999 case LSRUse::Special:
2000 // The use is completely folded, i.e., everything is folded into the
2001 // instruction.
2002 return 0;
2003 }
2004
2005 llvm_unreachable("Invalid LSRUse Kind!");
2006}
2007
2009 LSRUse::KindType Kind, MemAccessTy AccessTy,
2010 GlobalValue *BaseGV, Immediate BaseOffset,
2011 bool HasBaseReg) {
2012 // Fast-path: zero is always foldable.
2013 if (BaseOffset.isZero() && !BaseGV)
2014 return true;
2015
2016 // Conservatively, create an address with an immediate and a
2017 // base and a scale.
2018 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2019
2020 // Canonicalize a scale of 1 to a base register if the formula doesn't
2021 // already have a base register.
2022 if (!HasBaseReg && Scale == 1) {
2023 Scale = 0;
2024 HasBaseReg = true;
2025 }
2026
2027 // FIXME: Try with + without a scale? Maybe based on TTI?
2028 // I think basereg + scaledreg + immediateoffset isn't a good 'conservative'
2029 // default for many architectures, not just AArch64 SVE. More investigation
2030 // needed later to determine if this should be used more widely than just
2031 // on scalable types.
2032 if (HasBaseReg && BaseOffset.isNonZero() && Kind != LSRUse::ICmpZero &&
2033 AccessTy.MemTy && AccessTy.MemTy->isScalableTy() && DropScaledForVScale)
2034 Scale = 0;
2035
2036 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
2037 HasBaseReg, Scale);
2038}
2039
2041 ScalarEvolution &SE, Immediate MinOffset,
2042 Immediate MaxOffset, LSRUse::KindType Kind,
2043 MemAccessTy AccessTy, const SCEV *S,
2044 bool HasBaseReg) {
2045 // Fast-path: zero is always foldable.
2046 if (S->isZero()) return true;
2047
2048 // Conservatively, create an address with an immediate and a
2049 // base and a scale.
2050 Immediate BaseOffset = ExtractImmediate(S, SE);
2051 GlobalValue *BaseGV = ExtractSymbol(S, SE);
2052
2053 // If there's anything else involved, it's not foldable.
2054 if (!S->isZero()) return false;
2055
2056 // Fast-path: zero is always foldable.
2057 if (BaseOffset.isZero() && !BaseGV)
2058 return true;
2059
2060 if (BaseOffset.isScalable())
2061 return false;
2062
2063 // Conservatively, create an address with an immediate and a
2064 // base and a scale.
2065 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2066
2067 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
2068 BaseOffset, HasBaseReg, Scale);
2069}
2070
2071namespace {
2072
2073/// An individual increment in a Chain of IV increments. Relate an IV user to
2074/// an expression that computes the IV it uses from the IV used by the previous
2075/// link in the Chain.
2076///
2077/// For the head of a chain, IncExpr holds the absolute SCEV expression for the
2078/// original IVOperand. The head of the chain's IVOperand is only valid during
2079/// chain collection, before LSR replaces IV users. During chain generation,
2080/// IncExpr can be used to find the new IVOperand that computes the same
2081/// expression.
2082struct IVInc {
2083 Instruction *UserInst;
2084 Value* IVOperand;
2085 const SCEV *IncExpr;
2086
2087 IVInc(Instruction *U, Value *O, const SCEV *E)
2088 : UserInst(U), IVOperand(O), IncExpr(E) {}
2089};
2090
2091// The list of IV increments in program order. We typically add the head of a
2092// chain without finding subsequent links.
2093struct IVChain {
2095 const SCEV *ExprBase = nullptr;
2096
2097 IVChain() = default;
2098 IVChain(const IVInc &Head, const SCEV *Base)
2099 : Incs(1, Head), ExprBase(Base) {}
2100
2102
2103 // Return the first increment in the chain.
2104 const_iterator begin() const {
2105 assert(!Incs.empty());
2106 return std::next(Incs.begin());
2107 }
2108 const_iterator end() const {
2109 return Incs.end();
2110 }
2111
2112 // Returns true if this chain contains any increments.
2113 bool hasIncs() const { return Incs.size() >= 2; }
2114
2115 // Add an IVInc to the end of this chain.
2116 void add(const IVInc &X) { Incs.push_back(X); }
2117
2118 // Returns the last UserInst in the chain.
2119 Instruction *tailUserInst() const { return Incs.back().UserInst; }
2120
2121 // Returns true if IncExpr can be profitably added to this chain.
2122 bool isProfitableIncrement(const SCEV *OperExpr,
2123 const SCEV *IncExpr,
2125};
2126
2127/// Helper for CollectChains to track multiple IV increment uses. Distinguish
2128/// between FarUsers that definitely cross IV increments and NearUsers that may
2129/// be used between IV increments.
2130struct ChainUsers {
2133};
2134
2135/// This class holds state for the main loop strength reduction logic.
2136class LSRInstance {
2137 IVUsers &IU;
2138 ScalarEvolution &SE;
2139 DominatorTree &DT;
2140 LoopInfo &LI;
2141 AssumptionCache &AC;
2142 TargetLibraryInfo &TLI;
2143 const TargetTransformInfo &TTI;
2144 Loop *const L;
2145 MemorySSAUpdater *MSSAU;
2147 mutable SCEVExpander Rewriter;
2148 bool Changed = false;
2149
2150 /// This is the insert position that the current loop's induction variable
2151 /// increment should be placed. In simple loops, this is the latch block's
2152 /// terminator. But in more complicated cases, this is a position which will
2153 /// dominate all the in-loop post-increment users.
2154 Instruction *IVIncInsertPos = nullptr;
2155
2156 /// Interesting factors between use strides.
2157 ///
2158 /// We explicitly use a SetVector which contains a SmallSet, instead of the
2159 /// default, a SmallDenseSet, because we need to use the full range of
2160 /// int64_ts, and there's currently no good way of doing that with
2161 /// SmallDenseSet.
2163
2164 /// The cost of the current SCEV, the best solution by LSR will be dropped if
2165 /// the solution is not profitable.
2166 Cost BaselineCost;
2167
2168 /// Interesting use types, to facilitate truncation reuse.
2170
2171 /// The list of interesting uses.
2173
2174 /// Track which uses use which register candidates.
2175 RegUseTracker RegUses;
2176
2177 // Limit the number of chains to avoid quadratic behavior. We don't expect to
2178 // have more than a few IV increment chains in a loop. Missing a Chain falls
2179 // back to normal LSR behavior for those uses.
2180 static const unsigned MaxChains = 8;
2181
2182 /// IV users can form a chain of IV increments.
2184
2185 /// IV users that belong to profitable IVChains.
2187
2188 /// Induction variables that were generated and inserted by the SCEV Expander.
2189 SmallVector<llvm::WeakVH, 2> ScalarEvolutionIVs;
2190
2191 // Inserting instructions in the loop and using them as PHI's input could
2192 // break LCSSA in case if PHI's parent block is not a loop exit (i.e. the
2193 // corresponding incoming block is not loop exiting). So collect all such
2194 // instructions to form LCSSA for them later.
2195 SmallSetVector<Instruction *, 4> InsertedNonLCSSAInsts;
2196
2197 void OptimizeShadowIV();
2198 bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
2199 ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
2200 void OptimizeLoopTermCond();
2201
2202 void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
2203 SmallVectorImpl<ChainUsers> &ChainUsersVec);
2204 void FinalizeChain(IVChain &Chain);
2205 void CollectChains();
2206 void GenerateIVChain(const IVChain &Chain,
2208
2209 void CollectInterestingTypesAndFactors();
2210 void CollectFixupsAndInitialFormulae();
2211
2212 // Support for sharing of LSRUses between LSRFixups.
2214 UseMapTy UseMap;
2215
2216 bool reconcileNewOffset(LSRUse &LU, Immediate NewOffset, bool HasBaseReg,
2217 LSRUse::KindType Kind, MemAccessTy AccessTy);
2218
2219 std::pair<size_t, Immediate> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
2220 MemAccessTy AccessTy);
2221
2222 void DeleteUse(LSRUse &LU, size_t LUIdx);
2223
2224 LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
2225
2226 void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2227 void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2228 void CountRegisters(const Formula &F, size_t LUIdx);
2229 bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
2230
2231 void CollectLoopInvariantFixupsAndFormulae();
2232
2233 void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
2234 unsigned Depth = 0);
2235
2236 void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
2237 const Formula &Base, unsigned Depth,
2238 size_t Idx, bool IsScaledReg = false);
2239 void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
2240 void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2241 const Formula &Base, size_t Idx,
2242 bool IsScaledReg = false);
2243 void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2244 void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2245 const Formula &Base,
2246 const SmallVectorImpl<Immediate> &Worklist,
2247 size_t Idx, bool IsScaledReg = false);
2248 void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2249 void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2250 void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2251 void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
2252 void GenerateCrossUseConstantOffsets();
2253 void GenerateAllReuseFormulae();
2254
2255 void FilterOutUndesirableDedicatedRegisters();
2256
2257 size_t EstimateSearchSpaceComplexity() const;
2258 void NarrowSearchSpaceByDetectingSupersets();
2259 void NarrowSearchSpaceByCollapsingUnrolledCode();
2260 void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
2261 void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
2262 void NarrowSearchSpaceByFilterPostInc();
2263 void NarrowSearchSpaceByDeletingCostlyFormulas();
2264 void NarrowSearchSpaceByPickingWinnerRegs();
2265 void NarrowSearchSpaceUsingHeuristics();
2266
2267 void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
2268 Cost &SolutionCost,
2270 const Cost &CurCost,
2271 const SmallPtrSet<const SCEV *, 16> &CurRegs,
2272 DenseSet<const SCEV *> &VisitedRegs) const;
2273 void Solve(SmallVectorImpl<const Formula *> &Solution) const;
2274
2276 HoistInsertPosition(BasicBlock::iterator IP,
2277 const SmallVectorImpl<Instruction *> &Inputs) const;
2278 BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP,
2279 const LSRFixup &LF,
2280 const LSRUse &LU) const;
2281
2282 Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2284 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2285 void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
2286 const Formula &F,
2288 void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2290 void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
2291
2292public:
2293 LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
2295 TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU);
2296
2297 bool getChanged() const { return Changed; }
2298 const SmallVectorImpl<WeakVH> &getScalarEvolutionIVs() const {
2299 return ScalarEvolutionIVs;
2300 }
2301
2302 void print_factors_and_types(raw_ostream &OS) const;
2303 void print_fixups(raw_ostream &OS) const;
2304 void print_uses(raw_ostream &OS) const;
2305 void print(raw_ostream &OS) const;
2306 void dump() const;
2307};
2308
2309} // end anonymous namespace
2310
2311/// If IV is used in a int-to-float cast inside the loop then try to eliminate
2312/// the cast operation.
2313void LSRInstance::OptimizeShadowIV() {
2314 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2315 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2316 return;
2317
2318 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
2319 UI != E; /* empty */) {
2320 IVUsers::const_iterator CandidateUI = UI;
2321 ++UI;
2322 Instruction *ShadowUse = CandidateUI->getUser();
2323 Type *DestTy = nullptr;
2324 bool IsSigned = false;
2325
2326 /* If shadow use is a int->float cast then insert a second IV
2327 to eliminate this cast.
2328
2329 for (unsigned i = 0; i < n; ++i)
2330 foo((double)i);
2331
2332 is transformed into
2333
2334 double d = 0.0;
2335 for (unsigned i = 0; i < n; ++i, ++d)
2336 foo(d);
2337 */
2338 if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
2339 IsSigned = false;
2340 DestTy = UCast->getDestTy();
2341 }
2342 else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
2343 IsSigned = true;
2344 DestTy = SCast->getDestTy();
2345 }
2346 if (!DestTy) continue;
2347
2348 // If target does not support DestTy natively then do not apply
2349 // this transformation.
2350 if (!TTI.isTypeLegal(DestTy)) continue;
2351
2352 PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
2353 if (!PH) continue;
2354 if (PH->getNumIncomingValues() != 2) continue;
2355
2356 // If the calculation in integers overflows, the result in FP type will
2357 // differ. So we only can do this transformation if we are guaranteed to not
2358 // deal with overflowing values
2359 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PH));
2360 if (!AR) continue;
2361 if (IsSigned && !AR->hasNoSignedWrap()) continue;
2362 if (!IsSigned && !AR->hasNoUnsignedWrap()) continue;
2363
2364 Type *SrcTy = PH->getType();
2365 int Mantissa = DestTy->getFPMantissaWidth();
2366 if (Mantissa == -1) continue;
2367 if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
2368 continue;
2369
2370 unsigned Entry, Latch;
2371 if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
2372 Entry = 0;
2373 Latch = 1;
2374 } else {
2375 Entry = 1;
2376 Latch = 0;
2377 }
2378
2379 ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
2380 if (!Init) continue;
2381 Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
2382 (double)Init->getSExtValue() :
2383 (double)Init->getZExtValue());
2384
2385 BinaryOperator *Incr =
2386 dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
2387 if (!Incr) continue;
2388 if (Incr->getOpcode() != Instruction::Add
2389 && Incr->getOpcode() != Instruction::Sub)
2390 continue;
2391
2392 /* Initialize new IV, double d = 0.0 in above example. */
2393 ConstantInt *C = nullptr;
2394 if (Incr->getOperand(0) == PH)
2395 C = dyn_cast<ConstantInt>(Incr->getOperand(1));
2396 else if (Incr->getOperand(1) == PH)
2397 C = dyn_cast<ConstantInt>(Incr->getOperand(0));
2398 else
2399 continue;
2400
2401 if (!C) continue;
2402
2403 // Ignore negative constants, as the code below doesn't handle them
2404 // correctly. TODO: Remove this restriction.
2405 if (!C->getValue().isStrictlyPositive())
2406 continue;
2407
2408 /* Add new PHINode. */
2409 PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH->getIterator());
2410 NewPH->setDebugLoc(PH->getDebugLoc());
2411
2412 /* create new increment. '++d' in above example. */
2413 Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
2415 Incr->getOpcode() == Instruction::Add ? Instruction::FAdd
2416 : Instruction::FSub,
2417 NewPH, CFP, "IV.S.next.", Incr->getIterator());
2418 NewIncr->setDebugLoc(Incr->getDebugLoc());
2419
2420 NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
2421 NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
2422
2423 /* Remove cast operation */
2424 ShadowUse->replaceAllUsesWith(NewPH);
2425 ShadowUse->eraseFromParent();
2426 Changed = true;
2427 break;
2428 }
2429}
2430
2431/// If Cond has an operand that is an expression of an IV, set the IV user and
2432/// stride information and return true, otherwise return false.
2433bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
2434 for (IVStrideUse &U : IU)
2435 if (U.getUser() == Cond) {
2436 // NOTE: we could handle setcc instructions with multiple uses here, but
2437 // InstCombine does it as well for simple uses, it's not clear that it
2438 // occurs enough in real life to handle.
2439 CondUse = &U;
2440 return true;
2441 }
2442 return false;
2443}
2444
2445/// Rewrite the loop's terminating condition if it uses a max computation.
2446///
2447/// This is a narrow solution to a specific, but acute, problem. For loops
2448/// like this:
2449///
2450/// i = 0;
2451/// do {
2452/// p[i] = 0.0;
2453/// } while (++i < n);
2454///
2455/// the trip count isn't just 'n', because 'n' might not be positive. And
2456/// unfortunately this can come up even for loops where the user didn't use
2457/// a C do-while loop. For example, seemingly well-behaved top-test loops
2458/// will commonly be lowered like this:
2459///
2460/// if (n > 0) {
2461/// i = 0;
2462/// do {
2463/// p[i] = 0.0;
2464/// } while (++i < n);
2465/// }
2466///
2467/// and then it's possible for subsequent optimization to obscure the if
2468/// test in such a way that indvars can't find it.
2469///
2470/// When indvars can't find the if test in loops like this, it creates a
2471/// max expression, which allows it to give the loop a canonical
2472/// induction variable:
2473///
2474/// i = 0;
2475/// max = n < 1 ? 1 : n;
2476/// do {
2477/// p[i] = 0.0;
2478/// } while (++i != max);
2479///
2480/// Canonical induction variables are necessary because the loop passes
2481/// are designed around them. The most obvious example of this is the
2482/// LoopInfo analysis, which doesn't remember trip count values. It
2483/// expects to be able to rediscover the trip count each time it is
2484/// needed, and it does this using a simple analysis that only succeeds if
2485/// the loop has a canonical induction variable.
2486///
2487/// However, when it comes time to generate code, the maximum operation
2488/// can be quite costly, especially if it's inside of an outer loop.
2489///
2490/// This function solves this problem by detecting this type of loop and
2491/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
2492/// the instructions for the maximum computation.
2493ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
2494 // Check that the loop matches the pattern we're looking for.
2495 if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
2496 Cond->getPredicate() != CmpInst::ICMP_NE)
2497 return Cond;
2498
2499 SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
2500 if (!Sel || !Sel->hasOneUse()) return Cond;
2501
2502 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2503 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2504 return Cond;
2505 const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
2506
2507 // Add one to the backedge-taken count to get the trip count.
2508 const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
2509 if (IterationCount != SE.getSCEV(Sel)) return Cond;
2510
2511 // Check for a max calculation that matches the pattern. There's no check
2512 // for ICMP_ULE here because the comparison would be with zero, which
2513 // isn't interesting.
2514 CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
2515 const SCEVNAryExpr *Max = nullptr;
2516 if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
2517 Pred = ICmpInst::ICMP_SLE;
2518 Max = S;
2519 } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
2520 Pred = ICmpInst::ICMP_SLT;
2521 Max = S;
2522 } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
2523 Pred = ICmpInst::ICMP_ULT;
2524 Max = U;
2525 } else {
2526 // No match; bail.
2527 return Cond;
2528 }
2529
2530 // To handle a max with more than two operands, this optimization would
2531 // require additional checking and setup.
2532 if (Max->getNumOperands() != 2)
2533 return Cond;
2534
2535 const SCEV *MaxLHS = Max->getOperand(0);
2536 const SCEV *MaxRHS = Max->getOperand(1);
2537
2538 // ScalarEvolution canonicalizes constants to the left. For < and >, look
2539 // for a comparison with 1. For <= and >=, a comparison with zero.
2540 if (!MaxLHS ||
2541 (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
2542 return Cond;
2543
2544 // Check the relevant induction variable for conformance to
2545 // the pattern.
2546 const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
2547 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
2548 if (!AR || !AR->isAffine() ||
2549 AR->getStart() != One ||
2550 AR->getStepRecurrence(SE) != One)
2551 return Cond;
2552
2553 assert(AR->getLoop() == L &&
2554 "Loop condition operand is an addrec in a different loop!");
2555
2556 // Check the right operand of the select, and remember it, as it will
2557 // be used in the new comparison instruction.
2558 Value *NewRHS = nullptr;
2559 if (ICmpInst::isTrueWhenEqual(Pred)) {
2560 // Look for n+1, and grab n.
2561 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
2562 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2563 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2564 NewRHS = BO->getOperand(0);
2565 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
2566 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2567 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2568 NewRHS = BO->getOperand(0);
2569 if (!NewRHS)
2570 return Cond;
2571 } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
2572 NewRHS = Sel->getOperand(1);
2573 else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
2574 NewRHS = Sel->getOperand(2);
2575 else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
2576 NewRHS = SU->getValue();
2577 else
2578 // Max doesn't match expected pattern.
2579 return Cond;
2580
2581 // Determine the new comparison opcode. It may be signed or unsigned,
2582 // and the original comparison may be either equality or inequality.
2583 if (Cond->getPredicate() == CmpInst::ICMP_EQ)
2584 Pred = CmpInst::getInversePredicate(Pred);
2585
2586 // Ok, everything looks ok to change the condition into an SLT or SGE and
2587 // delete the max calculation.
2588 ICmpInst *NewCond = new ICmpInst(Cond->getIterator(), Pred,
2589 Cond->getOperand(0), NewRHS, "scmp");
2590
2591 // Delete the max calculation instructions.
2592 NewCond->setDebugLoc(Cond->getDebugLoc());
2593 Cond->replaceAllUsesWith(NewCond);
2594 CondUse->setUser(NewCond);
2595 Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
2596 Cond->eraseFromParent();
2597 Sel->eraseFromParent();
2598 if (Cmp->use_empty())
2599 Cmp->eraseFromParent();
2600 return NewCond;
2601}
2602
2603/// Change loop terminating condition to use the postinc iv when possible.
2604void
2605LSRInstance::OptimizeLoopTermCond() {
2607
2608 // We need a different set of heuristics for rotated and non-rotated loops.
2609 // If a loop is rotated then the latch is also the backedge, so inserting
2610 // post-inc expressions just before the latch is ideal. To reduce live ranges
2611 // it also makes sense to rewrite terminating conditions to use post-inc
2612 // expressions.
2613 //
2614 // If the loop is not rotated then the latch is not a backedge; the latch
2615 // check is done in the loop head. Adding post-inc expressions before the
2616 // latch will cause overlapping live-ranges of pre-inc and post-inc expressions
2617 // in the loop body. In this case we do *not* want to use post-inc expressions
2618 // in the latch check, and we want to insert post-inc expressions before
2619 // the backedge.
2620 BasicBlock *LatchBlock = L->getLoopLatch();
2621 SmallVector<BasicBlock*, 8> ExitingBlocks;
2622 L->getExitingBlocks(ExitingBlocks);
2623 if (!llvm::is_contained(ExitingBlocks, LatchBlock)) {
2624 // The backedge doesn't exit the loop; treat this as a head-tested loop.
2625 IVIncInsertPos = LatchBlock->getTerminator();
2626 return;
2627 }
2628
2629 // Otherwise treat this as a rotated loop.
2630 for (BasicBlock *ExitingBlock : ExitingBlocks) {
2631 // Get the terminating condition for the loop if possible. If we
2632 // can, we want to change it to use a post-incremented version of its
2633 // induction variable, to allow coalescing the live ranges for the IV into
2634 // one register value.
2635
2636 BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
2637 if (!TermBr)
2638 continue;
2639 // FIXME: Overly conservative, termination condition could be an 'or' etc..
2640 if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
2641 continue;
2642
2643 // Search IVUsesByStride to find Cond's IVUse if there is one.
2644 IVStrideUse *CondUse = nullptr;
2645 ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
2646 if (!FindIVUserForCond(Cond, CondUse))
2647 continue;
2648
2649 // If the trip count is computed in terms of a max (due to ScalarEvolution
2650 // being unable to find a sufficient guard, for example), change the loop
2651 // comparison to use SLT or ULT instead of NE.
2652 // One consequence of doing this now is that it disrupts the count-down
2653 // optimization. That's not always a bad thing though, because in such
2654 // cases it may still be worthwhile to avoid a max.
2655 Cond = OptimizeMax(Cond, CondUse);
2656
2657 // If this exiting block dominates the latch block, it may also use
2658 // the post-inc value if it won't be shared with other uses.
2659 // Check for dominance.
2660 if (!DT.dominates(ExitingBlock, LatchBlock))
2661 continue;
2662
2663 // Conservatively avoid trying to use the post-inc value in non-latch
2664 // exits if there may be pre-inc users in intervening blocks.
2665 if (LatchBlock != ExitingBlock)
2666 for (const IVStrideUse &UI : IU)
2667 // Test if the use is reachable from the exiting block. This dominator
2668 // query is a conservative approximation of reachability.
2669 if (&UI != CondUse &&
2670 !DT.properlyDominates(UI.getUser()->getParent(), ExitingBlock)) {
2671 // Conservatively assume there may be reuse if the quotient of their
2672 // strides could be a legal scale.
2673 const SCEV *A = IU.getStride(*CondUse, L);
2674 const SCEV *B = IU.getStride(UI, L);
2675 if (!A || !B) continue;
2676 if (SE.getTypeSizeInBits(A->getType()) !=
2677 SE.getTypeSizeInBits(B->getType())) {
2678 if (SE.getTypeSizeInBits(A->getType()) >
2679 SE.getTypeSizeInBits(B->getType()))
2680 B = SE.getSignExtendExpr(B, A->getType());
2681 else
2682 A = SE.getSignExtendExpr(A, B->getType());
2683 }
2684 if (const SCEVConstant *D =
2685 dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
2686 const ConstantInt *C = D->getValue();
2687 // Stride of one or negative one can have reuse with non-addresses.
2688 if (C->isOne() || C->isMinusOne())
2689 goto decline_post_inc;
2690 // Avoid weird situations.
2691 if (C->getValue().getSignificantBits() >= 64 ||
2692 C->getValue().isMinSignedValue())
2693 goto decline_post_inc;
2694 // Check for possible scaled-address reuse.
2695 if (isAddressUse(TTI, UI.getUser(), UI.getOperandValToReplace())) {
2696 MemAccessTy AccessTy =
2697 getAccessType(TTI, UI.getUser(), UI.getOperandValToReplace());
2698 int64_t Scale = C->getSExtValue();
2699 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2700 /*BaseOffset=*/0,
2701 /*HasBaseReg=*/true, Scale,
2702 AccessTy.AddrSpace))
2703 goto decline_post_inc;
2704 Scale = -Scale;
2705 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2706 /*BaseOffset=*/0,
2707 /*HasBaseReg=*/true, Scale,
2708 AccessTy.AddrSpace))
2709 goto decline_post_inc;
2710 }
2711 }
2712 }
2713
2714 LLVM_DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: "
2715 << *Cond << '\n');
2716
2717 // It's possible for the setcc instruction to be anywhere in the loop, and
2718 // possible for it to have multiple users. If it is not immediately before
2719 // the exiting block branch, move it.
2720 if (Cond->getNextNonDebugInstruction() != TermBr) {
2721 if (Cond->hasOneUse()) {
2722 Cond->moveBefore(TermBr);
2723 } else {
2724 // Clone the terminating condition and insert into the loopend.
2725 ICmpInst *OldCond = Cond;
2726 Cond = cast<ICmpInst>(Cond->clone());
2727 Cond->setName(L->getHeader()->getName() + ".termcond");
2728 Cond->insertInto(ExitingBlock, TermBr->getIterator());
2729
2730 // Clone the IVUse, as the old use still exists!
2731 CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
2732 TermBr->replaceUsesOfWith(OldCond, Cond);
2733 }
2734 }
2735
2736 // If we get to here, we know that we can transform the setcc instruction to
2737 // use the post-incremented version of the IV, allowing us to coalesce the
2738 // live ranges for the IV correctly.
2739 CondUse->transformToPostInc(L);
2740 Changed = true;
2741
2742 PostIncs.insert(Cond);
2743 decline_post_inc:;
2744 }
2745
2746 // Determine an insertion point for the loop induction variable increment. It
2747 // must dominate all the post-inc comparisons we just set up, and it must
2748 // dominate the loop latch edge.
2749 IVIncInsertPos = L->getLoopLatch()->getTerminator();
2750 for (Instruction *Inst : PostIncs)
2751 IVIncInsertPos = DT.findNearestCommonDominator(IVIncInsertPos, Inst);
2752}
2753
2754/// Determine if the given use can accommodate a fixup at the given offset and
2755/// other details. If so, update the use and return true.
2756bool LSRInstance::reconcileNewOffset(LSRUse &LU, Immediate NewOffset,
2757 bool HasBaseReg, LSRUse::KindType Kind,
2758 MemAccessTy AccessTy) {
2759 Immediate NewMinOffset = LU.MinOffset;
2760 Immediate NewMaxOffset = LU.MaxOffset;
2761 MemAccessTy NewAccessTy = AccessTy;
2762
2763 // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
2764 // something conservative, however this can pessimize in the case that one of
2765 // the uses will have all its uses outside the loop, for example.
2766 if (LU.Kind != Kind)
2767 return false;
2768
2769 // Check for a mismatched access type, and fall back conservatively as needed.
2770 // TODO: Be less conservative when the type is similar and can use the same
2771 // addressing modes.
2772 if (Kind == LSRUse::Address) {
2773 if (AccessTy.MemTy != LU.AccessTy.MemTy) {
2774 NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext(),
2775 AccessTy.AddrSpace);
2776 }
2777 }
2778
2779 // Conservatively assume HasBaseReg is true for now.
2780 if (Immediate::isKnownLT(NewOffset, LU.MinOffset)) {
2781 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2782 LU.MaxOffset - NewOffset, HasBaseReg))
2783 return false;
2784 NewMinOffset = NewOffset;
2785 } else if (Immediate::isKnownGT(NewOffset, LU.MaxOffset)) {
2786 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2787 NewOffset - LU.MinOffset, HasBaseReg))
2788 return false;
2789 NewMaxOffset = NewOffset;
2790 }
2791
2792 // FIXME: We should be able to handle some level of scalable offset support
2793 // for 'void', but in order to get basic support up and running this is
2794 // being left out.
2795 if (NewAccessTy.MemTy && NewAccessTy.MemTy->isVoidTy() &&
2796 (NewMinOffset.isScalable() || NewMaxOffset.isScalable()))
2797 return false;
2798
2799 // Update the use.
2800 LU.MinOffset = NewMinOffset;
2801 LU.MaxOffset = NewMaxOffset;
2802 LU.AccessTy = NewAccessTy;
2803 return true;
2804}
2805
2806/// Return an LSRUse index and an offset value for a fixup which needs the given
2807/// expression, with the given kind and optional access type. Either reuse an
2808/// existing use or create a new one, as needed.
2809std::pair<size_t, Immediate> LSRInstance::getUse(const SCEV *&Expr,
2810 LSRUse::KindType Kind,
2811 MemAccessTy AccessTy) {
2812 const SCEV *Copy = Expr;
2813 Immediate Offset = ExtractImmediate(Expr, SE);
2814
2815 // Basic uses can't accept any offset, for example.
2816 if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
2817 Offset, /*HasBaseReg=*/ true)) {
2818 Expr = Copy;
2819 Offset = Immediate::getFixed(0);
2820 }
2821
2822 std::pair<UseMapTy::iterator, bool> P =
2823 UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0));
2824 if (!P.second) {
2825 // A use already existed with this base.
2826 size_t LUIdx = P.first->second;
2827 LSRUse &LU = Uses[LUIdx];
2828 if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
2829 // Reuse this use.
2830 return std::make_pair(LUIdx, Offset);
2831 }
2832
2833 // Create a new use.
2834 size_t LUIdx = Uses.size();
2835 P.first->second = LUIdx;
2836 Uses.push_back(LSRUse(Kind, AccessTy));
2837 LSRUse &LU = Uses[LUIdx];
2838
2839 LU.MinOffset = Offset;
2840 LU.MaxOffset = Offset;
2841 return std::make_pair(LUIdx, Offset);
2842}
2843
2844/// Delete the given use from the Uses list.
2845void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
2846 if (&LU != &Uses.back())
2847 std::swap(LU, Uses.back());
2848 Uses.pop_back();
2849
2850 // Update RegUses.
2851 RegUses.swapAndDropUse(LUIdx, Uses.size());
2852}
2853
2854/// Look for a use distinct from OrigLU which is has a formula that has the same
2855/// registers as the given formula.
2856LSRUse *
2857LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
2858 const LSRUse &OrigLU) {
2859 // Search all uses for the formula. This could be more clever.
2860 for (LSRUse &LU : Uses) {
2861 // Check whether this use is close enough to OrigLU, to see whether it's
2862 // worthwhile looking through its formulae.
2863 // Ignore ICmpZero uses because they may contain formulae generated by
2864 // GenerateICmpZeroScales, in which case adding fixup offsets may
2865 // be invalid.
2866 if (&LU != &OrigLU &&
2867 LU.Kind != LSRUse::ICmpZero &&
2868 LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
2869 LU.WidestFixupType == OrigLU.WidestFixupType &&
2870 LU.HasFormulaWithSameRegs(OrigF)) {
2871 // Scan through this use's formulae.
2872 for (const Formula &F : LU.Formulae) {
2873 // Check to see if this formula has the same registers and symbols
2874 // as OrigF.
2875 if (F.BaseRegs == OrigF.BaseRegs &&
2876 F.ScaledReg == OrigF.ScaledReg &&
2877 F.BaseGV == OrigF.BaseGV &&
2878 F.Scale == OrigF.Scale &&
2879 F.UnfoldedOffset == OrigF.UnfoldedOffset) {
2880 if (F.BaseOffset.isZero())
2881 return &LU;
2882 // This is the formula where all the registers and symbols matched;
2883 // there aren't going to be any others. Since we declined it, we
2884 // can skip the rest of the formulae and proceed to the next LSRUse.
2885 break;
2886 }
2887 }
2888 }
2889 }
2890
2891 // Nothing looked good.
2892 return nullptr;
2893}
2894
2895void LSRInstance::CollectInterestingTypesAndFactors() {
2897
2898 // Collect interesting types and strides.
2900 for (const IVStrideUse &U : IU) {
2901 const SCEV *Expr = IU.getExpr(U);
2902 if (!Expr)
2903 continue;
2904
2905 // Collect interesting types.
2906 Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
2907
2908 // Add strides for mentioned loops.
2909 Worklist.push_back(Expr);
2910 do {
2911 const SCEV *S = Worklist.pop_back_val();
2912 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
2913 if (AR->getLoop() == L)
2914 Strides.insert(AR->getStepRecurrence(SE));
2915 Worklist.push_back(AR->getStart());
2916 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
2917 append_range(Worklist, Add->operands());
2918 }
2919 } while (!Worklist.empty());
2920 }
2921
2922 // Compute interesting factors from the set of interesting strides.
2924 I = Strides.begin(), E = Strides.end(); I != E; ++I)
2926 std::next(I); NewStrideIter != E; ++NewStrideIter) {
2927 const SCEV *OldStride = *I;
2928 const SCEV *NewStride = *NewStrideIter;
2929
2930 if (SE.getTypeSizeInBits(OldStride->getType()) !=
2931 SE.getTypeSizeInBits(NewStride->getType())) {
2932 if (SE.getTypeSizeInBits(OldStride->getType()) >
2933 SE.getTypeSizeInBits(NewStride->getType()))
2934 NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
2935 else
2936 OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
2937 }
2938 if (const SCEVConstant *Factor =
2939 dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
2940 SE, true))) {
2941 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2942 Factors.insert(Factor->getAPInt().getSExtValue());
2943 } else if (const SCEVConstant *Factor =
2944 dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
2945 NewStride,
2946 SE, true))) {
2947 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2948 Factors.insert(Factor->getAPInt().getSExtValue());
2949 }
2950 }
2951
2952 // If all uses use the same type, don't bother looking for truncation-based
2953 // reuse.
2954 if (Types.size() == 1)
2955 Types.clear();
2956
2957 LLVM_DEBUG(print_factors_and_types(dbgs()));
2958}
2959
2960/// Helper for CollectChains that finds an IV operand (computed by an AddRec in
2961/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
2962/// IVStrideUses, we could partially skip this.
2963static User::op_iterator
2965 Loop *L, ScalarEvolution &SE) {
2966 for(; OI != OE; ++OI) {
2967 if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
2968 if (!SE.isSCEVable(Oper->getType()))
2969 continue;
2970
2971 if (const SCEVAddRecExpr *AR =
2972 dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) {
2973 if (AR->getLoop() == L)
2974 break;
2975 }
2976 }
2977 }
2978 return OI;
2979}
2980
2981/// IVChain logic must consistently peek base TruncInst operands, so wrap it in
2982/// a convenient helper.
2984 if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
2985 return Trunc->getOperand(0);
2986 return Oper;
2987}
2988
2989/// Return an approximation of this SCEV expression's "base", or NULL for any
2990/// constant. Returning the expression itself is conservative. Returning a
2991/// deeper subexpression is more precise and valid as long as it isn't less
2992/// complex than another subexpression. For expressions involving multiple
2993/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
2994/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
2995/// IVInc==b-a.
2996///
2997/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
2998/// SCEVUnknown, we simply return the rightmost SCEV operand.
2999static const SCEV *getExprBase(const SCEV *S) {
3000 switch (S->getSCEVType()) {
3001 default: // including scUnknown.
3002 return S;
3003 case scConstant:
3004 case scVScale:
3005 return nullptr;
3006 case scTruncate:
3007 return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
3008 case scZeroExtend:
3009 return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
3010 case scSignExtend:
3011 return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
3012 case scAddExpr: {
3013 // Skip over scaled operands (scMulExpr) to follow add operands as long as
3014 // there's nothing more complex.
3015 // FIXME: not sure if we want to recognize negation.
3016 const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
3017 for (const SCEV *SubExpr : reverse(Add->operands())) {
3018 if (SubExpr->getSCEVType() == scAddExpr)
3019 return getExprBase(SubExpr);
3020
3021 if (SubExpr->getSCEVType() != scMulExpr)
3022 return SubExpr;
3023 }
3024 return S; // all operands are scaled, be conservative.
3025 }
3026 case scAddRecExpr:
3027 return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
3028 }
3029 llvm_unreachable("Unknown SCEV kind!");
3030}
3031
3032/// Return true if the chain increment is profitable to expand into a loop
3033/// invariant value, which may require its own register. A profitable chain
3034/// increment will be an offset relative to the same base. We allow such offsets
3035/// to potentially be used as chain increment as long as it's not obviously
3036/// expensive to expand using real instructions.
3037bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
3038 const SCEV *IncExpr,
3039 ScalarEvolution &SE) {
3040 // Aggressively form chains when -stress-ivchain.
3041 if (StressIVChain)
3042 return true;
3043
3044 // Do not replace a constant offset from IV head with a nonconstant IV
3045 // increment.
3046 if (!isa<SCEVConstant>(IncExpr)) {
3047 const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
3048 if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
3049 return false;
3050 }
3051
3053 return !isHighCostExpansion(IncExpr, Processed, SE);
3054}
3055
3056/// Return true if the number of registers needed for the chain is estimated to
3057/// be less than the number required for the individual IV users. First prohibit
3058/// any IV users that keep the IV live across increments (the Users set should
3059/// be empty). Next count the number and type of increments in the chain.
3060///
3061/// Chaining IVs can lead to considerable code bloat if ISEL doesn't
3062/// effectively use postinc addressing modes. Only consider it profitable it the
3063/// increments can be computed in fewer registers when chained.
3064///
3065/// TODO: Consider IVInc free if it's already used in another chains.
3066static bool isProfitableChain(IVChain &Chain,
3068 ScalarEvolution &SE,
3069 const TargetTransformInfo &TTI) {
3070 if (StressIVChain)
3071 return true;
3072
3073 if (!Chain.hasIncs())
3074 return false;
3075
3076 if (!Users.empty()) {
3077 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
3078 for (Instruction *Inst
3079 : Users) { dbgs() << " " << *Inst << "\n"; });
3080 return false;
3081 }
3082 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3083
3084 // The chain itself may require a register, so intialize cost to 1.
3085 int cost = 1;
3086
3087 // A complete chain likely eliminates the need for keeping the original IV in
3088 // a register. LSR does not currently know how to form a complete chain unless
3089 // the header phi already exists.
3090 if (isa<PHINode>(Chain.tailUserInst())
3091 && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
3092 --cost;
3093 }
3094 const SCEV *LastIncExpr = nullptr;
3095 unsigned NumConstIncrements = 0;
3096 unsigned NumVarIncrements = 0;
3097 unsigned NumReusedIncrements = 0;
3098
3099 if (TTI.isProfitableLSRChainElement(Chain.Incs[0].UserInst))
3100 return true;
3101
3102 for (const IVInc &Inc : Chain) {
3103 if (TTI.isProfitableLSRChainElement(Inc.UserInst))
3104 return true;
3105 if (Inc.IncExpr->isZero())
3106 continue;
3107
3108 // Incrementing by zero or some constant is neutral. We assume constants can
3109 // be folded into an addressing mode or an add's immediate operand.
3110 if (isa<SCEVConstant>(Inc.IncExpr)) {
3111 ++NumConstIncrements;
3112 continue;
3113 }
3114
3115 if (Inc.IncExpr == LastIncExpr)
3116 ++NumReusedIncrements;
3117 else
3118 ++NumVarIncrements;
3119
3120 LastIncExpr = Inc.IncExpr;
3121 }
3122 // An IV chain with a single increment is handled by LSR's postinc
3123 // uses. However, a chain with multiple increments requires keeping the IV's
3124 // value live longer than it needs to be if chained.
3125 if (NumConstIncrements > 1)
3126 --cost;
3127
3128 // Materializing increment expressions in the preheader that didn't exist in
3129 // the original code may cost a register. For example, sign-extended array
3130 // indices can produce ridiculous increments like this:
3131 // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
3132 cost += NumVarIncrements;
3133
3134 // Reusing variable increments likely saves a register to hold the multiple of
3135 // the stride.
3136 cost -= NumReusedIncrements;
3137
3138 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
3139 << "\n");
3140
3141 return cost < 0;
3142}
3143
3144/// Add this IV user to an existing chain or make it the head of a new chain.
3145void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
3146 SmallVectorImpl<ChainUsers> &ChainUsersVec) {
3147 // When IVs are used as types of varying widths, they are generally converted
3148 // to a wider type with some uses remaining narrow under a (free) trunc.
3149 Value *const NextIV = getWideOperand(IVOper);
3150 const SCEV *const OperExpr = SE.getSCEV(NextIV);
3151 const SCEV *const OperExprBase = getExprBase(OperExpr);
3152
3153 // Visit all existing chains. Check if its IVOper can be computed as a
3154 // profitable loop invariant increment from the last link in the Chain.
3155 unsigned ChainIdx = 0, NChains = IVChainVec.size();
3156 const SCEV *LastIncExpr = nullptr;
3157 for (; ChainIdx < NChains; ++ChainIdx) {
3158 IVChain &Chain = IVChainVec[ChainIdx];
3159
3160 // Prune the solution space aggressively by checking that both IV operands
3161 // are expressions that operate on the same unscaled SCEVUnknown. This
3162 // "base" will be canceled by the subsequent getMinusSCEV call. Checking
3163 // first avoids creating extra SCEV expressions.
3164 if (!StressIVChain && Chain.ExprBase != OperExprBase)
3165 continue;
3166
3167 Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
3168 if (PrevIV->getType() != NextIV->getType())
3169 continue;
3170
3171 // A phi node terminates a chain.
3172 if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
3173 continue;
3174
3175 // The increment must be loop-invariant so it can be kept in a register.
3176 const SCEV *PrevExpr = SE.getSCEV(PrevIV);
3177 const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
3178 if (isa<SCEVCouldNotCompute>(IncExpr) || !SE.isLoopInvariant(IncExpr, L))
3179 continue;
3180
3181 if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
3182 LastIncExpr = IncExpr;
3183 break;
3184 }
3185 }
3186 // If we haven't found a chain, create a new one, unless we hit the max. Don't
3187 // bother for phi nodes, because they must be last in the chain.
3188 if (ChainIdx == NChains) {
3189 if (isa<PHINode>(UserInst))
3190 return;
3191 if (NChains >= MaxChains && !StressIVChain) {
3192 LLVM_DEBUG(dbgs() << "IV Chain Limit\n");
3193 return;
3194 }
3195 LastIncExpr = OperExpr;
3196 // IVUsers may have skipped over sign/zero extensions. We don't currently
3197 // attempt to form chains involving extensions unless they can be hoisted
3198 // into this loop's AddRec.
3199 if (!isa<SCEVAddRecExpr>(LastIncExpr))
3200 return;
3201 ++NChains;
3202 IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
3203 OperExprBase));
3204 ChainUsersVec.resize(NChains);
3205 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
3206 << ") IV=" << *LastIncExpr << "\n");
3207 } else {
3208 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst
3209 << ") IV+" << *LastIncExpr << "\n");
3210 // Add this IV user to the end of the chain.
3211 IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
3212 }
3213 IVChain &Chain = IVChainVec[ChainIdx];
3214
3215 SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
3216 // This chain's NearUsers become FarUsers.
3217 if (!LastIncExpr->isZero()) {
3218 ChainUsersVec[ChainIdx].FarUsers.insert(NearUsers.begin(),
3219 NearUsers.end());
3220 NearUsers.clear();
3221 }
3222
3223 // All other uses of IVOperand become near uses of the chain.
3224 // We currently ignore intermediate values within SCEV expressions, assuming
3225 // they will eventually be used be the current chain, or can be computed
3226 // from one of the chain increments. To be more precise we could
3227 // transitively follow its user and only add leaf IV users to the set.
3228 for (User *U : IVOper->users()) {
3229 Instruction *OtherUse = dyn_cast<Instruction>(U);
3230 if (!OtherUse)
3231 continue;
3232 // Uses in the chain will no longer be uses if the chain is formed.
3233 // Include the head of the chain in this iteration (not Chain.begin()).
3234 IVChain::const_iterator IncIter = Chain.Incs.begin();
3235 IVChain::const_iterator IncEnd = Chain.Incs.end();
3236 for( ; IncIter != IncEnd; ++IncIter) {
3237 if (IncIter->UserInst == OtherUse)
3238 break;
3239 }
3240 if (IncIter != IncEnd)
3241 continue;
3242
3243 if (SE.isSCEVable(OtherUse->getType())
3244 && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
3245 && IU.isIVUserOrOperand(OtherUse)) {
3246 continue;
3247 }
3248 NearUsers.insert(OtherUse);
3249 }
3250
3251 // Since this user is part of the chain, it's no longer considered a use
3252 // of the chain.
3253 ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
3254}
3255
3256/// Populate the vector of Chains.
3257///
3258/// This decreases ILP at the architecture level. Targets with ample registers,
3259/// multiple memory ports, and no register renaming probably don't want
3260/// this. However, such targets should probably disable LSR altogether.
3261///
3262/// The job of LSR is to make a reasonable choice of induction variables across
3263/// the loop. Subsequent passes can easily "unchain" computation exposing more
3264/// ILP *within the loop* if the target wants it.
3265///
3266/// Finding the best IV chain is potentially a scheduling problem. Since LSR
3267/// will not reorder memory operations, it will recognize this as a chain, but
3268/// will generate redundant IV increments. Ideally this would be corrected later
3269/// by a smart scheduler:
3270/// = A[i]
3271/// = A[i+x]
3272/// A[i] =
3273/// A[i+x] =
3274///
3275/// TODO: Walk the entire domtree within this loop, not just the path to the
3276/// loop latch. This will discover chains on side paths, but requires
3277/// maintaining multiple copies of the Chains state.
3278void LSRInstance::CollectChains() {
3279 LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n");
3280 SmallVector<ChainUsers, 8> ChainUsersVec;
3281
3283 BasicBlock *LoopHeader = L->getHeader();
3284 for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
3285 Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
3286 LatchPath.push_back(Rung->getBlock());
3287 }
3288 LatchPath.push_back(LoopHeader);
3289
3290 // Walk the instruction stream from the loop header to the loop latch.
3291 for (BasicBlock *BB : reverse(LatchPath)) {
3292 for (Instruction &I : *BB) {
3293 // Skip instructions that weren't seen by IVUsers analysis.
3294 if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&I))
3295 continue;
3296
3297 // Ignore users that are part of a SCEV expression. This way we only
3298 // consider leaf IV Users. This effectively rediscovers a portion of
3299 // IVUsers analysis but in program order this time.
3300 if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I)))
3301 continue;
3302
3303 // Remove this instruction from any NearUsers set it may be in.
3304 for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
3305 ChainIdx < NChains; ++ChainIdx) {
3306 ChainUsersVec[ChainIdx].NearUsers.erase(&I);
3307 }
3308 // Search for operands that can be chained.
3309 SmallPtrSet<Instruction*, 4> UniqueOperands;
3310 User::op_iterator IVOpEnd = I.op_end();
3311 User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE);
3312 while (IVOpIter != IVOpEnd) {
3313 Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
3314 if (UniqueOperands.insert(IVOpInst).second)
3315 ChainInstruction(&I, IVOpInst, ChainUsersVec);
3316 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3317 }
3318 } // Continue walking down the instructions.
3319 } // Continue walking down the domtree.
3320 // Visit phi backedges to determine if the chain can generate the IV postinc.
3321 for (PHINode &PN : L->getHeader()->phis()) {
3322 if (!SE.isSCEVable(PN.getType()))
3323 continue;
3324
3325 Instruction *IncV =
3326 dyn_cast<Instruction>(PN.getIncomingValueForBlock(L->getLoopLatch()));
3327 if (IncV)
3328 ChainInstruction(&PN, IncV, ChainUsersVec);
3329 }
3330 // Remove any unprofitable chains.
3331 unsigned ChainIdx = 0;
3332 for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
3333 UsersIdx < NChains; ++UsersIdx) {
3334 if (!isProfitableChain(IVChainVec[UsersIdx],
3335 ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
3336 continue;
3337 // Preserve the chain at UsesIdx.
3338 if (ChainIdx != UsersIdx)
3339 IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
3340 FinalizeChain(IVChainVec[ChainIdx]);
3341 ++ChainIdx;
3342 }
3343 IVChainVec.resize(ChainIdx);
3344}
3345
3346void LSRInstance::FinalizeChain(IVChain &Chain) {
3347 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3348 LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
3349
3350 for (const IVInc &Inc : Chain) {
3351 LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");
3352 auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
3353 assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
3354 IVIncSet.insert(UseI);
3355 }
3356}
3357
3358/// Return true if the IVInc can be folded into an addressing mode.
3359static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
3360 Value *Operand, const TargetTransformInfo &TTI) {
3361 const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
3362 Immediate IncOffset = Immediate::getZero();
3363 if (IncConst) {
3364 if (IncConst && IncConst->getAPInt().getSignificantBits() > 64)
3365 return false;
3366 IncOffset = Immediate::getFixed(IncConst->getValue()->getSExtValue());
3367 } else {
3368 // Look for mul(vscale, constant), to detect a scalable offset.
3369 auto *IncVScale = dyn_cast<SCEVMulExpr>(IncExpr);
3370 if (!IncVScale || IncVScale->getNumOperands() != 2 ||
3371 !isa<SCEVVScale>(IncVScale->getOperand(1)))
3372 return false;
3373 auto *Scale = dyn_cast<SCEVConstant>(IncVScale->getOperand(0));
3374 if (!Scale || Scale->getType()->getScalarSizeInBits() > 64)
3375 return false;
3376 IncOffset = Immediate::getScalable(Scale->getValue()->getSExtValue());
3377 }
3378
3379 if (!isAddressUse(TTI, UserInst, Operand))
3380 return false;
3381
3382 MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
3383 if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
3384 IncOffset, /*HasBaseReg=*/false))
3385 return false;
3386
3387 return true;
3388}
3389
3390/// Generate an add or subtract for each IVInc in a chain to materialize the IV
3391/// user's operand from the previous IV user's operand.
3392void LSRInstance::GenerateIVChain(const IVChain &Chain,
3394 // Find the new IVOperand for the head of the chain. It may have been replaced
3395 // by LSR.
3396 const IVInc &Head = Chain.Incs[0];
3397 User::op_iterator IVOpEnd = Head.UserInst->op_end();
3398 // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
3399 User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
3400 IVOpEnd, L, SE);
3401 Value *IVSrc = nullptr;
3402 while (IVOpIter != IVOpEnd) {
3403 IVSrc = getWideOperand(*IVOpIter);
3404
3405 // If this operand computes the expression that the chain needs, we may use
3406 // it. (Check this after setting IVSrc which is used below.)
3407 //
3408 // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
3409 // narrow for the chain, so we can no longer use it. We do allow using a
3410 // wider phi, assuming the LSR checked for free truncation. In that case we
3411 // should already have a truncate on this operand such that
3412 // getSCEV(IVSrc) == IncExpr.
3413 if (SE.getSCEV(*IVOpIter) == Head.IncExpr
3414 || SE.getSCEV(IVSrc) == Head.IncExpr) {
3415 break;
3416 }
3417 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3418 }
3419 if (IVOpIter == IVOpEnd) {
3420 // Gracefully give up on this chain.
3421 LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
3422 return;
3423 }
3424 assert(IVSrc && "Failed to find IV chain source");
3425
3426 LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
3427 Type *IVTy = IVSrc->getType();
3428 Type *IntTy = SE.getEffectiveSCEVType(IVTy);
3429 const SCEV *LeftOverExpr = nullptr;
3430 const SCEV *Accum = SE.getZero(IntTy);
3432 Bases.emplace_back(Accum, IVSrc);
3433
3434 for (const IVInc &Inc : Chain) {
3435 Instruction *InsertPt = Inc.UserInst;
3436 if (isa<PHINode>(InsertPt))
3437 InsertPt = L->getLoopLatch()->getTerminator();
3438
3439 // IVOper will replace the current IV User's operand. IVSrc is the IV
3440 // value currently held in a register.
3441 Value *IVOper = IVSrc;
3442 if (!Inc.IncExpr->isZero()) {
3443 // IncExpr was the result of subtraction of two narrow values, so must
3444 // be signed.
3445 const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
3446 Accum = SE.getAddExpr(Accum, IncExpr);
3447 LeftOverExpr = LeftOverExpr ?
3448 SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
3449 }
3450
3451 // Look through each base to see if any can produce a nice addressing mode.
3452 bool FoundBase = false;
3453 for (auto [MapScev, MapIVOper] : reverse(Bases)) {
3454 const SCEV *Remainder = SE.getMinusSCEV(Accum, MapScev);
3455 if (canFoldIVIncExpr(Remainder, Inc.UserInst, Inc.IVOperand, TTI)) {
3456 if (!Remainder->isZero()) {
3457 Rewriter.clearPostInc();
3458 Value *IncV = Rewriter.expandCodeFor(Remainder, IntTy, InsertPt);
3459 const SCEV *IVOperExpr =
3460 SE.getAddExpr(SE.getUnknown(MapIVOper), SE.getUnknown(IncV));
3461 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3462 } else {
3463 IVOper = MapIVOper;
3464 }
3465
3466 FoundBase = true;
3467 break;
3468 }
3469 }
3470 if (!FoundBase && LeftOverExpr && !LeftOverExpr->isZero()) {
3471 // Expand the IV increment.
3472 Rewriter.clearPostInc();
3473 Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
3474 const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
3475 SE.getUnknown(IncV));
3476 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3477
3478 // If an IV increment can't be folded, use it as the next IV value.
3479 if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
3480 assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
3481 Bases.emplace_back(Accum, IVOper);
3482 IVSrc = IVOper;
3483 LeftOverExpr = nullptr;
3484 }
3485 }
3486 Type *OperTy = Inc.IVOperand->getType();
3487 if (IVTy != OperTy) {
3488 assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
3489 "cannot extend a chained IV");
3490 IRBuilder<> Builder(InsertPt);
3491 IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
3492 }
3493 Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
3494 if (auto *OperandIsInstr = dyn_cast<Instruction>(Inc.IVOperand))
3495 DeadInsts.emplace_back(OperandIsInstr);
3496 }
3497 // If LSR created a new, wider phi, we may also replace its postinc. We only
3498 // do this if we also found a wide value for the head of the chain.
3499 if (isa<PHINode>(Chain.tailUserInst())) {
3500 for (PHINode &Phi : L->getHeader()->phis()) {
3501 if (Phi.getType() != IVSrc->getType())
3502 continue;
3503 Instruction *PostIncV = dyn_cast<Instruction>(
3504 Phi.getIncomingValueForBlock(L->getLoopLatch()));
3505 if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
3506 continue;
3507 Value *IVOper = IVSrc;
3508 Type *PostIncTy = PostIncV->getType();
3509 if (IVTy != PostIncTy) {
3510 assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
3511 IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
3512 Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
3513 IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
3514 }
3515 Phi.replaceUsesOfWith(PostIncV, IVOper);
3516 DeadInsts.emplace_back(PostIncV);
3517 }
3518 }
3519}
3520
3521void LSRInstance::CollectFixupsAndInitialFormulae() {
3522 BranchInst *ExitBranch = nullptr;
3523 bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &TLI);
3524
3525 // For calculating baseline cost
3527 DenseSet<const SCEV *> VisitedRegs;
3528 DenseSet<size_t> VisitedLSRUse;
3529
3530 for (const IVStrideUse &U : IU) {
3531 Instruction *UserInst = U.getUser();
3532 // Skip IV users that are part of profitable IV Chains.
3533 User::op_iterator UseI =
3534 find(UserInst->operands(), U.getOperandValToReplace());
3535 assert(UseI != UserInst->op_end() && "cannot find IV operand");
3536 if (IVIncSet.count(UseI)) {
3537 LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
3538 continue;
3539 }
3540
3541 LSRUse::KindType Kind = LSRUse::Basic;
3542 MemAccessTy AccessTy;
3543 if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
3544 Kind = LSRUse::Address;
3545 AccessTy = getAccessType(TTI, UserInst, U.getOperandValToReplace());
3546 }
3547
3548 const SCEV *S = IU.getExpr(U);
3549 if (!S)
3550 continue;
3551 PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
3552
3553 // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
3554 // (N - i == 0), and this allows (N - i) to be the expression that we work
3555 // with rather than just N or i, so we can consider the register
3556 // requirements for both N and i at the same time. Limiting this code to
3557 // equality icmps is not a problem because all interesting loops use
3558 // equality icmps, thanks to IndVarSimplify.
3559 if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) {
3560 // If CI can be saved in some target, like replaced inside hardware loop
3561 // in PowerPC, no need to generate initial formulae for it.
3562 if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition()))
3563 continue;
3564 if (CI->isEquality()) {
3565 // Swap the operands if needed to put the OperandValToReplace on the
3566 // left, for consistency.
3567 Value *NV = CI->getOperand(1);
3568 if (NV == U.getOperandValToReplace()) {
3569 CI->setOperand(1, CI->getOperand(0));
3570 CI->setOperand(0, NV);
3571 NV = CI->getOperand(1);
3572 Changed = true;
3573 }
3574
3575 // x == y --> x - y == 0
3576 const SCEV *N = SE.getSCEV(NV);
3577 if (SE.isLoopInvariant(N, L) && Rewriter.isSafeToExpand(N) &&
3578 (!NV->getType()->isPointerTy() ||
3579 SE.getPointerBase(N) == SE.getPointerBase(S))) {
3580 // S is normalized, so normalize N before folding it into S
3581 // to keep the result normalized.
3582 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3583 if (!N)
3584 continue;
3585 Kind = LSRUse::ICmpZero;
3586 S = SE.getMinusSCEV(N, S);
3587 } else if (L->isLoopInvariant(NV) &&
3588 (!isa<Instruction>(NV) ||
3589 DT.dominates(cast<Instruction>(NV), L->getHeader())) &&
3590 !NV->getType()->isPointerTy()) {
3591 // If we can't generally expand the expression (e.g. it contains
3592 // a divide), but it is already at a loop invariant point before the
3593 // loop, wrap it in an unknown (to prevent the expander from trying
3594 // to re-expand in a potentially unsafe way.) The restriction to
3595 // integer types is required because the unknown hides the base, and
3596 // SCEV can't compute the difference of two unknown pointers.
3597 N = SE.getUnknown(NV);
3598 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3599 if (!N)
3600 continue;
3601 Kind = LSRUse::ICmpZero;
3602 S = SE.getMinusSCEV(N, S);
3603 assert(!isa<SCEVCouldNotCompute>(S));
3604 }
3605
3606 // -1 and the negations of all interesting strides (except the negation
3607 // of -1) are now also interesting.
3608 for (size_t i = 0, e = Factors.size(); i != e; ++i)
3609 if (Factors[i] != -1)
3610 Factors.insert(-(uint64_t)Factors[i]);
3611 Factors.insert(-1);
3612 }
3613 }
3614
3615 // Get or create an LSRUse.
3616 std::pair<size_t, Immediate> P = getUse(S, Kind, AccessTy);
3617 size_t LUIdx = P.first;
3618 Immediate Offset = P.second;
3619 LSRUse &LU = Uses[LUIdx];
3620
3621 // Record the fixup.
3622 LSRFixup &LF = LU.getNewFixup();
3623 LF.UserInst = UserInst;
3624 LF.OperandValToReplace = U.getOperandValToReplace();
3625 LF.PostIncLoops = TmpPostIncLoops;
3626 LF.Offset = Offset;
3627 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3628
3629 // Create SCEV as Formula for calculating baseline cost
3630 if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) {
3631 Formula F;
3632 F.initialMatch(S, L, SE);
3633 BaselineCost.RateFormula(F, Regs, VisitedRegs, LU);
3634 VisitedLSRUse.insert(LUIdx);
3635 }
3636
3637 if (!LU.WidestFixupType ||
3638 SE.getTypeSizeInBits(LU.WidestFixupType) <
3639 SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3640 LU.WidestFixupType = LF.OperandValToReplace->getType();
3641
3642 // If this is the first use of this LSRUse, give it a formula.
3643 if (LU.Formulae.empty()) {
3644 InsertInitialFormula(S, LU, LUIdx);
3645 CountRegisters(LU.Formulae.back(), LUIdx);
3646 }
3647 }
3648
3649 LLVM_DEBUG(print_fixups(dbgs()));
3650}
3651
3652/// Insert a formula for the given expression into the given use, separating out
3653/// loop-variant portions from loop-invariant and loop-computable portions.
3654void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU,
3655 size_t LUIdx) {
3656 // Mark uses whose expressions cannot be expanded.
3657 if (!Rewriter.isSafeToExpand(S))
3658 LU.RigidFormula = true;
3659
3660 Formula F;
3661 F.initialMatch(S, L, SE);
3662 bool Inserted = InsertFormula(LU, LUIdx, F);
3663 assert(Inserted && "Initial formula already exists!"); (void)Inserted;
3664}
3665
3666/// Insert a simple single-register formula for the given expression into the
3667/// given use.
3668void
3669LSRInstance::InsertSupplementalFormula(const SCEV *S,
3670 LSRUse &LU, size_t LUIdx) {
3671 Formula F;
3672 F.BaseRegs.push_back(S);
3673 F.HasBaseReg = true;
3674 bool Inserted = InsertFormula(LU, LUIdx, F);
3675 assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
3676}
3677
3678/// Note which registers are used by the given formula, updating RegUses.
3679void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
3680 if (F.ScaledReg)
3681 RegUses.countRegister(F.ScaledReg, LUIdx);
3682 for (const SCEV *BaseReg : F.BaseRegs)
3683 RegUses.countRegister(BaseReg, LUIdx);
3684}
3685
3686/// If the given formula has not yet been inserted, add it to the list, and
3687/// return true. Return false otherwise.
3688bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
3689 // Do not insert formula that we will not be able to expand.
3690 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
3691 "Formula is illegal");
3692
3693 if (!LU.InsertFormula(F, *L))
3694 return false;
3695
3696 CountRegisters(F, LUIdx);
3697 return true;
3698}
3699
3700/// Check for other uses of loop-invariant values which we're tracking. These
3701/// other uses will pin these values in registers, making them less profitable
3702/// for elimination.
3703/// TODO: This currently misses non-constant addrec step registers.
3704/// TODO: Should this give more weight to users inside the loop?
3705void
3706LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
3707 SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
3709
3710 // Don't collect outside uses if we are favoring postinc - the instructions in
3711 // the loop are more important than the ones outside of it.
3712 if (AMK == TTI::AMK_PostIndexed)
3713 return;
3714
3715 while (!Worklist.empty()) {
3716 const SCEV *S = Worklist.pop_back_val();
3717
3718 // Don't process the same SCEV twice
3719 if (!Visited.insert(S).second)
3720 continue;
3721
3722 if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
3723 append_range(Worklist, N->operands());
3724 else if (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(S))
3725 Worklist.push_back(C->getOperand());
3726 else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
3727 Worklist.push_back(D->getLHS());
3728 Worklist.push_back(D->getRHS());
3729 } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
3730 const Value *V = US->getValue();
3731 if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
3732 // Look for instructions defined outside the loop.
3733 if (L->contains(Inst)) continue;
3734 } else if (isa<Constant>(V))
3735 // Constants can be re-materialized.
3736 continue;
3737 for (const Use &U : V->uses()) {
3738 const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
3739 // Ignore non-instructions.
3740 if (!UserInst)
3741 continue;
3742 // Don't bother if the instruction is an EHPad.
3743 if (UserInst->isEHPad())
3744 continue;
3745 // Ignore instructions in other functions (as can happen with
3746 // Constants).
3747 if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
3748 continue;
3749 // Ignore instructions not dominated by the loop.
3750 const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
3751 UserInst->getParent() :
3752 cast<PHINode>(UserInst)->getIncomingBlock(
3754 if (!DT.dominates(L->getHeader(), UseBB))
3755 continue;
3756 // Don't bother if the instruction is in a BB which ends in an EHPad.
3757 if (UseBB->getTerminator()->isEHPad())
3758 continue;
3759
3760 // Ignore cases in which the currently-examined value could come from
3761 // a basic block terminated with an EHPad. This checks all incoming
3762 // blocks of the phi node since it is possible that the same incoming
3763 // value comes from multiple basic blocks, only some of which may end
3764 // in an EHPad. If any of them do, a subsequent rewrite attempt by this
3765 // pass would try to insert instructions into an EHPad, hitting an
3766 // assertion.
3767 if (isa<PHINode>(UserInst)) {
3768 const auto *PhiNode = cast<PHINode>(UserInst);
3769 bool HasIncompatibleEHPTerminatedBlock = false;
3770 llvm::Value *ExpectedValue = U;
3771 for (unsigned int I = 0; I < PhiNode->getNumIncomingValues(); I++) {
3772 if (PhiNode->getIncomingValue(I) == ExpectedValue) {
3773 if (PhiNode->getIncomingBlock(I)->getTerminator()->isEHPad()) {
3774 HasIncompatibleEHPTerminatedBlock = true;
3775 break;
3776 }
3777 }
3778 }
3779 if (HasIncompatibleEHPTerminatedBlock) {
3780 continue;
3781 }
3782 }
3783
3784 // Don't bother rewriting PHIs in catchswitch blocks.
3785 if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
3786 continue;
3787 // Ignore uses which are part of other SCEV expressions, to avoid
3788 // analyzing them multiple times.
3789 if (SE.isSCEVable(UserInst->getType())) {
3790 const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
3791 // If the user is a no-op, look through to its uses.
3792 if (!isa<SCEVUnknown>(UserS))
3793 continue;
3794 if (UserS == US) {
3795 Worklist.push_back(
3796 SE.getUnknown(const_cast<Instruction *>(UserInst)));
3797 continue;
3798 }
3799 }
3800 // Ignore icmp instructions which are already being analyzed.
3801 if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
3802 unsigned OtherIdx = !U.getOperandNo();
3803 Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
3804 if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
3805 continue;
3806 }
3807
3808 std::pair<size_t, Immediate> P =
3809 getUse(S, LSRUse::Basic, MemAccessTy());
3810 size_t LUIdx = P.first;
3811 Immediate Offset = P.second;
3812 LSRUse &LU = Uses[LUIdx];
3813 LSRFixup &LF = LU.getNewFixup();
3814 LF.UserInst = const_cast<Instruction *>(UserInst);
3815 LF.OperandValToReplace = U;
3816 LF.Offset = Offset;
3817 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3818 if (!LU.WidestFixupType ||
3819 SE.getTypeSizeInBits(LU.WidestFixupType) <
3820 SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3821 LU.WidestFixupType = LF.OperandValToReplace->getType();
3822 InsertSupplementalFormula(US, LU, LUIdx);
3823 CountRegisters(LU.Formulae.back(), Uses.size() - 1);
3824 break;
3825 }
3826 }
3827 }
3828}
3829
3830/// Split S into subexpressions which can be pulled out into separate
3831/// registers. If C is non-null, multiply each subexpression by C.
3832///
3833/// Return remainder expression after factoring the subexpressions captured by
3834/// Ops. If Ops is complete, return NULL.
3835static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
3837 const Loop *L,
3838 ScalarEvolution &SE,
3839 unsigned Depth = 0) {
3840 // Arbitrarily cap recursion to protect compile time.
3841 if (Depth >= 3)
3842 return S;
3843
3844 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
3845 // Break out add operands.
3846 for (const SCEV *S : Add->operands()) {
3847 const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
3848 if (Remainder)
3849 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3850 }
3851 return nullptr;
3852 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
3853 // Split a non-zero base out of an addrec.
3854 if (AR->getStart()->isZero() || !AR->isAffine())
3855 return S;
3856
3857 const SCEV *Remainder = CollectSubexprs(AR->getStart(),
3858 C, Ops, L, SE, Depth+1);
3859 // Split the non-zero AddRec unless it is part of a nested recurrence that
3860 // does not pertain to this loop.
3861 if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) {
3862 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3863 Remainder = nullptr;
3864 }
3865 if (Remainder != AR->getStart()) {
3866 if (!Remainder)
3867 Remainder = SE.getConstant(AR->getType(), 0);
3868 return SE.getAddRecExpr(Remainder,
3869 AR->getStepRecurrence(SE),
3870 AR->getLoop(),
3871 //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
3873 }
3874 } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
3875 // Break (C * (a + b + c)) into C*a + C*b + C*c.
3876 if (Mul->getNumOperands() != 2)
3877 return S;
3878 if (const SCEVConstant *Op0 =
3879 dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
3880 C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
3881 const SCEV *Remainder =
3882 CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1);
3883 if (Remainder)
3884 Ops.push_back(SE.getMulExpr(C, Remainder));
3885 return nullptr;
3886 }
3887 }
3888 return S;
3889}
3890
3891/// Return true if the SCEV represents a value that may end up as a
3892/// post-increment operation.
3894 LSRUse &LU, const SCEV *S, const Loop *L,
3895 ScalarEvolution &SE) {
3896 if (LU.Kind != LSRUse::Address ||
3897 !LU.AccessTy.getType()->isIntOrIntVectorTy())
3898 return false;
3899 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S);
3900 if (!AR)
3901 return false;
3902 const SCEV *LoopStep = AR->getStepRecurrence(SE);
3903 if (!isa<SCEVConstant>(LoopStep))
3904 return false;
3905 // Check if a post-indexed load/store can be used.
3908 const SCEV *LoopStart = AR->getStart();
3909 if (!isa<SCEVConstant>(LoopStart) && SE.isLoopInvariant(LoopStart, L))
3910 return true;
3911 }
3912 return false;
3913}
3914
3915/// Helper function for LSRInstance::GenerateReassociations.
3916void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
3917 const Formula &Base,
3918 unsigned Depth, size_t Idx,
3919 bool IsScaledReg) {
3920 const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3921 // Don't generate reassociations for the base register of a value that
3922 // may generate a post-increment operator. The reason is that the
3923 // reassociations cause extra base+register formula to be created,
3924 // and possibly chosen, but the post-increment is more efficient.
3925 if (AMK == TTI::AMK_PostIndexed && mayUsePostIncMode(TTI, LU, BaseReg, L, SE))
3926 return;
3928 const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
3929 if (Remainder)
3930 AddOps.push_back(Remainder);
3931
3932 if (AddOps.size() == 1)
3933 return;
3934
3936 JE = AddOps.end();
3937 J != JE; ++J) {
3938 // Loop-variant "unknown" values are uninteresting; we won't be able to
3939 // do anything meaningful with them.
3940 if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
3941 continue;
3942
3943 // Don't pull a constant into a register if the constant could be folded
3944 // into an immediate field.
3945 if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3946 LU.AccessTy, *J, Base.getNumRegs() > 1))
3947 continue;
3948
3949 // Collect all operands except *J.
3950 SmallVector<const SCEV *, 8> InnerAddOps(
3951 ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
3952 InnerAddOps.append(std::next(J),
3953 ((const SmallVector<const SCEV *, 8> &)AddOps).end());
3954
3955 // Don't leave just a constant behind in a register if the constant could
3956 // be folded into an immediate field.
3957 if (InnerAddOps.size() == 1 &&
3958 isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3959 LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
3960 continue;
3961
3962 const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
3963 if (InnerSum->isZero())
3964 continue;
3965 Formula F = Base;
3966
3967 if (F.UnfoldedOffset.isNonZero() && F.UnfoldedOffset.isScalable())
3968 continue;
3969
3970 // Add the remaining pieces of the add back into the new formula.
3971 const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
3972 if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
3973 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
3974 InnerSumSC->getValue()->getZExtValue())) {
3975 F.UnfoldedOffset =
3976 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
3977 InnerSumSC->getValue()->getZExtValue());
3978 if (IsScaledReg) {
3979 F.ScaledReg = nullptr;
3980 F.Scale = 0;
3981 } else
3982 F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
3983 } else if (IsScaledReg)
3984 F.ScaledReg = InnerSum;
3985 else
3986 F.BaseRegs[Idx] = InnerSum;
3987
3988 // Add J as its own register, or an unfolded immediate.
3989 const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
3990 if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
3991 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
3992 SC->getValue()->getZExtValue()))
3993 F.UnfoldedOffset =
3994 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
3995 SC->getValue()->getZExtValue());
3996 else
3997 F.BaseRegs.push_back(*J);
3998 // We may have changed the number of register in base regs, adjust the
3999 // formula accordingly.
4000 F.canonicalize(*L);
4001
4002 if (InsertFormula(LU, LUIdx, F))
4003 // If that formula hadn't been seen before, recurse to find more like
4004 // it.
4005 // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2)
4006 // Because just Depth is not enough to bound compile time.
4007 // This means that every time AddOps.size() is greater 16^x we will add
4008 // x to Depth.
4009 GenerateReassociations(LU, LUIdx, LU.Formulae.back(),
4010 Depth + 1 + (Log2_32(AddOps.size()) >> 2));
4011 }
4012}
4013
4014/// Split out subexpressions from adds and the bases of addrecs.
4015void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
4016 Formula Base, unsigned Depth) {
4017 assert(Base.isCanonical(*L) && "Input must be in the canonical form");
4018 // Arbitrarily cap recursion to protect compile time.
4019 if (Depth >= 3)
4020 return;
4021
4022 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4023 GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
4024
4025 if (Base.Scale == 1)
4026 GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
4027 /* Idx */ -1, /* IsScaledReg */ true);
4028}
4029
4030/// Generate a formula consisting of all of the loop-dominating registers added
4031/// into a single register.
4032void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
4033 Formula Base) {
4034 // This method is only interesting on a plurality of registers.
4035 if (Base.BaseRegs.size() + (Base.Scale == 1) +
4036 (Base.UnfoldedOffset.isNonZero()) <=
4037 1)
4038 return;
4039
4040 // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
4041 // processing the formula.
4042 Base.unscale();
4044 Formula NewBase = Base;
4045 NewBase.BaseRegs.clear();
4046 Type *CombinedIntegerType = nullptr;
4047 for (const SCEV *BaseReg : Base.BaseRegs) {
4048 if (SE.properlyDominates(BaseReg, L->getHeader()) &&
4049 !SE.hasComputableLoopEvolution(BaseReg, L)) {
4050 if (!CombinedIntegerType)
4051 CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType());
4052 Ops.push_back(BaseReg);
4053 }
4054 else
4055 NewBase.BaseRegs.push_back(BaseReg);
4056 }
4057
4058 // If no register is relevant, we're done.
4059 if (Ops.size() == 0)
4060 return;
4061
4062 // Utility function for generating the required variants of the combined
4063 // registers.
4064 auto GenerateFormula = [&](const SCEV *Sum) {
4065 Formula F = NewBase;
4066
4067 // TODO: If Sum is zero, it probably means ScalarEvolution missed an
4068 // opportunity to fold something. For now, just ignore such cases
4069 // rather than proceed with zero in a register.
4070 if (Sum->isZero())
4071 return;
4072
4073 F.BaseRegs.push_back(Sum);
4074 F.canonicalize(*L);
4075 (void)InsertFormula(LU, LUIdx, F);
4076 };
4077
4078 // If we collected at least two registers, generate a formula combining them.
4079 if (Ops.size() > 1) {
4080 SmallVector<const SCEV *, 4> OpsCopy(Ops); // Don't let SE modify Ops.
4081 GenerateFormula(SE.getAddExpr(OpsCopy));
4082 }
4083
4084 // If we have an unfolded offset, generate a formula combining it with the
4085 // registers collected.
4086 if (NewBase.UnfoldedOffset.isNonZero() && NewBase.UnfoldedOffset.isFixed()) {
4087 assert(CombinedIntegerType && "Missing a type for the unfolded offset");
4088 Ops.push_back(SE.getConstant(CombinedIntegerType,
4089 NewBase.UnfoldedOffset.getFixedValue(), true));
4090 NewBase.UnfoldedOffset = Immediate::getFixed(0);
4091 GenerateFormula(SE.getAddExpr(Ops));
4092 }
4093}
4094
4095/// Helper function for LSRInstance::GenerateSymbolicOffsets.
4096void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
4097 const Formula &Base, size_t Idx,
4098 bool IsScaledReg) {
4099 const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4100 GlobalValue *GV = ExtractSymbol(G, SE);
4101 if (G->isZero() || !GV)
4102 return;
4103 Formula F = Base;
4104 F.BaseGV = GV;
4105 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4106 return;
4107 if (IsScaledReg)
4108 F.ScaledReg = G;
4109 else
4110 F.BaseRegs[Idx] = G;
4111 (void)InsertFormula(LU, LUIdx, F);
4112}
4113
4114/// Generate reuse formulae using symbolic offsets.
4115void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
4116 Formula Base) {
4117 // We can't add a symbolic offset if the address already contains one.
4118 if (Base.BaseGV) return;
4119
4120 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4121 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
4122 if (Base.Scale == 1)
4123 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
4124 /* IsScaledReg */ true);
4125}
4126
4127/// Helper function for LSRInstance::GenerateConstantOffsets.
4128void LSRInstance::GenerateConstantOffsetsImpl(
4129 LSRUse &LU, unsigned LUIdx, const Formula &Base,
4130 const SmallVectorImpl<Immediate> &Worklist, size_t Idx, bool IsScaledReg) {
4131
4132 auto GenerateOffset = [&](const SCEV *G, Immediate Offset) {
4133 Formula F = Base;
4134 if (!Base.BaseOffset.isCompatibleImmediate(Offset))
4135 return;
4136 F.BaseOffset = Base.BaseOffset.subUnsigned(Offset);
4137
4138 if (isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) {
4139 // Add the offset to the base register.
4140 const SCEV *NewOffset = Offset.getSCEV(SE, G->getType());
4141 const SCEV *NewG = SE.getAddExpr(NewOffset, G);
4142 // If it cancelled out, drop the base register, otherwise update it.
4143 if (NewG->isZero()) {
4144 if (IsScaledReg) {
4145 F.Scale = 0;
4146 F.ScaledReg = nullptr;
4147 } else
4148 F.deleteBaseReg(F.BaseRegs[Idx]);
4149 F.canonicalize(*L);
4150 } else if (IsScaledReg)
4151 F.ScaledReg = NewG;
4152 else
4153 F.BaseRegs[Idx] = NewG;
4154
4155 (void)InsertFormula(LU, LUIdx, F);
4156 }
4157 };
4158
4159 const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4160
4161 // With constant offsets and constant steps, we can generate pre-inc
4162 // accesses by having the offset equal the step. So, for access #0 with a
4163 // step of 8, we generate a G - 8 base which would require the first access
4164 // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer
4165 // for itself and hopefully becomes the base for other accesses. This means
4166 // means that a single pre-indexed access can be generated to become the new
4167 // base pointer for each iteration of the loop, resulting in no extra add/sub
4168 // instructions for pointer updating.
4169 if (AMK == TTI::AMK_PreIndexed && LU.Kind == LSRUse::Address) {
4170 if (auto *GAR = dyn_cast<SCEVAddRecExpr>(G)) {
4171 if (auto *StepRec =
4172 dyn_cast<SCEVConstant>(GAR->getStepRecurrence(SE))) {
4173 const APInt &StepInt = StepRec->getAPInt();
4174 int64_t Step = StepInt.isNegative() ?
4175 StepInt.getSExtValue() : StepInt.getZExtValue();
4176
4177 for (Immediate Offset : Worklist) {
4178 if (Offset.isFixed()) {
4179 Offset = Immediate::getFixed(Offset.getFixedValue() - Step);
4180 GenerateOffset(G, Offset);
4181 }
4182 }
4183 }
4184 }
4185 }
4186 for (Immediate Offset : Worklist)
4187 GenerateOffset(G, Offset);
4188
4189 Immediate Imm = ExtractImmediate(G, SE);
4190 if (G->isZero() || Imm.isZero() ||
4191 !Base.BaseOffset.isCompatibleImmediate(Imm))
4192 return;
4193 Formula F = Base;
4194 F.BaseOffset = F.BaseOffset.addUnsigned(Imm);
4195 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4196 return;
4197 if (IsScaledReg) {
4198 F.ScaledReg = G;
4199 } else {
4200 F.BaseRegs[Idx] = G;
4201 // We may generate non canonical Formula if G is a recurrent expr reg
4202 // related with current loop while F.ScaledReg is not.
4203 F.canonicalize(*L);
4204 }
4205 (void)InsertFormula(LU, LUIdx, F);
4206}
4207
4208/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
4209void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
4210 Formula Base) {
4211 // TODO: For now, just add the min and max offset, because it usually isn't
4212 // worthwhile looking at everything inbetween.
4214 Worklist.push_back(LU.MinOffset);
4215 if (LU.MaxOffset != LU.MinOffset)
4216 Worklist.push_back(LU.MaxOffset);
4217
4218 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4219 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
4220 if (Base.Scale == 1)
4221 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
4222 /* IsScaledReg */ true);
4223}
4224
4225/// For ICmpZero, check to see if we can scale up the comparison. For example, x
4226/// == y -> x*c == y*c.
4227void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
4228 Formula Base) {
4229 if (LU.Kind != LSRUse::ICmpZero) return;
4230
4231 // Determine the integer type for the base formula.
4232 Type *IntTy = Base.getType();
4233 if (!IntTy) return;
4234 if (SE.getTypeSizeInBits(IntTy) > 64) return;
4235
4236 // Don't do this if there is more than one offset.
4237 if (LU.MinOffset != LU.MaxOffset) return;
4238
4239 // Check if transformation is valid. It is illegal to multiply pointer.
4240 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4241 return;
4242 for (const SCEV *BaseReg : Base.BaseRegs)
4243 if (BaseReg->getType()->isPointerTy())
4244 return;
4245 assert(!Base.BaseGV && "ICmpZero use is not legal!");
4246
4247 // Check each interesting stride.
4248 for (int64_t Factor : Factors) {
4249 // Check that Factor can be represented by IntTy
4250 if (!ConstantInt::isValueValidForType(IntTy, Factor))
4251 continue;
4252 // Check that the multiplication doesn't overflow.
4253 if (Base.BaseOffset.isMin() && Factor == -1)
4254 continue;
4255 // Not supporting scalable immediates.
4256 if (Base.BaseOffset.isNonZero() && Base.BaseOffset.isScalable())
4257 continue;
4258 Immediate NewBaseOffset = Base.BaseOffset.mulUnsigned(Factor);
4259 assert(Factor != 0 && "Zero factor not expected!");
4260 if (NewBaseOffset.getFixedValue() / Factor !=
4261 Base.BaseOffset.getFixedValue())
4262 continue;
4263 // If the offset will be truncated at this use, check that it is in bounds.
4264 if (!IntTy->isPointerTy() &&
4265 !ConstantInt::isValueValidForType(IntTy, NewBaseOffset.getFixedValue()))
4266 continue;
4267
4268 // Check that multiplying with the use offset doesn't overflow.
4269 Immediate Offset = LU.MinOffset;
4270 if (Offset.isMin() && Factor == -1)
4271 continue;
4272 Offset = Offset.mulUnsigned(Factor);
4273 if (Offset.getFixedValue() / Factor != LU.MinOffset.getFixedValue())
4274 continue;
4275 // If the offset will be truncated at this use, check that it is in bounds.
4276 if (!IntTy->isPointerTy() &&
4277 !ConstantInt::isValueValidForType(IntTy, Offset.getFixedValue()))
4278 continue;
4279
4280 Formula F = Base;
4281 F.BaseOffset = NewBaseOffset;
4282
4283 // Check that this scale is legal.
4284 if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
4285 continue;
4286
4287 // Compensate for the use having MinOffset built into it.
4288 F.BaseOffset = F.BaseOffset.addUnsigned(Offset).subUnsigned(LU.MinOffset);
4289
4290 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4291
4292 // Check that multiplying with each base register doesn't overflow.
4293 for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
4294 F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
4295 if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
4296 goto next;
4297 }
4298
4299 // Check that multiplying with the scaled register doesn't overflow.
4300 if (F.ScaledReg) {
4301 F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
4302 if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
4303 continue;
4304 }
4305
4306 // Check that multiplying with the unfolded offset doesn't overflow.
4307 if (F.UnfoldedOffset.isNonZero()) {
4308 if (F.UnfoldedOffset.isMin() && Factor == -1)
4309 continue;
4310 F.UnfoldedOffset = F.UnfoldedOffset.mulUnsigned(Factor);
4311 if (F.UnfoldedOffset.getFixedValue() / Factor !=
4312 Base.UnfoldedOffset.getFixedValue())
4313 continue;
4314 // If the offset will be truncated, check that it is in bounds.
4316 IntTy, F.UnfoldedOffset.getFixedValue()))
4317 continue;
4318 }
4319
4320 // If we make it here and it's legal, add it.
4321 (void)InsertFormula(LU, LUIdx, F);
4322 next:;
4323 }
4324}
4325
4326/// Generate stride factor reuse formulae by making use of scaled-offset address
4327/// modes, for example.
4328void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
4329 // Determine the integer type for the base formula.
4330 Type *IntTy = Base.getType();
4331 if (!IntTy) return;
4332
4333 // If this Formula already has a scaled register, we can't add another one.
4334 // Try to unscale the formula to generate a better scale.
4335 if (Base.Scale != 0 && !Base.unscale())
4336 return;
4337
4338 assert(Base.Scale == 0 && "unscale did not did its job!");
4339
4340 // Check each interesting stride.
4341 for (int64_t Factor : Factors) {
4342 Base.Scale = Factor;
4343 Base.HasBaseReg = Base.BaseRegs.size() > 1;
4344 // Check whether this scale is going to be legal.
4345 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4346 Base)) {
4347 // As a special-case, handle special out-of-loop Basic users specially.
4348 // TODO: Reconsider this special case.
4349 if (LU.Kind == LSRUse::Basic &&
4350 isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
4351 LU.AccessTy, Base) &&
4352 LU.AllFixupsOutsideLoop)
4353 LU.Kind = LSRUse::Special;
4354 else
4355 continue;
4356 }
4357 // For an ICmpZero, negating a solitary base register won't lead to
4358 // new solutions.
4359 if (LU.Kind == LSRUse::ICmpZero && !Base.HasBaseReg &&
4360 Base.BaseOffset.isZero() && !Base.BaseGV)
4361 continue;
4362 // For each addrec base reg, if its loop is current loop, apply the scale.
4363 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
4364 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
4365 if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
4366 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4367 if (FactorS->isZero())
4368 continue;
4369 // Divide out the factor, ignoring high bits, since we'll be
4370 // scaling the value back up in the end.
4371 if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true))
4372 if (!Quotient->isZero()) {
4373 // TODO: This could be optimized to avoid all the copying.
4374 Formula F = Base;
4375 F.ScaledReg = Quotient;
4376 F.deleteBaseReg(F.BaseRegs[i]);
4377 // The canonical representation of 1*reg is reg, which is already in
4378 // Base. In that case, do not try to insert the formula, it will be
4379 // rejected anyway.
4380 if (F.Scale == 1 && (F.BaseRegs.empty() ||
4381 (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
4382 continue;
4383 // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
4384 // non canonical Formula with ScaledReg's loop not being L.
4385 if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
4386 F.canonicalize(*L);
4387 (void)InsertFormula(LU, LUIdx, F);
4388 }
4389 }
4390 }
4391 }
4392}
4393
4394/// Extend/Truncate \p Expr to \p ToTy considering post-inc uses in \p Loops.
4395/// For all PostIncLoopSets in \p Loops, first de-normalize \p Expr, then
4396/// perform the extension/truncate and normalize again, as the normalized form
4397/// can result in folds that are not valid in the post-inc use contexts. The
4398/// expressions for all PostIncLoopSets must match, otherwise return nullptr.
4399static const SCEV *
4401 const SCEV *Expr, Type *ToTy,
4402 ScalarEvolution &SE) {
4403 const SCEV *Result = nullptr;
4404 for (auto &L : Loops) {
4405 auto *DenormExpr = denormalizeForPostIncUse(Expr, L, SE);
4406 const SCEV *NewDenormExpr = SE.getAnyExtendExpr(DenormExpr, ToTy);
4407 const SCEV *New = normalizeForPostIncUse(NewDenormExpr, L, SE);
4408 if (!New || (Result && New != Result))
4409 return nullptr;
4410 Result = New;
4411 }
4412
4413 assert(Result && "failed to create expression");
4414 return Result;
4415}
4416
4417/// Generate reuse formulae from different IV types.
4418void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
4419 // Don't bother truncating symbolic values.
4420 if (Base.BaseGV) return;
4421
4422 // Determine the integer type for the base formula.
4423 Type *DstTy = Base.getType();
4424 if (!DstTy) return;
4425 if (DstTy->isPointerTy())
4426 return;
4427
4428 // It is invalid to extend a pointer type so exit early if ScaledReg or
4429 // any of the BaseRegs are pointers.
4430 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4431 return;
4432 if (any_of(Base.BaseRegs,
4433 [](const SCEV *S) { return S->getType()->isPointerTy(); }))
4434 return;
4435
4437 for (auto &LF : LU.Fixups)
4438 Loops.push_back(LF.PostIncLoops);
4439
4440 for (Type *SrcTy : Types) {
4441 if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
4442 Formula F = Base;
4443
4444 // Sometimes SCEV is able to prove zero during ext transform. It may
4445 // happen if SCEV did not do all possible transforms while creating the
4446 // initial node (maybe due to depth limitations), but it can do them while
4447 // taking ext.
4448 if (F.ScaledReg) {
4449 const SCEV *NewScaledReg =
4450 getAnyExtendConsideringPostIncUses(Loops, F.ScaledReg, SrcTy, SE);
4451 if (!NewScaledReg || NewScaledReg->isZero())
4452 continue;
4453 F.ScaledReg = NewScaledReg;
4454 }
4455 bool HasZeroBaseReg = false;
4456 for (const SCEV *&BaseReg : F.BaseRegs) {
4457 const SCEV *NewBaseReg =
4458 getAnyExtendConsideringPostIncUses(Loops, BaseReg, SrcTy, SE);
4459 if (!NewBaseReg || NewBaseReg->isZero()) {
4460 HasZeroBaseReg = true;
4461 break;
4462 }
4463 BaseReg = NewBaseReg;
4464 }
4465 if (HasZeroBaseReg)
4466 continue;
4467
4468 // TODO: This assumes we've done basic processing on all uses and
4469 // have an idea what the register usage is.
4470 if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
4471 continue;
4472
4473 F.canonicalize(*L);
4474 (void)InsertFormula(LU, LUIdx, F);
4475 }
4476 }
4477}
4478
4479namespace {
4480
4481/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
4482/// modifications so that the search phase doesn't have to worry about the data
4483/// structures moving underneath it.
4484struct WorkItem {
4485 size_t LUIdx;
4486 Immediate Imm;
4487 const SCEV *OrigReg;
4488
4489 WorkItem(size_t LI, Immediate I, const SCEV *R)
4490 : LUIdx(LI), Imm(I), OrigReg(R) {}
4491
4492 void print(raw_ostream &OS) const;
4493 void dump() const;
4494};
4495
4496} // end anonymous namespace
4497
4498#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4499void WorkItem::print(raw_ostream &OS) const {
4500 OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
4501 << " , add offset " << Imm;
4502}
4503
4504LLVM_DUMP_METHOD void WorkItem::dump() const {
4505 print(errs()); errs() << '\n';
4506}
4507#endif
4508
4509/// Look for registers which are a constant distance apart and try to form reuse
4510/// opportunities between them.
4511void LSRInstance::GenerateCrossUseConstantOffsets() {
4512 // Group the registers by their value without any added constant offset.
4513 using ImmMapTy = std::map<Immediate, const SCEV *, KeyOrderTargetImmediate>;
4514
4518 for (const SCEV *Use : RegUses) {
4519 const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify.
4520 Immediate Imm = ExtractImmediate(Reg, SE);
4521 auto Pair = Map.insert(std::make_pair(Reg, ImmMapTy()));
4522 if (Pair.second)
4523 Sequence.push_back(Reg);
4524 Pair.first->second.insert(std::make_pair(Imm, Use));
4525 UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
4526 }
4527
4528 // Now examine each set of registers with the same base value. Build up
4529 // a list of work to do and do the work in a separate step so that we're
4530 // not adding formulae and register counts while we're searching.
4531 SmallVector<WorkItem, 32> WorkItems;
4532 SmallSet<std::pair<size_t, Immediate>, 32, KeyOrderSizeTAndImmediate>
4533 UniqueItems;
4534 for (const SCEV *Reg : Sequence) {
4535 const ImmMapTy &Imms = Map.find(Reg)->second;
4536
4537 // It's not worthwhile looking for reuse if there's only one offset.
4538 if (Imms.size() == 1)
4539 continue;
4540
4541 LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
4542 for (const auto &Entry
4543 : Imms) dbgs()
4544 << ' ' << Entry.first;
4545 dbgs() << '\n');
4546
4547 // Examine each offset.
4548 for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
4549 J != JE; ++J) {
4550 const SCEV *OrigReg = J->second;
4551
4552 Immediate JImm = J->first;
4553 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
4554
4555 if (!isa<SCEVConstant>(OrigReg) &&
4556 UsedByIndicesMap[Reg].count() == 1) {
4557 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4558 << '\n');
4559 continue;
4560 }
4561
4562 // Conservatively examine offsets between this orig reg a few selected
4563 // other orig regs.
4564 Immediate First = Imms.begin()->first;
4565 Immediate Last = std::prev(Imms.end())->first;
4566 if (!First.isCompatibleImmediate(Last)) {
4567 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4568 << "\n");
4569 continue;
4570 }
4571 // Only scalable if both terms are scalable, or if one is scalable and
4572 // the other is 0.
4573 bool Scalable = First.isScalable() || Last.isScalable();
4574 int64_t FI = First.getKnownMinValue();
4575 int64_t LI = Last.getKnownMinValue();
4576 // Compute (First + Last) / 2 without overflow using the fact that
4577 // First + Last = 2 * (First + Last) + (First ^ Last).
4578 int64_t Avg = (FI & LI) + ((FI ^ LI) >> 1);
4579 // If the result is negative and FI is odd and LI even (or vice versa),
4580 // we rounded towards -inf. Add 1 in that case, to round towards 0.
4581 Avg = Avg + ((FI ^ LI) & ((uint64_t)Avg >> 63));
4582 ImmMapTy::const_iterator OtherImms[] = {
4583 Imms.begin(), std::prev(Imms.end()),
4584 Imms.lower_bound(Immediate::get(Avg, Scalable))};
4585 for (const auto &M : OtherImms) {
4586 if (M == J || M == JE) continue;
4587 if (!JImm.isCompatibleImmediate(M->first))
4588 continue;
4589
4590 // Compute the difference between the two.
4591 Immediate Imm = JImm.subUnsigned(M->first);
4592 for (unsigned LUIdx : UsedByIndices.set_bits())
4593 // Make a memo of this use, offset, and register tuple.
4594 if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
4595 WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
4596 }
4597 }
4598 }
4599
4600 Map.clear();
4601 Sequence.clear();
4602 UsedByIndicesMap.clear();
4603 UniqueItems.clear();
4604
4605 // Now iterate through the worklist and add new formulae.
4606 for (const WorkItem &WI : WorkItems) {
4607 size_t LUIdx = WI.LUIdx;
4608 LSRUse &LU = Uses[LUIdx];
4609 Immediate Imm = WI.Imm;
4610 const SCEV *OrigReg = WI.OrigReg;
4611
4612 Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
4613 const SCEV *NegImmS = Imm.getNegativeSCEV(SE, IntTy);
4614 unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
4615
4616 // TODO: Use a more targeted data structure.
4617 for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
4618 Formula F = LU.Formulae[L];
4619 // FIXME: The code for the scaled and unscaled registers looks
4620 // very similar but slightly different. Investigate if they
4621 // could be merged. That way, we would not have to unscale the
4622 // Formula.
4623 F.unscale();
4624 // Use the immediate in the scaled register.
4625 if (F.ScaledReg == OrigReg) {
4626 if (!F.BaseOffset.isCompatibleImmediate(Imm))
4627 continue;
4628 Immediate Offset = F.BaseOffset.addUnsigned(Imm.mulUnsigned(F.Scale));
4629 // Don't create 50 + reg(-50).
4630 const SCEV *S = Offset.getNegativeSCEV(SE, IntTy);
4631 if (F.referencesReg(S))
4632 continue;
4633 Formula NewF = F;
4634 NewF.BaseOffset = Offset;
4635 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4636 NewF))
4637 continue;
4638 NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
4639
4640 // If the new scale is a constant in a register, and adding the constant
4641 // value to the immediate would produce a value closer to zero than the
4642 // immediate itself, then the formula isn't worthwhile.
4643 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) {
4644 // FIXME: Do we need to do something for scalable immediates here?
4645 // A scalable SCEV won't be constant, but we might still have
4646 // something in the offset? Bail out for now to be safe.
4647 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4648 continue;
4649 if (C->getValue()->isNegative() !=
4650 (NewF.BaseOffset.isLessThanZero()) &&
4651 (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
4652 .ule(std::abs(NewF.BaseOffset.getFixedValue())))
4653 continue;
4654 }
4655
4656 // OK, looks good.
4657 NewF.canonicalize(*this->L);
4658 (void)InsertFormula(LU, LUIdx, NewF);
4659 } else {
4660 // Use the immediate in a base register.
4661 for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
4662 const SCEV *BaseReg = F.BaseRegs[N];
4663 if (BaseReg != OrigReg)
4664 continue;
4665 Formula NewF = F;
4666 if (!NewF.BaseOffset.isCompatibleImmediate(Imm) ||
4667 !NewF.UnfoldedOffset.isCompatibleImmediate(Imm) ||
4668 !NewF.BaseOffset.isCompatibleImmediate(NewF.UnfoldedOffset))
4669 continue;
4670 NewF.BaseOffset = NewF.BaseOffset.addUnsigned(Imm);
4671 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
4672 LU.Kind, LU.AccessTy, NewF)) {
4673 if (AMK == TTI::AMK_PostIndexed &&
4674 mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
4675 continue;
4676 Immediate NewUnfoldedOffset = NewF.UnfoldedOffset.addUnsigned(Imm);
4677 if (!isLegalAddImmediate(TTI, NewUnfoldedOffset))
4678 continue;
4679 NewF = F;
4680 NewF.UnfoldedOffset = NewUnfoldedOffset;
4681 }
4682 NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
4683
4684 // If the new formula has a constant in a register, and adding the
4685 // constant value to the immediate would produce a value closer to
4686 // zero than the immediate itself, then the formula isn't worthwhile.
4687 for (const SCEV *NewReg : NewF.BaseRegs)
4688 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg)) {
4689 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4690 goto skip_formula;
4691 if ((C->getAPInt() + NewF.BaseOffset.getFixedValue())
4692 .abs()
4693 .slt(std::abs(NewF.BaseOffset.getFixedValue())) &&
4694 (C->getAPInt() + NewF.BaseOffset.getFixedValue())
4695 .countr_zero() >=
4696 (unsigned)llvm::countr_zero<uint64_t>(
4697 NewF.BaseOffset.getFixedValue()))
4698 goto skip_formula;
4699 }
4700
4701 // Ok, looks good.
4702 NewF.canonicalize(*this->L);
4703 (void)InsertFormula(LU, LUIdx, NewF);
4704 break;
4705 skip_formula:;
4706 }
4707 }
4708 }
4709 }
4710}
4711
4712/// Generate formulae for each use.
4713void
4714LSRInstance::GenerateAllReuseFormulae() {
4715 // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
4716 // queries are more precise.
4717 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4718 LSRUse &LU = Uses[LUIdx];
4719 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4720 GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
4721 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4722 GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
4723 }
4724 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4725 LSRUse &LU = Uses[LUIdx];
4726 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4727 GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
4728 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4729 GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
4730 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4731 GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
4732 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4733 GenerateScales(LU, LUIdx, LU.Formulae[i]);
4734 }
4735 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4736 LSRUse &LU = Uses[LUIdx];
4737 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4738 GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
4739 }
4740
4741 GenerateCrossUseConstantOffsets();
4742
4743 LLVM_DEBUG(dbgs() << "\n"
4744 "After generating reuse formulae:\n";
4745 print_uses(dbgs()));
4746}
4747
4748/// If there are multiple formulae with the same set of registers used
4749/// by other uses, pick the best one and delete the others.
4750void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
4751 DenseSet<const SCEV *> VisitedRegs;
4754#ifndef NDEBUG
4755 bool ChangedFormulae = false;
4756#endif
4757
4758 // Collect the best formula for each unique set of shared registers. This
4759 // is reset for each use.
4760 using BestFormulaeTy =
4761 DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>;
4762
4763 BestFormulaeTy BestFormulae;
4764
4765 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4766 LSRUse &LU = Uses[LUIdx];
4767 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
4768 dbgs() << '\n');
4769
4770 bool Any = false;
4771 for (size_t FIdx = 0, NumForms = LU.Formulae.size();
4772 FIdx != NumForms; ++FIdx) {
4773 Formula &F = LU.Formulae[FIdx];
4774
4775 // Some formulas are instant losers. For example, they may depend on
4776 // nonexistent AddRecs from other loops. These need to be filtered
4777 // immediately, otherwise heuristics could choose them over others leading
4778 // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
4779 // avoids the need to recompute this information across formulae using the
4780 // same bad AddRec. Passing LoserRegs is also essential unless we remove
4781 // the corresponding bad register from the Regs set.
4782 Cost CostF(L, SE, TTI, AMK);
4783 Regs.clear();
4784 CostF.RateFormula(F, Regs, VisitedRegs, LU, &LoserRegs);
4785 if (CostF.isLoser()) {
4786 // During initial formula generation, undesirable formulae are generated
4787 // by uses within other loops that have some non-trivial address mode or
4788 // use the postinc form of the IV. LSR needs to provide these formulae
4789 // as the basis of rediscovering the desired formula that uses an AddRec
4790 // corresponding to the existing phi. Once all formulae have been
4791 // generated, these initial losers may be pruned.
4792 LLVM_DEBUG(dbgs() << " Filtering loser "; F.print(dbgs());
4793 dbgs() << "\n");
4794 }
4795 else {
4797 for (const SCEV *Reg : F.BaseRegs) {
4798 if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
4799 Key.push_back(Reg);
4800 }
4801 if (F.ScaledReg &&
4802 RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
4803 Key.push_back(F.ScaledReg);
4804 // Unstable sort by host order ok, because this is only used for
4805 // uniquifying.
4806 llvm::sort(Key);
4807
4808 std::pair<BestFormulaeTy::const_iterator, bool> P =
4809 BestFormulae.insert(std::make_pair(Key, FIdx));
4810 if (P.second)
4811 continue;
4812
4813 Formula &Best = LU.Formulae[P.first->second];
4814
4815 Cost CostBest(L, SE, TTI, AMK);
4816 Regs.clear();
4817 CostBest.RateFormula(Best, Regs, VisitedRegs, LU);
4818 if (CostF.isLess(CostBest))
4819 std::swap(F, Best);
4820 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
4821 dbgs() << "\n"
4822 " in favor of formula ";
4823 Best.print(dbgs()); dbgs() << '\n');
4824 }
4825#ifndef NDEBUG
4826 ChangedFormulae = true;
4827#endif
4828 LU.DeleteFormula(F);
4829 --FIdx;
4830 --NumForms;
4831 Any = true;
4832 }
4833
4834 // Now that we've filtered out some formulae, recompute the Regs set.
4835 if (Any)
4836 LU.RecomputeRegs(LUIdx, RegUses);
4837
4838 // Reset this to prepare for the next use.
4839 BestFormulae.clear();
4840 }
4841
4842 LLVM_DEBUG(if (ChangedFormulae) {
4843 dbgs() << "\n"
4844 "After filtering out undesirable candidates:\n";
4845 print_uses(dbgs());
4846 });
4847}
4848
4849/// Estimate the worst-case number of solutions the solver might have to
4850/// consider. It almost never considers this many solutions because it prune the
4851/// search space, but the pruning isn't always sufficient.
4852size_t LSRInstance::EstimateSearchSpaceComplexity() const {
4853 size_t Power = 1;
4854 for (const LSRUse &LU : Uses) {
4855 size_t FSize = LU.Formulae.size();
4856 if (FSize >= ComplexityLimit) {
4857 Power = ComplexityLimit;
4858 break;
4859 }
4860 Power *= FSize;
4861 if (Power >= ComplexityLimit)
4862 break;
4863 }
4864 return Power;
4865}
4866
4867/// When one formula uses a superset of the registers of another formula, it
4868/// won't help reduce register pressure (though it may not necessarily hurt
4869/// register pressure); remove it to simplify the system.
4870void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
4871 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4872 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4873
4874 LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
4875 "which use a superset of registers used by other "
4876 "formulae.\n");
4877
4878 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4879 LSRUse &LU = Uses[LUIdx];
4880 bool Any = false;
4881 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
4882 Formula &F = LU.Formulae[i];
4883 if (F.BaseOffset.isNonZero() && F.BaseOffset.isScalable())
4884 continue;
4885 // Look for a formula with a constant or GV in a register. If the use
4886 // also has a formula with that same value in an immediate field,
4887 // delete the one that uses a register.
4889 I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
4890 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
4891 Formula NewF = F;
4892 //FIXME: Formulas should store bitwidth to do wrapping properly.
4893 // See PR41034.
4894 NewF.BaseOffset =
4895 Immediate::getFixed(NewF.BaseOffset.getFixedValue() +
4896 (uint64_t)C->getValue()->getSExtValue());
4897 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4898 (I - F.BaseRegs.begin()));
4899 if (LU.HasFormulaWithSameRegs(NewF)) {
4900 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4901 dbgs() << '\n');
4902 LU.DeleteFormula(F);
4903 --i;
4904 --e;
4905 Any = true;
4906 break;
4907 }
4908 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
4909 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
4910 if (!F.BaseGV) {
4911 Formula NewF = F;
4912 NewF.BaseGV = GV;
4913 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4914 (I - F.BaseRegs.begin()));
4915 if (LU.HasFormulaWithSameRegs(NewF)) {
4916 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4917 dbgs() << '\n');
4918 LU.DeleteFormula(F);
4919 --i;
4920 --e;
4921 Any = true;
4922 break;
4923 }
4924 }
4925 }
4926 }
4927 }
4928 if (Any)
4929 LU.RecomputeRegs(LUIdx, RegUses);
4930 }
4931
4932 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4933 }
4934}
4935
4936/// When there are many registers for expressions like A, A+1, A+2, etc.,
4937/// allocate a single register for them.
4938void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
4939 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4940 return;
4941
4942 LLVM_DEBUG(
4943 dbgs() << "The search space is too complex.\n"
4944 "Narrowing the search space by assuming that uses separated "
4945 "by a constant offset will use the same registers.\n");
4946
4947 // This is especially useful for unrolled loops.
4948
4949 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4950 LSRUse &LU = Uses[LUIdx];
4951 for (const Formula &F : LU.Formulae) {
4952 if (F.BaseOffset.isZero() || (F.Scale != 0 && F.Scale != 1))
4953 continue;
4954
4955 LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
4956 if (!LUThatHas)
4957 continue;
4958
4959 if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
4960 LU.Kind, LU.AccessTy))
4961 continue;
4962
4963 LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');
4964
4965 LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
4966
4967 // Transfer the fixups of LU to LUThatHas.
4968 for (LSRFixup &Fixup : LU.Fixups) {
4969 Fixup.Offset += F.BaseOffset;
4970 LUThatHas->pushFixup(Fixup);
4971 LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
4972 }
4973
4974 // Delete formulae from the new use which are no longer legal.
4975 bool Any = false;
4976 for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
4977 Formula &F = LUThatHas->Formulae[i];
4978 if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
4979 LUThatHas->Kind, LUThatHas->AccessTy, F)) {
4980 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
4981 LUThatHas->DeleteFormula(F);
4982 --i;
4983 --e;
4984 Any = true;
4985 }
4986 }
4987
4988 if (Any)
4989 LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
4990
4991 // Delete the old use.
4992 DeleteUse(LU, LUIdx);
4993 --LUIdx;
4994 --NumUses;
4995 break;
4996 }
4997 }
4998
4999 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5000}
5001
5002/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
5003/// we've done more filtering, as it may be able to find more formulae to
5004/// eliminate.
5005void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
5006 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5007 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5008
5009 LLVM_DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
5010 "undesirable dedicated registers.\n");
5011
5012 FilterOutUndesirableDedicatedRegisters();
5013
5014 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5015 }
5016}
5017
5018/// If a LSRUse has multiple formulae with the same ScaledReg and Scale.
5019/// Pick the best one and delete the others.
5020/// This narrowing heuristic is to keep as many formulae with different
5021/// Scale and ScaledReg pair as possible while narrowing the search space.
5022/// The benefit is that it is more likely to find out a better solution
5023/// from a formulae set with more Scale and ScaledReg variations than
5024/// a formulae set with the same Scale and ScaledReg. The picking winner
5025/// reg heuristic will often keep the formulae with the same Scale and
5026/// ScaledReg and filter others, and we want to avoid that if possible.
5027void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
5028 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5029 return;
5030
5031 LLVM_DEBUG(
5032 dbgs() << "The search space is too complex.\n"
5033 "Narrowing the search space by choosing the best Formula "
5034 "from the Formulae with the same Scale and ScaledReg.\n");
5035
5036 // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
5037 using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>;
5038
5039 BestFormulaeTy BestFormulae;
5040#ifndef NDEBUG
5041 bool ChangedFormulae = false;
5042#endif
5043 DenseSet<const SCEV *> VisitedRegs;
5045
5046 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5047 LSRUse &LU = Uses[LUIdx];
5048 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
5049 dbgs() << '\n');
5050
5051 // Return true if Formula FA is better than Formula FB.
5052 auto IsBetterThan = [&](Formula &FA, Formula &FB) {
5053 // First we will try to choose the Formula with fewer new registers.
5054 // For a register used by current Formula, the more the register is
5055 // shared among LSRUses, the less we increase the register number
5056 // counter of the formula.
5057 size_t FARegNum = 0;
5058 for (const SCEV *Reg : FA.BaseRegs) {
5059 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5060 FARegNum += (NumUses - UsedByIndices.count() + 1);
5061 }
5062 size_t FBRegNum = 0;
5063 for (const SCEV *Reg : FB.BaseRegs) {
5064 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5065 FBRegNum += (NumUses - UsedByIndices.count() + 1);
5066 }
5067 if (FARegNum != FBRegNum)
5068 return FARegNum < FBRegNum;
5069
5070 // If the new register numbers are the same, choose the Formula with
5071 // less Cost.
5072 Cost CostFA(L, SE, TTI, AMK);
5073 Cost CostFB(L, SE, TTI, AMK);
5074 Regs.clear();
5075 CostFA.RateFormula(FA, Regs, VisitedRegs, LU);
5076 Regs.clear();
5077 CostFB.RateFormula(FB, Regs, VisitedRegs, LU);
5078 return CostFA.isLess(CostFB);
5079 };
5080
5081 bool Any = false;
5082 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5083 ++FIdx) {
5084 Formula &F = LU.Formulae[FIdx];
5085 if (!F.ScaledReg)
5086 continue;
5087 auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx});
5088 if (P.second)
5089 continue;
5090
5091 Formula &Best = LU.Formulae[P.first->second];
5092 if (IsBetterThan(F, Best))
5093 std::swap(F, Best);
5094 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5095 dbgs() << "\n"
5096 " in favor of formula ";
5097 Best.print(dbgs()); dbgs() << '\n');
5098#ifndef NDEBUG
5099 ChangedFormulae = true;
5100#endif
5101 LU.DeleteFormula(F);
5102 --FIdx;
5103 --NumForms;
5104 Any = true;
5105 }
5106 if (Any)
5107 LU.RecomputeRegs(LUIdx, RegUses);
5108
5109 // Reset this to prepare for the next use.
5110 BestFormulae.clear();
5111 }
5112
5113 LLVM_DEBUG(if (ChangedFormulae) {
5114 dbgs() << "\n"
5115 "After filtering out undesirable candidates:\n";
5116 print_uses(dbgs());
5117 });
5118}
5119
5120/// If we are over the complexity limit, filter out any post-inc prefering
5121/// variables to only post-inc values.
5122void LSRInstance::NarrowSearchSpaceByFilterPostInc() {
5123 if (AMK != TTI::AMK_PostIndexed)
5124 return;
5125 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5126 return;
5127
5128 LLVM_DEBUG(dbgs() << "The search space is too complex.\n"
5129 "Narrowing the search space by choosing the lowest "
5130 "register Formula for PostInc Uses.\n");
5131
5132 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5133 LSRUse &LU = Uses[LUIdx];
5134
5135 if (LU.Kind != LSRUse::Address)
5136 continue;
5137 if (!TTI.isIndexedLoadLegal(TTI.MIM_PostInc, LU.AccessTy.getType()) &&
5138 !TTI.isIndexedStoreLegal(TTI.MIM_PostInc, LU.AccessTy.getType()))
5139 continue;
5140
5141 size_t MinRegs = std::numeric_limits<size_t>::max();
5142 for (const Formula &F : LU.Formulae)
5143 MinRegs = std::min(F.getNumRegs(), MinRegs);
5144
5145 bool Any = false;
5146 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5147 ++FIdx) {
5148 Formula &F = LU.Formulae[FIdx];
5149 if (F.getNumRegs() > MinRegs) {
5150 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5151 dbgs() << "\n");
5152 LU.DeleteFormula(F);
5153 --FIdx;
5154 --NumForms;
5155 Any = true;
5156 }
5157 }
5158 if (Any)
5159 LU.RecomputeRegs(LUIdx, RegUses);
5160
5161 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5162 break;
5163 }
5164
5165 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5166}
5167
5168/// The function delete formulas with high registers number expectation.
5169/// Assuming we don't know the value of each formula (already delete
5170/// all inefficient), generate probability of not selecting for each
5171/// register.
5172/// For example,
5173/// Use1:
5174/// reg(a) + reg({0,+,1})
5175/// reg(a) + reg({-1,+,1}) + 1
5176/// reg({a,+,1})
5177/// Use2:
5178/// reg(b) + reg({0,+,1})
5179/// reg(b) + reg({-1,+,1}) + 1
5180/// reg({b,+,1})
5181/// Use3:
5182/// reg(c) + reg(b) + reg({0,+,1})
5183/// reg(c) + reg({b,+,1})
5184///
5185/// Probability of not selecting
5186/// Use1 Use2 Use3
5187/// reg(a) (1/3) * 1 * 1
5188/// reg(b) 1 * (1/3) * (1/2)
5189/// reg({0,+,1}) (2/3) * (2/3) * (1/2)
5190/// reg({-1,+,1}) (2/3) * (2/3) * 1
5191/// reg({a,+,1}) (2/3) * 1 * 1
5192/// reg({b,+,1}) 1 * (2/3) * (2/3)
5193/// reg(c) 1 * 1 * 0
5194///
5195/// Now count registers number mathematical expectation for each formula:
5196/// Note that for each use we exclude probability if not selecting for the use.
5197/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
5198/// probabilty 1/3 of not selecting for Use1).
5199/// Use1:
5200/// reg(a) + reg({0,+,1}) 1 + 1/3 -- to be deleted
5201/// reg(a) + reg({-1,+,1}) + 1 1 + 4/9 -- to be deleted
5202/// reg({a,+,1}) 1
5203/// Use2:
5204/// reg(b) + reg({0,+,1}) 1/2 + 1/3 -- to be deleted
5205/// reg(b) + reg({-1,+,1}) + 1 1/2 + 2/3 -- to be deleted
5206/// reg({b,+,1}) 2/3
5207/// Use3:
5208/// reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
5209/// reg(c) + reg({b,+,1}) 1 + 2/3
5210void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
5211 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5212 return;
5213 // Ok, we have too many of formulae on our hands to conveniently handle.
5214 // Use a rough heuristic to thin out the list.
5215
5216 // Set of Regs wich will be 100% used in final solution.
5217 // Used in each formula of a solution (in example above this is reg(c)).
5218 // We can skip them in calculations.
5220 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5221
5222 // Map each register to probability of not selecting
5223 DenseMap <const SCEV *, float> RegNumMap;
5224 for (const SCEV *Reg : RegUses) {
5225 if (UniqRegs.count(Reg))
5226 continue;
5227 float PNotSel = 1;
5228 for (const LSRUse &LU : Uses) {
5229 if (!LU.Regs.count(Reg))
5230 continue;
5231 float P = LU.getNotSelectedProbability(Reg);
5232 if (P != 0.0)
5233 PNotSel *= P;
5234 else
5235 UniqRegs.insert(Reg);
5236 }
5237 RegNumMap.insert(std::make_pair(Reg, PNotSel));
5238 }
5239
5240 LLVM_DEBUG(
5241 dbgs() << "Narrowing the search space by deleting costly formulas\n");
5242
5243 // Delete formulas where registers number expectation is high.
5244 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5245 LSRUse &LU = Uses[LUIdx];
5246 // If nothing to delete - continue.
5247 if (LU.Formulae.size() < 2)
5248 continue;
5249 // This is temporary solution to test performance. Float should be
5250 // replaced with round independent type (based on integers) to avoid
5251 // different results for different target builds.
5252 float FMinRegNum = LU.Formulae[0].getNumRegs();
5253 float FMinARegNum = LU.Formulae[0].getNumRegs();
5254 size_t MinIdx = 0;
5255 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5256 Formula &F = LU.Formulae[i];
5257 float FRegNum = 0;
5258 float FARegNum = 0;
5259 for (const SCEV *BaseReg : F.BaseRegs) {
5260 if (UniqRegs.count(BaseReg))
5261 continue;
5262 FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5263 if (isa<SCEVAddRecExpr>(BaseReg))
5264 FARegNum +=
5265 RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5266 }
5267 if (const SCEV *ScaledReg = F.ScaledReg) {
5268 if (!UniqRegs.count(ScaledReg)) {
5269 FRegNum +=
5270 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5271 if (isa<SCEVAddRecExpr>(ScaledReg))
5272 FARegNum +=
5273 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5274 }
5275 }
5276 if (FMinRegNum > FRegNum ||
5277 (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
5278 FMinRegNum = FRegNum;
5279 FMinARegNum = FARegNum;
5280 MinIdx = i;
5281 }
5282 }
5283 LLVM_DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs());
5284 dbgs() << " with min reg num " << FMinRegNum << '\n');
5285 if (MinIdx != 0)
5286 std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
5287 while (LU.Formulae.size() != 1) {
5288 LLVM_DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs());
5289 dbgs() << '\n');
5290 LU.Formulae.pop_back();
5291 }
5292 LU.RecomputeRegs(LUIdx, RegUses);
5293 assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
5294 Formula &F = LU.Formulae[0];
5295 LLVM_DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n');
5296 // When we choose the formula, the regs become unique.
5297 UniqRegs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
5298 if (F.ScaledReg)
5299 UniqRegs.insert(F.ScaledReg);
5300 }
5301 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5302}
5303
5304// Check if Best and Reg are SCEVs separated by a constant amount C, and if so
5305// would the addressing offset +C would be legal where the negative offset -C is
5306// not.
5308 ScalarEvolution &SE, const SCEV *Best,
5309 const SCEV *Reg,
5310 MemAccessTy AccessType) {
5311 if (Best->getType() != Reg->getType() ||
5312 (isa<SCEVAddRecExpr>(Best) && isa<SCEVAddRecExpr>(Reg) &&
5313 cast<SCEVAddRecExpr>(Best)->getLoop() !=
5314 cast<SCEVAddRecExpr>(Reg)->getLoop()))
5315 return false;
5316 std::optional<APInt> Diff = SE.computeConstantDifference(Best, Reg);
5317 if (!Diff)
5318 return false;
5319
5321 AccessType.MemTy, /*BaseGV=*/nullptr,
5322 /*BaseOffset=*/Diff->getSExtValue(),
5323 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace) &&
5325 AccessType.MemTy, /*BaseGV=*/nullptr,
5326 /*BaseOffset=*/-Diff->getSExtValue(),
5327 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace);
5328}
5329
5330/// Pick a register which seems likely to be profitable, and then in any use
5331/// which has any reference to that register, delete all formulae which do not
5332/// reference that register.
5333void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
5334 // With all other options exhausted, loop until the system is simple
5335 // enough to handle.
5337 while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5338 // Ok, we have too many of formulae on our hands to conveniently handle.
5339 // Use a rough heuristic to thin out the list.
5340 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5341
5342 // Pick the register which is used by the most LSRUses, which is likely
5343 // to be a good reuse register candidate.
5344 const SCEV *Best = nullptr;
5345 unsigned BestNum = 0;
5346 for (const SCEV *Reg : RegUses) {
5347 if (Taken.count(Reg))
5348 continue;
5349 if (!Best) {
5350 Best = Reg;
5351 BestNum = RegUses.getUsedByIndices(Reg).count();
5352 } else {
5353 unsigned Count = RegUses.getUsedByIndices(Reg).count();
5354 if (Count > BestNum) {
5355 Best = Reg;
5356 BestNum = Count;
5357 }
5358
5359 // If the scores are the same, but the Reg is simpler for the target
5360 // (for example {x,+,1} as opposed to {x+C,+,1}, where the target can
5361 // handle +C but not -C), opt for the simpler formula.
5362 if (Count == BestNum) {
5363 int LUIdx = RegUses.getUsedByIndices(Reg).find_first();
5364 if (LUIdx >= 0 && Uses[LUIdx].Kind == LSRUse::Address &&
5365 IsSimplerBaseSCEVForTarget(TTI, SE, Best, Reg,
5366 Uses[LUIdx].AccessTy)) {
5367 Best = Reg;
5368 BestNum = Count;
5369 }
5370 }
5371 }
5372 }
5373 assert(Best && "Failed to find best LSRUse candidate");
5374
5375 LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
5376 << " will yield profitable reuse.\n");
5377 Taken.insert(Best);
5378
5379 // In any use with formulae which references this register, delete formulae
5380 // which don't reference it.
5381 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5382 LSRUse &LU = Uses[LUIdx];
5383 if (!LU.Regs.count(Best)) continue;
5384
5385 bool Any = false;
5386 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5387 Formula &F = LU.Formulae[i];
5388 if (!F.referencesReg(Best)) {
5389 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
5390 LU.DeleteFormula(F);
5391 --e;
5392 --i;
5393 Any = true;
5394 assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
5395 continue;
5396 }
5397 }
5398
5399 if (Any)
5400 LU.RecomputeRegs(LUIdx, RegUses);
5401 }
5402
5403 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5404 }
5405}
5406
5407/// If there are an extraordinary number of formulae to choose from, use some
5408/// rough heuristics to prune down the number of formulae. This keeps the main
5409/// solver from taking an extraordinary amount of time in some worst-case
5410/// scenarios.
5411void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
5412 NarrowSearchSpaceByDetectingSupersets();
5413 NarrowSearchSpaceByCollapsingUnrolledCode();
5414 NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
5416 NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
5417 NarrowSearchSpaceByFilterPostInc();
5418 if (LSRExpNarrow)
5419 NarrowSearchSpaceByDeletingCostlyFormulas();
5420 else
5421 NarrowSearchSpaceByPickingWinnerRegs();
5422}
5423
5424/// This is the recursive solver.
5425void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
5426 Cost &SolutionCost,
5428 const Cost &CurCost,
5429 const SmallPtrSet<const SCEV *, 16> &CurRegs,
5430 DenseSet<const SCEV *> &VisitedRegs) const {
5431 // Some ideas:
5432 // - prune more:
5433 // - use more aggressive filtering
5434 // - sort the formula so that the most profitable solutions are found first
5435 // - sort the uses too
5436 // - search faster:
5437 // - don't compute a cost, and then compare. compare while computing a cost
5438 // and bail early.
5439 // - track register sets with SmallBitVector
5440
5441 const LSRUse &LU = Uses[Workspace.size()];
5442
5443 // If this use references any register that's already a part of the
5444 // in-progress solution, consider it a requirement that a formula must
5445 // reference that register in order to be considered. This prunes out
5446 // unprofitable searching.
5448 for (const SCEV *S : CurRegs)
5449 if (LU.Regs.count(S))
5450 ReqRegs.insert(S);
5451
5453 Cost NewCost(L, SE, TTI, AMK);
5454 for (const Formula &F : LU.Formulae) {
5455 // Ignore formulae which may not be ideal in terms of register reuse of
5456 // ReqRegs. The formula should use all required registers before
5457 // introducing new ones.
5458 // This can sometimes (notably when trying to favour postinc) lead to
5459 // sub-optimial decisions. There it is best left to the cost modelling to
5460 // get correct.
5461 if (AMK != TTI::AMK_PostIndexed || LU.Kind != LSRUse::Address) {
5462 int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
5463 for (const SCEV *Reg : ReqRegs) {
5464 if ((F.ScaledReg && F.ScaledReg == Reg) ||
5465 is_contained(F.BaseRegs, Reg)) {
5466 --NumReqRegsToFind;
5467 if (NumReqRegsToFind == 0)
5468 break;
5469 }
5470 }
5471 if (NumReqRegsToFind != 0) {
5472 // If none of the formulae satisfied the required registers, then we could
5473 // clear ReqRegs and try again. Currently, we simply give up in this case.
5474 continue;
5475 }
5476 }
5477
5478 // Evaluate the cost of the current formula. If it's already worse than
5479 // the current best, prune the search at that point.
5480 NewCost = CurCost;
5481 NewRegs = CurRegs;
5482 NewCost.RateFormula(F, NewRegs, VisitedRegs, LU);
5483 if (NewCost.isLess(SolutionCost)) {
5484 Workspace.push_back(&F);
5485 if (Workspace.size() != Uses.size()) {
5486 SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
5487 NewRegs, VisitedRegs);
5488 if (F.getNumRegs() == 1 && Workspace.size() == 1)
5489 VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
5490 } else {
5491 LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
5492 dbgs() << ".\nRegs:\n";
5493 for (const SCEV *S : NewRegs) dbgs()
5494 << "- " << *S << "\n";
5495 dbgs() << '\n');
5496
5497 SolutionCost = NewCost;
5498 Solution = Workspace;
5499 }
5500 Workspace.pop_back();
5501 }
5502 }
5503}
5504
5505/// Choose one formula from each use. Return the results in the given Solution
5506/// vector.
5507void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
5509 Cost SolutionCost(L, SE, TTI, AMK);
5510 SolutionCost.Lose();
5511 Cost CurCost(L, SE, TTI, AMK);
5513 DenseSet<const SCEV *> VisitedRegs;
5514 Workspace.reserve(Uses.size());
5515
5516 // SolveRecurse does all the work.
5517 SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
5518 CurRegs, VisitedRegs);
5519 if (Solution.empty()) {
5520 LLVM_DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
5521 return;
5522 }
5523
5524 // Ok, we've now made all our decisions.
5525 LLVM_DEBUG(dbgs() << "\n"
5526 "The chosen solution requires ";
5527 SolutionCost.print(dbgs()); dbgs() << ":\n";
5528 for (size_t i = 0, e = Uses.size(); i != e; ++i) {
5529 dbgs() << " ";
5530 Uses[i].print(dbgs());
5531 dbgs() << "\n"
5532 " ";
5533 Solution[i]->print(dbgs());
5534 dbgs() << '\n';
5535 });
5536
5537 assert(Solution.size() == Uses.size() && "Malformed solution!");
5538
5539 const bool EnableDropUnprofitableSolution = [&] {
5541 case cl::BOU_TRUE:
5542 return true;
5543 case cl::BOU_FALSE:
5544 return false;
5545 case cl::BOU_UNSET:
5547 }
5548 llvm_unreachable("Unhandled cl::boolOrDefault enum");
5549 }();
5550
5551 if (BaselineCost.isLess(SolutionCost)) {
5552 if (!EnableDropUnprofitableSolution)
5553 LLVM_DEBUG(
5554 dbgs() << "Baseline is more profitable than chosen solution, "
5555 "add option 'lsr-drop-solution' to drop LSR solution.\n");
5556 else {
5557 LLVM_DEBUG(dbgs() << "Baseline is more profitable than chosen "
5558 "solution, dropping LSR solution.\n";);
5559 Solution.clear();
5560 }
5561 }
5562}
5563
5564/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
5565/// we can go while still being dominated by the input positions. This helps
5566/// canonicalize the insert position, which encourages sharing.
5568LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
5569 const SmallVectorImpl<Instruction *> &Inputs)
5570 const {
5571 Instruction *Tentative = &*IP;
5572 while (true) {
5573 bool AllDominate = true;
5574 Instruction *BetterPos = nullptr;
5575 // Don't bother attempting to insert before a catchswitch, their basic block
5576 // cannot have other non-PHI instructions.
5577 if (isa<CatchSwitchInst>(Tentative))
5578 return IP;
5579
5580 for (Instruction *Inst : Inputs) {
5581 if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
5582 AllDominate = false;
5583 break;
5584 }
5585 // Attempt to find an insert position in the middle of the block,
5586 // instead of at the end, so that it can be used for other expansions.
5587 if (Tentative->getParent() == Inst->getParent() &&
5588 (!BetterPos || !DT.dominates(Inst, BetterPos)))
5589 BetterPos = &*std::next(BasicBlock::iterator(Inst));
5590 }
5591 if (!AllDominate)
5592 break;
5593 if (BetterPos)
5594 IP = BetterPos->getIterator();
5595 else
5596 IP = Tentative->getIterator();
5597
5598 const Loop *IPLoop = LI.getLoopFor(IP->getParent());
5599 unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
5600
5601 BasicBlock *IDom;
5602 for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
5603 if (!Rung) return IP;
5604 Rung = Rung->getIDom();
5605 if (!Rung) return IP;
5606 IDom = Rung->getBlock();
5607
5608 // Don't climb into a loop though.
5609 const Loop *IDomLoop = LI.getLoopFor(IDom);
5610 unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
5611 if (IDomDepth <= IPLoopDepth &&
5612 (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
5613 break;
5614 }
5615
5616 Tentative = IDom->getTerminator();
5617 }
5618
5619 return IP;
5620}
5621
5622/// Determine an input position which will be dominated by the operands and
5623/// which will dominate the result.
5624BasicBlock::iterator LSRInstance::AdjustInsertPositionForExpand(
5625 BasicBlock::iterator LowestIP, const LSRFixup &LF, const LSRUse &LU) const {
5626 // Collect some instructions which must be dominated by the
5627 // expanding replacement. These must be dominated by any operands that
5628 // will be required in the expansion.
5630 if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
5631 Inputs.push_back(I);
5632 if (LU.Kind == LSRUse::ICmpZero)
5633 if (Instruction *I =
5634 dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
5635 Inputs.push_back(I);
5636 if (LF.PostIncLoops.count(L)) {
5637 if (LF.isUseFullyOutsideLoop(L))
5638 Inputs.push_back(L->getLoopLatch()->getTerminator());
5639 else
5640 Inputs.push_back(IVIncInsertPos);
5641 }
5642 // The expansion must also be dominated by the increment positions of any
5643 // loops it for which it is using post-inc mode.
5644 for (const Loop *PIL : LF.PostIncLoops) {
5645 if (PIL == L) continue;
5646
5647 // Be dominated by the loop exit.
5648 SmallVector<BasicBlock *, 4> ExitingBlocks;
5649 PIL->getExitingBlocks(ExitingBlocks);
5650 if (!ExitingBlocks.empty()) {
5651 BasicBlock *BB = ExitingBlocks[0];
5652 for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
5653 BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
5654 Inputs.push_back(BB->getTerminator());
5655 }
5656 }
5657
5658 assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad()
5659 && !isa<DbgInfoIntrinsic>(LowestIP) &&
5660 "Insertion point must be a normal instruction");
5661
5662 // Then, climb up the immediate dominator tree as far as we can go while
5663 // still being dominated by the input positions.
5664 BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
5665
5666 // Don't insert instructions before PHI nodes.
5667 while (isa<PHINode>(IP)) ++IP;
5668
5669 // Ignore landingpad instructions.
5670 while (IP->isEHPad()) ++IP;
5671
5672 // Ignore debug intrinsics.
5673 while (isa<DbgInfoIntrinsic>(IP)) ++IP;
5674
5675 // Set IP below instructions recently inserted by SCEVExpander. This keeps the
5676 // IP consistent across expansions and allows the previously inserted
5677 // instructions to be reused by subsequent expansion.
5678 while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP)
5679 ++IP;
5680
5681 return IP;
5682}
5683
5684/// Emit instructions for the leading candidate expression for this LSRUse (this
5685/// is called "expanding").
5686Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
5687 const Formula &F, BasicBlock::iterator IP,
5688 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5689 if (LU.RigidFormula)
5690 return LF.OperandValToReplace;
5691
5692 // Determine an input position which will be dominated by the operands and
5693 // which will dominate the result.
5694 IP = AdjustInsertPositionForExpand(IP, LF, LU);
5695 Rewriter.setInsertPoint(&*IP);
5696
5697 // Inform the Rewriter if we have a post-increment use, so that it can
5698 // perform an advantageous expansion.
5699 Rewriter.setPostInc(LF.PostIncLoops);
5700
5701 // This is the type that the user actually needs.
5702 Type *OpTy = LF.OperandValToReplace->getType();
5703 // This will be the type that we'll initially expand to.
5704 Type *Ty = F.getType();
5705 if (!Ty)
5706 // No type known; just expand directly to the ultimate type.
5707 Ty = OpTy;
5708 else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
5709 // Expand directly to the ultimate type if it's the right size.
5710 Ty = OpTy;
5711 // This is the type to do integer arithmetic in.
5712 Type *IntTy = SE.getEffectiveSCEVType(Ty);
5713
5714 // Build up a list of operands to add together to form the full base.
5716
5717 // Expand the BaseRegs portion.
5718 for (const SCEV *Reg : F.BaseRegs) {
5719 assert(!Reg->isZero() && "Zero allocated in a base register!");
5720
5721 // If we're expanding for a post-inc user, make the post-inc adjustment.
5722 Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE);
5723 Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
5724 }
5725
5726 // Expand the ScaledReg portion.
5727 Value *ICmpScaledV = nullptr;
5728 if (F.Scale != 0) {
5729 const SCEV *ScaledS = F.ScaledReg;
5730
5731 // If we're expanding for a post-inc user, make the post-inc adjustment.
5732 PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
5733 ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE);
5734
5735 if (LU.Kind == LSRUse::ICmpZero) {
5736 // Expand ScaleReg as if it was part of the base regs.
5737 if (F.Scale == 1)
5738 Ops.push_back(
5739 SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)));
5740 else {
5741 // An interesting way of "folding" with an icmp is to use a negated
5742 // scale, which we'll implement by inserting it into the other operand
5743 // of the icmp.
5744 assert(F.Scale == -1 &&
5745 "The only scale supported by ICmpZero uses is -1!");
5746 ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr);
5747 }
5748 } else {
5749 // Otherwise just expand the scaled register and an explicit scale,
5750 // which is expected to be matched as part of the address.
5751
5752 // Flush the operand list to suppress SCEVExpander hoisting address modes.
5753 // Unless the addressing mode will not be folded.
5754 if (!Ops.empty() && LU.Kind == LSRUse::Address &&
5755 isAMCompletelyFolded(TTI, LU, F)) {
5756 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), nullptr);
5757 Ops.clear();
5758 Ops.push_back(SE.getUnknown(FullV));
5759 }
5760 ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr));
5761 if (F.Scale != 1)
5762 ScaledS =
5763 SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
5764 Ops.push_back(ScaledS);
5765 }
5766 }
5767
5768 // Expand the GV portion.
5769 if (F.BaseGV) {
5770 // Flush the operand list to suppress SCEVExpander hoisting.
5771 if (!Ops.empty()) {
5772 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), IntTy);
5773 Ops.clear();
5774 Ops.push_back(SE.getUnknown(FullV));
5775 }
5776 Ops.push_back(SE.getUnknown(F.BaseGV));
5777 }
5778
5779 // Flush the operand list to suppress SCEVExpander hoisting of both folded and
5780 // unfolded offsets. LSR assumes they both live next to their uses.
5781 if (!Ops.empty()) {
5782 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
5783 Ops.clear();
5784 Ops.push_back(SE.getUnknown(FullV));
5785 }
5786
5787 // FIXME: Are we sure we won't get a mismatch here? Is there a way to bail
5788 // out at this point, or should we generate a SCEV adding together mixed
5789 // offsets?
5790 assert(F.BaseOffset.isCompatibleImmediate(LF.Offset) &&
5791 "Expanding mismatched offsets\n");
5792 // Expand the immediate portion.
5793 Immediate Offset = F.BaseOffset.addUnsigned(LF.Offset);
5794 if (Offset.isNonZero()) {
5795 if (LU.Kind == LSRUse::ICmpZero) {
5796 // The other interesting way of "folding" with an ICmpZero is to use a
5797 // negated immediate.
5798 if (!ICmpScaledV)
5799 ICmpScaledV =
5800 ConstantInt::get(IntTy, -(uint64_t)Offset.getFixedValue());
5801 else {
5802 Ops.push_back(SE.getUnknown(ICmpScaledV));
5803 ICmpScaledV = ConstantInt::get(IntTy, Offset.getFixedValue());
5804 }
5805 } else {
5806 // Just add the immediate values. These again are expected to be matched
5807 // as part of the address.
5808 Ops.push_back(Offset.getUnknownSCEV(SE, IntTy));
5809 }
5810 }
5811
5812 // Expand the unfolded offset portion.
5813 Immediate UnfoldedOffset = F.UnfoldedOffset;
5814 if (UnfoldedOffset.isNonZero()) {
5815 // Just add the immediate values.
5816 Ops.push_back(UnfoldedOffset.getUnknownSCEV(SE, IntTy));
5817 }
5818
5819 // Emit instructions summing all the operands.
5820 const SCEV *FullS = Ops.empty() ?
5821 SE.getConstant(IntTy, 0) :
5822 SE.getAddExpr(Ops);
5823 Value *FullV = Rewriter.expandCodeFor(FullS, Ty);
5824
5825 // We're done expanding now, so reset the rewriter.
5826 Rewriter.clearPostInc();
5827
5828 // An ICmpZero Formula represents an ICmp which we're handling as a
5829 // comparison against zero. Now that we've expanded an expression for that
5830 // form, update the ICmp's other operand.
5831 if (LU.Kind == LSRUse::ICmpZero) {
5832 ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
5833 if (auto *OperandIsInstr = dyn_cast<Instruction>(CI->getOperand(1)))
5834 DeadInsts.emplace_back(OperandIsInstr);
5835 assert(!F.BaseGV && "ICmp does not support folding a global value and "
5836 "a scale at the same time!");
5837 if (F.Scale == -1) {
5838 if (ICmpScaledV->getType() != OpTy) {
5840 CastInst::getCastOpcode(ICmpScaledV, false, OpTy, false),
5841 ICmpScaledV, OpTy, "tmp", CI->getIterator());
5842 ICmpScaledV = Cast;
5843 }
5844 CI->setOperand(1, ICmpScaledV);
5845 } else {
5846 // A scale of 1 means that the scale has been expanded as part of the
5847 // base regs.
5848 assert((F.Scale == 0 || F.Scale == 1) &&
5849 "ICmp does not support folding a global value and "
5850 "a scale at the same time!");
5852 -(uint64_t)Offset.getFixedValue());
5853 if (C->getType() != OpTy) {
5855 CastInst::getCastOpcode(C, false, OpTy, false), C, OpTy,
5856 CI->getDataLayout());
5857 assert(C && "Cast of ConstantInt should have folded");
5858 }
5859
5860 CI->setOperand(1, C);
5861 }
5862 }
5863
5864 return FullV;
5865}
5866
5867/// Helper for Rewrite. PHI nodes are special because the use of their operands
5868/// effectively happens in their predecessor blocks, so the expression may need
5869/// to be expanded in multiple places.
5870void LSRInstance::RewriteForPHI(PHINode *PN, const LSRUse &LU,
5871 const LSRFixup &LF, const Formula &F,
5874
5875 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
5876 if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
5877 bool needUpdateFixups = false;
5878 BasicBlock *BB = PN->getIncomingBlock(i);
5879
5880 // If this is a critical edge, split the edge so that we do not insert
5881 // the code on all predecessor/successor paths. We do this unless this
5882 // is the canonical backedge for this loop, which complicates post-inc
5883 // users.
5884 if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
5885 !isa<IndirectBrInst>(BB->getTerminator()) &&
5886 !isa<CatchSwitchInst>(BB->getTerminator())) {
5887 BasicBlock *Parent = PN->getParent();
5888 Loop *PNLoop = LI.getLoopFor(Parent);
5889 if (!PNLoop || Parent != PNLoop->getHeader()) {
5890 // Split the critical edge.
5891 BasicBlock *NewBB = nullptr;
5892 if (!Parent->isLandingPad()) {
5893 NewBB =
5894 SplitCriticalEdge(BB, Parent,
5895 CriticalEdgeSplittingOptions(&DT, &LI, MSSAU)
5896 .setMergeIdenticalEdges()
5897 .setKeepOneInputPHIs());
5898 } else {
5900 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
5901 SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DTU, &LI);
5902 NewBB = NewBBs[0];
5903 }
5904 // If NewBB==NULL, then SplitCriticalEdge refused to split because all
5905 // phi predecessors are identical. The simple thing to do is skip
5906 // splitting in this case rather than complicate the API.
5907 if (NewBB) {
5908 // If PN is outside of the loop and BB is in the loop, we want to
5909 // move the block to be immediately before the PHI block, not
5910 // immediately after BB.
5911 if (L->contains(BB) && !L->contains(PN))
5912 NewBB->moveBefore(PN->getParent());
5913
5914 // Splitting the edge can reduce the number of PHI entries we have.
5915 e = PN->getNumIncomingValues();
5916 BB = NewBB;
5917 i = PN->getBasicBlockIndex(BB);
5918
5919 needUpdateFixups = true;
5920 }
5921 }
5922 }
5923
5924 std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
5925 Inserted.insert(std::make_pair(BB, static_cast<Value *>(nullptr)));
5926 if (!Pair.second)
5927 PN->setIncomingValue(i, Pair.first->second);
5928 else {
5929 Value *FullV =
5930 Expand(LU, LF, F, BB->getTerminator()->getIterator(), DeadInsts);
5931
5932 // If this is reuse-by-noop-cast, insert the noop cast.
5933 Type *OpTy = LF.OperandValToReplace->getType();
5934 if (FullV->getType() != OpTy)
5935 FullV = CastInst::Create(
5936 CastInst::getCastOpcode(FullV, false, OpTy, false), FullV,
5937 LF.OperandValToReplace->getType(), "tmp",
5938 BB->getTerminator()->getIterator());
5939
5940 // If the incoming block for this value is not in the loop, it means the
5941 // current PHI is not in a loop exit, so we must create a LCSSA PHI for
5942 // the inserted value.
5943 if (auto *I = dyn_cast<Instruction>(FullV))
5944 if (L->contains(I) && !L->contains(BB))
5945 InsertedNonLCSSAInsts.insert(I);
5946
5947 PN->setIncomingValue(i, FullV);
5948 Pair.first->second = FullV;
5949 }
5950
5951 // If LSR splits critical edge and phi node has other pending
5952 // fixup operands, we need to update those pending fixups. Otherwise
5953 // formulae will not be implemented completely and some instructions
5954 // will not be eliminated.
5955 if (needUpdateFixups) {
5956 for (LSRUse &LU : Uses)
5957 for (LSRFixup &Fixup : LU.Fixups)
5958 // If fixup is supposed to rewrite some operand in the phi
5959 // that was just updated, it may be already moved to
5960 // another phi node. Such fixup requires update.
5961 if (Fixup.UserInst == PN) {
5962 // Check if the operand we try to replace still exists in the
5963 // original phi.
5964 bool foundInOriginalPHI = false;
5965 for (const auto &val : PN->incoming_values())
5966 if (val == Fixup.OperandValToReplace) {
5967 foundInOriginalPHI = true;
5968 break;
5969 }
5970
5971 // If fixup operand found in original PHI - nothing to do.
5972 if (foundInOriginalPHI)
5973 continue;
5974
5975 // Otherwise it might be moved to another PHI and requires update.
5976 // If fixup operand not found in any of the incoming blocks that
5977 // means we have already rewritten it - nothing to do.
5978 for (const auto &Block : PN->blocks())
5979 for (BasicBlock::iterator I = Block->begin(); isa<PHINode>(I);
5980 ++I) {
5981 PHINode *NewPN = cast<PHINode>(I);
5982 for (const auto &val : NewPN->incoming_values())
5983 if (val == Fixup.OperandValToReplace)
5984 Fixup.UserInst = NewPN;
5985 }
5986 }
5987 }
5988 }
5989}
5990
5991/// Emit instructions for the leading candidate expression for this LSRUse (this
5992/// is called "expanding"), and update the UserInst to reference the newly
5993/// expanded value.
5994void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
5995 const Formula &F,
5997 // First, find an insertion point that dominates UserInst. For PHI nodes,
5998 // find the nearest block which dominates all the relevant uses.
5999 if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
6000 RewriteForPHI(PN, LU, LF, F, DeadInsts);
6001 } else {
6002 Value *FullV = Expand(LU, LF, F, LF.UserInst->getIterator(), DeadInsts);
6003
6004 // If this is reuse-by-noop-cast, insert the noop cast.
6005 Type *OpTy = LF.OperandValToReplace->getType();
6006 if (FullV->getType() != OpTy) {
6007 Instruction *Cast =
6008 CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
6009 FullV, OpTy, "tmp", LF.UserInst->getIterator());
6010 FullV = Cast;
6011 }
6012
6013 // Update the user. ICmpZero is handled specially here (for now) because
6014 // Expand may have updated one of the operands of the icmp already, and
6015 // its new value may happen to be equal to LF.OperandValToReplace, in
6016 // which case doing replaceUsesOfWith leads to replacing both operands
6017 // with the same value. TODO: Reorganize this.
6018 if (LU.Kind == LSRUse::ICmpZero)
6019 LF.UserInst->setOperand(0, FullV);
6020 else
6021 LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
6022 }
6023
6024 if (auto *OperandIsInstr = dyn_cast<Instruction>(LF.OperandValToReplace))
6025 DeadInsts.emplace_back(OperandIsInstr);
6026}
6027
6028// Trying to hoist the IVInc to loop header if all IVInc users are in
6029// the loop header. It will help backend to generate post index load/store
6030// when the latch block is different from loop header block.
6031static bool canHoistIVInc(const TargetTransformInfo &TTI, const LSRFixup &Fixup,
6032 const LSRUse &LU, Instruction *IVIncInsertPos,
6033 Loop *L) {
6034 if (LU.Kind != LSRUse::Address)
6035 return false;
6036
6037 // For now this code do the conservative optimization, only work for
6038 // the header block. Later we can hoist the IVInc to the block post
6039 // dominate all users.
6040 BasicBlock *LHeader = L->getHeader();
6041 if (IVIncInsertPos->getParent() == LHeader)
6042 return false;
6043
6044 if (!Fixup.OperandValToReplace ||
6045 any_of(Fixup.OperandValToReplace->users(), [&LHeader](User *U) {
6046 Instruction *UI = cast<Instruction>(U);
6047 return UI->getParent() != LHeader;
6048 }))
6049 return false;
6050
6051 Instruction *I = Fixup.UserInst;
6052 Type *Ty = I->getType();
6053 return Ty->isIntegerTy() &&
6054 ((isa<LoadInst>(I) && TTI.isIndexedLoadLegal(TTI.MIM_PostInc, Ty)) ||
6055 (isa<StoreInst>(I) && TTI.isIndexedStoreLegal(TTI.MIM_PostInc, Ty)));
6056}
6057
6058/// Rewrite all the fixup locations with new values, following the chosen
6059/// solution.
6060void LSRInstance::ImplementSolution(
6061 const SmallVectorImpl<const Formula *> &Solution) {
6062 // Keep track of instructions we may have made dead, so that
6063 // we can remove them after we are done working.
6065
6066 // Mark phi nodes that terminate chains so the expander tries to reuse them.
6067 for (const IVChain &Chain : IVChainVec) {
6068 if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst()))
6069 Rewriter.setChainedPhi(PN);
6070 }
6071
6072 // Expand the new value definitions and update the users.
6073 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
6074 for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) {
6075 Instruction *InsertPos =
6076 canHoistIVInc(TTI, Fixup, Uses[LUIdx], IVIncInsertPos, L)
6077 ? L->getHeader()->getTerminator()
6078 : IVIncInsertPos;
6079 Rewriter.setIVIncInsertPos(L, InsertPos);
6080 Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], DeadInsts);
6081 Changed = true;
6082 }
6083
6084 auto InsertedInsts = InsertedNonLCSSAInsts.takeVector();
6085 formLCSSAForInstructions(InsertedInsts, DT, LI, &SE);
6086
6087 for (const IVChain &Chain : IVChainVec) {
6088 GenerateIVChain(Chain, DeadInsts);
6089 Changed = true;
6090 }
6091
6092 for (const WeakVH &IV : Rewriter.getInsertedIVs())
6093 if (IV && dyn_cast<Instruction>(&*IV)->getParent())
6094 ScalarEvolutionIVs.push_back(IV);
6095
6096 // Clean up after ourselves. This must be done before deleting any
6097 // instructions.
6098 Rewriter.clear();
6099
6101 &TLI, MSSAU);
6102
6103 // In our cost analysis above, we assume that each addrec consumes exactly
6104 // one register, and arrange to have increments inserted just before the
6105 // latch to maximimize the chance this is true. However, if we reused
6106 // existing IVs, we now need to move the increments to match our
6107 // expectations. Otherwise, our cost modeling results in us having a
6108 // chosen a non-optimal result for the actual schedule. (And yes, this
6109 // scheduling decision does impact later codegen.)
6110 for (PHINode &PN : L->getHeader()->phis()) {
6111 BinaryOperator *BO = nullptr;
6112 Value *Start = nullptr, *Step = nullptr;
6113 if (!matchSimpleRecurrence(&PN, BO, Start, Step))
6114 continue;
6115
6116 switch (BO->getOpcode()) {
6117 case Instruction::Sub:
6118 if (BO->getOperand(0) != &PN)
6119 // sub is non-commutative - match handling elsewhere in LSR
6120 continue;
6121 break;
6122 case Instruction::Add:
6123 break;
6124 default:
6125 continue;
6126 };
6127
6128 if (!isa<Constant>(Step))
6129 // If not a constant step, might increase register pressure
6130 // (We assume constants have been canonicalized to RHS)
6131 continue;
6132
6133 if (BO->getParent() == IVIncInsertPos->getParent())
6134 // Only bother moving across blocks. Isel can handle block local case.
6135 continue;
6136
6137 // Can we legally schedule inc at the desired point?
6138 if (!llvm::all_of(BO->uses(),
6139 [&](Use &U) {return DT.dominates(IVIncInsertPos, U);}))
6140 continue;
6141 BO->moveBefore(IVIncInsertPos);
6142 Changed = true;
6143 }
6144
6145
6146}
6147
6148LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
6149 DominatorTree &DT, LoopInfo &LI,
6152 : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), TLI(TLI), TTI(TTI), L(L),
6153 MSSAU(MSSAU), AMK(PreferredAddresingMode.getNumOccurrences() > 0
6155 : TTI.getPreferredAddressingMode(L, &SE)),
6156 Rewriter(SE, L->getHeader()->getDataLayout(), "lsr", false),
6157 BaselineCost(L, SE, TTI, AMK) {
6158 // If LoopSimplify form is not available, stay out of trouble.
6159 if (!L->isLoopSimplifyForm())
6160 return;
6161
6162 // If there's no interesting work to be done, bail early.
6163 if (IU.empty()) return;
6164
6165 // If there's too much analysis to be done, bail early. We won't be able to
6166 // model the problem anyway.
6167 unsigned NumUsers = 0;
6168 for (const IVStrideUse &U : IU) {
6169 if (++NumUsers > MaxIVUsers) {
6170 (void)U;
6171 LLVM_DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U
6172 << "\n");
6173 return;
6174 }
6175 // Bail out if we have a PHI on an EHPad that gets a value from a
6176 // CatchSwitchInst. Because the CatchSwitchInst cannot be split, there is
6177 // no good place to stick any instructions.
6178 if (auto *PN = dyn_cast<PHINode>(U.getUser())) {
6179 auto *FirstNonPHI = PN->getParent()->getFirstNonPHI();
6180 if (isa<FuncletPadInst>(FirstNonPHI) ||
6181 isa<CatchSwitchInst>(FirstNonPHI))
6182 for (BasicBlock *PredBB : PN->blocks())
6183 if (isa<CatchSwitchInst>(PredBB->getFirstNonPHI()))
6184 return;
6185 }
6186 }
6187
6188 LLVM_DEBUG(dbgs() << "\nLSR on loop ";
6189 L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
6190 dbgs() << ":\n");
6191
6192 // Configure SCEVExpander already now, so the correct mode is used for
6193 // isSafeToExpand() checks.
6194#if LLVM_ENABLE_ABI_BREAKING_CHECKS
6195 Rewriter.setDebugType(DEBUG_TYPE);
6196#endif
6197 Rewriter.disableCanonicalMode();
6198 Rewriter.enableLSRMode();
6199
6200 // First, perform some low-level loop optimizations.
6201 OptimizeShadowIV();
6202 OptimizeLoopTermCond();
6203
6204 // If loop preparation eliminates all interesting IV users, bail.
6205 if (IU.empty()) return;
6206
6207 // Skip nested loops until we can model them better with formulae.
6208 if (!L->isInnermost()) {
6209 LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
6210 return;
6211 }
6212
6213 // Start collecting data and preparing for the solver.
6214 // If number of registers is not the major cost, we cannot benefit from the
6215 // current profitable chain optimization which is based on number of
6216 // registers.
6217 // FIXME: add profitable chain optimization for other kinds major cost, for
6218 // example number of instructions.
6220 CollectChains();
6221 CollectInterestingTypesAndFactors();
6222 CollectFixupsAndInitialFormulae();
6223 CollectLoopInvariantFixupsAndFormulae();
6224
6225 if (Uses.empty())
6226 return;
6227
6228 LLVM_DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
6229 print_uses(dbgs()));
6230 LLVM_DEBUG(dbgs() << "The baseline solution requires ";
6231 BaselineCost.print(dbgs()); dbgs() << "\n");
6232
6233 // Now use the reuse data to generate a bunch of interesting ways
6234 // to formulate the values needed for the uses.
6235 GenerateAllReuseFormulae();
6236
6237 FilterOutUndesirableDedicatedRegisters();
6238 NarrowSearchSpaceUsingHeuristics();
6239
6241 Solve(Solution);
6242
6243 // Release memory that is no longer needed.
6244 Factors.clear();
6245 Types.clear();
6246 RegUses.clear();
6247
6248 if (Solution.empty())
6249 return;
6250
6251#ifndef NDEBUG
6252 // Formulae should be legal.
6253 for (const LSRUse &LU : Uses) {
6254 for (const Formula &F : LU.Formulae)
6255 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
6256 F) && "Illegal formula generated!");
6257 };
6258#endif
6259
6260 // Now that we've decided what we want, make it so.
6261 ImplementSolution(Solution);
6262}
6263
6264#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
6265void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
6266 if (Factors.empty() && Types.empty()) return;
6267
6268 OS << "LSR has identified the following interesting factors and types: ";
6269 bool First = true;
6270
6271 for (int64_t Factor : Factors) {
6272 if (!First) OS << ", ";
6273 First = false;
6274 OS << '*' << Factor;
6275 }
6276
6277 for (Type *Ty : Types) {
6278 if (!First) OS << ", ";
6279 First = false;
6280 OS << '(' << *Ty << ')';
6281 }
6282 OS << '\n';
6283}
6284
6285void LSRInstance::print_fixups(raw_ostream &OS) const {
6286 OS << "LSR is examining the following fixup sites:\n";
6287 for (const LSRUse &LU : Uses)
6288 for (const LSRFixup &LF : LU.Fixups) {
6289 dbgs() << " ";
6290 LF.print(OS);
6291 OS << '\n';
6292 }
6293}
6294
6295void LSRInstance::print_uses(raw_ostream &OS) const {
6296 OS << "LSR is examining the following uses:\n";
6297 for (const LSRUse &LU : Uses) {
6298 dbgs() << " ";
6299 LU.print(OS);
6300 OS << '\n';
6301 for (const Formula &F : LU.Formulae) {
6302 OS << " ";
6303 F.print(OS);
6304 OS << '\n';
6305 }
6306 }
6307}
6308
6309void LSRInstance::print(raw_ostream &OS) const {
6310 print_factors_and_types(OS);
6311 print_fixups(OS);
6312 print_uses(OS);
6313}
6314
6315LLVM_DUMP_METHOD void LSRInstance::dump() const {
6316 print(errs()); errs() << '\n';
6317}
6318#endif
6319
6320namespace {
6321
6322class LoopStrengthReduce : public LoopPass {
6323public:
6324 static char ID; // Pass ID, replacement for typeid
6325
6326 LoopStrengthReduce();
6327
6328private:
6329 bool runOnLoop(Loop *L, LPPassManager &LPM) override;
6330 void getAnalysisUsage(AnalysisUsage &AU) const override;
6331};
6332
6333} // end anonymous namespace
6334
6335LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
6337}
6338
6339void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
6340 // We split critical edges, so we change the CFG. However, we do update
6341 // many analyses if they are around.
6343
6353 // Requiring LoopSimplify a second time here prevents IVUsers from running
6354 // twice, since LoopSimplify was invalidated by running ScalarEvolution.
6360}
6361
6362namespace {
6363
6364/// Enables more convenient iteration over a DWARF expression vector.
6366ToDwarfOpIter(SmallVectorImpl<uint64_t> &Expr) {
6371 return {Begin, End};
6372}
6373
6374struct SCEVDbgValueBuilder {
6375 SCEVDbgValueBuilder() = default;
6376 SCEVDbgValueBuilder(const SCEVDbgValueBuilder &Base) { clone(Base); }
6377
6378 void clone(const SCEVDbgValueBuilder &Base) {
6379 LocationOps = Base.LocationOps;
6380 Expr = Base.Expr;
6381 }
6382
6383 void clear() {
6384 LocationOps.clear();
6385 Expr.clear();
6386 }
6387
6388 /// The DIExpression as we translate the SCEV.
6390 /// The location ops of the DIExpression.
6391 SmallVector<Value *, 2> LocationOps;
6392
6393 void pushOperator(uint64_t Op) { Expr.push_back(Op); }
6394 void pushUInt(uint64_t Operand) { Expr.push_back(Operand); }
6395
6396 /// Add a DW_OP_LLVM_arg to the expression, followed by the index of the value
6397 /// in the set of values referenced by the expression.
6398 void pushLocation(llvm::Value *V) {
6400 auto *It = llvm::find(LocationOps, V);
6401 unsigned ArgIndex = 0;
6402 if (It != LocationOps.end()) {
6403 ArgIndex = std::distance(LocationOps.begin(), It);
6404 } else {
6405 ArgIndex = LocationOps.size();
6406 LocationOps.push_back(V);
6407 }
6408 Expr.push_back(ArgIndex);
6409 }
6410
6411 void pushValue(const SCEVUnknown *U) {
6412 llvm::Value *V = cast<SCEVUnknown>(U)->getValue();
6413 pushLocation(V);
6414 }
6415
6416 bool pushConst(const SCEVConstant *C) {
6417 if (C->getAPInt().getSignificantBits() > 64)
6418 return false;
6419 Expr.push_back(llvm::dwarf::DW_OP_consts);
6420 Expr.push_back(C->getAPInt().getSExtValue());
6421 return true;
6422 }
6423
6424 // Iterating the expression as DWARF ops is convenient when updating
6425 // DWARF_OP_LLVM_args.
6427 return ToDwarfOpIter(Expr);
6428 }
6429
6430 /// Several SCEV types are sequences of the same arithmetic operator applied
6431 /// to constants and values that may be extended or truncated.
6432 bool pushArithmeticExpr(const llvm::SCEVCommutativeExpr *CommExpr,
6433 uint64_t DwarfOp) {
6434 assert((isa<llvm::SCEVAddExpr>(CommExpr) || isa<SCEVMulExpr>(CommExpr)) &&
6435 "Expected arithmetic SCEV type");
6436 bool Success = true;
6437 unsigned EmitOperator = 0;
6438 for (const auto &Op : CommExpr->operands()) {
6439 Success &= pushSCEV(Op);
6440
6441 if (EmitOperator >= 1)
6442 pushOperator(DwarfOp);
6443 ++EmitOperator;
6444 }
6445 return Success;
6446 }
6447
6448 // TODO: Identify and omit noop casts.
6449 bool pushCast(const llvm::SCEVCastExpr *C, bool IsSigned) {
6450 const llvm::SCEV *Inner = C->getOperand(0);
6451 const llvm::Type *Type = C->getType();
6452 uint64_t ToWidth = Type->getIntegerBitWidth();
6453 bool Success = pushSCEV(Inner);
6454 uint64_t CastOps[] = {dwarf::DW_OP_LLVM_convert, ToWidth,
6455 IsSigned ? llvm::dwarf::DW_ATE_signed
6456 : llvm::dwarf::DW_ATE_unsigned};
6457 for (const auto &Op : CastOps)
6458 pushOperator(Op);
6459 return Success;
6460 }
6461
6462 // TODO: MinMax - although these haven't been encountered in the test suite.
6463 bool pushSCEV(const llvm::SCEV *S) {
6464 bool Success = true;
6465 if (const SCEVConstant *StartInt = dyn_cast<SCEVConstant>(S)) {
6466 Success &= pushConst(StartInt);
6467
6468 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
6469 if (!U->getValue())
6470 return false;
6471 pushLocation(U->getValue());
6472
6473 } else if (const SCEVMulExpr *MulRec = dyn_cast<SCEVMulExpr>(S)) {
6474 Success &= pushArithmeticExpr(MulRec, llvm::dwarf::DW_OP_mul);
6475
6476 } else if (const SCEVUDivExpr *UDiv = dyn_cast<SCEVUDivExpr>(S)) {
6477 Success &= pushSCEV(UDiv->getLHS());
6478 Success &= pushSCEV(UDiv->getRHS());
6479 pushOperator(llvm::dwarf::DW_OP_div);
6480
6481 } else if (const SCEVCastExpr *Cast = dyn_cast<SCEVCastExpr>(S)) {
6482 // Assert if a new and unknown SCEVCastEXpr type is encountered.
6483 assert((isa<SCEVZeroExtendExpr>(Cast) || isa<SCEVTruncateExpr>(Cast) ||
6484 isa<SCEVPtrToIntExpr>(Cast) || isa<SCEVSignExtendExpr>(Cast)) &&
6485 "Unexpected cast type in SCEV.");
6486 Success &= pushCast(Cast, (isa<SCEVSignExtendExpr>(Cast)));
6487
6488 } else if (const SCEVAddExpr *AddExpr = dyn_cast<SCEVAddExpr>(S)) {
6489 Success &= pushArithmeticExpr(AddExpr, llvm::dwarf::DW_OP_plus);
6490
6491 } else if (isa<SCEVAddRecExpr>(S)) {
6492 // Nested SCEVAddRecExpr are generated by nested loops and are currently
6493 // unsupported.
6494 return false;
6495
6496 } else {
6497 return false;
6498 }
6499 return Success;
6500 }
6501
6502 /// Return true if the combination of arithmetic operator and underlying
6503 /// SCEV constant value is an identity function.
6504 bool isIdentityFunction(uint64_t Op, const SCEV *S) {
6505 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
6506 if (C->getAPInt().getSignificantBits() > 64)
6507 return false;
6508 int64_t I = C->getAPInt().getSExtValue();
6509 switch (Op) {
6510 case llvm::dwarf::DW_OP_plus:
6511 case llvm::dwarf::DW_OP_minus:
6512 return I == 0;
6513 case llvm::dwarf::DW_OP_mul:
6514 case llvm::dwarf::DW_OP_div:
6515 return I == 1;
6516 }
6517 }
6518 return false;
6519 }
6520
6521 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6522 /// builder's expression stack. The stack should already contain an
6523 /// expression for the iteration count, so that it can be multiplied by
6524 /// the stride and added to the start.
6525 /// Components of the expression are omitted if they are an identity function.
6526 /// Chain (non-affine) SCEVs are not supported.
6527 bool SCEVToValueExpr(const llvm::SCEVAddRecExpr &SAR, ScalarEvolution &SE) {
6528 assert(SAR.isAffine() && "Expected affine SCEV");
6529 // TODO: Is this check needed?
6530 if (isa<SCEVAddRecExpr>(SAR.getStart()))
6531 return false;
6532
6533 const SCEV *Start = SAR.getStart();
6534 const SCEV *Stride = SAR.getStepRecurrence(SE);
6535
6536 // Skip pushing arithmetic noops.
6537 if (!isIdentityFunction(llvm::dwarf::DW_OP_mul, Stride)) {
6538 if (!pushSCEV(Stride))
6539 return false;
6540 pushOperator(llvm::dwarf::DW_OP_mul);
6541 }
6542 if (!isIdentityFunction(llvm::dwarf::DW_OP_plus, Start)) {
6543 if (!pushSCEV(Start))
6544 return false;
6545 pushOperator(llvm::dwarf::DW_OP_plus);
6546 }
6547 return true;
6548 }
6549
6550 /// Create an expression that is an offset from a value (usually the IV).
6551 void createOffsetExpr(int64_t Offset, Value *OffsetValue) {
6552 pushLocation(OffsetValue);
6554 LLVM_DEBUG(
6555 dbgs() << "scev-salvage: Generated IV offset expression. Offset: "
6556 << std::to_string(Offset) << "\n");
6557 }
6558
6559 /// Combine a translation of the SCEV and the IV to create an expression that
6560 /// recovers a location's value.
6561 /// returns true if an expression was created.
6562 bool createIterCountExpr(const SCEV *S,
6563 const SCEVDbgValueBuilder &IterationCount,
6564 ScalarEvolution &SE) {
6565 // SCEVs for SSA values are most frquently of the form
6566 // {start,+,stride}, but sometimes they are ({start,+,stride} + %a + ..).
6567 // This is because %a is a PHI node that is not the IV. However, these
6568 // SCEVs have not been observed to result in debuginfo-lossy optimisations,
6569 // so its not expected this point will be reached.
6570 if (!isa<SCEVAddRecExpr>(S))
6571 return false;
6572
6573 LLVM_DEBUG(dbgs() << "scev-salvage: Location to salvage SCEV: " << *S
6574 << '\n');
6575
6576 const auto *Rec = cast<SCEVAddRecExpr>(S);
6577 if (!Rec->isAffine())
6578 return false;
6579
6581 return false;
6582
6583 // Initialise a new builder with the iteration count expression. In
6584 // combination with the value's SCEV this enables recovery.
6585 clone(IterationCount);
6586 if (!SCEVToValueExpr(*Rec, SE))
6587 return false;
6588
6589 return true;
6590 }
6591
6592 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6593 /// builder's expression stack. The stack should already contain an
6594 /// expression for the iteration count, so that it can be multiplied by
6595 /// the stride and added to the start.
6596 /// Components of the expression are omitted if they are an identity function.
6597 bool SCEVToIterCountExpr(const llvm::SCEVAddRecExpr &SAR,
6598 ScalarEvolution &SE) {
6599 assert(SAR.isAffine() && "Expected affine SCEV");
6600 if (isa<SCEVAddRecExpr>(SAR.getStart())) {
6601 LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV. Unsupported nested AddRec: "
6602 << SAR << '\n');
6603 return false;
6604 }
6605 const SCEV *Start = SAR.getStart();
6606 const SCEV *Stride = SAR.getStepRecurrence(SE);
6607
6608 // Skip pushing arithmetic noops.
6609 if (!isIdentityFunction(llvm::dwarf::DW_OP_minus, Start)) {
6610 if (!pushSCEV(Start))
6611 return false;
6612 pushOperator(llvm::dwarf::DW_OP_minus);
6613 }
6614 if (!isIdentityFunction(llvm::dwarf::DW_OP_div, Stride)) {
6615 if (!pushSCEV(Stride))
6616 return false;
6617 pushOperator(llvm::dwarf::DW_OP_div);
6618 }
6619 return true;
6620 }
6621
6622 // Append the current expression and locations to a location list and an
6623 // expression list. Modify the DW_OP_LLVM_arg indexes to account for
6624 // the locations already present in the destination list.
6625 void appendToVectors(SmallVectorImpl<uint64_t> &DestExpr,
6626 SmallVectorImpl<Value *> &DestLocations) {
6627 assert(!DestLocations.empty() &&
6628 "Expected the locations vector to contain the IV");
6629 // The DWARF_OP_LLVM_arg arguments of the expression being appended must be
6630 // modified to account for the locations already in the destination vector.
6631 // All builders contain the IV as the first location op.
6632 assert(!LocationOps.empty() &&
6633 "Expected the location ops to contain the IV.");
6634 // DestIndexMap[n] contains the index in DestLocations for the nth
6635 // location in this SCEVDbgValueBuilder.
6636 SmallVector<uint64_t, 2> DestIndexMap;
6637 for (const auto &Op : LocationOps) {
6638 auto It = find(DestLocations, Op);
6639 if (It != DestLocations.end()) {
6640 // Location already exists in DestLocations, reuse existing ArgIndex.
6641 DestIndexMap.push_back(std::distance(DestLocations.begin(), It));
6642 continue;
6643 }
6644 // Location is not in DestLocations, add it.
6645 DestIndexMap.push_back(DestLocations.size());
6646 DestLocations.push_back(Op);
6647 }
6648
6649 for (const auto &Op : expr_ops()) {
6650 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6651 Op.appendToVector(DestExpr);
6652 continue;
6653 }
6654
6656 // `DW_OP_LLVM_arg n` represents the nth LocationOp in this SCEV,
6657 // DestIndexMap[n] contains its new index in DestLocations.
6658 uint64_t NewIndex = DestIndexMap[Op.getArg(0)];
6659 DestExpr.push_back(NewIndex);
6660 }
6661 }
6662};
6663
6664/// Holds all the required data to salvage a dbg.value using the pre-LSR SCEVs
6665/// and DIExpression.
6666struct DVIRecoveryRec {
6667 DVIRecoveryRec(DbgValueInst *DbgValue)
6668 : DbgRef(DbgValue), Expr(DbgValue->getExpression()),
6669 HadLocationArgList(false) {}
6670 DVIRecoveryRec(DbgVariableRecord *DVR)
6671 : DbgRef(DVR), Expr(DVR->getExpression()), HadLocationArgList(false) {}
6672
6674 DIExpression *Expr;
6675 bool HadLocationArgList;
6676 SmallVector<WeakVH, 2> LocationOps;
6679
6680 void clear() {
6681 for (auto &RE : RecoveryExprs)
6682 RE.reset();
6683 RecoveryExprs.clear();
6684 }
6685
6686 ~DVIRecoveryRec() { clear(); }
6687};
6688} // namespace
6689
6690/// Returns the total number of DW_OP_llvm_arg operands in the expression.
6691/// This helps in determining if a DIArglist is necessary or can be omitted from
6692/// the dbg.value.
6694 auto expr_ops = ToDwarfOpIter(Expr);
6695 unsigned Count = 0;
6696 for (auto Op : expr_ops)
6697 if (Op.getOp() == dwarf::DW_OP_LLVM_arg)
6698 Count++;
6699 return Count;
6700}
6701
6702/// Overwrites DVI with the location and Ops as the DIExpression. This will
6703/// create an invalid expression if Ops has any dwarf::DW_OP_llvm_arg operands,
6704/// because a DIArglist is not created for the first argument of the dbg.value.
6705template <typename T>
6706static void updateDVIWithLocation(T &DbgVal, Value *Location,
6708 assert(numLLVMArgOps(Ops) == 0 && "Expected expression that does not "
6709 "contain any DW_OP_llvm_arg operands.");
6710 DbgVal.setRawLocation(ValueAsMetadata::get(Location));
6711 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6712 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6713}
6714
6715/// Overwrite DVI with locations placed into a DIArglist.
6716template <typename T>
6717static void updateDVIWithLocations(T &DbgVal,
6718 SmallVectorImpl<Value *> &Locations,
6720 assert(numLLVMArgOps(Ops) != 0 &&
6721 "Expected expression that references DIArglist locations using "
6722 "DW_OP_llvm_arg operands.");
6724 for (Value *V : Locations)
6725 MetadataLocs.push_back(ValueAsMetadata::get(V));
6726 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6727 DbgVal.setRawLocation(llvm::DIArgList::get(DbgVal.getContext(), ValArrayRef));
6728 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6729}
6730
6731/// Write the new expression and new location ops for the dbg.value. If possible
6732/// reduce the szie of the dbg.value intrinsic by omitting DIArglist. This
6733/// can be omitted if:
6734/// 1. There is only a single location, refenced by a single DW_OP_llvm_arg.
6735/// 2. The DW_OP_LLVM_arg is the first operand in the expression.
6736static void UpdateDbgValueInst(DVIRecoveryRec &DVIRec,
6737 SmallVectorImpl<Value *> &NewLocationOps,
6739 auto UpdateDbgValueInstImpl = [&](auto *DbgVal) {
6740 unsigned NumLLVMArgs = numLLVMArgOps(NewExpr);
6741 if (NumLLVMArgs == 0) {
6742 // Location assumed to be on the stack.
6743 updateDVIWithLocation(*DbgVal, NewLocationOps[0], NewExpr);
6744 } else if (NumLLVMArgs == 1 && NewExpr[0] == dwarf::DW_OP_LLVM_arg) {
6745 // There is only a single DW_OP_llvm_arg at the start of the expression,
6746 // so it can be omitted along with DIArglist.
6747 assert(NewExpr[1] == 0 &&
6748 "Lone LLVM_arg in a DIExpression should refer to location-op 0.");
6750 updateDVIWithLocation(*DbgVal, NewLocationOps[0], ShortenedOps);
6751 } else {
6752 // Multiple DW_OP_llvm_arg, so DIArgList is strictly necessary.
6753 updateDVIWithLocations(*DbgVal, NewLocationOps, NewExpr);
6754 }
6755
6756 // If the DIExpression was previously empty then add the stack terminator.
6757 // Non-empty expressions have only had elements inserted into them and so
6758 // the terminator should already be present e.g. stack_value or fragment.
6759 DIExpression *SalvageExpr = DbgVal->getExpression();
6760 if (!DVIRec.Expr->isComplex() && SalvageExpr->isComplex()) {
6761 SalvageExpr =
6762 DIExpression::append(SalvageExpr, {dwarf::DW_OP_stack_value});
6763 DbgVal->setExpression(SalvageExpr);
6764 }
6765 };
6766 if (isa<DbgValueInst *>(DVIRec.DbgRef))
6767 UpdateDbgValueInstImpl(cast<DbgValueInst *>(DVIRec.DbgRef));
6768 else
6769 UpdateDbgValueInstImpl(cast<DbgVariableRecord *>(DVIRec.DbgRef));
6770}
6771
6772/// Cached location ops may be erased during LSR, in which case a poison is
6773/// required when restoring from the cache. The type of that location is no
6774/// longer available, so just use int8. The poison will be replaced by one or
6775/// more locations later when a SCEVDbgValueBuilder selects alternative
6776/// locations to use for the salvage.
6778 return (VH) ? VH : PoisonValue::get(llvm::Type::getInt8Ty(C));
6779}
6780
6781/// Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
6782static void restorePreTransformState(DVIRecoveryRec &DVIRec) {
6783 auto RestorePreTransformStateImpl = [&](auto *DbgVal) {
6784 LLVM_DEBUG(dbgs() << "scev-salvage: restore dbg.value to pre-LSR state\n"
6785 << "scev-salvage: post-LSR: " << *DbgVal << '\n');
6786 assert(DVIRec.Expr && "Expected an expression");
6787 DbgVal->setExpression(DVIRec.Expr);
6788
6789 // Even a single location-op may be inside a DIArgList and referenced with
6790 // DW_OP_LLVM_arg, which is valid only with a DIArgList.
6791 if (!DVIRec.HadLocationArgList) {
6792 assert(DVIRec.LocationOps.size() == 1 &&
6793 "Unexpected number of location ops.");
6794 // LSR's unsuccessful salvage attempt may have added DIArgList, which in
6795 // this case was not present before, so force the location back to a
6796 // single uncontained Value.
6797 Value *CachedValue =
6798 getValueOrPoison(DVIRec.LocationOps[0], DbgVal->getContext());
6799 DbgVal->setRawLocation(ValueAsMetadata::get(CachedValue));
6800 } else {
6802 for (WeakVH VH : DVIRec.LocationOps) {
6803 Value *CachedValue = getValueOrPoison(VH, DbgVal->getContext());
6804 MetadataLocs.push_back(ValueAsMetadata::get(CachedValue));
6805 }
6806 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6807 DbgVal->setRawLocation(
6808 llvm::DIArgList::get(DbgVal->getContext(), ValArrayRef));
6809 }
6810 LLVM_DEBUG(dbgs() << "scev-salvage: pre-LSR: " << *DbgVal << '\n');
6811 };
6812 if (isa<DbgValueInst *>(DVIRec.DbgRef))
6813 RestorePreTransformStateImpl(cast<DbgValueInst *>(DVIRec.DbgRef));
6814 else
6815 RestorePreTransformStateImpl(cast<DbgVariableRecord *>(DVIRec.DbgRef));
6816}
6817
6819 llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec,
6820 const SCEV *SCEVInductionVar,
6821 SCEVDbgValueBuilder IterCountExpr) {
6822
6823 if (isa<DbgValueInst *>(DVIRec.DbgRef)
6824 ? !cast<DbgValueInst *>(DVIRec.DbgRef)->isKillLocation()
6825 : !cast<DbgVariableRecord *>(DVIRec.DbgRef)->isKillLocation())
6826 return false;
6827
6828 // LSR may have caused several changes to the dbg.value in the failed salvage
6829 // attempt. So restore the DIExpression, the location ops and also the
6830 // location ops format, which is always DIArglist for multiple ops, but only
6831 // sometimes for a single op.
6833
6834 // LocationOpIndexMap[i] will store the post-LSR location index of
6835 // the non-optimised out location at pre-LSR index i.
6836 SmallVector<int64_t, 2> LocationOpIndexMap;
6837 LocationOpIndexMap.assign(DVIRec.LocationOps.size(), -1);
6838 SmallVector<Value *, 2> NewLocationOps;
6839 NewLocationOps.push_back(LSRInductionVar);
6840
6841 for (unsigned i = 0; i < DVIRec.LocationOps.size(); i++) {
6842 WeakVH VH = DVIRec.LocationOps[i];
6843 // Place the locations not optimised out in the list first, avoiding
6844 // inserts later. The map is used to update the DIExpression's
6845 // DW_OP_LLVM_arg arguments as the expression is updated.
6846 if (VH && !isa<UndefValue>(VH)) {
6847 NewLocationOps.push_back(VH);
6848 LocationOpIndexMap[i] = NewLocationOps.size() - 1;
6849 LLVM_DEBUG(dbgs() << "scev-salvage: Location index " << i
6850 << " now at index " << LocationOpIndexMap[i] << "\n");
6851 continue;
6852 }
6853
6854 // It's possible that a value referred to in the SCEV may have been
6855 // optimised out by LSR.
6856 if (SE.containsErasedValue(DVIRec.SCEVs[i]) ||
6857 SE.containsUndefs(DVIRec.SCEVs[i])) {
6858 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV for location at index: " << i
6859 << " refers to a location that is now undef or erased. "
6860 "Salvage abandoned.\n");
6861 return false;
6862 }
6863
6864 LLVM_DEBUG(dbgs() << "scev-salvage: salvaging location at index " << i
6865 << " with SCEV: " << *DVIRec.SCEVs[i] << "\n");
6866
6867 DVIRec.RecoveryExprs[i] = std::make_unique<SCEVDbgValueBuilder>();
6868 SCEVDbgValueBuilder *SalvageExpr = DVIRec.RecoveryExprs[i].get();
6869
6870 // Create an offset-based salvage expression if possible, as it requires
6871 // less DWARF ops than an iteration count-based expression.
6872 if (std::optional<APInt> Offset =
6873 SE.computeConstantDifference(DVIRec.SCEVs[i], SCEVInductionVar)) {
6874 if (Offset->getSignificantBits() <= 64)
6875 SalvageExpr->createOffsetExpr(Offset->getSExtValue(), LSRInductionVar);
6876 else
6877 return false;
6878 } else if (!SalvageExpr->createIterCountExpr(DVIRec.SCEVs[i], IterCountExpr,
6879 SE))
6880 return false;
6881 }
6882
6883 // Merge the DbgValueBuilder generated expressions and the original
6884 // DIExpression, place the result into an new vector.
6886 if (DVIRec.Expr->getNumElements() == 0) {
6887 assert(DVIRec.RecoveryExprs.size() == 1 &&
6888 "Expected only a single recovery expression for an empty "
6889 "DIExpression.");
6890 assert(DVIRec.RecoveryExprs[0] &&
6891 "Expected a SCEVDbgSalvageBuilder for location 0");
6892 SCEVDbgValueBuilder *B = DVIRec.RecoveryExprs[0].get();
6893 B->appendToVectors(NewExpr, NewLocationOps);
6894 }
6895 for (const auto &Op : DVIRec.Expr->expr_ops()) {
6896 // Most Ops needn't be updated.
6897 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6898 Op.appendToVector(NewExpr);
6899 continue;
6900 }
6901
6902 uint64_t LocationArgIndex = Op.getArg(0);
6903 SCEVDbgValueBuilder *DbgBuilder =
6904 DVIRec.RecoveryExprs[LocationArgIndex].get();
6905 // The location doesn't have s SCEVDbgValueBuilder, so LSR did not
6906 // optimise it away. So just translate the argument to the updated
6907 // location index.
6908 if (!DbgBuilder) {
6909 NewExpr.push_back(dwarf::DW_OP_LLVM_arg);
6910 assert(LocationOpIndexMap[Op.getArg(0)] != -1 &&
6911 "Expected a positive index for the location-op position.");
6912 NewExpr.push_back(LocationOpIndexMap[Op.getArg(0)]);
6913 continue;
6914 }
6915 // The location has a recovery expression.
6916 DbgBuilder->appendToVectors(NewExpr, NewLocationOps);
6917 }
6918
6919 UpdateDbgValueInst(DVIRec, NewLocationOps, NewExpr);
6920 if (isa<DbgValueInst *>(DVIRec.DbgRef))
6921 LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: "
6922 << *cast<DbgValueInst *>(DVIRec.DbgRef) << "\n");
6923 else
6924 LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: "
6925 << *cast<DbgVariableRecord *>(DVIRec.DbgRef) << "\n");
6926 return true;
6927}
6928
6929/// Obtain an expression for the iteration count, then attempt to salvage the
6930/// dbg.value intrinsics.
6932 llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar,
6933 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &DVIToUpdate) {
6934 if (DVIToUpdate.empty())
6935 return;
6936
6937 const llvm::SCEV *SCEVInductionVar = SE.getSCEV(LSRInductionVar);
6938 assert(SCEVInductionVar &&
6939 "Anticipated a SCEV for the post-LSR induction variable");
6940
6941 if (const SCEVAddRecExpr *IVAddRec =
6942 dyn_cast<SCEVAddRecExpr>(SCEVInductionVar)) {
6943 if (!IVAddRec->isAffine())
6944 return;
6945
6946 // Prevent translation using excessive resources.
6947 if (IVAddRec->getExpressionSize() > MaxSCEVSalvageExpressionSize)
6948 return;
6949
6950 // The iteration count is required to recover location values.
6951 SCEVDbgValueBuilder IterCountExpr;
6952 IterCountExpr.pushLocation(LSRInductionVar);
6953 if (!IterCountExpr.SCEVToIterCountExpr(*IVAddRec, SE))
6954 return;
6955
6956 LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV: " << *SCEVInductionVar
6957 << '\n');
6958
6959 for (auto &DVIRec : DVIToUpdate) {
6960 SalvageDVI(L, SE, LSRInductionVar, *DVIRec, SCEVInductionVar,
6961 IterCountExpr);
6962 }
6963 }
6964}
6965
6966/// Identify and cache salvageable DVI locations and expressions along with the
6967/// corresponding SCEV(s). Also ensure that the DVI is not deleted between
6968/// cacheing and salvaging.
6970 Loop *L, ScalarEvolution &SE,
6971 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &SalvageableDVISCEVs,
6972 SmallSet<AssertingVH<DbgValueInst>, 2> &DVIHandles) {
6973 for (const auto &B : L->getBlocks()) {
6974 for (auto &I : *B) {
6975 auto ProcessDbgValue = [&](auto *DbgVal) -> bool {
6976 // Ensure that if any location op is undef that the dbg.vlue is not
6977 // cached.
6978 if (DbgVal->isKillLocation())
6979 return false;
6980
6981 // Check that the location op SCEVs are suitable for translation to
6982 // DIExpression.
6983 const auto &HasTranslatableLocationOps =
6984 [&](const auto *DbgValToTranslate) -> bool {
6985 for (const auto LocOp : DbgValToTranslate->location_ops()) {
6986 if (!LocOp)
6987 return false;
6988
6989 if (!SE.isSCEVable(LocOp->getType()))
6990 return false;
6991
6992 const SCEV *S = SE.getSCEV(LocOp);
6993 if (SE.containsUndefs(S))
6994 return false;
6995 }
6996 return true;
6997 };
6998
6999 if (!HasTranslatableLocationOps(DbgVal))
7000 return false;
7001
7002 std::unique_ptr<DVIRecoveryRec> NewRec =
7003 std::make_unique<DVIRecoveryRec>(DbgVal);
7004 // Each location Op may need a SCEVDbgValueBuilder in order to recover
7005 // it. Pre-allocating a vector will enable quick lookups of the builder
7006 // later during the salvage.
7007 NewRec->RecoveryExprs.resize(DbgVal->getNumVariableLocationOps());
7008 for (const auto LocOp : DbgVal->location_ops()) {
7009 NewRec->SCEVs.push_back(SE.getSCEV(LocOp));
7010 NewRec->LocationOps.push_back(LocOp);
7011 NewRec->HadLocationArgList = DbgVal->hasArgList();
7012 }
7013 SalvageableDVISCEVs.push_back(std::move(NewRec));
7014 return true;
7015 };
7016 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
7017 if (DVR.isDbgValue() || DVR.isDbgAssign())
7018 ProcessDbgValue(&DVR);
7019 }
7020 auto DVI = dyn_cast<DbgValueInst>(&I);
7021 if (!DVI)
7022 continue;
7023 if (ProcessDbgValue(DVI))
7024 DVIHandles.insert(DVI);
7025 }
7026 }
7027}
7028
7029/// Ideally pick the PHI IV inserted by ScalarEvolutionExpander. As a fallback
7030/// any PHi from the loop header is usable, but may have less chance of
7031/// surviving subsequent transforms.
7033 const LSRInstance &LSR) {
7034
7035 auto IsSuitableIV = [&](PHINode *P) {
7036 if (!SE.isSCEVable(P->getType()))
7037 return false;
7038 if (const SCEVAddRecExpr *Rec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(P)))
7039 return Rec->isAffine() && !SE.containsUndefs(SE.getSCEV(P));
7040 return false;
7041 };
7042
7043 // For now, just pick the first IV that was generated and inserted by
7044 // ScalarEvolution. Ideally pick an IV that is unlikely to be optimised away
7045 // by subsequent transforms.
7046 for (const WeakVH &IV : LSR.getScalarEvolutionIVs()) {
7047 if (!IV)
7048 continue;
7049
7050 // There should only be PHI node IVs.
7051 PHINode *P = cast<PHINode>(&*IV);
7052
7053 if (IsSuitableIV(P))
7054 return P;
7055 }
7056
7057 for (PHINode &P : L.getHeader()->phis()) {
7058 if (IsSuitableIV(&P))
7059 return &P;
7060 }
7061 return nullptr;
7062}
7063
7065 DominatorTree &DT, LoopInfo &LI,
7066 const TargetTransformInfo &TTI,
7068 MemorySSA *MSSA) {
7069
7070 // Debug preservation - before we start removing anything identify which DVI
7071 // meet the salvageable criteria and store their DIExpression and SCEVs.
7072 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> SalvageableDVIRecords;
7074 DbgGatherSalvagableDVI(L, SE, SalvageableDVIRecords, DVIHandles);
7075
7076 bool Changed = false;
7077 std::unique_ptr<MemorySSAUpdater> MSSAU;
7078 if (MSSA)
7079 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
7080
7081 // Run the main LSR transformation.
7082 const LSRInstance &Reducer =
7083 LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get());
7084 Changed |= Reducer.getChanged();
7085
7086 // Remove any extra phis created by processing inner loops.
7087 Changed |= DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7088 if (EnablePhiElim && L->isLoopSimplifyForm()) {
7090 const DataLayout &DL = L->getHeader()->getDataLayout();
7091 SCEVExpander Rewriter(SE, DL, "lsr", false);
7092#if LLVM_ENABLE_ABI_BREAKING_CHECKS
7093 Rewriter.setDebugType(DEBUG_TYPE);
7094#endif
7095 unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI);
7096 Rewriter.clear();
7097 if (numFolded) {
7098 Changed = true;
7100 MSSAU.get());
7101 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7102 }
7103 }
7104 // LSR may at times remove all uses of an induction variable from a loop.
7105 // The only remaining use is the PHI in the exit block.
7106 // When this is the case, if the exit value of the IV can be calculated using
7107 // SCEV, we can replace the exit block PHI with the final value of the IV and
7108 // skip the updates in each loop iteration.
7109 if (L->isRecursivelyLCSSAForm(DT, LI) && L->getExitBlock()) {
7111 const DataLayout &DL = L->getHeader()->getDataLayout();
7112 SCEVExpander Rewriter(SE, DL, "lsr", true);
7113 int Rewrites = rewriteLoopExitValues(L, &LI, &TLI, &SE, &TTI, Rewriter, &DT,
7114 UnusedIndVarInLoop, DeadInsts);
7115 Rewriter.clear();
7116 if (Rewrites) {
7117 Changed = true;
7119 MSSAU.get());
7120 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7121 }
7122 }
7123
7124 if (SalvageableDVIRecords.empty())
7125 return Changed;
7126
7127 // Obtain relevant IVs and attempt to rewrite the salvageable DVIs with
7128 // expressions composed using the derived iteration count.
7129 // TODO: Allow for multiple IV references for nested AddRecSCEVs
7130 for (const auto &L : LI) {
7131 if (llvm::PHINode *IV = GetInductionVariable(*L, SE, Reducer))
7132 DbgRewriteSalvageableDVIs(L, SE, IV, SalvageableDVIRecords);
7133 else {
7134 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV salvaging not possible. An IV "
7135 "could not be identified.\n");
7136 }
7137 }
7138
7139 for (auto &Rec : SalvageableDVIRecords)
7140 Rec->clear();
7141 SalvageableDVIRecords.clear();
7142 DVIHandles.clear();
7143 return Changed;
7144}
7145
7146bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
7147 if (skipLoop(L))
7148 return false;
7149
7150 auto &IU = getAnalysis<IVUsersWrapperPass>().getIU();
7151 auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
7152 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
7153 auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
7154 const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
7155 *L->getHeader()->getParent());
7156 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
7157 *L->getHeader()->getParent());
7158 auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
7159 *L->getHeader()->getParent());
7160 auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
7161 MemorySSA *MSSA = nullptr;
7162 if (MSSAAnalysis)
7163 MSSA = &MSSAAnalysis->getMSSA();
7164 return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, TLI, MSSA);
7165}
7166
7169 LPMUpdater &) {
7170 if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,
7171 AR.DT, AR.LI, AR.TTI, AR.AC, AR.TLI, AR.MSSA))
7172 return PreservedAnalyses::all();
7173
7174 auto PA = getLoopPassPreservedAnalyses();
7175 if (AR.MSSA)
7176 PA.preserve<MemorySSAAnalysis>();
7177 return PA;
7178}
7179
7180char LoopStrengthReduce::ID = 0;
7181
7182INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
7183 "Loop Strength Reduction", false, false)
7189INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
7190INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
7191 "Loop Strength Reduction", false, false)
7192
7193Pass *llvm::createLoopStrengthReducePass() { return new LoopStrengthReduce(); }
#define Success
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:686
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:622
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static bool isCanonical(const MDString *S)
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
This file contains constants used for implementing Dwarf debug support.
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1315
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Hexagon Hardware Loops
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition: IVUsers.cpp:48
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:533
This header provides classes for managing per-loop analyses.
static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec, const SCEV *SCEVInductionVar, SCEVDbgValueBuilder IterCountExpr)
static cl::opt< bool > DropScaledForVScale("lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true), cl::desc("Avoid using scaled registers with vscale-relative addressing"))
static Value * getWideOperand(Value *Oper)
IVChain logic must consistently peek base TruncInst operands, so wrap it in a convenient helper.
static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE)
Return true if the given add can be sign-extended without changing its value.
static bool mayUsePostIncMode(const TargetTransformInfo &TTI, LSRUse &LU, const SCEV *S, const Loop *L, ScalarEvolution &SE)
Return true if the SCEV represents a value that may end up as a post-increment operation.
static void restorePreTransformState(DVIRecoveryRec &DVIRec)
Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE)
If S involves the addition of a constant integer value, return that integer value,...
static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L)
static User::op_iterator findIVOperand(User::op_iterator OI, User::op_iterator OE, Loop *L, ScalarEvolution &SE)
Helper for CollectChains that finds an IV operand (computed by an AddRec in this loop) within [OI,...
static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset, Immediate MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg, int64_t Scale)
Test whether we know how to expand the current formula.
static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE)
Return true if the given mul can be sign-extended without changing its value.
static const unsigned MaxSCEVSalvageExpressionSize
Limit the size of expression that SCEV-based salvaging will attempt to translate into a DIExpression.
static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if this AddRec is already a phi in its loop.
static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F, const Loop &L)
static cl::opt< bool > InsnsCost("lsr-insns-cost", cl::Hidden, cl::init(true), cl::desc("Add instruction count to a LSR cost model"))
static cl::opt< bool > StressIVChain("stress-ivchain", cl::Hidden, cl::init(false), cl::desc("Stress test LSR IV chains"))
static bool isAddressUse(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Returns true if the specified instruction is using the specified value as an address.
static GlobalValue * ExtractSymbol(const SCEV *&S, ScalarEvolution &SE)
If S involves the addition of a GlobalValue address, return that symbol, and mutate S to point to a n...
static void updateDVIWithLocation(T &DbgVal, Value *Location, SmallVectorImpl< uint64_t > &Ops)
Overwrites DVI with the location and Ops as the DIExpression.
static bool isLegalAddImmediate(const TargetTransformInfo &TTI, Immediate Offset)
static cl::opt< cl::boolOrDefault > AllowDropSolutionIfLessProfitable("lsr-drop-solution", cl::Hidden, cl::desc("Attempt to drop solution if it is less profitable"))
static cl::opt< bool > EnableVScaleImmediates("lsr-enable-vscale-immediates", cl::Hidden, cl::init(true), cl::desc("Enable analysis of vscale-relative immediates in LSR"))
static cl::opt< TTI::AddressingModeKind > PreferredAddresingMode("lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None), cl::desc("A flag that overrides the target's preferred addressing mode."), cl::values(clEnumValN(TTI::AMK_None, "none", "Don't prefer any addressing mode"), clEnumValN(TTI::AMK_PreIndexed, "preindexed", "Prefer pre-indexed addressing mode"), clEnumValN(TTI::AMK_PostIndexed, "postindexed", "Prefer post-indexed addressing mode")))
static const SCEV * getExprBase(const SCEV *S)
Return an approximation of this SCEV expression's "base", or NULL for any constant.
static bool isAlwaysFoldable(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg)
static llvm::PHINode * GetInductionVariable(const Loop &L, ScalarEvolution &SE, const LSRInstance &LSR)
Ideally pick the PHI IV inserted by ScalarEvolutionExpander.
static bool IsSimplerBaseSCEVForTarget(const TargetTransformInfo &TTI, ScalarEvolution &SE, const SCEV *Best, const SCEV *Reg, MemAccessTy AccessType)
loop reduce
static const unsigned MaxIVUsers
MaxIVUsers is an arbitrary threshold that provides an early opportunity for bail out.
static bool isHighCostExpansion(const SCEV *S, SmallPtrSetImpl< const SCEV * > &Processed, ScalarEvolution &SE)
Check if expanding this expression is likely to incur significant cost.
static Value * getValueOrPoison(WeakVH &VH, LLVMContext &C)
Cached location ops may be erased during LSR, in which case a poison is required when restoring from ...
static MemAccessTy getAccessType(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Return the type of the memory being accessed.
static unsigned numLLVMArgOps(SmallVectorImpl< uint64_t > &Expr)
Returns the total number of DW_OP_llvm_arg operands in the expression.
static void DbgRewriteSalvageableDVIs(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &DVIToUpdate)
Obtain an expression for the iteration count, then attempt to salvage the dbg.value intrinsics.
static cl::opt< bool > EnablePhiElim("enable-lsr-phielim", cl::Hidden, cl::init(true), cl::desc("Enable LSR phi elimination"))
static void DbgGatherSalvagableDVI(Loop *L, ScalarEvolution &SE, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &SalvageableDVISCEVs, SmallSet< AssertingVH< DbgValueInst >, 2 > &DVIHandles)
Identify and cache salvageable DVI locations and expressions along with the corresponding SCEV(s).
static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if the given addrec can be sign-extended without changing its value.
static bool canHoistIVInc(const TargetTransformInfo &TTI, const LSRFixup &Fixup, const LSRUse &LU, Instruction *IVIncInsertPos, Loop *L)
static void DoInitialMatch(const SCEV *S, Loop *L, SmallVectorImpl< const SCEV * > &Good, SmallVectorImpl< const SCEV * > &Bad, ScalarEvolution &SE)
Recursion helper for initialMatch.
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F)
Check if the addressing mode defined by F is completely folded in LU at isel time.
static cl::opt< bool > LSRExpNarrow("lsr-exp-narrow", cl::Hidden, cl::init(false), cl::desc("Narrow LSR complex solution using" " expectation of registers number"))
static cl::opt< bool > FilterSameScaledReg("lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true), cl::desc("Narrow LSR search space by filtering non-optimal formulae" " with the same ScaledReg and Scale"))
static void updateDVIWithLocations(T &DbgVal, SmallVectorImpl< Value * > &Locations, SmallVectorImpl< uint64_t > &Ops)
Overwrite DVI with locations placed into a DIArglist.
static cl::opt< unsigned > ComplexityLimit("lsr-complexity-limit", cl::Hidden, cl::init(std::numeric_limits< uint16_t >::max()), cl::desc("LSR search space complexity limit"))
static void UpdateDbgValueInst(DVIRecoveryRec &DVIRec, SmallVectorImpl< Value * > &NewLocationOps, SmallVectorImpl< uint64_t > &NewExpr)
Write the new expression and new location ops for the dbg.value.
static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC, TargetLibraryInfo &TLI, MemorySSA *MSSA)
static bool isProfitableChain(IVChain &Chain, SmallPtrSetImpl< Instruction * > &Users, ScalarEvolution &SE, const TargetTransformInfo &TTI)
Return true if the number of registers needed for the chain is estimated to be less than the number r...
static const SCEV * CollectSubexprs(const SCEV *S, const SCEVConstant *C, SmallVectorImpl< const SCEV * > &Ops, const Loop *L, ScalarEvolution &SE, unsigned Depth=0)
Split S into subexpressions which can be pulled out into separate registers.
static const SCEV * getExactSDiv(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool IgnoreSignificantBits=false)
Return an expression for LHS /s RHS, if it can be determined and if the remainder is known to be zero...
static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, Value *Operand, const TargetTransformInfo &TTI)
Return true if the IVInc can be folded into an addressing mode.
#define DEBUG_TYPE
static const SCEV * getAnyExtendConsideringPostIncUses(ArrayRef< PostIncLoopSet > Loops, const SCEV *Expr, Type *ToTy, ScalarEvolution &SE)
Extend/Truncate Expr to ToTy considering post-inc uses in Loops.
static unsigned getSetupCost(const SCEV *Reg, unsigned Depth)
static cl::opt< unsigned > SetupCostDepthLimit("lsr-setupcost-depth-limit", cl::Hidden, cl::init(7), cl::desc("The limit on recursion depth for LSRs setup cost"))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
unsigned Reg
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
uint64_t IntrinsicInst * II
#define P(N)
PowerPC TLS Dynamic Call Fixup
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file defines the PointerIntPair class.
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
SI optimize exec mask operations pre RA
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
Virtual Register Rewriter
Definition: VirtRegMap.cpp:261
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class recording the (high level) value of a variable.
Class for arbitrary precision integers.
Definition: APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1520
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:329
APInt sdiv(const APInt &RHS) const
Signed division function for APInt.
Definition: APInt.cpp:1618
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition: APInt.h:1511
APInt srem(const APInt &RHS) const
Function for signed remainder operation.
Definition: APInt.cpp:1710
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
Represent the analysis usage information of a pass.
AnalysisUsage & addRequiredID(const void *ID)
Definition: Pass.cpp:270
AnalysisUsage & addPreservedID(const void *ID)
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Definition: Any.h:28
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
Value handle that asserts if the Value is deleted.
Definition: ValueHandle.h:264
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:501
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:704
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:517
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition: BasicBlock.h:376
bool isLandingPad() const
Return true if this basic block is a landing pad.
Definition: BasicBlock.cpp:677
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
BinaryOps getOpcode() const
Definition: InstrTypes.h:370
static BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
Conditional or Unconditional Branch instruction.
bool isUnconditional() const
Value * getCondition() const
static Instruction::CastOps getCastOpcode(const Value *Val, bool SrcIsSigned, Type *Ty, bool DstIsSigned)
Returns the opcode necessary to cast Val into Ty using usual casting rules.
static CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_EQ
equal
Definition: InstrTypes.h:694
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:787
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
static bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
Definition: Constants.cpp:1597
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.h:126
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:163
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:157
This is an important base class in LLVM.
Definition: Constant.h:42
static DIArgList * get(LLVMContext &Context, ArrayRef< ValueAsMetadata * > Args)
An iterator for expression operands.
DWARF expression.
static DIExpression * append(const DIExpression *Expr, ArrayRef< uint64_t > Ops)
Append the opcodes Ops to DIExpr.
static void appendOffset(SmallVectorImpl< uint64_t > &Ops, int64_t Offset)
Append Ops with operations to apply the Offset.
bool isComplex() const
Return whether the location is computed on the expression stack, meaning it cannot be a simple regist...
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
This represents the llvm.dbg.value instruction.
Record of a variable value-assignment, aka a non instruction representation of the dbg....
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
NodeT * getBlock() const
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:317
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Instruction * findNearestCommonDominator(Instruction *I1, Instruction *I2) const
Find the nearest instruction I that dominates both I1 and I2, in the sense that a result produced bef...
Definition: Dominators.cpp:344
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
This instruction compares its operands according to the predicate given to the constructor.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2697
IVStrideUse - Keep track of one use of a strided induction variable.
Definition: IVUsers.h:35
void transformToPostInc(const Loop *L)
transformToPostInc - Transform the expression to post-inc form for the given loop.
Definition: IVUsers.cpp:367
Value * getOperandValToReplace() const
getOperandValToReplace - Return the Value of the operand in the user instruction that this IVStrideUs...
Definition: IVUsers.h:54
void setUser(Instruction *NewUser)
setUser - Assign a new user instruction for this use.
Definition: IVUsers.h:48
Analysis pass that exposes the IVUsers for a loop.
Definition: IVUsers.h:184
ilist< IVStrideUse >::const_iterator const_iterator
Definition: IVUsers.h:142
bool empty() const
Definition: IVUsers.h:147
void print(raw_ostream &OS) const
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:475
bool isEHPad() const
Return true if the instruction is a variety of EH-block.
Definition: Instruction.h:833
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:94
Type * getAccessType() const LLVM_READONLY
Return the type this instruction accesses in memory, if any.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:472
const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition: Instruction.cpp:76
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
An instruction for reading from memory.
Definition: Instructions.h:176
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:593
virtual bool runOnLoop(Loop *L, LPPassManager &LPM)=0
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
An analysis that produces MemorySSA for a function.
Definition: MemorySSA.h:928
Legacy analysis pass which computes MemorySSA.
Definition: MemorySSA.h:985
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition: MemorySSA.h:701
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
iterator_range< const_block_iterator > blocks() const
op_range incoming_values()
void setIncomingValue(unsigned i, Value *V)
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
static unsigned getIncomingValueNumForOperand(unsigned i)
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98
PointerIntPair - This class implements a pair of a pointer and small integer.
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
This node represents an addition of some number of SCEVs.
This node represents a polynomial recurrence on the trip count of the specified loop.
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This is the base class for unary cast operator classes.
This node is the base class for n'ary commutative operators.
This class represents a constant integer value.
ConstantInt * getValue() const
const APInt & getAPInt() const
This class uses information about analyze scalars to rewrite expressions in canonical form.
This is the base class for unary integral cast operator classes.
This node represents multiplication of some number of SCEVs.
This node is a base class providing common functionality for n'ary operators.
ArrayRef< const SCEV * > operands() const
This class represents a signed maximum selection.
This class represents a binary unsigned division operation.
This class represents an unsigned maximum selection.
This means that we are dealing with an entirely unknown SCEV value, and only represent it as its LLVM...
This class represents an analyzed expression in the program.
ArrayRef< const SCEV * > operands() const
Return operands of this SCEV expression.
unsigned short getExpressionSize() const
bool isZero() const
Return true if the expression is a constant zero.
SCEVTypes getSCEVType() const
Type * getType() const
Return the LLVM type of this SCEV expression.
This class represents a cast from signed integer to floating point.
The main scalar evolution driver.
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
uint64_t getTypeSizeInBits(Type *Ty) const
Return the size in bits of the specified type, for which isSCEVable must return true.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getNoopOrSignExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getAddRecExpr(const SCEV *Start, const SCEV *Step, const Loop *L, SCEV::NoWrapFlags Flags)
Get an add recurrence expression for the specified loop.
bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
Type * getEffectiveSCEVType(Type *Ty) const
Return a type with the same bitwidth as the given type and which represents how SCEV will treat the g...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getAnyExtendExpr(const SCEV *Op, Type *Ty)
getAnyExtendExpr - Return a SCEV for the given operand extended with unspecified bits out to the give...
bool containsUndefs(const SCEV *S) const
Return true if the SCEV expression contains an undef value.
const SCEV * getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
const SCEV * getVScale(Type *Ty)
bool hasComputableLoopEvolution(const SCEV *S, const Loop *L)
Return true if the given SCEV changes value in a known way in the specified loop.
const SCEV * getPointerBase(const SCEV *V)
Transitively follow the chain of pointer-type operands until reaching a SCEV that does not have a sin...
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUnknown(Value *V)
std::optional< APInt > computeConstantDifference(const SCEV *LHS, const SCEV *RHS)
Compute LHS - RHS and returns the result as an APInt if it is a constant, and std::nullopt if it isn'...
bool properlyDominates(const SCEV *S, const BasicBlock *BB)
Return true if elements that makes up the given SCEV properly dominate the specified basic block.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
bool containsErasedValue(const SCEV *S) const
Return true if the SCEV expression contains a Value that has been optimised out and is now a nullptr.
LLVMContext & getContext() const
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:113
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
iterator_range< const_set_bits_iterator > set_bits() const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
size_type size() const
Returns the number of bits in this bitvector.
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
size_type count() const
Returns the number of bits which are set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
iterator end() const
Definition: SmallPtrSet.h:477
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
iterator begin() const
Definition: SmallPtrSet.h:472
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
void clear()
Definition: SmallSet.h:204
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:704
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void reserve(size_type N)
Definition: SmallVector.h:663
iterator erase(const_iterator CI)
Definition: SmallVector.h:737
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:578
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:805
typename SuperClass::iterator iterator
Definition: SmallVector.h:577
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
static StackOffset get(int64_t Fixed, int64_t Scalable)
Definition: TypeSize.h:44
An instruction for storing to memory.
Definition: Instructions.h:292
Provides information about what library functions are available for the current target.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const
bool shouldDropLSRSolutionIfLessProfitable() const
Return true if LSR should drop a found solution if it's calculated to be less profitable than the bas...
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const
Return true if LSR cost of C1 is lower than C2.
bool isProfitableLSRChainElement(Instruction *I) const
bool LSRWithInstrQueries() const
Return true if the loop strength reduce pass should make Instruction* based TTI queries to isLegalAdd...
bool isIndexedStoreLegal(enum MemIndexedMode Mode, Type *Ty) const
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace=0, Instruction *I=nullptr, int64_t ScalableOffset=0) const
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isIndexedLoadLegal(enum MemIndexedMode Mode, Type *Ty) const
bool isLegalICmpImmediate(int64_t Imm) const
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
bool isLegalAddImmediate(int64_t Imm) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *LibInfo) const
Return true if the target can save a compare for loop count, for example hardware loop saves a compar...
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isLegalAddScalableImmediate(int64_t Imm) const
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
bool isNumRegsMajorCostOfLSR() const
Return true if LSR major cost is number of registers.
@ MIM_PostInc
Post-incrementing.
bool canMacroFuseCmp() const
Return true if the target can fuse a compare and branch.
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace=0) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
This class represents a truncation of integer types.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static Type * getVoidTy(LLVMContext &C)
int getFPMantissaWidth() const
Return the width of the mantissa of this type.
static IntegerType * getInt8Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
This class represents a cast unsigned integer to floating point.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:288
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
void setOperand(unsigned i, Value *Val)
Definition: User.h:233
Value * getOperand(unsigned i) const
Definition: User.h:228
unsigned getNumOperands() const
Definition: User.h:250
op_iterator op_end()
Definition: User.h:282
static ValueAsMetadata * get(Value *V)
Definition: Metadata.cpp:501
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
iterator_range< use_iterator > uses()
Definition: Value.h:376
A nullable Value handle that is nullable.
Definition: ValueHandle.h:144
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:95
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Key
PAL metadata keys.
@ Entry
Definition: COFF.h:844
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ SC
CHAIN = SC CHAIN, Imm128 - System call.
Reg
All possible values of the reg field in the ModR/M byte.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:711
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
@ DW_OP_LLVM_arg
Only used in LLVM metadata.
Definition: Dwarf.h:147
@ DW_OP_LLVM_convert
Only used in LLVM metadata.
Definition: Dwarf.h:143
constexpr double e
Definition: MathExtras.h:47
Sequence
A sequence of states that a pointer may go through in which an objc_retain and objc_release are actua...
Definition: PtrState.h:41
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:235
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition: DWP.cpp:480
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1759
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
char & LoopSimplifyID
bool operator!=(uint64_t V1, const APInt &V2)
Definition: APInt.h:2082
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2115
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
bool matchSimpleRecurrence(const PHINode *P, BinaryOperator *&BO, Value *&Start, Value *&Step)
Attempt to match a simple first order recurrence cycle of the form: iv = phi Ty [Start,...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
bool DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr)
Examine each PHI in the given block and delete it if it is dead.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
const SCEV * denormalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE)
Denormalize S to be post-increment for all loops present in Loops.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1664
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
void SplitLandingPadPredecessors(BasicBlock *OrigBB, ArrayRef< BasicBlock * > Preds, const char *Suffix, const char *Suffix2, SmallVectorImpl< BasicBlock * > &NewBBs, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, bool PreserveLCSSA=false)
This method transforms the landing pad, OrigBB, by introducing two new basic blocks into the function...
const SCEV * normalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE, bool CheckInvertible=true)
Normalize S to be post-increment for all loops present in Loops.
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
@ Add
Sum of integers.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1938
Pass * createLoopStrengthReducePass()
BasicBlock * SplitCriticalEdge(Instruction *TI, unsigned SuccNum, const CriticalEdgeSplittingOptions &Options=CriticalEdgeSplittingOptions(), const Twine &BBName="")
If this edge is a critical edge, insert a new node to split the critical edge.
bool RecursivelyDeleteTriviallyDeadInstructionsPermissive(SmallVectorImpl< WeakTrackingVH > &DeadInsts, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
Same functionality as RecursivelyDeleteTriviallyDeadInstructions, but allow instructions that are not...
Definition: Local.cpp:561
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
bool formLCSSAForInstructions(SmallVectorImpl< Instruction * > &Worklist, const DominatorTree &DT, const LoopInfo &LI, ScalarEvolution *SE, SmallVectorImpl< PHINode * > *PHIsToRemove=nullptr, SmallVectorImpl< PHINode * > *InsertedPHIs=nullptr)
Ensures LCSSA form for every instruction from the Worklist in the scope of innermost containing loop.
Definition: LCSSA.cpp:325
void initializeLoopStrengthReducePass(PassRegistry &)
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
int rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI, ScalarEvolution *SE, const TargetTransformInfo *TTI, SCEVExpander &Rewriter, DominatorTree *DT, ReplaceExitVal ReplaceExitValue, SmallVector< WeakTrackingVH, 16 > &DeadInsts)
If the final value of any expressions that are recurrent in the loop can be computed,...
Definition: LoopUtils.cpp:1549
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
@ UnusedIndVarInLoop
Definition: LoopUtils.h:482
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:468
bool SCEVExprContains(const SCEV *Root, PredTy Pred)
Return true if any node in Root satisfies the predicate Pred.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Option class for critical edge splitting.
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Information about a load/store intrinsic defined by the target.
Value * PtrVal
This is the pointer that the intrinsic is loading from or storing to.