LLVM 20.0.0git
LoopStrengthReduce.cpp
Go to the documentation of this file.
1//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This transformation analyzes and transforms the induction variables (and
10// computations derived from them) into forms suitable for efficient execution
11// on the target.
12//
13// This pass performs a strength reduction on array references inside loops that
14// have as one or more of their components the loop induction variable, it
15// rewrites expressions to take advantage of scaled-index addressing modes
16// available on the target, and it performs a variety of other optimizations
17// related to loop induction variables.
18//
19// Terminology note: this code has a lot of handling for "post-increment" or
20// "post-inc" users. This is not talking about post-increment addressing modes;
21// it is instead talking about code like this:
22//
23// %i = phi [ 0, %entry ], [ %i.next, %latch ]
24// ...
25// %i.next = add %i, 1
26// %c = icmp eq %i.next, %n
27//
28// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
29// it's useful to think about these as the same register, with some uses using
30// the value of the register before the add and some using it after. In this
31// example, the icmp is a post-increment user, since it uses %i.next, which is
32// the value of the induction variable after the increment. The other common
33// case of post-increment users is users outside the loop.
34//
35// TODO: More sophistication in the way Formulae are generated and filtered.
36//
37// TODO: Handle multiple loops at a time.
38//
39// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
40// of a GlobalValue?
41//
42// TODO: When truncation is free, truncate ICmp users' operands to make it a
43// smaller encoding (on x86 at least).
44//
45// TODO: When a negated register is used by an add (such as in a list of
46// multiple base registers, or as the increment expression in an addrec),
47// we may not actually need both reg and (-1 * reg) in registers; the
48// negation can be implemented by using a sub instead of an add. The
49// lack of support for taking this into consideration when making
50// register pressure decisions is partly worked around by the "Special"
51// use kind.
52//
53//===----------------------------------------------------------------------===//
54
56#include "llvm/ADT/APInt.h"
57#include "llvm/ADT/DenseMap.h"
58#include "llvm/ADT/DenseSet.h"
59#include "llvm/ADT/Hashing.h"
61#include "llvm/ADT/STLExtras.h"
62#include "llvm/ADT/SetVector.h"
65#include "llvm/ADT/SmallSet.h"
67#include "llvm/ADT/Statistic.h"
84#include "llvm/Config/llvm-config.h"
85#include "llvm/IR/BasicBlock.h"
86#include "llvm/IR/Constant.h"
87#include "llvm/IR/Constants.h"
90#include "llvm/IR/Dominators.h"
91#include "llvm/IR/GlobalValue.h"
92#include "llvm/IR/IRBuilder.h"
93#include "llvm/IR/InstrTypes.h"
94#include "llvm/IR/Instruction.h"
97#include "llvm/IR/Module.h"
98#include "llvm/IR/Operator.h"
99#include "llvm/IR/PassManager.h"
100#include "llvm/IR/Type.h"
101#include "llvm/IR/Use.h"
102#include "llvm/IR/User.h"
103#include "llvm/IR/Value.h"
104#include "llvm/IR/ValueHandle.h"
106#include "llvm/Pass.h"
107#include "llvm/Support/Casting.h"
110#include "llvm/Support/Debug.h"
120#include <algorithm>
121#include <cassert>
122#include <cstddef>
123#include <cstdint>
124#include <iterator>
125#include <limits>
126#include <map>
127#include <numeric>
128#include <optional>
129#include <utility>
130
131using namespace llvm;
132
133#define DEBUG_TYPE "loop-reduce"
134
135/// MaxIVUsers is an arbitrary threshold that provides an early opportunity for
136/// bail out. This threshold is far beyond the number of users that LSR can
137/// conceivably solve, so it should not affect generated code, but catches the
138/// worst cases before LSR burns too much compile time and stack space.
139static const unsigned MaxIVUsers = 200;
140
141/// Limit the size of expression that SCEV-based salvaging will attempt to
142/// translate into a DIExpression.
143/// Choose a maximum size such that debuginfo is not excessively increased and
144/// the salvaging is not too expensive for the compiler.
145static const unsigned MaxSCEVSalvageExpressionSize = 64;
146
147// Cleanup congruent phis after LSR phi expansion.
149 "enable-lsr-phielim", cl::Hidden, cl::init(true),
150 cl::desc("Enable LSR phi elimination"));
151
152// The flag adds instruction count to solutions cost comparison.
154 "lsr-insns-cost", cl::Hidden, cl::init(true),
155 cl::desc("Add instruction count to a LSR cost model"));
156
157// Flag to choose how to narrow complex lsr solution
159 "lsr-exp-narrow", cl::Hidden, cl::init(false),
160 cl::desc("Narrow LSR complex solution using"
161 " expectation of registers number"));
162
163// Flag to narrow search space by filtering non-optimal formulae with
164// the same ScaledReg and Scale.
166 "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true),
167 cl::desc("Narrow LSR search space by filtering non-optimal formulae"
168 " with the same ScaledReg and Scale"));
169
171 "lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None),
172 cl::desc("A flag that overrides the target's preferred addressing mode."),
174 "none",
175 "Don't prefer any addressing mode"),
177 "preindexed",
178 "Prefer pre-indexed addressing mode"),
180 "postindexed",
181 "Prefer post-indexed addressing mode")));
182
184 "lsr-complexity-limit", cl::Hidden,
185 cl::init(std::numeric_limits<uint16_t>::max()),
186 cl::desc("LSR search space complexity limit"));
187
189 "lsr-setupcost-depth-limit", cl::Hidden, cl::init(7),
190 cl::desc("The limit on recursion depth for LSRs setup cost"));
191
193 "lsr-drop-solution", cl::Hidden,
194 cl::desc("Attempt to drop solution if it is less profitable"));
195
197 "lsr-enable-vscale-immediates", cl::Hidden, cl::init(true),
198 cl::desc("Enable analysis of vscale-relative immediates in LSR"));
199
201 "lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true),
202 cl::desc("Avoid using scaled registers with vscale-relative addressing"));
203
204#ifndef NDEBUG
205// Stress test IV chain generation.
207 "stress-ivchain", cl::Hidden, cl::init(false),
208 cl::desc("Stress test LSR IV chains"));
209#else
210static bool StressIVChain = false;
211#endif
212
213namespace {
214
215struct MemAccessTy {
216 /// Used in situations where the accessed memory type is unknown.
217 static const unsigned UnknownAddressSpace =
218 std::numeric_limits<unsigned>::max();
219
220 Type *MemTy = nullptr;
221 unsigned AddrSpace = UnknownAddressSpace;
222
223 MemAccessTy() = default;
224 MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {}
225
226 bool operator==(MemAccessTy Other) const {
227 return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
228 }
229
230 bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
231
232 static MemAccessTy getUnknown(LLVMContext &Ctx,
233 unsigned AS = UnknownAddressSpace) {
234 return MemAccessTy(Type::getVoidTy(Ctx), AS);
235 }
236
237 Type *getType() { return MemTy; }
238};
239
240/// This class holds data which is used to order reuse candidates.
241class RegSortData {
242public:
243 /// This represents the set of LSRUse indices which reference
244 /// a particular register.
245 SmallBitVector UsedByIndices;
246
247 void print(raw_ostream &OS) const;
248 void dump() const;
249};
250
251// An offset from an address that is either scalable or fixed. Used for
252// per-target optimizations of addressing modes.
253class Immediate : public details::FixedOrScalableQuantity<Immediate, int64_t> {
254 constexpr Immediate(ScalarTy MinVal, bool Scalable)
255 : FixedOrScalableQuantity(MinVal, Scalable) {}
256
257 constexpr Immediate(const FixedOrScalableQuantity<Immediate, int64_t> &V)
258 : FixedOrScalableQuantity(V) {}
259
260public:
261 constexpr Immediate() = delete;
262
263 static constexpr Immediate getFixed(ScalarTy MinVal) {
264 return {MinVal, false};
265 }
266 static constexpr Immediate getScalable(ScalarTy MinVal) {
267 return {MinVal, true};
268 }
269 static constexpr Immediate get(ScalarTy MinVal, bool Scalable) {
270 return {MinVal, Scalable};
271 }
272 static constexpr Immediate getZero() { return {0, false}; }
273 static constexpr Immediate getFixedMin() {
274 return {std::numeric_limits<int64_t>::min(), false};
275 }
276 static constexpr Immediate getFixedMax() {
277 return {std::numeric_limits<int64_t>::max(), false};
278 }
279 static constexpr Immediate getScalableMin() {
280 return {std::numeric_limits<int64_t>::min(), true};
281 }
282 static constexpr Immediate getScalableMax() {
283 return {std::numeric_limits<int64_t>::max(), true};
284 }
285
286 constexpr bool isLessThanZero() const { return Quantity < 0; }
287
288 constexpr bool isGreaterThanZero() const { return Quantity > 0; }
289
290 constexpr bool isCompatibleImmediate(const Immediate &Imm) const {
291 return isZero() || Imm.isZero() || Imm.Scalable == Scalable;
292 }
293
294 constexpr bool isMin() const {
295 return Quantity == std::numeric_limits<ScalarTy>::min();
296 }
297
298 constexpr bool isMax() const {
299 return Quantity == std::numeric_limits<ScalarTy>::max();
300 }
301
302 // Arithmetic 'operators' that cast to unsigned types first.
303 constexpr Immediate addUnsigned(const Immediate &RHS) const {
304 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
305 ScalarTy Value = (uint64_t)Quantity + RHS.getKnownMinValue();
306 return {Value, Scalable || RHS.isScalable()};
307 }
308
309 constexpr Immediate subUnsigned(const Immediate &RHS) const {
310 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
311 ScalarTy Value = (uint64_t)Quantity - RHS.getKnownMinValue();
312 return {Value, Scalable || RHS.isScalable()};
313 }
314
315 // Scale the quantity by a constant without caring about runtime scalability.
316 constexpr Immediate mulUnsigned(const ScalarTy RHS) const {
317 ScalarTy Value = (uint64_t)Quantity * RHS;
318 return {Value, Scalable};
319 }
320
321 // Helpers for generating SCEVs with vscale terms where needed.
322 const SCEV *getSCEV(ScalarEvolution &SE, Type *Ty) const {
323 const SCEV *S = SE.getConstant(Ty, Quantity);
324 if (Scalable)
325 S = SE.getMulExpr(S, SE.getVScale(S->getType()));
326 return S;
327 }
328
329 const SCEV *getNegativeSCEV(ScalarEvolution &SE, Type *Ty) const {
330 const SCEV *NegS = SE.getConstant(Ty, -(uint64_t)Quantity);
331 if (Scalable)
332 NegS = SE.getMulExpr(NegS, SE.getVScale(NegS->getType()));
333 return NegS;
334 }
335
336 const SCEV *getUnknownSCEV(ScalarEvolution &SE, Type *Ty) const {
337 const SCEV *SU = SE.getUnknown(ConstantInt::getSigned(Ty, Quantity));
338 if (Scalable)
339 SU = SE.getMulExpr(SU, SE.getVScale(SU->getType()));
340 return SU;
341 }
342};
343
344// This is needed for the Compare type of std::map when Immediate is used
345// as a key. We don't need it to be fully correct against any value of vscale,
346// just to make sure that vscale-related terms in the map are considered against
347// each other rather than being mixed up and potentially missing opportunities.
348struct KeyOrderTargetImmediate {
349 bool operator()(const Immediate &LHS, const Immediate &RHS) const {
350 if (LHS.isScalable() && !RHS.isScalable())
351 return false;
352 if (!LHS.isScalable() && RHS.isScalable())
353 return true;
354 return LHS.getKnownMinValue() < RHS.getKnownMinValue();
355 }
356};
357
358// This would be nicer if we could be generic instead of directly using size_t,
359// but there doesn't seem to be a type trait for is_orderable or
360// is_lessthan_comparable or similar.
361struct KeyOrderSizeTAndImmediate {
362 bool operator()(const std::pair<size_t, Immediate> &LHS,
363 const std::pair<size_t, Immediate> &RHS) const {
364 size_t LSize = LHS.first;
365 size_t RSize = RHS.first;
366 if (LSize != RSize)
367 return LSize < RSize;
368 return KeyOrderTargetImmediate()(LHS.second, RHS.second);
369 }
370};
371} // end anonymous namespace
372
373#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
374void RegSortData::print(raw_ostream &OS) const {
375 OS << "[NumUses=" << UsedByIndices.count() << ']';
376}
377
378LLVM_DUMP_METHOD void RegSortData::dump() const {
379 print(errs()); errs() << '\n';
380}
381#endif
382
383namespace {
384
385/// Map register candidates to information about how they are used.
386class RegUseTracker {
387 using RegUsesTy = DenseMap<const SCEV *, RegSortData>;
388
389 RegUsesTy RegUsesMap;
391
392public:
393 void countRegister(const SCEV *Reg, size_t LUIdx);
394 void dropRegister(const SCEV *Reg, size_t LUIdx);
395 void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
396
397 bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
398
399 const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
400
401 void clear();
402
405
406 iterator begin() { return RegSequence.begin(); }
407 iterator end() { return RegSequence.end(); }
408 const_iterator begin() const { return RegSequence.begin(); }
409 const_iterator end() const { return RegSequence.end(); }
410};
411
412} // end anonymous namespace
413
414void
415RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
416 std::pair<RegUsesTy::iterator, bool> Pair =
417 RegUsesMap.insert(std::make_pair(Reg, RegSortData()));
418 RegSortData &RSD = Pair.first->second;
419 if (Pair.second)
420 RegSequence.push_back(Reg);
421 RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
422 RSD.UsedByIndices.set(LUIdx);
423}
424
425void
426RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
427 RegUsesTy::iterator It = RegUsesMap.find(Reg);
428 assert(It != RegUsesMap.end());
429 RegSortData &RSD = It->second;
430 assert(RSD.UsedByIndices.size() > LUIdx);
431 RSD.UsedByIndices.reset(LUIdx);
432}
433
434void
435RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
436 assert(LUIdx <= LastLUIdx);
437
438 // Update RegUses. The data structure is not optimized for this purpose;
439 // we must iterate through it and update each of the bit vectors.
440 for (auto &Pair : RegUsesMap) {
441 SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
442 if (LUIdx < UsedByIndices.size())
443 UsedByIndices[LUIdx] =
444 LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : false;
445 UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
446 }
447}
448
449bool
450RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
451 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
452 if (I == RegUsesMap.end())
453 return false;
454 const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
455 int i = UsedByIndices.find_first();
456 if (i == -1) return false;
457 if ((size_t)i != LUIdx) return true;
458 return UsedByIndices.find_next(i) != -1;
459}
460
461const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
462 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
463 assert(I != RegUsesMap.end() && "Unknown register!");
464 return I->second.UsedByIndices;
465}
466
467void RegUseTracker::clear() {
468 RegUsesMap.clear();
469 RegSequence.clear();
470}
471
472namespace {
473
474/// This class holds information that describes a formula for computing
475/// satisfying a use. It may include broken-out immediates and scaled registers.
476struct Formula {
477 /// Global base address used for complex addressing.
478 GlobalValue *BaseGV = nullptr;
479
480 /// Base offset for complex addressing.
481 Immediate BaseOffset = Immediate::getZero();
482
483 /// Whether any complex addressing has a base register.
484 bool HasBaseReg = false;
485
486 /// The scale of any complex addressing.
487 int64_t Scale = 0;
488
489 /// The list of "base" registers for this use. When this is non-empty. The
490 /// canonical representation of a formula is
491 /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
492 /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
493 /// 3. The reg containing recurrent expr related with currect loop in the
494 /// formula should be put in the ScaledReg.
495 /// #1 enforces that the scaled register is always used when at least two
496 /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
497 /// #2 enforces that 1 * reg is reg.
498 /// #3 ensures invariant regs with respect to current loop can be combined
499 /// together in LSR codegen.
500 /// This invariant can be temporarily broken while building a formula.
501 /// However, every formula inserted into the LSRInstance must be in canonical
502 /// form.
504
505 /// The 'scaled' register for this use. This should be non-null when Scale is
506 /// not zero.
507 const SCEV *ScaledReg = nullptr;
508
509 /// An additional constant offset which added near the use. This requires a
510 /// temporary register, but the offset itself can live in an add immediate
511 /// field rather than a register.
512 Immediate UnfoldedOffset = Immediate::getZero();
513
514 Formula() = default;
515
516 void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
517
518 bool isCanonical(const Loop &L) const;
519
520 void canonicalize(const Loop &L);
521
522 bool unscale();
523
524 bool hasZeroEnd() const;
525
526 size_t getNumRegs() const;
527 Type *getType() const;
528
529 void deleteBaseReg(const SCEV *&S);
530
531 bool referencesReg(const SCEV *S) const;
532 bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
533 const RegUseTracker &RegUses) const;
534
535 void print(raw_ostream &OS) const;
536 void dump() const;
537};
538
539} // end anonymous namespace
540
541/// Recursion helper for initialMatch.
542static void DoInitialMatch(const SCEV *S, Loop *L,
545 ScalarEvolution &SE) {
546 // Collect expressions which properly dominate the loop header.
547 if (SE.properlyDominates(S, L->getHeader())) {
548 Good.push_back(S);
549 return;
550 }
551
552 // Look at add operands.
553 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
554 for (const SCEV *S : Add->operands())
555 DoInitialMatch(S, L, Good, Bad, SE);
556 return;
557 }
558
559 // Look at addrec operands.
560 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
561 if (!AR->getStart()->isZero() && AR->isAffine()) {
562 DoInitialMatch(AR->getStart(), L, Good, Bad, SE);
563 DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
564 AR->getStepRecurrence(SE),
565 // FIXME: AR->getNoWrapFlags()
566 AR->getLoop(), SCEV::FlagAnyWrap),
567 L, Good, Bad, SE);
568 return;
569 }
570
571 // Handle a multiplication by -1 (negation) if it didn't fold.
572 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
573 if (Mul->getOperand(0)->isAllOnesValue()) {
575 const SCEV *NewMul = SE.getMulExpr(Ops);
576
579 DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
580 const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
581 SE.getEffectiveSCEVType(NewMul->getType())));
582 for (const SCEV *S : MyGood)
583 Good.push_back(SE.getMulExpr(NegOne, S));
584 for (const SCEV *S : MyBad)
585 Bad.push_back(SE.getMulExpr(NegOne, S));
586 return;
587 }
588
589 // Ok, we can't do anything interesting. Just stuff the whole thing into a
590 // register and hope for the best.
591 Bad.push_back(S);
592}
593
594/// Incorporate loop-variant parts of S into this Formula, attempting to keep
595/// all loop-invariant and loop-computable values in a single base register.
596void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
599 DoInitialMatch(S, L, Good, Bad, SE);
600 if (!Good.empty()) {
601 const SCEV *Sum = SE.getAddExpr(Good);
602 if (!Sum->isZero())
603 BaseRegs.push_back(Sum);
604 HasBaseReg = true;
605 }
606 if (!Bad.empty()) {
607 const SCEV *Sum = SE.getAddExpr(Bad);
608 if (!Sum->isZero())
609 BaseRegs.push_back(Sum);
610 HasBaseReg = true;
611 }
612 canonicalize(*L);
613}
614
615static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L) {
616 return SCEVExprContains(S, [&L](const SCEV *S) {
617 return isa<SCEVAddRecExpr>(S) && (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
618 });
619}
620
621/// Check whether or not this formula satisfies the canonical
622/// representation.
623/// \see Formula::BaseRegs.
624bool Formula::isCanonical(const Loop &L) const {
625 if (!ScaledReg)
626 return BaseRegs.size() <= 1;
627
628 if (Scale != 1)
629 return true;
630
631 if (Scale == 1 && BaseRegs.empty())
632 return false;
633
634 if (containsAddRecDependentOnLoop(ScaledReg, L))
635 return true;
636
637 // If ScaledReg is not a recurrent expr, or it is but its loop is not current
638 // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
639 // loop, we want to swap the reg in BaseRegs with ScaledReg.
640 return none_of(BaseRegs, [&L](const SCEV *S) {
642 });
643}
644
645/// Helper method to morph a formula into its canonical representation.
646/// \see Formula::BaseRegs.
647/// Every formula having more than one base register, must use the ScaledReg
648/// field. Otherwise, we would have to do special cases everywhere in LSR
649/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
650/// On the other hand, 1*reg should be canonicalized into reg.
651void Formula::canonicalize(const Loop &L) {
652 if (isCanonical(L))
653 return;
654
655 if (BaseRegs.empty()) {
656 // No base reg? Use scale reg with scale = 1 as such.
657 assert(ScaledReg && "Expected 1*reg => reg");
658 assert(Scale == 1 && "Expected 1*reg => reg");
659 BaseRegs.push_back(ScaledReg);
660 Scale = 0;
661 ScaledReg = nullptr;
662 return;
663 }
664
665 // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
666 if (!ScaledReg) {
667 ScaledReg = BaseRegs.pop_back_val();
668 Scale = 1;
669 }
670
671 // If ScaledReg is an invariant with respect to L, find the reg from
672 // BaseRegs containing the recurrent expr related with Loop L. Swap the
673 // reg with ScaledReg.
674 if (!containsAddRecDependentOnLoop(ScaledReg, L)) {
675 auto I = find_if(BaseRegs, [&L](const SCEV *S) {
677 });
678 if (I != BaseRegs.end())
679 std::swap(ScaledReg, *I);
680 }
681 assert(isCanonical(L) && "Failed to canonicalize?");
682}
683
684/// Get rid of the scale in the formula.
685/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
686/// \return true if it was possible to get rid of the scale, false otherwise.
687/// \note After this operation the formula may not be in the canonical form.
688bool Formula::unscale() {
689 if (Scale != 1)
690 return false;
691 Scale = 0;
692 BaseRegs.push_back(ScaledReg);
693 ScaledReg = nullptr;
694 return true;
695}
696
697bool Formula::hasZeroEnd() const {
698 if (UnfoldedOffset || BaseOffset)
699 return false;
700 if (BaseRegs.size() != 1 || ScaledReg)
701 return false;
702 return true;
703}
704
705/// Return the total number of register operands used by this formula. This does
706/// not include register uses implied by non-constant addrec strides.
707size_t Formula::getNumRegs() const {
708 return !!ScaledReg + BaseRegs.size();
709}
710
711/// Return the type of this formula, if it has one, or null otherwise. This type
712/// is meaningless except for the bit size.
713Type *Formula::getType() const {
714 return !BaseRegs.empty() ? BaseRegs.front()->getType() :
715 ScaledReg ? ScaledReg->getType() :
716 BaseGV ? BaseGV->getType() :
717 nullptr;
718}
719
720/// Delete the given base reg from the BaseRegs list.
721void Formula::deleteBaseReg(const SCEV *&S) {
722 if (&S != &BaseRegs.back())
723 std::swap(S, BaseRegs.back());
724 BaseRegs.pop_back();
725}
726
727/// Test if this formula references the given register.
728bool Formula::referencesReg(const SCEV *S) const {
729 return S == ScaledReg || is_contained(BaseRegs, S);
730}
731
732/// Test whether this formula uses registers which are used by uses other than
733/// the use with the given index.
734bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
735 const RegUseTracker &RegUses) const {
736 if (ScaledReg)
737 if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
738 return true;
739 for (const SCEV *BaseReg : BaseRegs)
740 if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
741 return true;
742 return false;
743}
744
745#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
746void Formula::print(raw_ostream &OS) const {
747 bool First = true;
748 if (BaseGV) {
749 if (!First) OS << " + "; else First = false;
750 BaseGV->printAsOperand(OS, /*PrintType=*/false);
751 }
752 if (BaseOffset.isNonZero()) {
753 if (!First) OS << " + "; else First = false;
754 OS << BaseOffset;
755 }
756 for (const SCEV *BaseReg : BaseRegs) {
757 if (!First) OS << " + "; else First = false;
758 OS << "reg(" << *BaseReg << ')';
759 }
760 if (HasBaseReg && BaseRegs.empty()) {
761 if (!First) OS << " + "; else First = false;
762 OS << "**error: HasBaseReg**";
763 } else if (!HasBaseReg && !BaseRegs.empty()) {
764 if (!First) OS << " + "; else First = false;
765 OS << "**error: !HasBaseReg**";
766 }
767 if (Scale != 0) {
768 if (!First) OS << " + "; else First = false;
769 OS << Scale << "*reg(";
770 if (ScaledReg)
771 OS << *ScaledReg;
772 else
773 OS << "<unknown>";
774 OS << ')';
775 }
776 if (UnfoldedOffset.isNonZero()) {
777 if (!First) OS << " + ";
778 OS << "imm(" << UnfoldedOffset << ')';
779 }
780}
781
782LLVM_DUMP_METHOD void Formula::dump() const {
783 print(errs()); errs() << '\n';
784}
785#endif
786
787/// Return true if the given addrec can be sign-extended without changing its
788/// value.
790 Type *WideTy =
792 return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
793}
794
795/// Return true if the given add can be sign-extended without changing its
796/// value.
797static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
798 Type *WideTy =
799 IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
800 return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
801}
802
803/// Return true if the given mul can be sign-extended without changing its
804/// value.
805static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
806 Type *WideTy =
808 SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
809 return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
810}
811
812/// Return an expression for LHS /s RHS, if it can be determined and if the
813/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
814/// is true, expressions like (X * Y) /s Y are simplified to X, ignoring that
815/// the multiplication may overflow, which is useful when the result will be
816/// used in a context where the most significant bits are ignored.
817static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
818 ScalarEvolution &SE,
819 bool IgnoreSignificantBits = false) {
820 // Handle the trivial case, which works for any SCEV type.
821 if (LHS == RHS)
822 return SE.getConstant(LHS->getType(), 1);
823
824 // Handle a few RHS special cases.
825 const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
826 if (RC) {
827 const APInt &RA = RC->getAPInt();
828 // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
829 // some folding.
830 if (RA.isAllOnes()) {
831 if (LHS->getType()->isPointerTy())
832 return nullptr;
833 return SE.getMulExpr(LHS, RC);
834 }
835 // Handle x /s 1 as x.
836 if (RA == 1)
837 return LHS;
838 }
839
840 // Check for a division of a constant by a constant.
841 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
842 if (!RC)
843 return nullptr;
844 const APInt &LA = C->getAPInt();
845 const APInt &RA = RC->getAPInt();
846 if (LA.srem(RA) != 0)
847 return nullptr;
848 return SE.getConstant(LA.sdiv(RA));
849 }
850
851 // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
852 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
853 if ((IgnoreSignificantBits || isAddRecSExtable(AR, SE)) && AR->isAffine()) {
854 const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
855 IgnoreSignificantBits);
856 if (!Step) return nullptr;
857 const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
858 IgnoreSignificantBits);
859 if (!Start) return nullptr;
860 // FlagNW is independent of the start value, step direction, and is
861 // preserved with smaller magnitude steps.
862 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
863 return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
864 }
865 return nullptr;
866 }
867
868 // Distribute the sdiv over add operands, if the add doesn't overflow.
869 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
870 if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
872 for (const SCEV *S : Add->operands()) {
873 const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
874 if (!Op) return nullptr;
875 Ops.push_back(Op);
876 }
877 return SE.getAddExpr(Ops);
878 }
879 return nullptr;
880 }
881
882 // Check for a multiply operand that we can pull RHS out of.
883 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
884 if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
885 // Handle special case C1*X*Y /s C2*X*Y.
886 if (const SCEVMulExpr *MulRHS = dyn_cast<SCEVMulExpr>(RHS)) {
887 if (IgnoreSignificantBits || isMulSExtable(MulRHS, SE)) {
888 const SCEVConstant *LC = dyn_cast<SCEVConstant>(Mul->getOperand(0));
889 const SCEVConstant *RC =
890 dyn_cast<SCEVConstant>(MulRHS->getOperand(0));
891 if (LC && RC) {
893 SmallVector<const SCEV *, 4> ROps(drop_begin(MulRHS->operands()));
894 if (LOps == ROps)
895 return getExactSDiv(LC, RC, SE, IgnoreSignificantBits);
896 }
897 }
898 }
899
901 bool Found = false;
902 for (const SCEV *S : Mul->operands()) {
903 if (!Found)
904 if (const SCEV *Q = getExactSDiv(S, RHS, SE,
905 IgnoreSignificantBits)) {
906 S = Q;
907 Found = true;
908 }
909 Ops.push_back(S);
910 }
911 return Found ? SE.getMulExpr(Ops) : nullptr;
912 }
913 return nullptr;
914 }
915
916 // Otherwise we don't know.
917 return nullptr;
918}
919
920/// If S involves the addition of a constant integer value, return that integer
921/// value, and mutate S to point to a new SCEV with that value excluded.
922static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
923 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
924 if (C->getAPInt().getSignificantBits() <= 64) {
925 S = SE.getConstant(C->getType(), 0);
926 return Immediate::getFixed(C->getValue()->getSExtValue());
927 }
928 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
929 SmallVector<const SCEV *, 8> NewOps(Add->operands());
930 Immediate Result = ExtractImmediate(NewOps.front(), SE);
931 if (Result.isNonZero())
932 S = SE.getAddExpr(NewOps);
933 return Result;
934 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
935 SmallVector<const SCEV *, 8> NewOps(AR->operands());
936 Immediate Result = ExtractImmediate(NewOps.front(), SE);
937 if (Result.isNonZero())
938 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
939 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
941 return Result;
942 } else if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S)) {
943 if (EnableVScaleImmediates && M->getNumOperands() == 2) {
944 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(M->getOperand(0)))
945 if (isa<SCEVVScale>(M->getOperand(1))) {
946 S = SE.getConstant(M->getType(), 0);
947 return Immediate::getScalable(C->getValue()->getSExtValue());
948 }
949 }
950 }
951 return Immediate::getZero();
952}
953
954/// If S involves the addition of a GlobalValue address, return that symbol, and
955/// mutate S to point to a new SCEV with that value excluded.
957 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
958 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
959 S = SE.getConstant(GV->getType(), 0);
960 return GV;
961 }
962 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
963 SmallVector<const SCEV *, 8> NewOps(Add->operands());
964 GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
965 if (Result)
966 S = SE.getAddExpr(NewOps);
967 return Result;
968 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
969 SmallVector<const SCEV *, 8> NewOps(AR->operands());
970 GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
971 if (Result)
972 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
973 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
975 return Result;
976 }
977 return nullptr;
978}
979
980/// Returns true if the specified instruction is using the specified value as an
981/// address.
983 Instruction *Inst, Value *OperandVal) {
984 bool isAddress = isa<LoadInst>(Inst);
985 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
986 if (SI->getPointerOperand() == OperandVal)
987 isAddress = true;
988 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
989 // Addressing modes can also be folded into prefetches and a variety
990 // of intrinsics.
991 switch (II->getIntrinsicID()) {
992 case Intrinsic::memset:
993 case Intrinsic::prefetch:
994 case Intrinsic::masked_load:
995 if (II->getArgOperand(0) == OperandVal)
996 isAddress = true;
997 break;
998 case Intrinsic::masked_store:
999 if (II->getArgOperand(1) == OperandVal)
1000 isAddress = true;
1001 break;
1002 case Intrinsic::memmove:
1003 case Intrinsic::memcpy:
1004 if (II->getArgOperand(0) == OperandVal ||
1005 II->getArgOperand(1) == OperandVal)
1006 isAddress = true;
1007 break;
1008 default: {
1009 MemIntrinsicInfo IntrInfo;
1010 if (TTI.getTgtMemIntrinsic(II, IntrInfo)) {
1011 if (IntrInfo.PtrVal == OperandVal)
1012 isAddress = true;
1013 }
1014 }
1015 }
1016 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1017 if (RMW->getPointerOperand() == OperandVal)
1018 isAddress = true;
1019 } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1020 if (CmpX->getPointerOperand() == OperandVal)
1021 isAddress = true;
1022 }
1023 return isAddress;
1024}
1025
1026/// Return the type of the memory being accessed.
1027static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
1028 Instruction *Inst, Value *OperandVal) {
1029 MemAccessTy AccessTy = MemAccessTy::getUnknown(Inst->getContext());
1030
1031 // First get the type of memory being accessed.
1032 if (Type *Ty = Inst->getAccessType())
1033 AccessTy.MemTy = Ty;
1034
1035 // Then get the pointer address space.
1036 if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
1037 AccessTy.AddrSpace = SI->getPointerAddressSpace();
1038 } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
1039 AccessTy.AddrSpace = LI->getPointerAddressSpace();
1040 } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1041 AccessTy.AddrSpace = RMW->getPointerAddressSpace();
1042 } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1043 AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
1044 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
1045 switch (II->getIntrinsicID()) {
1046 case Intrinsic::prefetch:
1047 case Intrinsic::memset:
1048 AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
1049 AccessTy.MemTy = OperandVal->getType();
1050 break;
1051 case Intrinsic::memmove:
1052 case Intrinsic::memcpy:
1053 AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace();
1054 AccessTy.MemTy = OperandVal->getType();
1055 break;
1056 case Intrinsic::masked_load:
1057 AccessTy.AddrSpace =
1058 II->getArgOperand(0)->getType()->getPointerAddressSpace();
1059 break;
1060 case Intrinsic::masked_store:
1061 AccessTy.AddrSpace =
1062 II->getArgOperand(1)->getType()->getPointerAddressSpace();
1063 break;
1064 default: {
1065 MemIntrinsicInfo IntrInfo;
1066 if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) {
1067 AccessTy.AddrSpace
1068 = IntrInfo.PtrVal->getType()->getPointerAddressSpace();
1069 }
1070
1071 break;
1072 }
1073 }
1074 }
1075
1076 return AccessTy;
1077}
1078
1079/// Return true if this AddRec is already a phi in its loop.
1080static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
1081 for (PHINode &PN : AR->getLoop()->getHeader()->phis()) {
1082 if (SE.isSCEVable(PN.getType()) &&
1083 (SE.getEffectiveSCEVType(PN.getType()) ==
1084 SE.getEffectiveSCEVType(AR->getType())) &&
1085 SE.getSCEV(&PN) == AR)
1086 return true;
1087 }
1088 return false;
1089}
1090
1091/// Check if expanding this expression is likely to incur significant cost. This
1092/// is tricky because SCEV doesn't track which expressions are actually computed
1093/// by the current IR.
1094///
1095/// We currently allow expansion of IV increments that involve adds,
1096/// multiplication by constants, and AddRecs from existing phis.
1097///
1098/// TODO: Allow UDivExpr if we can find an existing IV increment that is an
1099/// obvious multiple of the UDivExpr.
1100static bool isHighCostExpansion(const SCEV *S,
1102 ScalarEvolution &SE) {
1103 // Zero/One operand expressions
1104 switch (S->getSCEVType()) {
1105 case scUnknown:
1106 case scConstant:
1107 case scVScale:
1108 return false;
1109 case scTruncate:
1110 return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
1111 Processed, SE);
1112 case scZeroExtend:
1113 return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
1114 Processed, SE);
1115 case scSignExtend:
1116 return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
1117 Processed, SE);
1118 default:
1119 break;
1120 }
1121
1122 if (!Processed.insert(S).second)
1123 return false;
1124
1125 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
1126 for (const SCEV *S : Add->operands()) {
1127 if (isHighCostExpansion(S, Processed, SE))
1128 return true;
1129 }
1130 return false;
1131 }
1132
1133 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
1134 if (Mul->getNumOperands() == 2) {
1135 // Multiplication by a constant is ok
1136 if (isa<SCEVConstant>(Mul->getOperand(0)))
1137 return isHighCostExpansion(Mul->getOperand(1), Processed, SE);
1138
1139 // If we have the value of one operand, check if an existing
1140 // multiplication already generates this expression.
1141 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) {
1142 Value *UVal = U->getValue();
1143 for (User *UR : UVal->users()) {
1144 // If U is a constant, it may be used by a ConstantExpr.
1145 Instruction *UI = dyn_cast<Instruction>(UR);
1146 if (UI && UI->getOpcode() == Instruction::Mul &&
1147 SE.isSCEVable(UI->getType())) {
1148 return SE.getSCEV(UI) == Mul;
1149 }
1150 }
1151 }
1152 }
1153 }
1154
1155 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
1156 if (isExistingPhi(AR, SE))
1157 return false;
1158 }
1159
1160 // Fow now, consider any other type of expression (div/mul/min/max) high cost.
1161 return true;
1162}
1163
1164namespace {
1165
1166class LSRUse;
1167
1168} // end anonymous namespace
1169
1170/// Check if the addressing mode defined by \p F is completely
1171/// folded in \p LU at isel time.
1172/// This includes address-mode folding and special icmp tricks.
1173/// This function returns true if \p LU can accommodate what \p F
1174/// defines and up to 1 base + 1 scaled + offset.
1175/// In other words, if \p F has several base registers, this function may
1176/// still return true. Therefore, users still need to account for
1177/// additional base registers and/or unfolded offsets to derive an
1178/// accurate cost model.
1180 const LSRUse &LU, const Formula &F);
1181
1182// Get the cost of the scaling factor used in F for LU.
1184 const LSRUse &LU, const Formula &F,
1185 const Loop &L);
1186
1187namespace {
1188
1189/// This class is used to measure and compare candidate formulae.
1190class Cost {
1191 const Loop *L = nullptr;
1192 ScalarEvolution *SE = nullptr;
1193 const TargetTransformInfo *TTI = nullptr;
1196
1197public:
1198 Cost() = delete;
1199 Cost(const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
1201 L(L), SE(&SE), TTI(&TTI), AMK(AMK) {
1202 C.Insns = 0;
1203 C.NumRegs = 0;
1204 C.AddRecCost = 0;
1205 C.NumIVMuls = 0;
1206 C.NumBaseAdds = 0;
1207 C.ImmCost = 0;
1208 C.SetupCost = 0;
1209 C.ScaleCost = 0;
1210 }
1211
1212 bool isLess(const Cost &Other) const;
1213
1214 void Lose();
1215
1216#ifndef NDEBUG
1217 // Once any of the metrics loses, they must all remain losers.
1218 bool isValid() {
1219 return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
1220 | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
1221 || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
1222 & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
1223 }
1224#endif
1225
1226 bool isLoser() {
1227 assert(isValid() && "invalid cost");
1228 return C.NumRegs == ~0u;
1229 }
1230
1231 void RateFormula(const Formula &F,
1233 const DenseSet<const SCEV *> &VisitedRegs,
1234 const LSRUse &LU,
1235 SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
1236
1237 void print(raw_ostream &OS) const;
1238 void dump() const;
1239
1240private:
1241 void RateRegister(const Formula &F, const SCEV *Reg,
1243 void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1246};
1247
1248/// An operand value in an instruction which is to be replaced with some
1249/// equivalent, possibly strength-reduced, replacement.
1250struct LSRFixup {
1251 /// The instruction which will be updated.
1252 Instruction *UserInst = nullptr;
1253
1254 /// The operand of the instruction which will be replaced. The operand may be
1255 /// used more than once; every instance will be replaced.
1256 Value *OperandValToReplace = nullptr;
1257
1258 /// If this user is to use the post-incremented value of an induction
1259 /// variable, this set is non-empty and holds the loops associated with the
1260 /// induction variable.
1261 PostIncLoopSet PostIncLoops;
1262
1263 /// A constant offset to be added to the LSRUse expression. This allows
1264 /// multiple fixups to share the same LSRUse with different offsets, for
1265 /// example in an unrolled loop.
1266 Immediate Offset = Immediate::getZero();
1267
1268 LSRFixup() = default;
1269
1270 bool isUseFullyOutsideLoop(const Loop *L) const;
1271
1272 void print(raw_ostream &OS) const;
1273 void dump() const;
1274};
1275
1276/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted
1277/// SmallVectors of const SCEV*.
1278struct UniquifierDenseMapInfo {
1279 static SmallVector<const SCEV *, 4> getEmptyKey() {
1281 V.push_back(reinterpret_cast<const SCEV *>(-1));
1282 return V;
1283 }
1284
1285 static SmallVector<const SCEV *, 4> getTombstoneKey() {
1287 V.push_back(reinterpret_cast<const SCEV *>(-2));
1288 return V;
1289 }
1290
1291 static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) {
1292 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
1293 }
1294
1295 static bool isEqual(const SmallVector<const SCEV *, 4> &LHS,
1297 return LHS == RHS;
1298 }
1299};
1300
1301/// This class holds the state that LSR keeps for each use in IVUsers, as well
1302/// as uses invented by LSR itself. It includes information about what kinds of
1303/// things can be folded into the user, information about the user itself, and
1304/// information about how the use may be satisfied. TODO: Represent multiple
1305/// users of the same expression in common?
1306class LSRUse {
1307 DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;
1308
1309public:
1310 /// An enum for a kind of use, indicating what types of scaled and immediate
1311 /// operands it might support.
1312 enum KindType {
1313 Basic, ///< A normal use, with no folding.
1314 Special, ///< A special case of basic, allowing -1 scales.
1315 Address, ///< An address use; folding according to TargetLowering
1316 ICmpZero ///< An equality icmp with both operands folded into one.
1317 // TODO: Add a generic icmp too?
1318 };
1319
1320 using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>;
1321
1322 KindType Kind;
1323 MemAccessTy AccessTy;
1324
1325 /// The list of operands which are to be replaced.
1327
1328 /// Keep track of the min and max offsets of the fixups.
1329 Immediate MinOffset = Immediate::getFixedMax();
1330 Immediate MaxOffset = Immediate::getFixedMin();
1331
1332 /// This records whether all of the fixups using this LSRUse are outside of
1333 /// the loop, in which case some special-case heuristics may be used.
1334 bool AllFixupsOutsideLoop = true;
1335
1336 /// RigidFormula is set to true to guarantee that this use will be associated
1337 /// with a single formula--the one that initially matched. Some SCEV
1338 /// expressions cannot be expanded. This allows LSR to consider the registers
1339 /// used by those expressions without the need to expand them later after
1340 /// changing the formula.
1341 bool RigidFormula = false;
1342
1343 /// This records the widest use type for any fixup using this
1344 /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
1345 /// fixup widths to be equivalent, because the narrower one may be relying on
1346 /// the implicit truncation to truncate away bogus bits.
1347 Type *WidestFixupType = nullptr;
1348
1349 /// A list of ways to build a value that can satisfy this user. After the
1350 /// list is populated, one of these is selected heuristically and used to
1351 /// formulate a replacement for OperandValToReplace in UserInst.
1352 SmallVector<Formula, 12> Formulae;
1353
1354 /// The set of register candidates used by all formulae in this LSRUse.
1356
1357 LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {}
1358
1359 LSRFixup &getNewFixup() {
1360 Fixups.push_back(LSRFixup());
1361 return Fixups.back();
1362 }
1363
1364 void pushFixup(LSRFixup &f) {
1365 Fixups.push_back(f);
1366 if (Immediate::isKnownGT(f.Offset, MaxOffset))
1367 MaxOffset = f.Offset;
1368 if (Immediate::isKnownLT(f.Offset, MinOffset))
1369 MinOffset = f.Offset;
1370 }
1371
1372 bool HasFormulaWithSameRegs(const Formula &F) const;
1373 float getNotSelectedProbability(const SCEV *Reg) const;
1374 bool InsertFormula(const Formula &F, const Loop &L);
1375 void DeleteFormula(Formula &F);
1376 void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
1377
1378 void print(raw_ostream &OS) const;
1379 void dump() const;
1380};
1381
1382} // end anonymous namespace
1383
1385 LSRUse::KindType Kind, MemAccessTy AccessTy,
1386 GlobalValue *BaseGV, Immediate BaseOffset,
1387 bool HasBaseReg, int64_t Scale,
1388 Instruction *Fixup = nullptr);
1389
1390static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) {
1391 if (isa<SCEVUnknown>(Reg) || isa<SCEVConstant>(Reg))
1392 return 1;
1393 if (Depth == 0)
1394 return 0;
1395 if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))
1396 return getSetupCost(S->getStart(), Depth - 1);
1397 if (auto S = dyn_cast<SCEVIntegralCastExpr>(Reg))
1398 return getSetupCost(S->getOperand(), Depth - 1);
1399 if (auto S = dyn_cast<SCEVNAryExpr>(Reg))
1400 return std::accumulate(S->operands().begin(), S->operands().end(), 0,
1401 [&](unsigned i, const SCEV *Reg) {
1402 return i + getSetupCost(Reg, Depth - 1);
1403 });
1404 if (auto S = dyn_cast<SCEVUDivExpr>(Reg))
1405 return getSetupCost(S->getLHS(), Depth - 1) +
1406 getSetupCost(S->getRHS(), Depth - 1);
1407 return 0;
1408}
1409
1410/// Tally up interesting quantities from the given register.
1411void Cost::RateRegister(const Formula &F, const SCEV *Reg,
1413 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
1414 // If this is an addrec for another loop, it should be an invariant
1415 // with respect to L since L is the innermost loop (at least
1416 // for now LSR only handles innermost loops).
1417 if (AR->getLoop() != L) {
1418 // If the AddRec exists, consider it's register free and leave it alone.
1419 if (isExistingPhi(AR, *SE) && AMK != TTI::AMK_PostIndexed)
1420 return;
1421
1422 // It is bad to allow LSR for current loop to add induction variables
1423 // for its sibling loops.
1424 if (!AR->getLoop()->contains(L)) {
1425 Lose();
1426 return;
1427 }
1428
1429 // Otherwise, it will be an invariant with respect to Loop L.
1430 ++C.NumRegs;
1431 return;
1432 }
1433
1434 unsigned LoopCost = 1;
1435 if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||
1436 TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {
1437
1438 // If the step size matches the base offset, we could use pre-indexed
1439 // addressing.
1440 if (AMK == TTI::AMK_PreIndexed && F.BaseOffset.isFixed()) {
1441 if (auto *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)))
1442 if (Step->getAPInt() == F.BaseOffset.getFixedValue())
1443 LoopCost = 0;
1444 } else if (AMK == TTI::AMK_PostIndexed) {
1445 const SCEV *LoopStep = AR->getStepRecurrence(*SE);
1446 if (isa<SCEVConstant>(LoopStep)) {
1447 const SCEV *LoopStart = AR->getStart();
1448 if (!isa<SCEVConstant>(LoopStart) &&
1449 SE->isLoopInvariant(LoopStart, L))
1450 LoopCost = 0;
1451 }
1452 }
1453 }
1454 C.AddRecCost += LoopCost;
1455
1456 // Add the step value register, if it needs one.
1457 // TODO: The non-affine case isn't precisely modeled here.
1458 if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
1459 if (!Regs.count(AR->getOperand(1))) {
1460 RateRegister(F, AR->getOperand(1), Regs);
1461 if (isLoser())
1462 return;
1463 }
1464 }
1465 }
1466 ++C.NumRegs;
1467
1468 // Rough heuristic; favor registers which don't require extra setup
1469 // instructions in the preheader.
1470 C.SetupCost += getSetupCost(Reg, SetupCostDepthLimit);
1471 // Ensure we don't, even with the recusion limit, produce invalid costs.
1472 C.SetupCost = std::min<unsigned>(C.SetupCost, 1 << 16);
1473
1474 C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
1475 SE->hasComputableLoopEvolution(Reg, L);
1476}
1477
1478/// Record this register in the set. If we haven't seen it before, rate
1479/// it. Optional LoserRegs provides a way to declare any formula that refers to
1480/// one of those regs an instant loser.
1481void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1483 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1484 if (LoserRegs && LoserRegs->count(Reg)) {
1485 Lose();
1486 return;
1487 }
1488 if (Regs.insert(Reg).second) {
1489 RateRegister(F, Reg, Regs);
1490 if (LoserRegs && isLoser())
1491 LoserRegs->insert(Reg);
1492 }
1493}
1494
1495void Cost::RateFormula(const Formula &F,
1497 const DenseSet<const SCEV *> &VisitedRegs,
1498 const LSRUse &LU,
1499 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1500 if (isLoser())
1501 return;
1502 assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
1503 // Tally up the registers.
1504 unsigned PrevAddRecCost = C.AddRecCost;
1505 unsigned PrevNumRegs = C.NumRegs;
1506 unsigned PrevNumBaseAdds = C.NumBaseAdds;
1507 if (const SCEV *ScaledReg = F.ScaledReg) {
1508 if (VisitedRegs.count(ScaledReg)) {
1509 Lose();
1510 return;
1511 }
1512 RatePrimaryRegister(F, ScaledReg, Regs, LoserRegs);
1513 if (isLoser())
1514 return;
1515 }
1516 for (const SCEV *BaseReg : F.BaseRegs) {
1517 if (VisitedRegs.count(BaseReg)) {
1518 Lose();
1519 return;
1520 }
1521 RatePrimaryRegister(F, BaseReg, Regs, LoserRegs);
1522 if (isLoser())
1523 return;
1524 }
1525
1526 // Determine how many (unfolded) adds we'll need inside the loop.
1527 size_t NumBaseParts = F.getNumRegs();
1528 if (NumBaseParts > 1)
1529 // Do not count the base and a possible second register if the target
1530 // allows to fold 2 registers.
1531 C.NumBaseAdds +=
1532 NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F)));
1533 C.NumBaseAdds += (F.UnfoldedOffset.isNonZero());
1534
1535 // Accumulate non-free scaling amounts.
1536 C.ScaleCost += *getScalingFactorCost(*TTI, LU, F, *L).getValue();
1537
1538 // Tally up the non-zero immediates.
1539 for (const LSRFixup &Fixup : LU.Fixups) {
1540 if (Fixup.Offset.isCompatibleImmediate(F.BaseOffset)) {
1541 Immediate Offset = Fixup.Offset.addUnsigned(F.BaseOffset);
1542 if (F.BaseGV)
1543 C.ImmCost += 64; // Handle symbolic values conservatively.
1544 // TODO: This should probably be the pointer size.
1545 else if (Offset.isNonZero())
1546 C.ImmCost +=
1547 APInt(64, Offset.getKnownMinValue(), true).getSignificantBits();
1548
1549 // Check with target if this offset with this instruction is
1550 // specifically not supported.
1551 if (LU.Kind == LSRUse::Address && Offset.isNonZero() &&
1552 !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1553 Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
1554 C.NumBaseAdds++;
1555 } else {
1556 // Incompatible immediate type, increase cost to avoid using
1557 C.ImmCost += 2048;
1558 }
1559 }
1560
1561 // If we don't count instruction cost exit here.
1562 if (!InsnsCost) {
1563 assert(isValid() && "invalid cost");
1564 return;
1565 }
1566
1567 // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
1568 // additional instruction (at least fill).
1569 // TODO: Need distinguish register class?
1570 unsigned TTIRegNum = TTI->getNumberOfRegisters(
1571 TTI->getRegisterClassForType(false, F.getType())) - 1;
1572 if (C.NumRegs > TTIRegNum) {
1573 // Cost already exceeded TTIRegNum, then only newly added register can add
1574 // new instructions.
1575 if (PrevNumRegs > TTIRegNum)
1576 C.Insns += (C.NumRegs - PrevNumRegs);
1577 else
1578 C.Insns += (C.NumRegs - TTIRegNum);
1579 }
1580
1581 // If ICmpZero formula ends with not 0, it could not be replaced by
1582 // just add or sub. We'll need to compare final result of AddRec.
1583 // That means we'll need an additional instruction. But if the target can
1584 // macro-fuse a compare with a branch, don't count this extra instruction.
1585 // For -10 + {0, +, 1}:
1586 // i = i + 1;
1587 // cmp i, 10
1588 //
1589 // For {-10, +, 1}:
1590 // i = i + 1;
1591 if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() &&
1592 !TTI->canMacroFuseCmp())
1593 C.Insns++;
1594 // Each new AddRec adds 1 instruction to calculation.
1595 C.Insns += (C.AddRecCost - PrevAddRecCost);
1596
1597 // BaseAdds adds instructions for unfolded registers.
1598 if (LU.Kind != LSRUse::ICmpZero)
1599 C.Insns += C.NumBaseAdds - PrevNumBaseAdds;
1600 assert(isValid() && "invalid cost");
1601}
1602
1603/// Set this cost to a losing value.
1604void Cost::Lose() {
1605 C.Insns = std::numeric_limits<unsigned>::max();
1606 C.NumRegs = std::numeric_limits<unsigned>::max();
1607 C.AddRecCost = std::numeric_limits<unsigned>::max();
1608 C.NumIVMuls = std::numeric_limits<unsigned>::max();
1609 C.NumBaseAdds = std::numeric_limits<unsigned>::max();
1610 C.ImmCost = std::numeric_limits<unsigned>::max();
1611 C.SetupCost = std::numeric_limits<unsigned>::max();
1612 C.ScaleCost = std::numeric_limits<unsigned>::max();
1613}
1614
1615/// Choose the lower cost.
1616bool Cost::isLess(const Cost &Other) const {
1617 if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
1618 C.Insns != Other.C.Insns)
1619 return C.Insns < Other.C.Insns;
1620 return TTI->isLSRCostLess(C, Other.C);
1621}
1622
1623#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1624void Cost::print(raw_ostream &OS) const {
1625 if (InsnsCost)
1626 OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
1627 OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s");
1628 if (C.AddRecCost != 0)
1629 OS << ", with addrec cost " << C.AddRecCost;
1630 if (C.NumIVMuls != 0)
1631 OS << ", plus " << C.NumIVMuls << " IV mul"
1632 << (C.NumIVMuls == 1 ? "" : "s");
1633 if (C.NumBaseAdds != 0)
1634 OS << ", plus " << C.NumBaseAdds << " base add"
1635 << (C.NumBaseAdds == 1 ? "" : "s");
1636 if (C.ScaleCost != 0)
1637 OS << ", plus " << C.ScaleCost << " scale cost";
1638 if (C.ImmCost != 0)
1639 OS << ", plus " << C.ImmCost << " imm cost";
1640 if (C.SetupCost != 0)
1641 OS << ", plus " << C.SetupCost << " setup cost";
1642}
1643
1644LLVM_DUMP_METHOD void Cost::dump() const {
1645 print(errs()); errs() << '\n';
1646}
1647#endif
1648
1649/// Test whether this fixup always uses its value outside of the given loop.
1650bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
1651 // PHI nodes use their value in their incoming blocks.
1652 if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
1653 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
1654 if (PN->getIncomingValue(i) == OperandValToReplace &&
1655 L->contains(PN->getIncomingBlock(i)))
1656 return false;
1657 return true;
1658 }
1659
1660 return !L->contains(UserInst);
1661}
1662
1663#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1664void LSRFixup::print(raw_ostream &OS) const {
1665 OS << "UserInst=";
1666 // Store is common and interesting enough to be worth special-casing.
1667 if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
1668 OS << "store ";
1669 Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
1670 } else if (UserInst->getType()->isVoidTy())
1671 OS << UserInst->getOpcodeName();
1672 else
1673 UserInst->printAsOperand(OS, /*PrintType=*/false);
1674
1675 OS << ", OperandValToReplace=";
1676 OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
1677
1678 for (const Loop *PIL : PostIncLoops) {
1679 OS << ", PostIncLoop=";
1680 PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
1681 }
1682
1683 if (Offset.isNonZero())
1684 OS << ", Offset=" << Offset;
1685}
1686
1687LLVM_DUMP_METHOD void LSRFixup::dump() const {
1688 print(errs()); errs() << '\n';
1689}
1690#endif
1691
1692/// Test whether this use as a formula which has the same registers as the given
1693/// formula.
1694bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
1696 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1697 // Unstable sort by host order ok, because this is only used for uniquifying.
1698 llvm::sort(Key);
1699 return Uniquifier.count(Key);
1700}
1701
1702/// The function returns a probability of selecting formula without Reg.
1703float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
1704 unsigned FNum = 0;
1705 for (const Formula &F : Formulae)
1706 if (F.referencesReg(Reg))
1707 FNum++;
1708 return ((float)(Formulae.size() - FNum)) / Formulae.size();
1709}
1710
1711/// If the given formula has not yet been inserted, add it to the list, and
1712/// return true. Return false otherwise. The formula must be in canonical form.
1713bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
1714 assert(F.isCanonical(L) && "Invalid canonical representation");
1715
1716 if (!Formulae.empty() && RigidFormula)
1717 return false;
1718
1720 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1721 // Unstable sort by host order ok, because this is only used for uniquifying.
1722 llvm::sort(Key);
1723
1724 if (!Uniquifier.insert(Key).second)
1725 return false;
1726
1727 // Using a register to hold the value of 0 is not profitable.
1728 assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
1729 "Zero allocated in a scaled register!");
1730#ifndef NDEBUG
1731 for (const SCEV *BaseReg : F.BaseRegs)
1732 assert(!BaseReg->isZero() && "Zero allocated in a base register!");
1733#endif
1734
1735 // Add the formula to the list.
1736 Formulae.push_back(F);
1737
1738 // Record registers now being used by this use.
1739 Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1740 if (F.ScaledReg)
1741 Regs.insert(F.ScaledReg);
1742
1743 return true;
1744}
1745
1746/// Remove the given formula from this use's list.
1747void LSRUse::DeleteFormula(Formula &F) {
1748 if (&F != &Formulae.back())
1749 std::swap(F, Formulae.back());
1750 Formulae.pop_back();
1751}
1752
1753/// Recompute the Regs field, and update RegUses.
1754void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
1755 // Now that we've filtered out some formulae, recompute the Regs set.
1756 SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
1757 Regs.clear();
1758 for (const Formula &F : Formulae) {
1759 if (F.ScaledReg) Regs.insert(F.ScaledReg);
1760 Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1761 }
1762
1763 // Update the RegTracker.
1764 for (const SCEV *S : OldRegs)
1765 if (!Regs.count(S))
1766 RegUses.dropRegister(S, LUIdx);
1767}
1768
1769#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1770void LSRUse::print(raw_ostream &OS) const {
1771 OS << "LSR Use: Kind=";
1772 switch (Kind) {
1773 case Basic: OS << "Basic"; break;
1774 case Special: OS << "Special"; break;
1775 case ICmpZero: OS << "ICmpZero"; break;
1776 case Address:
1777 OS << "Address of ";
1778 if (AccessTy.MemTy->isPointerTy())
1779 OS << "pointer"; // the full pointer type could be really verbose
1780 else {
1781 OS << *AccessTy.MemTy;
1782 }
1783
1784 OS << " in addrspace(" << AccessTy.AddrSpace << ')';
1785 }
1786
1787 OS << ", Offsets={";
1788 bool NeedComma = false;
1789 for (const LSRFixup &Fixup : Fixups) {
1790 if (NeedComma) OS << ',';
1791 OS << Fixup.Offset;
1792 NeedComma = true;
1793 }
1794 OS << '}';
1795
1796 if (AllFixupsOutsideLoop)
1797 OS << ", all-fixups-outside-loop";
1798
1799 if (WidestFixupType)
1800 OS << ", widest fixup type: " << *WidestFixupType;
1801}
1802
1803LLVM_DUMP_METHOD void LSRUse::dump() const {
1804 print(errs()); errs() << '\n';
1805}
1806#endif
1807
1809 LSRUse::KindType Kind, MemAccessTy AccessTy,
1810 GlobalValue *BaseGV, Immediate BaseOffset,
1811 bool HasBaseReg, int64_t Scale,
1812 Instruction *Fixup /* = nullptr */) {
1813 switch (Kind) {
1814 case LSRUse::Address: {
1815 int64_t FixedOffset =
1816 BaseOffset.isScalable() ? 0 : BaseOffset.getFixedValue();
1817 int64_t ScalableOffset =
1818 BaseOffset.isScalable() ? BaseOffset.getKnownMinValue() : 0;
1819 return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, FixedOffset,
1820 HasBaseReg, Scale, AccessTy.AddrSpace,
1821 Fixup, ScalableOffset);
1822 }
1823 case LSRUse::ICmpZero:
1824 // There's not even a target hook for querying whether it would be legal to
1825 // fold a GV into an ICmp.
1826 if (BaseGV)
1827 return false;
1828
1829 // ICmp only has two operands; don't allow more than two non-trivial parts.
1830 if (Scale != 0 && HasBaseReg && BaseOffset.isNonZero())
1831 return false;
1832
1833 // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
1834 // putting the scaled register in the other operand of the icmp.
1835 if (Scale != 0 && Scale != -1)
1836 return false;
1837
1838 // If we have low-level target information, ask the target if it can fold an
1839 // integer immediate on an icmp.
1840 if (BaseOffset.isNonZero()) {
1841 // We don't have an interface to query whether the target supports
1842 // icmpzero against scalable quantities yet.
1843 if (BaseOffset.isScalable())
1844 return false;
1845
1846 // We have one of:
1847 // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
1848 // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
1849 // Offs is the ICmp immediate.
1850 if (Scale == 0)
1851 // The cast does the right thing with
1852 // std::numeric_limits<int64_t>::min().
1853 BaseOffset = BaseOffset.getFixed(-(uint64_t)BaseOffset.getFixedValue());
1854 return TTI.isLegalICmpImmediate(BaseOffset.getFixedValue());
1855 }
1856
1857 // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
1858 return true;
1859
1860 case LSRUse::Basic:
1861 // Only handle single-register values.
1862 return !BaseGV && Scale == 0 && BaseOffset.isZero();
1863
1864 case LSRUse::Special:
1865 // Special case Basic to handle -1 scales.
1866 return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset.isZero();
1867 }
1868
1869 llvm_unreachable("Invalid LSRUse Kind!");
1870}
1871
1873 Immediate MinOffset, Immediate MaxOffset,
1874 LSRUse::KindType Kind, MemAccessTy AccessTy,
1875 GlobalValue *BaseGV, Immediate BaseOffset,
1876 bool HasBaseReg, int64_t Scale) {
1877 if (BaseOffset.isNonZero() &&
1878 (BaseOffset.isScalable() != MinOffset.isScalable() ||
1879 BaseOffset.isScalable() != MaxOffset.isScalable()))
1880 return false;
1881 // Check for overflow.
1882 int64_t Base = BaseOffset.getKnownMinValue();
1883 int64_t Min = MinOffset.getKnownMinValue();
1884 int64_t Max = MaxOffset.getKnownMinValue();
1885 if (((int64_t)((uint64_t)Base + Min) > Base) != (Min > 0))
1886 return false;
1887 MinOffset = Immediate::get((uint64_t)Base + Min, MinOffset.isScalable());
1888 if (((int64_t)((uint64_t)Base + Max) > Base) != (Max > 0))
1889 return false;
1890 MaxOffset = Immediate::get((uint64_t)Base + Max, MaxOffset.isScalable());
1891
1892 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
1893 HasBaseReg, Scale) &&
1894 isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
1895 HasBaseReg, Scale);
1896}
1897
1899 Immediate MinOffset, Immediate MaxOffset,
1900 LSRUse::KindType Kind, MemAccessTy AccessTy,
1901 const Formula &F, const Loop &L) {
1902 // For the purpose of isAMCompletelyFolded either having a canonical formula
1903 // or a scale not equal to zero is correct.
1904 // Problems may arise from non canonical formulae having a scale == 0.
1905 // Strictly speaking it would best to just rely on canonical formulae.
1906 // However, when we generate the scaled formulae, we first check that the
1907 // scaling factor is profitable before computing the actual ScaledReg for
1908 // compile time sake.
1909 assert((F.isCanonical(L) || F.Scale != 0));
1910 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1911 F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
1912}
1913
1914/// Test whether we know how to expand the current formula.
1915static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1916 Immediate MaxOffset, LSRUse::KindType Kind,
1917 MemAccessTy AccessTy, GlobalValue *BaseGV,
1918 Immediate BaseOffset, bool HasBaseReg, int64_t Scale) {
1919 // We know how to expand completely foldable formulae.
1920 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1921 BaseOffset, HasBaseReg, Scale) ||
1922 // Or formulae that use a base register produced by a sum of base
1923 // registers.
1924 (Scale == 1 &&
1925 isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1926 BaseGV, BaseOffset, true, 0));
1927}
1928
1929static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1930 Immediate MaxOffset, LSRUse::KindType Kind,
1931 MemAccessTy AccessTy, const Formula &F) {
1932 return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
1933 F.BaseOffset, F.HasBaseReg, F.Scale);
1934}
1935
1937 Immediate Offset) {
1938 if (Offset.isScalable())
1939 return TTI.isLegalAddScalableImmediate(Offset.getKnownMinValue());
1940
1941 return TTI.isLegalAddImmediate(Offset.getFixedValue());
1942}
1943
1945 const LSRUse &LU, const Formula &F) {
1946 // Target may want to look at the user instructions.
1947 if (LU.Kind == LSRUse::Address && TTI.LSRWithInstrQueries()) {
1948 for (const LSRFixup &Fixup : LU.Fixups)
1949 if (!isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1950 (F.BaseOffset + Fixup.Offset), F.HasBaseReg,
1951 F.Scale, Fixup.UserInst))
1952 return false;
1953 return true;
1954 }
1955
1956 return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1957 LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
1958 F.Scale);
1959}
1960
1962 const LSRUse &LU, const Formula &F,
1963 const Loop &L) {
1964 if (!F.Scale)
1965 return 0;
1966
1967 // If the use is not completely folded in that instruction, we will have to
1968 // pay an extra cost only for scale != 1.
1969 if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1970 LU.AccessTy, F, L))
1971 return F.Scale != 1;
1972
1973 switch (LU.Kind) {
1974 case LSRUse::Address: {
1975 // Check the scaling factor cost with both the min and max offsets.
1976 int64_t ScalableMin = 0, ScalableMax = 0, FixedMin = 0, FixedMax = 0;
1977 if (F.BaseOffset.isScalable()) {
1978 ScalableMin = (F.BaseOffset + LU.MinOffset).getKnownMinValue();
1979 ScalableMax = (F.BaseOffset + LU.MaxOffset).getKnownMinValue();
1980 } else {
1981 FixedMin = (F.BaseOffset + LU.MinOffset).getFixedValue();
1982 FixedMax = (F.BaseOffset + LU.MaxOffset).getFixedValue();
1983 }
1984 InstructionCost ScaleCostMinOffset = TTI.getScalingFactorCost(
1985 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMin, ScalableMin),
1986 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
1987 InstructionCost ScaleCostMaxOffset = TTI.getScalingFactorCost(
1988 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMax, ScalableMax),
1989 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
1990
1991 assert(ScaleCostMinOffset.isValid() && ScaleCostMaxOffset.isValid() &&
1992 "Legal addressing mode has an illegal cost!");
1993 return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
1994 }
1995 case LSRUse::ICmpZero:
1996 case LSRUse::Basic:
1997 case LSRUse::Special:
1998 // The use is completely folded, i.e., everything is folded into the
1999 // instruction.
2000 return 0;
2001 }
2002
2003 llvm_unreachable("Invalid LSRUse Kind!");
2004}
2005
2007 LSRUse::KindType Kind, MemAccessTy AccessTy,
2008 GlobalValue *BaseGV, Immediate BaseOffset,
2009 bool HasBaseReg) {
2010 // Fast-path: zero is always foldable.
2011 if (BaseOffset.isZero() && !BaseGV)
2012 return true;
2013
2014 // Conservatively, create an address with an immediate and a
2015 // base and a scale.
2016 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2017
2018 // Canonicalize a scale of 1 to a base register if the formula doesn't
2019 // already have a base register.
2020 if (!HasBaseReg && Scale == 1) {
2021 Scale = 0;
2022 HasBaseReg = true;
2023 }
2024
2025 // FIXME: Try with + without a scale? Maybe based on TTI?
2026 // I think basereg + scaledreg + immediateoffset isn't a good 'conservative'
2027 // default for many architectures, not just AArch64 SVE. More investigation
2028 // needed later to determine if this should be used more widely than just
2029 // on scalable types.
2030 if (HasBaseReg && BaseOffset.isNonZero() && Kind != LSRUse::ICmpZero &&
2031 AccessTy.MemTy && AccessTy.MemTy->isScalableTy() && DropScaledForVScale)
2032 Scale = 0;
2033
2034 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
2035 HasBaseReg, Scale);
2036}
2037
2039 ScalarEvolution &SE, Immediate MinOffset,
2040 Immediate MaxOffset, LSRUse::KindType Kind,
2041 MemAccessTy AccessTy, const SCEV *S,
2042 bool HasBaseReg) {
2043 // Fast-path: zero is always foldable.
2044 if (S->isZero()) return true;
2045
2046 // Conservatively, create an address with an immediate and a
2047 // base and a scale.
2048 Immediate BaseOffset = ExtractImmediate(S, SE);
2049 GlobalValue *BaseGV = ExtractSymbol(S, SE);
2050
2051 // If there's anything else involved, it's not foldable.
2052 if (!S->isZero()) return false;
2053
2054 // Fast-path: zero is always foldable.
2055 if (BaseOffset.isZero() && !BaseGV)
2056 return true;
2057
2058 if (BaseOffset.isScalable())
2059 return false;
2060
2061 // Conservatively, create an address with an immediate and a
2062 // base and a scale.
2063 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2064
2065 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
2066 BaseOffset, HasBaseReg, Scale);
2067}
2068
2069namespace {
2070
2071/// An individual increment in a Chain of IV increments. Relate an IV user to
2072/// an expression that computes the IV it uses from the IV used by the previous
2073/// link in the Chain.
2074///
2075/// For the head of a chain, IncExpr holds the absolute SCEV expression for the
2076/// original IVOperand. The head of the chain's IVOperand is only valid during
2077/// chain collection, before LSR replaces IV users. During chain generation,
2078/// IncExpr can be used to find the new IVOperand that computes the same
2079/// expression.
2080struct IVInc {
2081 Instruction *UserInst;
2082 Value* IVOperand;
2083 const SCEV *IncExpr;
2084
2085 IVInc(Instruction *U, Value *O, const SCEV *E)
2086 : UserInst(U), IVOperand(O), IncExpr(E) {}
2087};
2088
2089// The list of IV increments in program order. We typically add the head of a
2090// chain without finding subsequent links.
2091struct IVChain {
2093 const SCEV *ExprBase = nullptr;
2094
2095 IVChain() = default;
2096 IVChain(const IVInc &Head, const SCEV *Base)
2097 : Incs(1, Head), ExprBase(Base) {}
2098
2100
2101 // Return the first increment in the chain.
2102 const_iterator begin() const {
2103 assert(!Incs.empty());
2104 return std::next(Incs.begin());
2105 }
2106 const_iterator end() const {
2107 return Incs.end();
2108 }
2109
2110 // Returns true if this chain contains any increments.
2111 bool hasIncs() const { return Incs.size() >= 2; }
2112
2113 // Add an IVInc to the end of this chain.
2114 void add(const IVInc &X) { Incs.push_back(X); }
2115
2116 // Returns the last UserInst in the chain.
2117 Instruction *tailUserInst() const { return Incs.back().UserInst; }
2118
2119 // Returns true if IncExpr can be profitably added to this chain.
2120 bool isProfitableIncrement(const SCEV *OperExpr,
2121 const SCEV *IncExpr,
2123};
2124
2125/// Helper for CollectChains to track multiple IV increment uses. Distinguish
2126/// between FarUsers that definitely cross IV increments and NearUsers that may
2127/// be used between IV increments.
2128struct ChainUsers {
2131};
2132
2133/// This class holds state for the main loop strength reduction logic.
2134class LSRInstance {
2135 IVUsers &IU;
2136 ScalarEvolution &SE;
2137 DominatorTree &DT;
2138 LoopInfo &LI;
2139 AssumptionCache &AC;
2140 TargetLibraryInfo &TLI;
2141 const TargetTransformInfo &TTI;
2142 Loop *const L;
2143 MemorySSAUpdater *MSSAU;
2145 mutable SCEVExpander Rewriter;
2146 bool Changed = false;
2147
2148 /// This is the insert position that the current loop's induction variable
2149 /// increment should be placed. In simple loops, this is the latch block's
2150 /// terminator. But in more complicated cases, this is a position which will
2151 /// dominate all the in-loop post-increment users.
2152 Instruction *IVIncInsertPos = nullptr;
2153
2154 /// Interesting factors between use strides.
2155 ///
2156 /// We explicitly use a SetVector which contains a SmallSet, instead of the
2157 /// default, a SmallDenseSet, because we need to use the full range of
2158 /// int64_ts, and there's currently no good way of doing that with
2159 /// SmallDenseSet.
2161
2162 /// The cost of the current SCEV, the best solution by LSR will be dropped if
2163 /// the solution is not profitable.
2164 Cost BaselineCost;
2165
2166 /// Interesting use types, to facilitate truncation reuse.
2168
2169 /// The list of interesting uses.
2171
2172 /// Track which uses use which register candidates.
2173 RegUseTracker RegUses;
2174
2175 // Limit the number of chains to avoid quadratic behavior. We don't expect to
2176 // have more than a few IV increment chains in a loop. Missing a Chain falls
2177 // back to normal LSR behavior for those uses.
2178 static const unsigned MaxChains = 8;
2179
2180 /// IV users can form a chain of IV increments.
2182
2183 /// IV users that belong to profitable IVChains.
2185
2186 /// Induction variables that were generated and inserted by the SCEV Expander.
2187 SmallVector<llvm::WeakVH, 2> ScalarEvolutionIVs;
2188
2189 void OptimizeShadowIV();
2190 bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
2191 ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
2192 void OptimizeLoopTermCond();
2193
2194 void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
2195 SmallVectorImpl<ChainUsers> &ChainUsersVec);
2196 void FinalizeChain(IVChain &Chain);
2197 void CollectChains();
2198 void GenerateIVChain(const IVChain &Chain,
2200
2201 void CollectInterestingTypesAndFactors();
2202 void CollectFixupsAndInitialFormulae();
2203
2204 // Support for sharing of LSRUses between LSRFixups.
2206 UseMapTy UseMap;
2207
2208 bool reconcileNewOffset(LSRUse &LU, Immediate NewOffset, bool HasBaseReg,
2209 LSRUse::KindType Kind, MemAccessTy AccessTy);
2210
2211 std::pair<size_t, Immediate> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
2212 MemAccessTy AccessTy);
2213
2214 void DeleteUse(LSRUse &LU, size_t LUIdx);
2215
2216 LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
2217
2218 void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2219 void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2220 void CountRegisters(const Formula &F, size_t LUIdx);
2221 bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
2222
2223 void CollectLoopInvariantFixupsAndFormulae();
2224
2225 void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
2226 unsigned Depth = 0);
2227
2228 void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
2229 const Formula &Base, unsigned Depth,
2230 size_t Idx, bool IsScaledReg = false);
2231 void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
2232 void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2233 const Formula &Base, size_t Idx,
2234 bool IsScaledReg = false);
2235 void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2236 void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2237 const Formula &Base,
2238 const SmallVectorImpl<Immediate> &Worklist,
2239 size_t Idx, bool IsScaledReg = false);
2240 void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2241 void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2242 void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2243 void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
2244 void GenerateCrossUseConstantOffsets();
2245 void GenerateAllReuseFormulae();
2246
2247 void FilterOutUndesirableDedicatedRegisters();
2248
2249 size_t EstimateSearchSpaceComplexity() const;
2250 void NarrowSearchSpaceByDetectingSupersets();
2251 void NarrowSearchSpaceByCollapsingUnrolledCode();
2252 void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
2253 void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
2254 void NarrowSearchSpaceByFilterPostInc();
2255 void NarrowSearchSpaceByDeletingCostlyFormulas();
2256 void NarrowSearchSpaceByPickingWinnerRegs();
2257 void NarrowSearchSpaceUsingHeuristics();
2258
2259 void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
2260 Cost &SolutionCost,
2262 const Cost &CurCost,
2263 const SmallPtrSet<const SCEV *, 16> &CurRegs,
2264 DenseSet<const SCEV *> &VisitedRegs) const;
2265 void Solve(SmallVectorImpl<const Formula *> &Solution) const;
2266
2268 HoistInsertPosition(BasicBlock::iterator IP,
2269 const SmallVectorImpl<Instruction *> &Inputs) const;
2270 BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP,
2271 const LSRFixup &LF,
2272 const LSRUse &LU) const;
2273
2274 Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2276 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2277 void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
2278 const Formula &F,
2279 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2280 void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2281 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2282 void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
2283
2284public:
2285 LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
2287 TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU);
2288
2289 bool getChanged() const { return Changed; }
2290 const SmallVectorImpl<WeakVH> &getScalarEvolutionIVs() const {
2291 return ScalarEvolutionIVs;
2292 }
2293
2294 void print_factors_and_types(raw_ostream &OS) const;
2295 void print_fixups(raw_ostream &OS) const;
2296 void print_uses(raw_ostream &OS) const;
2297 void print(raw_ostream &OS) const;
2298 void dump() const;
2299};
2300
2301} // end anonymous namespace
2302
2303/// If IV is used in a int-to-float cast inside the loop then try to eliminate
2304/// the cast operation.
2305void LSRInstance::OptimizeShadowIV() {
2306 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2307 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2308 return;
2309
2310 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
2311 UI != E; /* empty */) {
2312 IVUsers::const_iterator CandidateUI = UI;
2313 ++UI;
2314 Instruction *ShadowUse = CandidateUI->getUser();
2315 Type *DestTy = nullptr;
2316 bool IsSigned = false;
2317
2318 /* If shadow use is a int->float cast then insert a second IV
2319 to eliminate this cast.
2320
2321 for (unsigned i = 0; i < n; ++i)
2322 foo((double)i);
2323
2324 is transformed into
2325
2326 double d = 0.0;
2327 for (unsigned i = 0; i < n; ++i, ++d)
2328 foo(d);
2329 */
2330 if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
2331 IsSigned = false;
2332 DestTy = UCast->getDestTy();
2333 }
2334 else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
2335 IsSigned = true;
2336 DestTy = SCast->getDestTy();
2337 }
2338 if (!DestTy) continue;
2339
2340 // If target does not support DestTy natively then do not apply
2341 // this transformation.
2342 if (!TTI.isTypeLegal(DestTy)) continue;
2343
2344 PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
2345 if (!PH) continue;
2346 if (PH->getNumIncomingValues() != 2) continue;
2347
2348 // If the calculation in integers overflows, the result in FP type will
2349 // differ. So we only can do this transformation if we are guaranteed to not
2350 // deal with overflowing values
2351 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PH));
2352 if (!AR) continue;
2353 if (IsSigned && !AR->hasNoSignedWrap()) continue;
2354 if (!IsSigned && !AR->hasNoUnsignedWrap()) continue;
2355
2356 Type *SrcTy = PH->getType();
2357 int Mantissa = DestTy->getFPMantissaWidth();
2358 if (Mantissa == -1) continue;
2359 if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
2360 continue;
2361
2362 unsigned Entry, Latch;
2363 if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
2364 Entry = 0;
2365 Latch = 1;
2366 } else {
2367 Entry = 1;
2368 Latch = 0;
2369 }
2370
2371 ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
2372 if (!Init) continue;
2373 Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
2374 (double)Init->getSExtValue() :
2375 (double)Init->getZExtValue());
2376
2377 BinaryOperator *Incr =
2378 dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
2379 if (!Incr) continue;
2380 if (Incr->getOpcode() != Instruction::Add
2381 && Incr->getOpcode() != Instruction::Sub)
2382 continue;
2383
2384 /* Initialize new IV, double d = 0.0 in above example. */
2385 ConstantInt *C = nullptr;
2386 if (Incr->getOperand(0) == PH)
2387 C = dyn_cast<ConstantInt>(Incr->getOperand(1));
2388 else if (Incr->getOperand(1) == PH)
2389 C = dyn_cast<ConstantInt>(Incr->getOperand(0));
2390 else
2391 continue;
2392
2393 if (!C) continue;
2394
2395 // Ignore negative constants, as the code below doesn't handle them
2396 // correctly. TODO: Remove this restriction.
2397 if (!C->getValue().isStrictlyPositive())
2398 continue;
2399
2400 /* Add new PHINode. */
2401 PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH->getIterator());
2402 NewPH->setDebugLoc(PH->getDebugLoc());
2403
2404 /* create new increment. '++d' in above example. */
2405 Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
2407 Incr->getOpcode() == Instruction::Add ? Instruction::FAdd
2408 : Instruction::FSub,
2409 NewPH, CFP, "IV.S.next.", Incr->getIterator());
2410 NewIncr->setDebugLoc(Incr->getDebugLoc());
2411
2412 NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
2413 NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
2414
2415 /* Remove cast operation */
2416 ShadowUse->replaceAllUsesWith(NewPH);
2417 ShadowUse->eraseFromParent();
2418 Changed = true;
2419 break;
2420 }
2421}
2422
2423/// If Cond has an operand that is an expression of an IV, set the IV user and
2424/// stride information and return true, otherwise return false.
2425bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
2426 for (IVStrideUse &U : IU)
2427 if (U.getUser() == Cond) {
2428 // NOTE: we could handle setcc instructions with multiple uses here, but
2429 // InstCombine does it as well for simple uses, it's not clear that it
2430 // occurs enough in real life to handle.
2431 CondUse = &U;
2432 return true;
2433 }
2434 return false;
2435}
2436
2437/// Rewrite the loop's terminating condition if it uses a max computation.
2438///
2439/// This is a narrow solution to a specific, but acute, problem. For loops
2440/// like this:
2441///
2442/// i = 0;
2443/// do {
2444/// p[i] = 0.0;
2445/// } while (++i < n);
2446///
2447/// the trip count isn't just 'n', because 'n' might not be positive. And
2448/// unfortunately this can come up even for loops where the user didn't use
2449/// a C do-while loop. For example, seemingly well-behaved top-test loops
2450/// will commonly be lowered like this:
2451///
2452/// if (n > 0) {
2453/// i = 0;
2454/// do {
2455/// p[i] = 0.0;
2456/// } while (++i < n);
2457/// }
2458///
2459/// and then it's possible for subsequent optimization to obscure the if
2460/// test in such a way that indvars can't find it.
2461///
2462/// When indvars can't find the if test in loops like this, it creates a
2463/// max expression, which allows it to give the loop a canonical
2464/// induction variable:
2465///
2466/// i = 0;
2467/// max = n < 1 ? 1 : n;
2468/// do {
2469/// p[i] = 0.0;
2470/// } while (++i != max);
2471///
2472/// Canonical induction variables are necessary because the loop passes
2473/// are designed around them. The most obvious example of this is the
2474/// LoopInfo analysis, which doesn't remember trip count values. It
2475/// expects to be able to rediscover the trip count each time it is
2476/// needed, and it does this using a simple analysis that only succeeds if
2477/// the loop has a canonical induction variable.
2478///
2479/// However, when it comes time to generate code, the maximum operation
2480/// can be quite costly, especially if it's inside of an outer loop.
2481///
2482/// This function solves this problem by detecting this type of loop and
2483/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
2484/// the instructions for the maximum computation.
2485ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
2486 // Check that the loop matches the pattern we're looking for.
2487 if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
2488 Cond->getPredicate() != CmpInst::ICMP_NE)
2489 return Cond;
2490
2491 SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
2492 if (!Sel || !Sel->hasOneUse()) return Cond;
2493
2494 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2495 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2496 return Cond;
2497 const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
2498
2499 // Add one to the backedge-taken count to get the trip count.
2500 const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
2501 if (IterationCount != SE.getSCEV(Sel)) return Cond;
2502
2503 // Check for a max calculation that matches the pattern. There's no check
2504 // for ICMP_ULE here because the comparison would be with zero, which
2505 // isn't interesting.
2506 CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
2507 const SCEVNAryExpr *Max = nullptr;
2508 if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
2509 Pred = ICmpInst::ICMP_SLE;
2510 Max = S;
2511 } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
2512 Pred = ICmpInst::ICMP_SLT;
2513 Max = S;
2514 } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
2515 Pred = ICmpInst::ICMP_ULT;
2516 Max = U;
2517 } else {
2518 // No match; bail.
2519 return Cond;
2520 }
2521
2522 // To handle a max with more than two operands, this optimization would
2523 // require additional checking and setup.
2524 if (Max->getNumOperands() != 2)
2525 return Cond;
2526
2527 const SCEV *MaxLHS = Max->getOperand(0);
2528 const SCEV *MaxRHS = Max->getOperand(1);
2529
2530 // ScalarEvolution canonicalizes constants to the left. For < and >, look
2531 // for a comparison with 1. For <= and >=, a comparison with zero.
2532 if (!MaxLHS ||
2533 (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
2534 return Cond;
2535
2536 // Check the relevant induction variable for conformance to
2537 // the pattern.
2538 const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
2539 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
2540 if (!AR || !AR->isAffine() ||
2541 AR->getStart() != One ||
2542 AR->getStepRecurrence(SE) != One)
2543 return Cond;
2544
2545 assert(AR->getLoop() == L &&
2546 "Loop condition operand is an addrec in a different loop!");
2547
2548 // Check the right operand of the select, and remember it, as it will
2549 // be used in the new comparison instruction.
2550 Value *NewRHS = nullptr;
2551 if (ICmpInst::isTrueWhenEqual(Pred)) {
2552 // Look for n+1, and grab n.
2553 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
2554 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2555 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2556 NewRHS = BO->getOperand(0);
2557 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
2558 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2559 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2560 NewRHS = BO->getOperand(0);
2561 if (!NewRHS)
2562 return Cond;
2563 } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
2564 NewRHS = Sel->getOperand(1);
2565 else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
2566 NewRHS = Sel->getOperand(2);
2567 else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
2568 NewRHS = SU->getValue();
2569 else
2570 // Max doesn't match expected pattern.
2571 return Cond;
2572
2573 // Determine the new comparison opcode. It may be signed or unsigned,
2574 // and the original comparison may be either equality or inequality.
2575 if (Cond->getPredicate() == CmpInst::ICMP_EQ)
2576 Pred = CmpInst::getInversePredicate(Pred);
2577
2578 // Ok, everything looks ok to change the condition into an SLT or SGE and
2579 // delete the max calculation.
2580 ICmpInst *NewCond = new ICmpInst(Cond->getIterator(), Pred,
2581 Cond->getOperand(0), NewRHS, "scmp");
2582
2583 // Delete the max calculation instructions.
2584 NewCond->setDebugLoc(Cond->getDebugLoc());
2585 Cond->replaceAllUsesWith(NewCond);
2586 CondUse->setUser(NewCond);
2587 Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
2588 Cond->eraseFromParent();
2589 Sel->eraseFromParent();
2590 if (Cmp->use_empty())
2591 Cmp->eraseFromParent();
2592 return NewCond;
2593}
2594
2595/// Change loop terminating condition to use the postinc iv when possible.
2596void
2597LSRInstance::OptimizeLoopTermCond() {
2599
2600 // We need a different set of heuristics for rotated and non-rotated loops.
2601 // If a loop is rotated then the latch is also the backedge, so inserting
2602 // post-inc expressions just before the latch is ideal. To reduce live ranges
2603 // it also makes sense to rewrite terminating conditions to use post-inc
2604 // expressions.
2605 //
2606 // If the loop is not rotated then the latch is not a backedge; the latch
2607 // check is done in the loop head. Adding post-inc expressions before the
2608 // latch will cause overlapping live-ranges of pre-inc and post-inc expressions
2609 // in the loop body. In this case we do *not* want to use post-inc expressions
2610 // in the latch check, and we want to insert post-inc expressions before
2611 // the backedge.
2612 BasicBlock *LatchBlock = L->getLoopLatch();
2613 SmallVector<BasicBlock*, 8> ExitingBlocks;
2614 L->getExitingBlocks(ExitingBlocks);
2615 if (!llvm::is_contained(ExitingBlocks, LatchBlock)) {
2616 // The backedge doesn't exit the loop; treat this as a head-tested loop.
2617 IVIncInsertPos = LatchBlock->getTerminator();
2618 return;
2619 }
2620
2621 // Otherwise treat this as a rotated loop.
2622 for (BasicBlock *ExitingBlock : ExitingBlocks) {
2623 // Get the terminating condition for the loop if possible. If we
2624 // can, we want to change it to use a post-incremented version of its
2625 // induction variable, to allow coalescing the live ranges for the IV into
2626 // one register value.
2627
2628 BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
2629 if (!TermBr)
2630 continue;
2631 // FIXME: Overly conservative, termination condition could be an 'or' etc..
2632 if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
2633 continue;
2634
2635 // Search IVUsesByStride to find Cond's IVUse if there is one.
2636 IVStrideUse *CondUse = nullptr;
2637 ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
2638 if (!FindIVUserForCond(Cond, CondUse))
2639 continue;
2640
2641 // If the trip count is computed in terms of a max (due to ScalarEvolution
2642 // being unable to find a sufficient guard, for example), change the loop
2643 // comparison to use SLT or ULT instead of NE.
2644 // One consequence of doing this now is that it disrupts the count-down
2645 // optimization. That's not always a bad thing though, because in such
2646 // cases it may still be worthwhile to avoid a max.
2647 Cond = OptimizeMax(Cond, CondUse);
2648
2649 // If this exiting block dominates the latch block, it may also use
2650 // the post-inc value if it won't be shared with other uses.
2651 // Check for dominance.
2652 if (!DT.dominates(ExitingBlock, LatchBlock))
2653 continue;
2654
2655 // Conservatively avoid trying to use the post-inc value in non-latch
2656 // exits if there may be pre-inc users in intervening blocks.
2657 if (LatchBlock != ExitingBlock)
2658 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
2659 // Test if the use is reachable from the exiting block. This dominator
2660 // query is a conservative approximation of reachability.
2661 if (&*UI != CondUse &&
2662 !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {
2663 // Conservatively assume there may be reuse if the quotient of their
2664 // strides could be a legal scale.
2665 const SCEV *A = IU.getStride(*CondUse, L);
2666 const SCEV *B = IU.getStride(*UI, L);
2667 if (!A || !B) continue;
2668 if (SE.getTypeSizeInBits(A->getType()) !=
2669 SE.getTypeSizeInBits(B->getType())) {
2670 if (SE.getTypeSizeInBits(A->getType()) >
2671 SE.getTypeSizeInBits(B->getType()))
2672 B = SE.getSignExtendExpr(B, A->getType());
2673 else
2674 A = SE.getSignExtendExpr(A, B->getType());
2675 }
2676 if (const SCEVConstant *D =
2677 dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
2678 const ConstantInt *C = D->getValue();
2679 // Stride of one or negative one can have reuse with non-addresses.
2680 if (C->isOne() || C->isMinusOne())
2681 goto decline_post_inc;
2682 // Avoid weird situations.
2683 if (C->getValue().getSignificantBits() >= 64 ||
2684 C->getValue().isMinSignedValue())
2685 goto decline_post_inc;
2686 // Check for possible scaled-address reuse.
2687 if (isAddressUse(TTI, UI->getUser(), UI->getOperandValToReplace())) {
2688 MemAccessTy AccessTy = getAccessType(
2689 TTI, UI->getUser(), UI->getOperandValToReplace());
2690 int64_t Scale = C->getSExtValue();
2691 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2692 /*BaseOffset=*/0,
2693 /*HasBaseReg=*/true, Scale,
2694 AccessTy.AddrSpace))
2695 goto decline_post_inc;
2696 Scale = -Scale;
2697 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2698 /*BaseOffset=*/0,
2699 /*HasBaseReg=*/true, Scale,
2700 AccessTy.AddrSpace))
2701 goto decline_post_inc;
2702 }
2703 }
2704 }
2705
2706 LLVM_DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: "
2707 << *Cond << '\n');
2708
2709 // It's possible for the setcc instruction to be anywhere in the loop, and
2710 // possible for it to have multiple users. If it is not immediately before
2711 // the exiting block branch, move it.
2712 if (Cond->getNextNonDebugInstruction() != TermBr) {
2713 if (Cond->hasOneUse()) {
2714 Cond->moveBefore(TermBr);
2715 } else {
2716 // Clone the terminating condition and insert into the loopend.
2717 ICmpInst *OldCond = Cond;
2718 Cond = cast<ICmpInst>(Cond->clone());
2719 Cond->setName(L->getHeader()->getName() + ".termcond");
2720 Cond->insertInto(ExitingBlock, TermBr->getIterator());
2721
2722 // Clone the IVUse, as the old use still exists!
2723 CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
2724 TermBr->replaceUsesOfWith(OldCond, Cond);
2725 }
2726 }
2727
2728 // If we get to here, we know that we can transform the setcc instruction to
2729 // use the post-incremented version of the IV, allowing us to coalesce the
2730 // live ranges for the IV correctly.
2731 CondUse->transformToPostInc(L);
2732 Changed = true;
2733
2734 PostIncs.insert(Cond);
2735 decline_post_inc:;
2736 }
2737
2738 // Determine an insertion point for the loop induction variable increment. It
2739 // must dominate all the post-inc comparisons we just set up, and it must
2740 // dominate the loop latch edge.
2741 IVIncInsertPos = L->getLoopLatch()->getTerminator();
2742 for (Instruction *Inst : PostIncs)
2743 IVIncInsertPos = DT.findNearestCommonDominator(IVIncInsertPos, Inst);
2744}
2745
2746/// Determine if the given use can accommodate a fixup at the given offset and
2747/// other details. If so, update the use and return true.
2748bool LSRInstance::reconcileNewOffset(LSRUse &LU, Immediate NewOffset,
2749 bool HasBaseReg, LSRUse::KindType Kind,
2750 MemAccessTy AccessTy) {
2751 Immediate NewMinOffset = LU.MinOffset;
2752 Immediate NewMaxOffset = LU.MaxOffset;
2753 MemAccessTy NewAccessTy = AccessTy;
2754
2755 // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
2756 // something conservative, however this can pessimize in the case that one of
2757 // the uses will have all its uses outside the loop, for example.
2758 if (LU.Kind != Kind)
2759 return false;
2760
2761 // Check for a mismatched access type, and fall back conservatively as needed.
2762 // TODO: Be less conservative when the type is similar and can use the same
2763 // addressing modes.
2764 if (Kind == LSRUse::Address) {
2765 if (AccessTy.MemTy != LU.AccessTy.MemTy) {
2766 NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext(),
2767 AccessTy.AddrSpace);
2768 }
2769 }
2770
2771 // Conservatively assume HasBaseReg is true for now.
2772 if (Immediate::isKnownLT(NewOffset, LU.MinOffset)) {
2773 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2774 LU.MaxOffset - NewOffset, HasBaseReg))
2775 return false;
2776 NewMinOffset = NewOffset;
2777 } else if (Immediate::isKnownGT(NewOffset, LU.MaxOffset)) {
2778 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2779 NewOffset - LU.MinOffset, HasBaseReg))
2780 return false;
2781 NewMaxOffset = NewOffset;
2782 }
2783
2784 // FIXME: We should be able to handle some level of scalable offset support
2785 // for 'void', but in order to get basic support up and running this is
2786 // being left out.
2787 if (NewAccessTy.MemTy && NewAccessTy.MemTy->isVoidTy() &&
2788 (NewMinOffset.isScalable() || NewMaxOffset.isScalable()))
2789 return false;
2790
2791 // Update the use.
2792 LU.MinOffset = NewMinOffset;
2793 LU.MaxOffset = NewMaxOffset;
2794 LU.AccessTy = NewAccessTy;
2795 return true;
2796}
2797
2798/// Return an LSRUse index and an offset value for a fixup which needs the given
2799/// expression, with the given kind and optional access type. Either reuse an
2800/// existing use or create a new one, as needed.
2801std::pair<size_t, Immediate> LSRInstance::getUse(const SCEV *&Expr,
2802 LSRUse::KindType Kind,
2803 MemAccessTy AccessTy) {
2804 const SCEV *Copy = Expr;
2805 Immediate Offset = ExtractImmediate(Expr, SE);
2806
2807 // Basic uses can't accept any offset, for example.
2808 if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
2809 Offset, /*HasBaseReg=*/ true)) {
2810 Expr = Copy;
2811 Offset = Immediate::getFixed(0);
2812 }
2813
2814 std::pair<UseMapTy::iterator, bool> P =
2815 UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0));
2816 if (!P.second) {
2817 // A use already existed with this base.
2818 size_t LUIdx = P.first->second;
2819 LSRUse &LU = Uses[LUIdx];
2820 if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
2821 // Reuse this use.
2822 return std::make_pair(LUIdx, Offset);
2823 }
2824
2825 // Create a new use.
2826 size_t LUIdx = Uses.size();
2827 P.first->second = LUIdx;
2828 Uses.push_back(LSRUse(Kind, AccessTy));
2829 LSRUse &LU = Uses[LUIdx];
2830
2831 LU.MinOffset = Offset;
2832 LU.MaxOffset = Offset;
2833 return std::make_pair(LUIdx, Offset);
2834}
2835
2836/// Delete the given use from the Uses list.
2837void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
2838 if (&LU != &Uses.back())
2839 std::swap(LU, Uses.back());
2840 Uses.pop_back();
2841
2842 // Update RegUses.
2843 RegUses.swapAndDropUse(LUIdx, Uses.size());
2844}
2845
2846/// Look for a use distinct from OrigLU which is has a formula that has the same
2847/// registers as the given formula.
2848LSRUse *
2849LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
2850 const LSRUse &OrigLU) {
2851 // Search all uses for the formula. This could be more clever.
2852 for (LSRUse &LU : Uses) {
2853 // Check whether this use is close enough to OrigLU, to see whether it's
2854 // worthwhile looking through its formulae.
2855 // Ignore ICmpZero uses because they may contain formulae generated by
2856 // GenerateICmpZeroScales, in which case adding fixup offsets may
2857 // be invalid.
2858 if (&LU != &OrigLU &&
2859 LU.Kind != LSRUse::ICmpZero &&
2860 LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
2861 LU.WidestFixupType == OrigLU.WidestFixupType &&
2862 LU.HasFormulaWithSameRegs(OrigF)) {
2863 // Scan through this use's formulae.
2864 for (const Formula &F : LU.Formulae) {
2865 // Check to see if this formula has the same registers and symbols
2866 // as OrigF.
2867 if (F.BaseRegs == OrigF.BaseRegs &&
2868 F.ScaledReg == OrigF.ScaledReg &&
2869 F.BaseGV == OrigF.BaseGV &&
2870 F.Scale == OrigF.Scale &&
2871 F.UnfoldedOffset == OrigF.UnfoldedOffset) {
2872 if (F.BaseOffset.isZero())
2873 return &LU;
2874 // This is the formula where all the registers and symbols matched;
2875 // there aren't going to be any others. Since we declined it, we
2876 // can skip the rest of the formulae and proceed to the next LSRUse.
2877 break;
2878 }
2879 }
2880 }
2881 }
2882
2883 // Nothing looked good.
2884 return nullptr;
2885}
2886
2887void LSRInstance::CollectInterestingTypesAndFactors() {
2889
2890 // Collect interesting types and strides.
2892 for (const IVStrideUse &U : IU) {
2893 const SCEV *Expr = IU.getExpr(U);
2894 if (!Expr)
2895 continue;
2896
2897 // Collect interesting types.
2898 Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
2899
2900 // Add strides for mentioned loops.
2901 Worklist.push_back(Expr);
2902 do {
2903 const SCEV *S = Worklist.pop_back_val();
2904 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
2905 if (AR->getLoop() == L)
2906 Strides.insert(AR->getStepRecurrence(SE));
2907 Worklist.push_back(AR->getStart());
2908 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
2909 append_range(Worklist, Add->operands());
2910 }
2911 } while (!Worklist.empty());
2912 }
2913
2914 // Compute interesting factors from the set of interesting strides.
2916 I = Strides.begin(), E = Strides.end(); I != E; ++I)
2918 std::next(I); NewStrideIter != E; ++NewStrideIter) {
2919 const SCEV *OldStride = *I;
2920 const SCEV *NewStride = *NewStrideIter;
2921
2922 if (SE.getTypeSizeInBits(OldStride->getType()) !=
2923 SE.getTypeSizeInBits(NewStride->getType())) {
2924 if (SE.getTypeSizeInBits(OldStride->getType()) >
2925 SE.getTypeSizeInBits(NewStride->getType()))
2926 NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
2927 else
2928 OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
2929 }
2930 if (const SCEVConstant *Factor =
2931 dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
2932 SE, true))) {
2933 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2934 Factors.insert(Factor->getAPInt().getSExtValue());
2935 } else if (const SCEVConstant *Factor =
2936 dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
2937 NewStride,
2938 SE, true))) {
2939 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2940 Factors.insert(Factor->getAPInt().getSExtValue());
2941 }
2942 }
2943
2944 // If all uses use the same type, don't bother looking for truncation-based
2945 // reuse.
2946 if (Types.size() == 1)
2947 Types.clear();
2948
2949 LLVM_DEBUG(print_factors_and_types(dbgs()));
2950}
2951
2952/// Helper for CollectChains that finds an IV operand (computed by an AddRec in
2953/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
2954/// IVStrideUses, we could partially skip this.
2955static User::op_iterator
2957 Loop *L, ScalarEvolution &SE) {
2958 for(; OI != OE; ++OI) {
2959 if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
2960 if (!SE.isSCEVable(Oper->getType()))
2961 continue;
2962
2963 if (const SCEVAddRecExpr *AR =
2964 dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) {
2965 if (AR->getLoop() == L)
2966 break;
2967 }
2968 }
2969 }
2970 return OI;
2971}
2972
2973/// IVChain logic must consistently peek base TruncInst operands, so wrap it in
2974/// a convenient helper.
2976 if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
2977 return Trunc->getOperand(0);
2978 return Oper;
2979}
2980
2981/// Return an approximation of this SCEV expression's "base", or NULL for any
2982/// constant. Returning the expression itself is conservative. Returning a
2983/// deeper subexpression is more precise and valid as long as it isn't less
2984/// complex than another subexpression. For expressions involving multiple
2985/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
2986/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
2987/// IVInc==b-a.
2988///
2989/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
2990/// SCEVUnknown, we simply return the rightmost SCEV operand.
2991static const SCEV *getExprBase(const SCEV *S) {
2992 switch (S->getSCEVType()) {
2993 default: // including scUnknown.
2994 return S;
2995 case scConstant:
2996 case scVScale:
2997 return nullptr;
2998 case scTruncate:
2999 return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
3000 case scZeroExtend:
3001 return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
3002 case scSignExtend:
3003 return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
3004 case scAddExpr: {
3005 // Skip over scaled operands (scMulExpr) to follow add operands as long as
3006 // there's nothing more complex.
3007 // FIXME: not sure if we want to recognize negation.
3008 const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
3009 for (const SCEV *SubExpr : reverse(Add->operands())) {
3010 if (SubExpr->getSCEVType() == scAddExpr)
3011 return getExprBase(SubExpr);
3012
3013 if (SubExpr->getSCEVType() != scMulExpr)
3014 return SubExpr;
3015 }
3016 return S; // all operands are scaled, be conservative.
3017 }
3018 case scAddRecExpr:
3019 return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
3020 }
3021 llvm_unreachable("Unknown SCEV kind!");
3022}
3023
3024/// Return true if the chain increment is profitable to expand into a loop
3025/// invariant value, which may require its own register. A profitable chain
3026/// increment will be an offset relative to the same base. We allow such offsets
3027/// to potentially be used as chain increment as long as it's not obviously
3028/// expensive to expand using real instructions.
3029bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
3030 const SCEV *IncExpr,
3031 ScalarEvolution &SE) {
3032 // Aggressively form chains when -stress-ivchain.
3033 if (StressIVChain)
3034 return true;
3035
3036 // Do not replace a constant offset from IV head with a nonconstant IV
3037 // increment.
3038 if (!isa<SCEVConstant>(IncExpr)) {
3039 const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
3040 if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
3041 return false;
3042 }
3043
3045 return !isHighCostExpansion(IncExpr, Processed, SE);
3046}
3047
3048/// Return true if the number of registers needed for the chain is estimated to
3049/// be less than the number required for the individual IV users. First prohibit
3050/// any IV users that keep the IV live across increments (the Users set should
3051/// be empty). Next count the number and type of increments in the chain.
3052///
3053/// Chaining IVs can lead to considerable code bloat if ISEL doesn't
3054/// effectively use postinc addressing modes. Only consider it profitable it the
3055/// increments can be computed in fewer registers when chained.
3056///
3057/// TODO: Consider IVInc free if it's already used in another chains.
3058static bool isProfitableChain(IVChain &Chain,
3060 ScalarEvolution &SE,
3061 const TargetTransformInfo &TTI) {
3062 if (StressIVChain)
3063 return true;
3064
3065 if (!Chain.hasIncs())
3066 return false;
3067
3068 if (!Users.empty()) {
3069 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
3070 for (Instruction *Inst
3071 : Users) { dbgs() << " " << *Inst << "\n"; });
3072 return false;
3073 }
3074 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3075
3076 // The chain itself may require a register, so intialize cost to 1.
3077 int cost = 1;
3078
3079 // A complete chain likely eliminates the need for keeping the original IV in
3080 // a register. LSR does not currently know how to form a complete chain unless
3081 // the header phi already exists.
3082 if (isa<PHINode>(Chain.tailUserInst())
3083 && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
3084 --cost;
3085 }
3086 const SCEV *LastIncExpr = nullptr;
3087 unsigned NumConstIncrements = 0;
3088 unsigned NumVarIncrements = 0;
3089 unsigned NumReusedIncrements = 0;
3090
3091 if (TTI.isProfitableLSRChainElement(Chain.Incs[0].UserInst))
3092 return true;
3093
3094 for (const IVInc &Inc : Chain) {
3095 if (TTI.isProfitableLSRChainElement(Inc.UserInst))
3096 return true;
3097 if (Inc.IncExpr->isZero())
3098 continue;
3099
3100 // Incrementing by zero or some constant is neutral. We assume constants can
3101 // be folded into an addressing mode or an add's immediate operand.
3102 if (isa<SCEVConstant>(Inc.IncExpr)) {
3103 ++NumConstIncrements;
3104 continue;
3105 }
3106
3107 if (Inc.IncExpr == LastIncExpr)
3108 ++NumReusedIncrements;
3109 else
3110 ++NumVarIncrements;
3111
3112 LastIncExpr = Inc.IncExpr;
3113 }
3114 // An IV chain with a single increment is handled by LSR's postinc
3115 // uses. However, a chain with multiple increments requires keeping the IV's
3116 // value live longer than it needs to be if chained.
3117 if (NumConstIncrements > 1)
3118 --cost;
3119
3120 // Materializing increment expressions in the preheader that didn't exist in
3121 // the original code may cost a register. For example, sign-extended array
3122 // indices can produce ridiculous increments like this:
3123 // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
3124 cost += NumVarIncrements;
3125
3126 // Reusing variable increments likely saves a register to hold the multiple of
3127 // the stride.
3128 cost -= NumReusedIncrements;
3129
3130 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
3131 << "\n");
3132
3133 return cost < 0;
3134}
3135
3136/// Add this IV user to an existing chain or make it the head of a new chain.
3137void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
3138 SmallVectorImpl<ChainUsers> &ChainUsersVec) {
3139 // When IVs are used as types of varying widths, they are generally converted
3140 // to a wider type with some uses remaining narrow under a (free) trunc.
3141 Value *const NextIV = getWideOperand(IVOper);
3142 const SCEV *const OperExpr = SE.getSCEV(NextIV);
3143 const SCEV *const OperExprBase = getExprBase(OperExpr);
3144
3145 // Visit all existing chains. Check if its IVOper can be computed as a
3146 // profitable loop invariant increment from the last link in the Chain.
3147 unsigned ChainIdx = 0, NChains = IVChainVec.size();
3148 const SCEV *LastIncExpr = nullptr;
3149 for (; ChainIdx < NChains; ++ChainIdx) {
3150 IVChain &Chain = IVChainVec[ChainIdx];
3151
3152 // Prune the solution space aggressively by checking that both IV operands
3153 // are expressions that operate on the same unscaled SCEVUnknown. This
3154 // "base" will be canceled by the subsequent getMinusSCEV call. Checking
3155 // first avoids creating extra SCEV expressions.
3156 if (!StressIVChain && Chain.ExprBase != OperExprBase)
3157 continue;
3158
3159 Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
3160 if (PrevIV->getType() != NextIV->getType())
3161 continue;
3162
3163 // A phi node terminates a chain.
3164 if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
3165 continue;
3166
3167 // The increment must be loop-invariant so it can be kept in a register.
3168 const SCEV *PrevExpr = SE.getSCEV(PrevIV);
3169 const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
3170 if (isa<SCEVCouldNotCompute>(IncExpr) || !SE.isLoopInvariant(IncExpr, L))
3171 continue;
3172
3173 if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
3174 LastIncExpr = IncExpr;
3175 break;
3176 }
3177 }
3178 // If we haven't found a chain, create a new one, unless we hit the max. Don't
3179 // bother for phi nodes, because they must be last in the chain.
3180 if (ChainIdx == NChains) {
3181 if (isa<PHINode>(UserInst))
3182 return;
3183 if (NChains >= MaxChains && !StressIVChain) {
3184 LLVM_DEBUG(dbgs() << "IV Chain Limit\n");
3185 return;
3186 }
3187 LastIncExpr = OperExpr;
3188 // IVUsers may have skipped over sign/zero extensions. We don't currently
3189 // attempt to form chains involving extensions unless they can be hoisted
3190 // into this loop's AddRec.
3191 if (!isa<SCEVAddRecExpr>(LastIncExpr))
3192 return;
3193 ++NChains;
3194 IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
3195 OperExprBase));
3196 ChainUsersVec.resize(NChains);
3197 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
3198 << ") IV=" << *LastIncExpr << "\n");
3199 } else {
3200 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst
3201 << ") IV+" << *LastIncExpr << "\n");
3202 // Add this IV user to the end of the chain.
3203 IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
3204 }
3205 IVChain &Chain = IVChainVec[ChainIdx];
3206
3207 SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
3208 // This chain's NearUsers become FarUsers.
3209 if (!LastIncExpr->isZero()) {
3210 ChainUsersVec[ChainIdx].FarUsers.insert(NearUsers.begin(),
3211 NearUsers.end());
3212 NearUsers.clear();
3213 }
3214
3215 // All other uses of IVOperand become near uses of the chain.
3216 // We currently ignore intermediate values within SCEV expressions, assuming
3217 // they will eventually be used be the current chain, or can be computed
3218 // from one of the chain increments. To be more precise we could
3219 // transitively follow its user and only add leaf IV users to the set.
3220 for (User *U : IVOper->users()) {
3221 Instruction *OtherUse = dyn_cast<Instruction>(U);
3222 if (!OtherUse)
3223 continue;
3224 // Uses in the chain will no longer be uses if the chain is formed.
3225 // Include the head of the chain in this iteration (not Chain.begin()).
3226 IVChain::const_iterator IncIter = Chain.Incs.begin();
3227 IVChain::const_iterator IncEnd = Chain.Incs.end();
3228 for( ; IncIter != IncEnd; ++IncIter) {
3229 if (IncIter->UserInst == OtherUse)
3230 break;
3231 }
3232 if (IncIter != IncEnd)
3233 continue;
3234
3235 if (SE.isSCEVable(OtherUse->getType())
3236 && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
3237 && IU.isIVUserOrOperand(OtherUse)) {
3238 continue;
3239 }
3240 NearUsers.insert(OtherUse);
3241 }
3242
3243 // Since this user is part of the chain, it's no longer considered a use
3244 // of the chain.
3245 ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
3246}
3247
3248/// Populate the vector of Chains.
3249///
3250/// This decreases ILP at the architecture level. Targets with ample registers,
3251/// multiple memory ports, and no register renaming probably don't want
3252/// this. However, such targets should probably disable LSR altogether.
3253///
3254/// The job of LSR is to make a reasonable choice of induction variables across
3255/// the loop. Subsequent passes can easily "unchain" computation exposing more
3256/// ILP *within the loop* if the target wants it.
3257///
3258/// Finding the best IV chain is potentially a scheduling problem. Since LSR
3259/// will not reorder memory operations, it will recognize this as a chain, but
3260/// will generate redundant IV increments. Ideally this would be corrected later
3261/// by a smart scheduler:
3262/// = A[i]
3263/// = A[i+x]
3264/// A[i] =
3265/// A[i+x] =
3266///
3267/// TODO: Walk the entire domtree within this loop, not just the path to the
3268/// loop latch. This will discover chains on side paths, but requires
3269/// maintaining multiple copies of the Chains state.
3270void LSRInstance::CollectChains() {
3271 LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n");
3272 SmallVector<ChainUsers, 8> ChainUsersVec;
3273
3275 BasicBlock *LoopHeader = L->getHeader();
3276 for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
3277 Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
3278 LatchPath.push_back(Rung->getBlock());
3279 }
3280 LatchPath.push_back(LoopHeader);
3281
3282 // Walk the instruction stream from the loop header to the loop latch.
3283 for (BasicBlock *BB : reverse(LatchPath)) {
3284 for (Instruction &I : *BB) {
3285 // Skip instructions that weren't seen by IVUsers analysis.
3286 if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&I))
3287 continue;
3288
3289 // Ignore users that are part of a SCEV expression. This way we only
3290 // consider leaf IV Users. This effectively rediscovers a portion of
3291 // IVUsers analysis but in program order this time.
3292 if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I)))
3293 continue;
3294
3295 // Remove this instruction from any NearUsers set it may be in.
3296 for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
3297 ChainIdx < NChains; ++ChainIdx) {
3298 ChainUsersVec[ChainIdx].NearUsers.erase(&I);
3299 }
3300 // Search for operands that can be chained.
3301 SmallPtrSet<Instruction*, 4> UniqueOperands;
3302 User::op_iterator IVOpEnd = I.op_end();
3303 User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE);
3304 while (IVOpIter != IVOpEnd) {
3305 Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
3306 if (UniqueOperands.insert(IVOpInst).second)
3307 ChainInstruction(&I, IVOpInst, ChainUsersVec);
3308 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3309 }
3310 } // Continue walking down the instructions.
3311 } // Continue walking down the domtree.
3312 // Visit phi backedges to determine if the chain can generate the IV postinc.
3313 for (PHINode &PN : L->getHeader()->phis()) {
3314 if (!SE.isSCEVable(PN.getType()))
3315 continue;
3316
3317 Instruction *IncV =
3318 dyn_cast<Instruction>(PN.getIncomingValueForBlock(L->getLoopLatch()));
3319 if (IncV)
3320 ChainInstruction(&PN, IncV, ChainUsersVec);
3321 }
3322 // Remove any unprofitable chains.
3323 unsigned ChainIdx = 0;
3324 for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
3325 UsersIdx < NChains; ++UsersIdx) {
3326 if (!isProfitableChain(IVChainVec[UsersIdx],
3327 ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
3328 continue;
3329 // Preserve the chain at UsesIdx.
3330 if (ChainIdx != UsersIdx)
3331 IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
3332 FinalizeChain(IVChainVec[ChainIdx]);
3333 ++ChainIdx;
3334 }
3335 IVChainVec.resize(ChainIdx);
3336}
3337
3338void LSRInstance::FinalizeChain(IVChain &Chain) {
3339 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3340 LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
3341
3342 for (const IVInc &Inc : Chain) {
3343 LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");
3344 auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
3345 assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
3346 IVIncSet.insert(UseI);
3347 }
3348}
3349
3350/// Return true if the IVInc can be folded into an addressing mode.
3351static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
3352 Value *Operand, const TargetTransformInfo &TTI) {
3353 const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
3354 Immediate IncOffset = Immediate::getZero();
3355 if (IncConst) {
3356 if (IncConst && IncConst->getAPInt().getSignificantBits() > 64)
3357 return false;
3358 IncOffset = Immediate::getFixed(IncConst->getValue()->getSExtValue());
3359 } else {
3360 // Look for mul(vscale, constant), to detect a scalable offset.
3361 auto *IncVScale = dyn_cast<SCEVMulExpr>(IncExpr);
3362 if (!IncVScale || IncVScale->getNumOperands() != 2 ||
3363 !isa<SCEVVScale>(IncVScale->getOperand(1)))
3364 return false;
3365 auto *Scale = dyn_cast<SCEVConstant>(IncVScale->getOperand(0));
3366 if (!Scale || Scale->getType()->getScalarSizeInBits() > 64)
3367 return false;
3368 IncOffset = Immediate::getScalable(Scale->getValue()->getSExtValue());
3369 }
3370
3371 if (!isAddressUse(TTI, UserInst, Operand))
3372 return false;
3373
3374 MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
3375 if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
3376 IncOffset, /*HasBaseReg=*/false))
3377 return false;
3378
3379 return true;
3380}
3381
3382/// Generate an add or subtract for each IVInc in a chain to materialize the IV
3383/// user's operand from the previous IV user's operand.
3384void LSRInstance::GenerateIVChain(const IVChain &Chain,
3386 // Find the new IVOperand for the head of the chain. It may have been replaced
3387 // by LSR.
3388 const IVInc &Head = Chain.Incs[0];
3389 User::op_iterator IVOpEnd = Head.UserInst->op_end();
3390 // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
3391 User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
3392 IVOpEnd, L, SE);
3393 Value *IVSrc = nullptr;
3394 while (IVOpIter != IVOpEnd) {
3395 IVSrc = getWideOperand(*IVOpIter);
3396
3397 // If this operand computes the expression that the chain needs, we may use
3398 // it. (Check this after setting IVSrc which is used below.)
3399 //
3400 // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
3401 // narrow for the chain, so we can no longer use it. We do allow using a
3402 // wider phi, assuming the LSR checked for free truncation. In that case we
3403 // should already have a truncate on this operand such that
3404 // getSCEV(IVSrc) == IncExpr.
3405 if (SE.getSCEV(*IVOpIter) == Head.IncExpr
3406 || SE.getSCEV(IVSrc) == Head.IncExpr) {
3407 break;
3408 }
3409 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3410 }
3411 if (IVOpIter == IVOpEnd) {
3412 // Gracefully give up on this chain.
3413 LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
3414 return;
3415 }
3416 assert(IVSrc && "Failed to find IV chain source");
3417
3418 LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
3419 Type *IVTy = IVSrc->getType();
3420 Type *IntTy = SE.getEffectiveSCEVType(IVTy);
3421 const SCEV *LeftOverExpr = nullptr;
3422 const SCEV *Accum = SE.getZero(IntTy);
3424 Bases.emplace_back(Accum, IVSrc);
3425
3426 for (const IVInc &Inc : Chain) {
3427 Instruction *InsertPt = Inc.UserInst;
3428 if (isa<PHINode>(InsertPt))
3429 InsertPt = L->getLoopLatch()->getTerminator();
3430
3431 // IVOper will replace the current IV User's operand. IVSrc is the IV
3432 // value currently held in a register.
3433 Value *IVOper = IVSrc;
3434 if (!Inc.IncExpr->isZero()) {
3435 // IncExpr was the result of subtraction of two narrow values, so must
3436 // be signed.
3437 const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
3438 Accum = SE.getAddExpr(Accum, IncExpr);
3439 LeftOverExpr = LeftOverExpr ?
3440 SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
3441 }
3442
3443 // Look through each base to see if any can produce a nice addressing mode.
3444 bool FoundBase = false;
3445 for (auto [MapScev, MapIVOper] : reverse(Bases)) {
3446 const SCEV *Remainder = SE.getMinusSCEV(Accum, MapScev);
3447 if (canFoldIVIncExpr(Remainder, Inc.UserInst, Inc.IVOperand, TTI)) {
3448 if (!Remainder->isZero()) {
3449 Rewriter.clearPostInc();
3450 Value *IncV = Rewriter.expandCodeFor(Remainder, IntTy, InsertPt);
3451 const SCEV *IVOperExpr =
3452 SE.getAddExpr(SE.getUnknown(MapIVOper), SE.getUnknown(IncV));
3453 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3454 } else {
3455 IVOper = MapIVOper;
3456 }
3457
3458 FoundBase = true;
3459 break;
3460 }
3461 }
3462 if (!FoundBase && LeftOverExpr && !LeftOverExpr->isZero()) {
3463 // Expand the IV increment.
3464 Rewriter.clearPostInc();
3465 Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
3466 const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
3467 SE.getUnknown(IncV));
3468 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3469
3470 // If an IV increment can't be folded, use it as the next IV value.
3471 if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
3472 assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
3473 Bases.emplace_back(Accum, IVOper);
3474 IVSrc = IVOper;
3475 LeftOverExpr = nullptr;
3476 }
3477 }
3478 Type *OperTy = Inc.IVOperand->getType();
3479 if (IVTy != OperTy) {
3480 assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
3481 "cannot extend a chained IV");
3482 IRBuilder<> Builder(InsertPt);
3483 IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
3484 }
3485 Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
3486 if (auto *OperandIsInstr = dyn_cast<Instruction>(Inc.IVOperand))
3487 DeadInsts.emplace_back(OperandIsInstr);
3488 }
3489 // If LSR created a new, wider phi, we may also replace its postinc. We only
3490 // do this if we also found a wide value for the head of the chain.
3491 if (isa<PHINode>(Chain.tailUserInst())) {
3492 for (PHINode &Phi : L->getHeader()->phis()) {
3493 if (Phi.getType() != IVSrc->getType())
3494 continue;
3495 Instruction *PostIncV = dyn_cast<Instruction>(
3496 Phi.getIncomingValueForBlock(L->getLoopLatch()));
3497 if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
3498 continue;
3499 Value *IVOper = IVSrc;
3500 Type *PostIncTy = PostIncV->getType();
3501 if (IVTy != PostIncTy) {
3502 assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
3503 IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
3504 Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
3505 IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
3506 }
3507 Phi.replaceUsesOfWith(PostIncV, IVOper);
3508 DeadInsts.emplace_back(PostIncV);
3509 }
3510 }
3511}
3512
3513void LSRInstance::CollectFixupsAndInitialFormulae() {
3514 BranchInst *ExitBranch = nullptr;
3515 bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &TLI);
3516
3517 // For calculating baseline cost
3519 DenseSet<const SCEV *> VisitedRegs;
3520 DenseSet<size_t> VisitedLSRUse;
3521
3522 for (const IVStrideUse &U : IU) {
3523 Instruction *UserInst = U.getUser();
3524 // Skip IV users that are part of profitable IV Chains.
3525 User::op_iterator UseI =
3526 find(UserInst->operands(), U.getOperandValToReplace());
3527 assert(UseI != UserInst->op_end() && "cannot find IV operand");
3528 if (IVIncSet.count(UseI)) {
3529 LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
3530 continue;
3531 }
3532
3533 LSRUse::KindType Kind = LSRUse::Basic;
3534 MemAccessTy AccessTy;
3535 if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
3536 Kind = LSRUse::Address;
3537 AccessTy = getAccessType(TTI, UserInst, U.getOperandValToReplace());
3538 }
3539
3540 const SCEV *S = IU.getExpr(U);
3541 if (!S)
3542 continue;
3543 PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
3544
3545 // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
3546 // (N - i == 0), and this allows (N - i) to be the expression that we work
3547 // with rather than just N or i, so we can consider the register
3548 // requirements for both N and i at the same time. Limiting this code to
3549 // equality icmps is not a problem because all interesting loops use
3550 // equality icmps, thanks to IndVarSimplify.
3551 if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) {
3552 // If CI can be saved in some target, like replaced inside hardware loop
3553 // in PowerPC, no need to generate initial formulae for it.
3554 if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition()))
3555 continue;
3556 if (CI->isEquality()) {
3557 // Swap the operands if needed to put the OperandValToReplace on the
3558 // left, for consistency.
3559 Value *NV = CI->getOperand(1);
3560 if (NV == U.getOperandValToReplace()) {
3561 CI->setOperand(1, CI->getOperand(0));
3562 CI->setOperand(0, NV);
3563 NV = CI->getOperand(1);
3564 Changed = true;
3565 }
3566
3567 // x == y --> x - y == 0
3568 const SCEV *N = SE.getSCEV(NV);
3569 if (SE.isLoopInvariant(N, L) && Rewriter.isSafeToExpand(N) &&
3570 (!NV->getType()->isPointerTy() ||
3571 SE.getPointerBase(N) == SE.getPointerBase(S))) {
3572 // S is normalized, so normalize N before folding it into S
3573 // to keep the result normalized.
3574 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3575 if (!N)
3576 continue;
3577 Kind = LSRUse::ICmpZero;
3578 S = SE.getMinusSCEV(N, S);
3579 } else if (L->isLoopInvariant(NV) &&
3580 (!isa<Instruction>(NV) ||
3581 DT.dominates(cast<Instruction>(NV), L->getHeader())) &&
3582 !NV->getType()->isPointerTy()) {
3583 // If we can't generally expand the expression (e.g. it contains
3584 // a divide), but it is already at a loop invariant point before the
3585 // loop, wrap it in an unknown (to prevent the expander from trying
3586 // to re-expand in a potentially unsafe way.) The restriction to
3587 // integer types is required because the unknown hides the base, and
3588 // SCEV can't compute the difference of two unknown pointers.
3589 N = SE.getUnknown(NV);
3590 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3591 if (!N)
3592 continue;
3593 Kind = LSRUse::ICmpZero;
3594 S = SE.getMinusSCEV(N, S);
3595 assert(!isa<SCEVCouldNotCompute>(S));
3596 }
3597
3598 // -1 and the negations of all interesting strides (except the negation
3599 // of -1) are now also interesting.
3600 for (size_t i = 0, e = Factors.size(); i != e; ++i)
3601 if (Factors[i] != -1)
3602 Factors.insert(-(uint64_t)Factors[i]);
3603 Factors.insert(-1);
3604 }
3605 }
3606
3607 // Get or create an LSRUse.
3608 std::pair<size_t, Immediate> P = getUse(S, Kind, AccessTy);
3609 size_t LUIdx = P.first;
3610 Immediate Offset = P.second;
3611 LSRUse &LU = Uses[LUIdx];
3612
3613 // Record the fixup.
3614 LSRFixup &LF = LU.getNewFixup();
3615 LF.UserInst = UserInst;
3616 LF.OperandValToReplace = U.getOperandValToReplace();
3617 LF.PostIncLoops = TmpPostIncLoops;
3618 LF.Offset = Offset;
3619 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3620
3621 // Create SCEV as Formula for calculating baseline cost
3622 if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) {
3623 Formula F;
3624 F.initialMatch(S, L, SE);
3625 BaselineCost.RateFormula(F, Regs, VisitedRegs, LU);
3626 VisitedLSRUse.insert(LUIdx);
3627 }
3628
3629 if (!LU.WidestFixupType ||
3630 SE.getTypeSizeInBits(LU.WidestFixupType) <
3631 SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3632 LU.WidestFixupType = LF.OperandValToReplace->getType();
3633
3634 // If this is the first use of this LSRUse, give it a formula.
3635 if (LU.Formulae.empty()) {
3636 InsertInitialFormula(S, LU, LUIdx);
3637 CountRegisters(LU.Formulae.back(), LUIdx);
3638 }
3639 }
3640
3641 LLVM_DEBUG(print_fixups(dbgs()));
3642}
3643
3644/// Insert a formula for the given expression into the given use, separating out
3645/// loop-variant portions from loop-invariant and loop-computable portions.
3646void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU,
3647 size_t LUIdx) {
3648 // Mark uses whose expressions cannot be expanded.
3649 if (!Rewriter.isSafeToExpand(S))
3650 LU.RigidFormula = true;
3651
3652 Formula F;
3653 F.initialMatch(S, L, SE);
3654 bool Inserted = InsertFormula(LU, LUIdx, F);
3655 assert(Inserted && "Initial formula already exists!"); (void)Inserted;
3656}
3657
3658/// Insert a simple single-register formula for the given expression into the
3659/// given use.
3660void
3661LSRInstance::InsertSupplementalFormula(const SCEV *S,
3662 LSRUse &LU, size_t LUIdx) {
3663 Formula F;
3664 F.BaseRegs.push_back(S);
3665 F.HasBaseReg = true;
3666 bool Inserted = InsertFormula(LU, LUIdx, F);
3667 assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
3668}
3669
3670/// Note which registers are used by the given formula, updating RegUses.
3671void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
3672 if (F.ScaledReg)
3673 RegUses.countRegister(F.ScaledReg, LUIdx);
3674 for (const SCEV *BaseReg : F.BaseRegs)
3675 RegUses.countRegister(BaseReg, LUIdx);
3676}
3677
3678/// If the given formula has not yet been inserted, add it to the list, and
3679/// return true. Return false otherwise.
3680bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
3681 // Do not insert formula that we will not be able to expand.
3682 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
3683 "Formula is illegal");
3684
3685 if (!LU.InsertFormula(F, *L))
3686 return false;
3687
3688 CountRegisters(F, LUIdx);
3689 return true;
3690}
3691
3692/// Check for other uses of loop-invariant values which we're tracking. These
3693/// other uses will pin these values in registers, making them less profitable
3694/// for elimination.
3695/// TODO: This currently misses non-constant addrec step registers.
3696/// TODO: Should this give more weight to users inside the loop?
3697void
3698LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
3699 SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
3701
3702 // Don't collect outside uses if we are favoring postinc - the instructions in
3703 // the loop are more important than the ones outside of it.
3704 if (AMK == TTI::AMK_PostIndexed)
3705 return;
3706
3707 while (!Worklist.empty()) {
3708 const SCEV *S = Worklist.pop_back_val();
3709
3710 // Don't process the same SCEV twice
3711 if (!Visited.insert(S).second)
3712 continue;
3713
3714 if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
3715 append_range(Worklist, N->operands());
3716 else if (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(S))
3717 Worklist.push_back(C->getOperand());
3718 else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
3719 Worklist.push_back(D->getLHS());
3720 Worklist.push_back(D->getRHS());
3721 } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
3722 const Value *V = US->getValue();
3723 if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
3724 // Look for instructions defined outside the loop.
3725 if (L->contains(Inst)) continue;
3726 } else if (isa<Constant>(V))
3727 // Constants can be re-materialized.
3728 continue;
3729 for (const Use &U : V->uses()) {
3730 const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
3731 // Ignore non-instructions.
3732 if (!UserInst)
3733 continue;
3734 // Don't bother if the instruction is an EHPad.
3735 if (UserInst->isEHPad())
3736 continue;
3737 // Ignore instructions in other functions (as can happen with
3738 // Constants).
3739 if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
3740 continue;
3741 // Ignore instructions not dominated by the loop.
3742 const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
3743 UserInst->getParent() :
3744 cast<PHINode>(UserInst)->getIncomingBlock(
3746 if (!DT.dominates(L->getHeader(), UseBB))
3747 continue;
3748 // Don't bother if the instruction is in a BB which ends in an EHPad.
3749 if (UseBB->getTerminator()->isEHPad())
3750 continue;
3751
3752 // Ignore cases in which the currently-examined value could come from
3753 // a basic block terminated with an EHPad. This checks all incoming
3754 // blocks of the phi node since it is possible that the same incoming
3755 // value comes from multiple basic blocks, only some of which may end
3756 // in an EHPad. If any of them do, a subsequent rewrite attempt by this
3757 // pass would try to insert instructions into an EHPad, hitting an
3758 // assertion.
3759 if (isa<PHINode>(UserInst)) {
3760 const auto *PhiNode = cast<PHINode>(UserInst);
3761 bool HasIncompatibleEHPTerminatedBlock = false;
3762 llvm::Value *ExpectedValue = U;
3763 for (unsigned int I = 0; I < PhiNode->getNumIncomingValues(); I++) {
3764 if (PhiNode->getIncomingValue(I) == ExpectedValue) {
3765 if (PhiNode->getIncomingBlock(I)->getTerminator()->isEHPad()) {
3766 HasIncompatibleEHPTerminatedBlock = true;
3767 break;
3768 }
3769 }
3770 }
3771 if (HasIncompatibleEHPTerminatedBlock) {
3772 continue;
3773 }
3774 }
3775
3776 // Don't bother rewriting PHIs in catchswitch blocks.
3777 if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
3778 continue;
3779 // Ignore uses which are part of other SCEV expressions, to avoid
3780 // analyzing them multiple times.
3781 if (SE.isSCEVable(UserInst->getType())) {
3782 const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
3783 // If the user is a no-op, look through to its uses.
3784 if (!isa<SCEVUnknown>(UserS))
3785 continue;
3786 if (UserS == US) {
3787 Worklist.push_back(
3788 SE.getUnknown(const_cast<Instruction *>(UserInst)));
3789 continue;
3790 }
3791 }
3792 // Ignore icmp instructions which are already being analyzed.
3793 if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
3794 unsigned OtherIdx = !U.getOperandNo();
3795 Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
3796 if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
3797 continue;
3798 }
3799
3800 std::pair<size_t, Immediate> P =
3801 getUse(S, LSRUse::Basic, MemAccessTy());
3802 size_t LUIdx = P.first;
3803 Immediate Offset = P.second;
3804 LSRUse &LU = Uses[LUIdx];
3805 LSRFixup &LF = LU.getNewFixup();
3806 LF.UserInst = const_cast<Instruction *>(UserInst);
3807 LF.OperandValToReplace = U;
3808 LF.Offset = Offset;
3809 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3810 if (!LU.WidestFixupType ||
3811 SE.getTypeSizeInBits(LU.WidestFixupType) <
3812 SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3813 LU.WidestFixupType = LF.OperandValToReplace->getType();
3814 InsertSupplementalFormula(US, LU, LUIdx);
3815 CountRegisters(LU.Formulae.back(), Uses.size() - 1);
3816 break;
3817 }
3818 }
3819 }
3820}
3821
3822/// Split S into subexpressions which can be pulled out into separate
3823/// registers. If C is non-null, multiply each subexpression by C.
3824///
3825/// Return remainder expression after factoring the subexpressions captured by
3826/// Ops. If Ops is complete, return NULL.
3827static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
3829 const Loop *L,
3830 ScalarEvolution &SE,
3831 unsigned Depth = 0) {
3832 // Arbitrarily cap recursion to protect compile time.
3833 if (Depth >= 3)
3834 return S;
3835
3836 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
3837 // Break out add operands.
3838 for (const SCEV *S : Add->operands()) {
3839 const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
3840 if (Remainder)
3841 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3842 }
3843 return nullptr;
3844 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
3845 // Split a non-zero base out of an addrec.
3846 if (AR->getStart()->isZero() || !AR->isAffine())
3847 return S;
3848
3849 const SCEV *Remainder = CollectSubexprs(AR->getStart(),
3850 C, Ops, L, SE, Depth+1);
3851 // Split the non-zero AddRec unless it is part of a nested recurrence that
3852 // does not pertain to this loop.
3853 if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) {
3854 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3855 Remainder = nullptr;
3856 }
3857 if (Remainder != AR->getStart()) {
3858 if (!Remainder)
3859 Remainder = SE.getConstant(AR->getType(), 0);
3860 return SE.getAddRecExpr(Remainder,
3861 AR->getStepRecurrence(SE),
3862 AR->getLoop(),
3863 //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
3865 }
3866 } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
3867 // Break (C * (a + b + c)) into C*a + C*b + C*c.
3868 if (Mul->getNumOperands() != 2)
3869 return S;
3870 if (const SCEVConstant *Op0 =
3871 dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
3872 C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
3873 const SCEV *Remainder =
3874 CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1);
3875 if (Remainder)
3876 Ops.push_back(SE.getMulExpr(C, Remainder));
3877 return nullptr;
3878 }
3879 }
3880 return S;
3881}
3882
3883/// Return true if the SCEV represents a value that may end up as a
3884/// post-increment operation.
3886 LSRUse &LU, const SCEV *S, const Loop *L,
3887 ScalarEvolution &SE) {
3888 if (LU.Kind != LSRUse::Address ||
3889 !LU.AccessTy.getType()->isIntOrIntVectorTy())
3890 return false;
3891 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S);
3892 if (!AR)
3893 return false;
3894 const SCEV *LoopStep = AR->getStepRecurrence(SE);
3895 if (!isa<SCEVConstant>(LoopStep))
3896 return false;
3897 // Check if a post-indexed load/store can be used.
3900 const SCEV *LoopStart = AR->getStart();
3901 if (!isa<SCEVConstant>(LoopStart) && SE.isLoopInvariant(LoopStart, L))
3902 return true;
3903 }
3904 return false;
3905}
3906
3907/// Helper function for LSRInstance::GenerateReassociations.
3908void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
3909 const Formula &Base,
3910 unsigned Depth, size_t Idx,
3911 bool IsScaledReg) {
3912 const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3913 // Don't generate reassociations for the base register of a value that
3914 // may generate a post-increment operator. The reason is that the
3915 // reassociations cause extra base+register formula to be created,
3916 // and possibly chosen, but the post-increment is more efficient.
3917 if (AMK == TTI::AMK_PostIndexed && mayUsePostIncMode(TTI, LU, BaseReg, L, SE))
3918 return;
3920 const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
3921 if (Remainder)
3922 AddOps.push_back(Remainder);
3923
3924 if (AddOps.size() == 1)
3925 return;
3926
3928 JE = AddOps.end();
3929 J != JE; ++J) {
3930 // Loop-variant "unknown" values are uninteresting; we won't be able to
3931 // do anything meaningful with them.
3932 if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
3933 continue;
3934
3935 // Don't pull a constant into a register if the constant could be folded
3936 // into an immediate field.
3937 if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3938 LU.AccessTy, *J, Base.getNumRegs() > 1))
3939 continue;
3940
3941 // Collect all operands except *J.
3942 SmallVector<const SCEV *, 8> InnerAddOps(
3943 ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
3944 InnerAddOps.append(std::next(J),
3945 ((const SmallVector<const SCEV *, 8> &)AddOps).end());
3946
3947 // Don't leave just a constant behind in a register if the constant could
3948 // be folded into an immediate field.
3949 if (InnerAddOps.size() == 1 &&
3950 isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3951 LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
3952 continue;
3953
3954 const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
3955 if (InnerSum->isZero())
3956 continue;
3957 Formula F = Base;
3958
3959 if (F.UnfoldedOffset.isNonZero() && F.UnfoldedOffset.isScalable())
3960 continue;
3961
3962 // Add the remaining pieces of the add back into the new formula.
3963 const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
3964 if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
3965 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
3966 InnerSumSC->getValue()->getZExtValue())) {
3967 F.UnfoldedOffset =
3968 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
3969 InnerSumSC->getValue()->getZExtValue());
3970 if (IsScaledReg)
3971 F.ScaledReg = nullptr;
3972 else
3973 F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
3974 } else if (IsScaledReg)
3975 F.ScaledReg = InnerSum;
3976 else
3977 F.BaseRegs[Idx] = InnerSum;
3978
3979 // Add J as its own register, or an unfolded immediate.
3980 const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
3981 if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
3982 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
3983 SC->getValue()->getZExtValue()))
3984 F.UnfoldedOffset =
3985 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
3986 SC->getValue()->getZExtValue());
3987 else
3988 F.BaseRegs.push_back(*J);
3989 // We may have changed the number of register in base regs, adjust the
3990 // formula accordingly.
3991 F.canonicalize(*L);
3992
3993 if (InsertFormula(LU, LUIdx, F))
3994 // If that formula hadn't been seen before, recurse to find more like
3995 // it.
3996 // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2)
3997 // Because just Depth is not enough to bound compile time.
3998 // This means that every time AddOps.size() is greater 16^x we will add
3999 // x to Depth.
4000 GenerateReassociations(LU, LUIdx, LU.Formulae.back(),
4001 Depth + 1 + (Log2_32(AddOps.size()) >> 2));
4002 }
4003}
4004
4005/// Split out subexpressions from adds and the bases of addrecs.
4006void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
4007 Formula Base, unsigned Depth) {
4008 assert(Base.isCanonical(*L) && "Input must be in the canonical form");
4009 // Arbitrarily cap recursion to protect compile time.
4010 if (Depth >= 3)
4011 return;
4012
4013 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4014 GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
4015
4016 if (Base.Scale == 1)
4017 GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
4018 /* Idx */ -1, /* IsScaledReg */ true);
4019}
4020
4021/// Generate a formula consisting of all of the loop-dominating registers added
4022/// into a single register.
4023void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
4024 Formula Base) {
4025 // This method is only interesting on a plurality of registers.
4026 if (Base.BaseRegs.size() + (Base.Scale == 1) +
4027 (Base.UnfoldedOffset.isNonZero()) <=
4028 1)
4029 return;
4030
4031 // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
4032 // processing the formula.
4033 Base.unscale();
4035 Formula NewBase = Base;
4036 NewBase.BaseRegs.clear();
4037 Type *CombinedIntegerType = nullptr;
4038 for (const SCEV *BaseReg : Base.BaseRegs) {
4039 if (SE.properlyDominates(BaseReg, L->getHeader()) &&
4040 !SE.hasComputableLoopEvolution(BaseReg, L)) {
4041 if (!CombinedIntegerType)
4042 CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType());
4043 Ops.push_back(BaseReg);
4044 }
4045 else
4046 NewBase.BaseRegs.push_back(BaseReg);
4047 }
4048
4049 // If no register is relevant, we're done.
4050 if (Ops.size() == 0)
4051 return;
4052
4053 // Utility function for generating the required variants of the combined
4054 // registers.
4055 auto GenerateFormula = [&](const SCEV *Sum) {
4056 Formula F = NewBase;
4057
4058 // TODO: If Sum is zero, it probably means ScalarEvolution missed an
4059 // opportunity to fold something. For now, just ignore such cases
4060 // rather than proceed with zero in a register.
4061 if (Sum->isZero())
4062 return;
4063
4064 F.BaseRegs.push_back(Sum);
4065 F.canonicalize(*L);
4066 (void)InsertFormula(LU, LUIdx, F);
4067 };
4068
4069 // If we collected at least two registers, generate a formula combining them.
4070 if (Ops.size() > 1) {
4071 SmallVector<const SCEV *, 4> OpsCopy(Ops); // Don't let SE modify Ops.
4072 GenerateFormula(SE.getAddExpr(OpsCopy));
4073 }
4074
4075 // If we have an unfolded offset, generate a formula combining it with the
4076 // registers collected.
4077 if (NewBase.UnfoldedOffset.isNonZero() && NewBase.UnfoldedOffset.isFixed()) {
4078 assert(CombinedIntegerType && "Missing a type for the unfolded offset");
4079 Ops.push_back(SE.getConstant(CombinedIntegerType,
4080 NewBase.UnfoldedOffset.getFixedValue(), true));
4081 NewBase.UnfoldedOffset = Immediate::getFixed(0);
4082 GenerateFormula(SE.getAddExpr(Ops));
4083 }
4084}
4085
4086/// Helper function for LSRInstance::GenerateSymbolicOffsets.
4087void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
4088 const Formula &Base, size_t Idx,
4089 bool IsScaledReg) {
4090 const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4091 GlobalValue *GV = ExtractSymbol(G, SE);
4092 if (G->isZero() || !GV)
4093 return;
4094 Formula F = Base;
4095 F.BaseGV = GV;
4096 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4097 return;
4098 if (IsScaledReg)
4099 F.ScaledReg = G;
4100 else
4101 F.BaseRegs[Idx] = G;
4102 (void)InsertFormula(LU, LUIdx, F);
4103}
4104
4105/// Generate reuse formulae using symbolic offsets.
4106void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
4107 Formula Base) {
4108 // We can't add a symbolic offset if the address already contains one.
4109 if (Base.BaseGV) return;
4110
4111 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4112 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
4113 if (Base.Scale == 1)
4114 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
4115 /* IsScaledReg */ true);
4116}
4117
4118/// Helper function for LSRInstance::GenerateConstantOffsets.
4119void LSRInstance::GenerateConstantOffsetsImpl(
4120 LSRUse &LU, unsigned LUIdx, const Formula &Base,
4121 const SmallVectorImpl<Immediate> &Worklist, size_t Idx, bool IsScaledReg) {
4122
4123 auto GenerateOffset = [&](const SCEV *G, Immediate Offset) {
4124 Formula F = Base;
4125 if (!Base.BaseOffset.isCompatibleImmediate(Offset))
4126 return;
4127 F.BaseOffset = Base.BaseOffset.subUnsigned(Offset);
4128
4129 if (isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) {
4130 // Add the offset to the base register.
4131 const SCEV *NewOffset = Offset.getSCEV(SE, G->getType());
4132 const SCEV *NewG = SE.getAddExpr(NewOffset, G);
4133 // If it cancelled out, drop the base register, otherwise update it.
4134 if (NewG->isZero()) {
4135 if (IsScaledReg) {
4136 F.Scale = 0;
4137 F.ScaledReg = nullptr;
4138 } else
4139 F.deleteBaseReg(F.BaseRegs[Idx]);
4140 F.canonicalize(*L);
4141 } else if (IsScaledReg)
4142 F.ScaledReg = NewG;
4143 else
4144 F.BaseRegs[Idx] = NewG;
4145
4146 (void)InsertFormula(LU, LUIdx, F);
4147 }
4148 };
4149
4150 const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4151
4152 // With constant offsets and constant steps, we can generate pre-inc
4153 // accesses by having the offset equal the step. So, for access #0 with a
4154 // step of 8, we generate a G - 8 base which would require the first access
4155 // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer
4156 // for itself and hopefully becomes the base for other accesses. This means
4157 // means that a single pre-indexed access can be generated to become the new
4158 // base pointer for each iteration of the loop, resulting in no extra add/sub
4159 // instructions for pointer updating.
4160 if (AMK == TTI::AMK_PreIndexed && LU.Kind == LSRUse::Address) {
4161 if (auto *GAR = dyn_cast<SCEVAddRecExpr>(G)) {
4162 if (auto *StepRec =
4163 dyn_cast<SCEVConstant>(GAR->getStepRecurrence(SE))) {
4164 const APInt &StepInt = StepRec->getAPInt();
4165 int64_t Step = StepInt.isNegative() ?
4166 StepInt.getSExtValue() : StepInt.getZExtValue();
4167
4168 for (Immediate Offset : Worklist) {
4169 if (Offset.isFixed()) {
4170 Offset = Immediate::getFixed(Offset.getFixedValue() - Step);
4171 GenerateOffset(G, Offset);
4172 }
4173 }
4174 }
4175 }
4176 }
4177 for (Immediate Offset : Worklist)
4178 GenerateOffset(G, Offset);
4179
4180 Immediate Imm = ExtractImmediate(G, SE);
4181 if (G->isZero() || Imm.isZero() ||
4182 !Base.BaseOffset.isCompatibleImmediate(Imm))
4183 return;
4184 Formula F = Base;
4185 F.BaseOffset = F.BaseOffset.addUnsigned(Imm);
4186 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4187 return;
4188 if (IsScaledReg) {
4189 F.ScaledReg = G;
4190 } else {
4191 F.BaseRegs[Idx] = G;
4192 // We may generate non canonical Formula if G is a recurrent expr reg
4193 // related with current loop while F.ScaledReg is not.
4194 F.canonicalize(*L);
4195 }
4196 (void)InsertFormula(LU, LUIdx, F);
4197}
4198
4199/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
4200void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
4201 Formula Base) {
4202 // TODO: For now, just add the min and max offset, because it usually isn't
4203 // worthwhile looking at everything inbetween.
4205 Worklist.push_back(LU.MinOffset);
4206 if (LU.MaxOffset != LU.MinOffset)
4207 Worklist.push_back(LU.MaxOffset);
4208
4209 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4210 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
4211 if (Base.Scale == 1)
4212 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
4213 /* IsScaledReg */ true);
4214}
4215
4216/// For ICmpZero, check to see if we can scale up the comparison. For example, x
4217/// == y -> x*c == y*c.
4218void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
4219 Formula Base) {
4220 if (LU.Kind != LSRUse::ICmpZero) return;
4221
4222 // Determine the integer type for the base formula.
4223 Type *IntTy = Base.getType();
4224 if (!IntTy) return;
4225 if (SE.getTypeSizeInBits(IntTy) > 64) return;
4226
4227 // Don't do this if there is more than one offset.
4228 if (LU.MinOffset != LU.MaxOffset) return;
4229
4230 // Check if transformation is valid. It is illegal to multiply pointer.
4231 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4232 return;
4233 for (const SCEV *BaseReg : Base.BaseRegs)
4234 if (BaseReg->getType()->isPointerTy())
4235 return;
4236 assert(!Base.BaseGV && "ICmpZero use is not legal!");
4237
4238 // Check each interesting stride.
4239 for (int64_t Factor : Factors) {
4240 // Check that Factor can be represented by IntTy
4241 if (!ConstantInt::isValueValidForType(IntTy, Factor))
4242 continue;
4243 // Check that the multiplication doesn't overflow.
4244 if (Base.BaseOffset.isMin() && Factor == -1)
4245 continue;
4246 // Not supporting scalable immediates.
4247 if (Base.BaseOffset.isNonZero() && Base.BaseOffset.isScalable())
4248 continue;
4249 Immediate NewBaseOffset = Base.BaseOffset.mulUnsigned(Factor);
4250 assert(Factor != 0 && "Zero factor not expected!");
4251 if (NewBaseOffset.getFixedValue() / Factor !=
4252 Base.BaseOffset.getFixedValue())
4253 continue;
4254 // If the offset will be truncated at this use, check that it is in bounds.
4255 if (!IntTy->isPointerTy() &&
4256 !ConstantInt::isValueValidForType(IntTy, NewBaseOffset.getFixedValue()))
4257 continue;
4258
4259 // Check that multiplying with the use offset doesn't overflow.
4260 Immediate Offset = LU.MinOffset;
4261 if (Offset.isMin() && Factor == -1)
4262 continue;
4263 Offset = Offset.mulUnsigned(Factor);
4264 if (Offset.getFixedValue() / Factor != LU.MinOffset.getFixedValue())
4265 continue;
4266 // If the offset will be truncated at this use, check that it is in bounds.
4267 if (!IntTy->isPointerTy() &&
4268 !ConstantInt::isValueValidForType(IntTy, Offset.getFixedValue()))
4269 continue;
4270
4271 Formula F = Base;
4272 F.BaseOffset = NewBaseOffset;
4273
4274 // Check that this scale is legal.
4275 if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
4276 continue;
4277
4278 // Compensate for the use having MinOffset built into it.
4279 F.BaseOffset = F.BaseOffset.addUnsigned(Offset).subUnsigned(LU.MinOffset);
4280
4281 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4282
4283 // Check that multiplying with each base register doesn't overflow.
4284 for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
4285 F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
4286 if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
4287 goto next;
4288 }
4289
4290 // Check that multiplying with the scaled register doesn't overflow.
4291 if (F.ScaledReg) {
4292 F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
4293 if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
4294 continue;
4295 }
4296
4297 // Check that multiplying with the unfolded offset doesn't overflow.
4298 if (F.UnfoldedOffset.isNonZero()) {
4299 if (F.UnfoldedOffset.isMin() && Factor == -1)
4300 continue;
4301 F.UnfoldedOffset = F.UnfoldedOffset.mulUnsigned(Factor);
4302 if (F.UnfoldedOffset.getFixedValue() / Factor !=
4303 Base.UnfoldedOffset.getFixedValue())
4304 continue;
4305 // If the offset will be truncated, check that it is in bounds.
4307 IntTy, F.UnfoldedOffset.getFixedValue()))
4308 continue;
4309 }
4310
4311 // If we make it here and it's legal, add it.
4312 (void)InsertFormula(LU, LUIdx, F);
4313 next:;
4314 }
4315}
4316
4317/// Generate stride factor reuse formulae by making use of scaled-offset address
4318/// modes, for example.
4319void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
4320 // Determine the integer type for the base formula.
4321 Type *IntTy = Base.getType();
4322 if (!IntTy) return;
4323
4324 // If this Formula already has a scaled register, we can't add another one.
4325 // Try to unscale the formula to generate a better scale.
4326 if (Base.Scale != 0 && !Base.unscale())
4327 return;
4328
4329 assert(Base.Scale == 0 && "unscale did not did its job!");
4330
4331 // Check each interesting stride.
4332 for (int64_t Factor : Factors) {
4333 Base.Scale = Factor;
4334 Base.HasBaseReg = Base.BaseRegs.size() > 1;
4335 // Check whether this scale is going to be legal.
4336 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4337 Base)) {
4338 // As a special-case, handle special out-of-loop Basic users specially.
4339 // TODO: Reconsider this special case.
4340 if (LU.Kind == LSRUse::Basic &&
4341 isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
4342 LU.AccessTy, Base) &&
4343 LU.AllFixupsOutsideLoop)
4344 LU.Kind = LSRUse::Special;
4345 else
4346 continue;
4347 }
4348 // For an ICmpZero, negating a solitary base register won't lead to
4349 // new solutions.
4350 if (LU.Kind == LSRUse::ICmpZero && !Base.HasBaseReg &&
4351 Base.BaseOffset.isZero() && !Base.BaseGV)
4352 continue;
4353 // For each addrec base reg, if its loop is current loop, apply the scale.
4354 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
4355 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
4356 if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
4357 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4358 if (FactorS->isZero())
4359 continue;
4360 // Divide out the factor, ignoring high bits, since we'll be
4361 // scaling the value back up in the end.
4362 if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true))
4363 if (!Quotient->isZero()) {
4364 // TODO: This could be optimized to avoid all the copying.
4365 Formula F = Base;
4366 F.ScaledReg = Quotient;
4367 F.deleteBaseReg(F.BaseRegs[i]);
4368 // The canonical representation of 1*reg is reg, which is already in
4369 // Base. In that case, do not try to insert the formula, it will be
4370 // rejected anyway.
4371 if (F.Scale == 1 && (F.BaseRegs.empty() ||
4372 (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
4373 continue;
4374 // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
4375 // non canonical Formula with ScaledReg's loop not being L.
4376 if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
4377 F.canonicalize(*L);
4378 (void)InsertFormula(LU, LUIdx, F);
4379 }
4380 }
4381 }
4382 }
4383}
4384
4385/// Extend/Truncate \p Expr to \p ToTy considering post-inc uses in \p Loops.
4386/// For all PostIncLoopSets in \p Loops, first de-normalize \p Expr, then
4387/// perform the extension/truncate and normalize again, as the normalized form
4388/// can result in folds that are not valid in the post-inc use contexts. The
4389/// expressions for all PostIncLoopSets must match, otherwise return nullptr.
4390static const SCEV *
4392 const SCEV *Expr, Type *ToTy,
4393 ScalarEvolution &SE) {
4394 const SCEV *Result = nullptr;
4395 for (auto &L : Loops) {
4396 auto *DenormExpr = denormalizeForPostIncUse(Expr, L, SE);
4397 const SCEV *NewDenormExpr = SE.getAnyExtendExpr(DenormExpr, ToTy);
4398 const SCEV *New = normalizeForPostIncUse(NewDenormExpr, L, SE);
4399 if (!New || (Result && New != Result))
4400 return nullptr;
4401 Result = New;
4402 }
4403
4404 assert(Result && "failed to create expression");
4405 return Result;
4406}
4407
4408/// Generate reuse formulae from different IV types.
4409void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
4410 // Don't bother truncating symbolic values.
4411 if (Base.BaseGV) return;
4412
4413 // Determine the integer type for the base formula.
4414 Type *DstTy = Base.getType();
4415 if (!DstTy) return;
4416 if (DstTy->isPointerTy())
4417 return;
4418
4419 // It is invalid to extend a pointer type so exit early if ScaledReg or
4420 // any of the BaseRegs are pointers.
4421 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4422 return;
4423 if (any_of(Base.BaseRegs,
4424 [](const SCEV *S) { return S->getType()->isPointerTy(); }))
4425 return;
4426
4428 for (auto &LF : LU.Fixups)
4429 Loops.push_back(LF.PostIncLoops);
4430
4431 for (Type *SrcTy : Types) {
4432 if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
4433 Formula F = Base;
4434
4435 // Sometimes SCEV is able to prove zero during ext transform. It may
4436 // happen if SCEV did not do all possible transforms while creating the
4437 // initial node (maybe due to depth limitations), but it can do them while
4438 // taking ext.
4439 if (F.ScaledReg) {
4440 const SCEV *NewScaledReg =
4441 getAnyExtendConsideringPostIncUses(Loops, F.ScaledReg, SrcTy, SE);
4442 if (!NewScaledReg || NewScaledReg->isZero())
4443 continue;
4444 F.ScaledReg = NewScaledReg;
4445 }
4446 bool HasZeroBaseReg = false;
4447 for (const SCEV *&BaseReg : F.BaseRegs) {
4448 const SCEV *NewBaseReg =
4449 getAnyExtendConsideringPostIncUses(Loops, BaseReg, SrcTy, SE);
4450 if (!NewBaseReg || NewBaseReg->isZero()) {
4451 HasZeroBaseReg = true;
4452 break;
4453 }
4454 BaseReg = NewBaseReg;
4455 }
4456 if (HasZeroBaseReg)
4457 continue;
4458
4459 // TODO: This assumes we've done basic processing on all uses and
4460 // have an idea what the register usage is.
4461 if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
4462 continue;
4463
4464 F.canonicalize(*L);
4465 (void)InsertFormula(LU, LUIdx, F);
4466 }
4467 }
4468}
4469
4470namespace {
4471
4472/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
4473/// modifications so that the search phase doesn't have to worry about the data
4474/// structures moving underneath it.
4475struct WorkItem {
4476 size_t LUIdx;
4477 Immediate Imm;
4478 const SCEV *OrigReg;
4479
4480 WorkItem(size_t LI, Immediate I, const SCEV *R)
4481 : LUIdx(LI), Imm(I), OrigReg(R) {}
4482
4483 void print(raw_ostream &OS) const;
4484 void dump() const;
4485};
4486
4487} // end anonymous namespace
4488
4489#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4490void WorkItem::print(raw_ostream &OS) const {
4491 OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
4492 << " , add offset " << Imm;
4493}
4494
4495LLVM_DUMP_METHOD void WorkItem::dump() const {
4496 print(errs()); errs() << '\n';
4497}
4498#endif
4499
4500/// Look for registers which are a constant distance apart and try to form reuse
4501/// opportunities between them.
4502void LSRInstance::GenerateCrossUseConstantOffsets() {
4503 // Group the registers by their value without any added constant offset.
4504 using ImmMapTy = std::map<Immediate, const SCEV *, KeyOrderTargetImmediate>;
4505
4509 for (const SCEV *Use : RegUses) {
4510 const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify.
4511 Immediate Imm = ExtractImmediate(Reg, SE);
4512 auto Pair = Map.insert(std::make_pair(Reg, ImmMapTy()));
4513 if (Pair.second)
4514 Sequence.push_back(Reg);
4515 Pair.first->second.insert(std::make_pair(Imm, Use));
4516 UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
4517 }
4518
4519 // Now examine each set of registers with the same base value. Build up
4520 // a list of work to do and do the work in a separate step so that we're
4521 // not adding formulae and register counts while we're searching.
4522 SmallVector<WorkItem, 32> WorkItems;
4523 SmallSet<std::pair<size_t, Immediate>, 32, KeyOrderSizeTAndImmediate>
4524 UniqueItems;
4525 for (const SCEV *Reg : Sequence) {
4526 const ImmMapTy &Imms = Map.find(Reg)->second;
4527
4528 // It's not worthwhile looking for reuse if there's only one offset.
4529 if (Imms.size() == 1)
4530 continue;
4531
4532 LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
4533 for (const auto &Entry
4534 : Imms) dbgs()
4535 << ' ' << Entry.first;
4536 dbgs() << '\n');
4537
4538 // Examine each offset.
4539 for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
4540 J != JE; ++J) {
4541 const SCEV *OrigReg = J->second;
4542
4543 Immediate JImm = J->first;
4544 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
4545
4546 if (!isa<SCEVConstant>(OrigReg) &&
4547 UsedByIndicesMap[Reg].count() == 1) {
4548 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4549 << '\n');
4550 continue;
4551 }
4552
4553 // Conservatively examine offsets between this orig reg a few selected
4554 // other orig regs.
4555 Immediate First = Imms.begin()->first;
4556 Immediate Last = std::prev(Imms.end())->first;
4557 if (!First.isCompatibleImmediate(Last)) {
4558 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4559 << "\n");
4560 continue;
4561 }
4562 // Only scalable if both terms are scalable, or if one is scalable and
4563 // the other is 0.
4564 bool Scalable = First.isScalable() || Last.isScalable();
4565 int64_t FI = First.getKnownMinValue();
4566 int64_t LI = Last.getKnownMinValue();
4567 // Compute (First + Last) / 2 without overflow using the fact that
4568 // First + Last = 2 * (First + Last) + (First ^ Last).
4569 int64_t Avg = (FI & LI) + ((FI ^ LI) >> 1);
4570 // If the result is negative and FI is odd and LI even (or vice versa),
4571 // we rounded towards -inf. Add 1 in that case, to round towards 0.
4572 Avg = Avg + ((FI ^ LI) & ((uint64_t)Avg >> 63));
4573 ImmMapTy::const_iterator OtherImms[] = {
4574 Imms.begin(), std::prev(Imms.end()),
4575 Imms.lower_bound(Immediate::get(Avg, Scalable))};
4576 for (const auto &M : OtherImms) {
4577 if (M == J || M == JE) continue;
4578 if (!JImm.isCompatibleImmediate(M->first))
4579 continue;
4580
4581 // Compute the difference between the two.
4582 Immediate Imm = JImm.subUnsigned(M->first);
4583 for (unsigned LUIdx : UsedByIndices.set_bits())
4584 // Make a memo of this use, offset, and register tuple.
4585 if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
4586 WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
4587 }
4588 }
4589 }
4590
4591 Map.clear();
4592 Sequence.clear();
4593 UsedByIndicesMap.clear();
4594 UniqueItems.clear();
4595
4596 // Now iterate through the worklist and add new formulae.
4597 for (const WorkItem &WI : WorkItems) {
4598 size_t LUIdx = WI.LUIdx;
4599 LSRUse &LU = Uses[LUIdx];
4600 Immediate Imm = WI.Imm;
4601 const SCEV *OrigReg = WI.OrigReg;
4602
4603 Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
4604 const SCEV *NegImmS = Imm.getNegativeSCEV(SE, IntTy);
4605 unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
4606
4607 // TODO: Use a more targeted data structure.
4608 for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
4609 Formula F = LU.Formulae[L];
4610 // FIXME: The code for the scaled and unscaled registers looks
4611 // very similar but slightly different. Investigate if they
4612 // could be merged. That way, we would not have to unscale the
4613 // Formula.
4614 F.unscale();
4615 // Use the immediate in the scaled register.
4616 if (F.ScaledReg == OrigReg) {
4617 if (!F.BaseOffset.isCompatibleImmediate(Imm))
4618 continue;
4619 Immediate Offset = F.BaseOffset.addUnsigned(Imm.mulUnsigned(F.Scale));
4620 // Don't create 50 + reg(-50).
4621 const SCEV *S = Offset.getNegativeSCEV(SE, IntTy);
4622 if (F.referencesReg(S))
4623 continue;
4624 Formula NewF = F;
4625 NewF.BaseOffset = Offset;
4626 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4627 NewF))
4628 continue;
4629 NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
4630
4631 // If the new scale is a constant in a register, and adding the constant
4632 // value to the immediate would produce a value closer to zero than the
4633 // immediate itself, then the formula isn't worthwhile.
4634 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) {
4635 // FIXME: Do we need to do something for scalable immediates here?
4636 // A scalable SCEV won't be constant, but we might still have
4637 // something in the offset? Bail out for now to be safe.
4638 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4639 continue;
4640 if (C->getValue()->isNegative() !=
4641 (NewF.BaseOffset.isLessThanZero()) &&
4642 (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
4643 .ule(std::abs(NewF.BaseOffset.getFixedValue())))
4644 continue;
4645 }
4646
4647 // OK, looks good.
4648 NewF.canonicalize(*this->L);
4649 (void)InsertFormula(LU, LUIdx, NewF);
4650 } else {
4651 // Use the immediate in a base register.
4652 for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
4653 const SCEV *BaseReg = F.BaseRegs[N];
4654 if (BaseReg != OrigReg)
4655 continue;
4656 Formula NewF = F;
4657 if (!NewF.BaseOffset.isCompatibleImmediate(Imm) ||
4658 !NewF.UnfoldedOffset.isCompatibleImmediate(Imm) ||
4659 !NewF.BaseOffset.isCompatibleImmediate(NewF.UnfoldedOffset))
4660 continue;
4661 NewF.BaseOffset = NewF.BaseOffset.addUnsigned(Imm);
4662 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
4663 LU.Kind, LU.AccessTy, NewF)) {
4664 if (AMK == TTI::AMK_PostIndexed &&
4665 mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
4666 continue;
4667 Immediate NewUnfoldedOffset = NewF.UnfoldedOffset.addUnsigned(Imm);
4668 if (!isLegalAddImmediate(TTI, NewUnfoldedOffset))
4669 continue;
4670 NewF = F;
4671 NewF.UnfoldedOffset = NewUnfoldedOffset;
4672 }
4673 NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
4674
4675 // If the new formula has a constant in a register, and adding the
4676 // constant value to the immediate would produce a value closer to
4677 // zero than the immediate itself, then the formula isn't worthwhile.
4678 for (const SCEV *NewReg : NewF.BaseRegs)
4679 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg)) {
4680 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4681 goto skip_formula;
4682 if ((C->getAPInt() + NewF.BaseOffset.getFixedValue())
4683 .abs()
4684 .slt(std::abs(NewF.BaseOffset.getFixedValue())) &&
4685 (C->getAPInt() + NewF.BaseOffset.getFixedValue())
4686 .countr_zero() >=
4687 (unsigned)llvm::countr_zero<uint64_t>(
4688 NewF.BaseOffset.getFixedValue()))
4689 goto skip_formula;
4690 }
4691
4692 // Ok, looks good.
4693 NewF.canonicalize(*this->L);
4694 (void)InsertFormula(LU, LUIdx, NewF);
4695 break;
4696 skip_formula:;
4697 }
4698 }
4699 }
4700 }
4701}
4702
4703/// Generate formulae for each use.
4704void
4705LSRInstance::GenerateAllReuseFormulae() {
4706 // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
4707 // queries are more precise.
4708 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4709 LSRUse &LU = Uses[LUIdx];
4710 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4711 GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
4712 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4713 GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
4714 }
4715 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4716 LSRUse &LU = Uses[LUIdx];
4717 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4718 GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
4719 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4720 GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
4721 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4722 GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
4723 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4724 GenerateScales(LU, LUIdx, LU.Formulae[i]);
4725 }
4726 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4727 LSRUse &LU = Uses[LUIdx];
4728 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4729 GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
4730 }
4731
4732 GenerateCrossUseConstantOffsets();
4733
4734 LLVM_DEBUG(dbgs() << "\n"
4735 "After generating reuse formulae:\n";
4736 print_uses(dbgs()));
4737}
4738
4739/// If there are multiple formulae with the same set of registers used
4740/// by other uses, pick the best one and delete the others.
4741void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
4742 DenseSet<const SCEV *> VisitedRegs;
4745#ifndef NDEBUG
4746 bool ChangedFormulae = false;
4747#endif
4748
4749 // Collect the best formula for each unique set of shared registers. This
4750 // is reset for each use.
4751 using BestFormulaeTy =
4752 DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>;
4753
4754 BestFormulaeTy BestFormulae;
4755
4756 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4757 LSRUse &LU = Uses[LUIdx];
4758 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
4759 dbgs() << '\n');
4760
4761 bool Any = false;
4762 for (size_t FIdx = 0, NumForms = LU.Formulae.size();
4763 FIdx != NumForms; ++FIdx) {
4764 Formula &F = LU.Formulae[FIdx];
4765
4766 // Some formulas are instant losers. For example, they may depend on
4767 // nonexistent AddRecs from other loops. These need to be filtered
4768 // immediately, otherwise heuristics could choose them over others leading
4769 // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
4770 // avoids the need to recompute this information across formulae using the
4771 // same bad AddRec. Passing LoserRegs is also essential unless we remove
4772 // the corresponding bad register from the Regs set.
4773 Cost CostF(L, SE, TTI, AMK);
4774 Regs.clear();
4775 CostF.RateFormula(F, Regs, VisitedRegs, LU, &LoserRegs);
4776 if (CostF.isLoser()) {
4777 // During initial formula generation, undesirable formulae are generated
4778 // by uses within other loops that have some non-trivial address mode or
4779 // use the postinc form of the IV. LSR needs to provide these formulae
4780 // as the basis of rediscovering the desired formula that uses an AddRec
4781 // corresponding to the existing phi. Once all formulae have been
4782 // generated, these initial losers may be pruned.
4783 LLVM_DEBUG(dbgs() << " Filtering loser "; F.print(dbgs());
4784 dbgs() << "\n");
4785 }
4786 else {
4788 for (const SCEV *Reg : F.BaseRegs) {
4789 if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
4790 Key.push_back(Reg);
4791 }
4792 if (F.ScaledReg &&
4793 RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
4794 Key.push_back(F.ScaledReg);
4795 // Unstable sort by host order ok, because this is only used for
4796 // uniquifying.
4797 llvm::sort(Key);
4798
4799 std::pair<BestFormulaeTy::const_iterator, bool> P =
4800 BestFormulae.insert(std::make_pair(Key, FIdx));
4801 if (P.second)
4802 continue;
4803
4804 Formula &Best = LU.Formulae[P.first->second];
4805
4806 Cost CostBest(L, SE, TTI, AMK);
4807 Regs.clear();
4808 CostBest.RateFormula(Best, Regs, VisitedRegs, LU);
4809 if (CostF.isLess(CostBest))
4810 std::swap(F, Best);
4811 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
4812 dbgs() << "\n"
4813 " in favor of formula ";
4814 Best.print(dbgs()); dbgs() << '\n');
4815 }
4816#ifndef NDEBUG
4817 ChangedFormulae = true;
4818#endif
4819 LU.DeleteFormula(F);
4820 --FIdx;
4821 --NumForms;
4822 Any = true;
4823 }
4824
4825 // Now that we've filtered out some formulae, recompute the Regs set.
4826 if (Any)
4827 LU.RecomputeRegs(LUIdx, RegUses);
4828
4829 // Reset this to prepare for the next use.
4830 BestFormulae.clear();
4831 }
4832
4833 LLVM_DEBUG(if (ChangedFormulae) {
4834 dbgs() << "\n"
4835 "After filtering out undesirable candidates:\n";
4836 print_uses(dbgs());
4837 });
4838}
4839
4840/// Estimate the worst-case number of solutions the solver might have to
4841/// consider. It almost never considers this many solutions because it prune the
4842/// search space, but the pruning isn't always sufficient.
4843size_t LSRInstance::EstimateSearchSpaceComplexity() const {
4844 size_t Power = 1;
4845 for (const LSRUse &LU : Uses) {
4846 size_t FSize = LU.Formulae.size();
4847 if (FSize >= ComplexityLimit) {
4848 Power = ComplexityLimit;
4849 break;
4850 }
4851 Power *= FSize;
4852 if (Power >= ComplexityLimit)
4853 break;
4854 }
4855 return Power;
4856}
4857
4858/// When one formula uses a superset of the registers of another formula, it
4859/// won't help reduce register pressure (though it may not necessarily hurt
4860/// register pressure); remove it to simplify the system.
4861void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
4862 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4863 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4864
4865 LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
4866 "which use a superset of registers used by other "
4867 "formulae.\n");
4868
4869 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4870 LSRUse &LU = Uses[LUIdx];
4871 bool Any = false;
4872 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
4873 Formula &F = LU.Formulae[i];
4874 if (F.BaseOffset.isNonZero() && F.BaseOffset.isScalable())
4875 continue;
4876 // Look for a formula with a constant or GV in a register. If the use
4877 // also has a formula with that same value in an immediate field,
4878 // delete the one that uses a register.
4880 I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
4881 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
4882 Formula NewF = F;
4883 //FIXME: Formulas should store bitwidth to do wrapping properly.
4884 // See PR41034.
4885 NewF.BaseOffset =
4886 Immediate::getFixed(NewF.BaseOffset.getFixedValue() +
4887 (uint64_t)C->getValue()->getSExtValue());
4888 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4889 (I - F.BaseRegs.begin()));
4890 if (LU.HasFormulaWithSameRegs(NewF)) {
4891 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4892 dbgs() << '\n');
4893 LU.DeleteFormula(F);
4894 --i;
4895 --e;
4896 Any = true;
4897 break;
4898 }
4899 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
4900 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
4901 if (!F.BaseGV) {
4902 Formula NewF = F;
4903 NewF.BaseGV = GV;
4904 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4905 (I - F.BaseRegs.begin()));
4906 if (LU.HasFormulaWithSameRegs(NewF)) {
4907 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4908 dbgs() << '\n');
4909 LU.DeleteFormula(F);
4910 --i;
4911 --e;
4912 Any = true;
4913 break;
4914 }
4915 }
4916 }
4917 }
4918 }
4919 if (Any)
4920 LU.RecomputeRegs(LUIdx, RegUses);
4921 }
4922
4923 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4924 }
4925}
4926
4927/// When there are many registers for expressions like A, A+1, A+2, etc.,
4928/// allocate a single register for them.
4929void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
4930 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4931 return;
4932
4933 LLVM_DEBUG(
4934 dbgs() << "The search space is too complex.\n"
4935 "Narrowing the search space by assuming that uses separated "
4936 "by a constant offset will use the same registers.\n");
4937
4938 // This is especially useful for unrolled loops.
4939
4940 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4941 LSRUse &LU = Uses[LUIdx];
4942 for (const Formula &F : LU.Formulae) {
4943 if (F.BaseOffset.isZero() || (F.Scale != 0 && F.Scale != 1))
4944 continue;
4945
4946 LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
4947 if (!LUThatHas)
4948 continue;
4949
4950 if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
4951 LU.Kind, LU.AccessTy))
4952 continue;
4953
4954 LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');
4955
4956 LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
4957
4958 // Transfer the fixups of LU to LUThatHas.
4959 for (LSRFixup &Fixup : LU.Fixups) {
4960 Fixup.Offset += F.BaseOffset;
4961 LUThatHas->pushFixup(Fixup);
4962 LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
4963 }
4964
4965 // Delete formulae from the new use which are no longer legal.
4966 bool Any = false;
4967 for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
4968 Formula &F = LUThatHas->Formulae[i];
4969 if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
4970 LUThatHas->Kind, LUThatHas->AccessTy, F)) {
4971 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
4972 LUThatHas->DeleteFormula(F);
4973 --i;
4974 --e;
4975 Any = true;
4976 }
4977 }
4978
4979 if (Any)
4980 LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
4981
4982 // Delete the old use.
4983 DeleteUse(LU, LUIdx);
4984 --LUIdx;
4985 --NumUses;
4986 break;
4987 }
4988 }
4989
4990 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4991}
4992
4993/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
4994/// we've done more filtering, as it may be able to find more formulae to
4995/// eliminate.
4996void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
4997 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4998 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4999
5000 LLVM_DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
5001 "undesirable dedicated registers.\n");
5002
5003 FilterOutUndesirableDedicatedRegisters();
5004
5005 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5006 }
5007}
5008
5009/// If a LSRUse has multiple formulae with the same ScaledReg and Scale.
5010/// Pick the best one and delete the others.
5011/// This narrowing heuristic is to keep as many formulae with different
5012/// Scale and ScaledReg pair as possible while narrowing the search space.
5013/// The benefit is that it is more likely to find out a better solution
5014/// from a formulae set with more Scale and ScaledReg variations than
5015/// a formulae set with the same Scale and ScaledReg. The picking winner
5016/// reg heuristic will often keep the formulae with the same Scale and
5017/// ScaledReg and filter others, and we want to avoid that if possible.
5018void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
5019 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5020 return;
5021
5022 LLVM_DEBUG(
5023 dbgs() << "The search space is too complex.\n"
5024 "Narrowing the search space by choosing the best Formula "
5025 "from the Formulae with the same Scale and ScaledReg.\n");
5026
5027 // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
5028 using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>;
5029
5030 BestFormulaeTy BestFormulae;
5031#ifndef NDEBUG
5032 bool ChangedFormulae = false;
5033#endif
5034 DenseSet<const SCEV *> VisitedRegs;
5036
5037 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5038 LSRUse &LU = Uses[LUIdx];
5039 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
5040 dbgs() << '\n');
5041
5042 // Return true if Formula FA is better than Formula FB.
5043 auto IsBetterThan = [&](Formula &FA, Formula &FB) {
5044 // First we will try to choose the Formula with fewer new registers.
5045 // For a register used by current Formula, the more the register is
5046 // shared among LSRUses, the less we increase the register number
5047 // counter of the formula.
5048 size_t FARegNum = 0;
5049 for (const SCEV *Reg : FA.BaseRegs) {
5050 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5051 FARegNum += (NumUses - UsedByIndices.count() + 1);
5052 }
5053 size_t FBRegNum = 0;
5054 for (const SCEV *Reg : FB.BaseRegs) {
5055 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5056 FBRegNum += (NumUses - UsedByIndices.count() + 1);
5057 }
5058 if (FARegNum != FBRegNum)
5059 return FARegNum < FBRegNum;
5060
5061 // If the new register numbers are the same, choose the Formula with
5062 // less Cost.
5063 Cost CostFA(L, SE, TTI, AMK);
5064 Cost CostFB(L, SE, TTI, AMK);
5065 Regs.clear();
5066 CostFA.RateFormula(FA, Regs, VisitedRegs, LU);
5067 Regs.clear();
5068 CostFB.RateFormula(FB, Regs, VisitedRegs, LU);
5069 return CostFA.isLess(CostFB);
5070 };
5071
5072 bool Any = false;
5073 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5074 ++FIdx) {
5075 Formula &F = LU.Formulae[FIdx];
5076 if (!F.ScaledReg)
5077 continue;
5078 auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx});
5079 if (P.second)
5080 continue;
5081
5082 Formula &Best = LU.Formulae[P.first->second];
5083 if (IsBetterThan(F, Best))
5084 std::swap(F, Best);
5085 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5086 dbgs() << "\n"
5087 " in favor of formula ";
5088 Best.print(dbgs()); dbgs() << '\n');
5089#ifndef NDEBUG
5090 ChangedFormulae = true;
5091#endif
5092 LU.DeleteFormula(F);
5093 --FIdx;
5094 --NumForms;
5095 Any = true;
5096 }
5097 if (Any)
5098 LU.RecomputeRegs(LUIdx, RegUses);
5099
5100 // Reset this to prepare for the next use.
5101 BestFormulae.clear();
5102 }
5103
5104 LLVM_DEBUG(if (ChangedFormulae) {
5105 dbgs() << "\n"
5106 "After filtering out undesirable candidates:\n";
5107 print_uses(dbgs());
5108 });
5109}
5110
5111/// If we are over the complexity limit, filter out any post-inc prefering
5112/// variables to only post-inc values.
5113void LSRInstance::NarrowSearchSpaceByFilterPostInc() {
5114 if (AMK != TTI::AMK_PostIndexed)
5115 return;
5116 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5117 return;
5118
5119 LLVM_DEBUG(dbgs() << "The search space is too complex.\n"
5120 "Narrowing the search space by choosing the lowest "
5121 "register Formula for PostInc Uses.\n");
5122
5123 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5124 LSRUse &LU = Uses[LUIdx];
5125
5126 if (LU.Kind != LSRUse::Address)
5127 continue;
5128 if (!TTI.isIndexedLoadLegal(TTI.MIM_PostInc, LU.AccessTy.getType()) &&
5129 !TTI.isIndexedStoreLegal(TTI.MIM_PostInc, LU.AccessTy.getType()))
5130 continue;
5131
5132 size_t MinRegs = std::numeric_limits<size_t>::max();
5133 for (const Formula &F : LU.Formulae)
5134 MinRegs = std::min(F.getNumRegs(), MinRegs);
5135
5136 bool Any = false;
5137 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5138 ++FIdx) {
5139 Formula &F = LU.Formulae[FIdx];
5140 if (F.getNumRegs() > MinRegs) {
5141 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5142 dbgs() << "\n");
5143 LU.DeleteFormula(F);
5144 --FIdx;
5145 --NumForms;
5146 Any = true;
5147 }
5148 }
5149 if (Any)
5150 LU.RecomputeRegs(LUIdx, RegUses);
5151
5152 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5153 break;
5154 }
5155
5156 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5157}
5158
5159/// The function delete formulas with high registers number expectation.
5160/// Assuming we don't know the value of each formula (already delete
5161/// all inefficient), generate probability of not selecting for each
5162/// register.
5163/// For example,
5164/// Use1:
5165/// reg(a) + reg({0,+,1})
5166/// reg(a) + reg({-1,+,1}) + 1
5167/// reg({a,+,1})
5168/// Use2:
5169/// reg(b) + reg({0,+,1})
5170/// reg(b) + reg({-1,+,1}) + 1
5171/// reg({b,+,1})
5172/// Use3:
5173/// reg(c) + reg(b) + reg({0,+,1})
5174/// reg(c) + reg({b,+,1})
5175///
5176/// Probability of not selecting
5177/// Use1 Use2 Use3
5178/// reg(a) (1/3) * 1 * 1
5179/// reg(b) 1 * (1/3) * (1/2)
5180/// reg({0,+,1}) (2/3) * (2/3) * (1/2)
5181/// reg({-1,+,1}) (2/3) * (2/3) * 1
5182/// reg({a,+,1}) (2/3) * 1 * 1
5183/// reg({b,+,1}) 1 * (2/3) * (2/3)
5184/// reg(c) 1 * 1 * 0
5185///
5186/// Now count registers number mathematical expectation for each formula:
5187/// Note that for each use we exclude probability if not selecting for the use.
5188/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
5189/// probabilty 1/3 of not selecting for Use1).
5190/// Use1:
5191/// reg(a) + reg({0,+,1}) 1 + 1/3 -- to be deleted
5192/// reg(a) + reg({-1,+,1}) + 1 1 + 4/9 -- to be deleted
5193/// reg({a,+,1}) 1
5194/// Use2:
5195/// reg(b) + reg({0,+,1}) 1/2 + 1/3 -- to be deleted
5196/// reg(b) + reg({-1,+,1}) + 1 1/2 + 2/3 -- to be deleted
5197/// reg({b,+,1}) 2/3
5198/// Use3:
5199/// reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
5200/// reg(c) + reg({b,+,1}) 1 + 2/3
5201void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
5202 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5203 return;
5204 // Ok, we have too many of formulae on our hands to conveniently handle.
5205 // Use a rough heuristic to thin out the list.
5206
5207 // Set of Regs wich will be 100% used in final solution.
5208 // Used in each formula of a solution (in example above this is reg(c)).
5209 // We can skip them in calculations.
5211 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5212
5213 // Map each register to probability of not selecting
5214 DenseMap <const SCEV *, float> RegNumMap;
5215 for (const SCEV *Reg : RegUses) {
5216 if (UniqRegs.count(Reg))
5217 continue;
5218 float PNotSel = 1;
5219 for (const LSRUse &LU : Uses) {
5220 if (!LU.Regs.count(Reg))
5221 continue;
5222 float P = LU.getNotSelectedProbability(Reg);
5223 if (P != 0.0)
5224 PNotSel *= P;
5225 else
5226 UniqRegs.insert(Reg);
5227 }
5228 RegNumMap.insert(std::make_pair(Reg, PNotSel));
5229 }
5230
5231 LLVM_DEBUG(
5232 dbgs() << "Narrowing the search space by deleting costly formulas\n");
5233
5234 // Delete formulas where registers number expectation is high.
5235 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5236 LSRUse &LU = Uses[LUIdx];
5237 // If nothing to delete - continue.
5238 if (LU.Formulae.size() < 2)
5239 continue;
5240 // This is temporary solution to test performance. Float should be
5241 // replaced with round independent type (based on integers) to avoid
5242 // different results for different target builds.
5243 float FMinRegNum = LU.Formulae[0].getNumRegs();
5244 float FMinARegNum = LU.Formulae[0].getNumRegs();
5245 size_t MinIdx = 0;
5246 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5247 Formula &F = LU.Formulae[i];
5248 float FRegNum = 0;
5249 float FARegNum = 0;
5250 for (const SCEV *BaseReg : F.BaseRegs) {
5251 if (UniqRegs.count(BaseReg))
5252 continue;
5253 FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5254 if (isa<SCEVAddRecExpr>(BaseReg))
5255 FARegNum +=
5256 RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5257 }
5258 if (const SCEV *ScaledReg = F.ScaledReg) {
5259 if (!UniqRegs.count(ScaledReg)) {
5260 FRegNum +=
5261 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5262 if (isa<SCEVAddRecExpr>(ScaledReg))
5263 FARegNum +=
5264 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5265 }
5266 }
5267 if (FMinRegNum > FRegNum ||
5268 (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
5269 FMinRegNum = FRegNum;
5270 FMinARegNum = FARegNum;
5271 MinIdx = i;
5272 }
5273 }
5274 LLVM_DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs());
5275 dbgs() << " with min reg num " << FMinRegNum << '\n');
5276 if (MinIdx != 0)
5277 std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
5278 while (LU.Formulae.size() != 1) {
5279 LLVM_DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs());
5280 dbgs() << '\n');
5281 LU.Formulae.pop_back();
5282 }
5283 LU.RecomputeRegs(LUIdx, RegUses);
5284 assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
5285 Formula &F = LU.Formulae[0];
5286 LLVM_DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n');
5287 // When we choose the formula, the regs become unique.
5288 UniqRegs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
5289 if (F.ScaledReg)
5290 UniqRegs.insert(F.ScaledReg);
5291 }
5292 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5293}
5294
5295// Check if Best and Reg are SCEVs separated by a constant amount C, and if so
5296// would the addressing offset +C would be legal where the negative offset -C is
5297// not.
5299 ScalarEvolution &SE, const SCEV *Best,
5300 const SCEV *Reg,
5301 MemAccessTy AccessType) {
5302 if (Best->getType() != Reg->getType() ||
5303 (isa<SCEVAddRecExpr>(Best) && isa<SCEVAddRecExpr>(Reg) &&
5304 cast<SCEVAddRecExpr>(Best)->getLoop() !=
5305 cast<SCEVAddRecExpr>(Reg)->getLoop()))
5306 return false;
5307 const auto *Diff = dyn_cast<SCEVConstant>(SE.getMinusSCEV(Best, Reg));
5308 if (!Diff)
5309 return false;
5310
5312 AccessType.MemTy, /*BaseGV=*/nullptr,
5313 /*BaseOffset=*/Diff->getAPInt().getSExtValue(),
5314 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace) &&
5316 AccessType.MemTy, /*BaseGV=*/nullptr,
5317 /*BaseOffset=*/-Diff->getAPInt().getSExtValue(),
5318 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace);
5319}
5320
5321/// Pick a register which seems likely to be profitable, and then in any use
5322/// which has any reference to that register, delete all formulae which do not
5323/// reference that register.
5324void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
5325 // With all other options exhausted, loop until the system is simple
5326 // enough to handle.
5328 while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5329 // Ok, we have too many of formulae on our hands to conveniently handle.
5330 // Use a rough heuristic to thin out the list.
5331 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5332
5333 // Pick the register which is used by the most LSRUses, which is likely
5334 // to be a good reuse register candidate.
5335 const SCEV *Best = nullptr;
5336 unsigned BestNum = 0;
5337 for (const SCEV *Reg : RegUses) {
5338 if (Taken.count(Reg))
5339 continue;
5340 if (!Best) {
5341 Best = Reg;
5342 BestNum = RegUses.getUsedByIndices(Reg).count();
5343 } else {
5344 unsigned Count = RegUses.getUsedByIndices(Reg).count();
5345 if (Count > BestNum) {
5346 Best = Reg;
5347 BestNum = Count;
5348 }
5349
5350 // If the scores are the same, but the Reg is simpler for the target
5351 // (for example {x,+,1} as opposed to {x+C,+,1}, where the target can
5352 // handle +C but not -C), opt for the simpler formula.
5353 if (Count == BestNum) {
5354 int LUIdx = RegUses.getUsedByIndices(Reg).find_first();
5355 if (LUIdx >= 0 && Uses[LUIdx].Kind == LSRUse::Address &&
5356 IsSimplerBaseSCEVForTarget(TTI, SE, Best, Reg,
5357 Uses[LUIdx].AccessTy)) {
5358 Best = Reg;
5359 BestNum = Count;
5360 }
5361 }
5362 }
5363 }
5364 assert(Best && "Failed to find best LSRUse candidate");
5365
5366 LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
5367 << " will yield profitable reuse.\n");
5368 Taken.insert(Best);
5369
5370 // In any use with formulae which references this register, delete formulae
5371 // which don't reference it.
5372 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5373 LSRUse &LU = Uses[LUIdx];
5374 if (!LU.Regs.count(Best)) continue;
5375
5376 bool Any = false;
5377 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5378 Formula &F = LU.Formulae[i];
5379 if (!F.referencesReg(Best)) {
5380 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
5381 LU.DeleteFormula(F);
5382 --e;
5383 --i;
5384 Any = true;
5385 assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
5386 continue;
5387 }
5388 }
5389
5390 if (Any)
5391 LU.RecomputeRegs(LUIdx, RegUses);
5392 }
5393
5394 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5395 }
5396}
5397
5398/// If there are an extraordinary number of formulae to choose from, use some
5399/// rough heuristics to prune down the number of formulae. This keeps the main
5400/// solver from taking an extraordinary amount of time in some worst-case
5401/// scenarios.
5402void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
5403 NarrowSearchSpaceByDetectingSupersets();
5404 NarrowSearchSpaceByCollapsingUnrolledCode();
5405 NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
5407 NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
5408 NarrowSearchSpaceByFilterPostInc();
5409 if (LSRExpNarrow)
5410 NarrowSearchSpaceByDeletingCostlyFormulas();
5411 else
5412 NarrowSearchSpaceByPickingWinnerRegs();
5413}
5414
5415/// This is the recursive solver.
5416void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
5417 Cost &SolutionCost,
5419 const Cost &CurCost,
5420 const SmallPtrSet<const SCEV *, 16> &CurRegs,
5421 DenseSet<const SCEV *> &VisitedRegs) const {
5422 // Some ideas:
5423 // - prune more:
5424 // - use more aggressive filtering
5425 // - sort the formula so that the most profitable solutions are found first
5426 // - sort the uses too
5427 // - search faster:
5428 // - don't compute a cost, and then compare. compare while computing a cost
5429 // and bail early.
5430 // - track register sets with SmallBitVector
5431
5432 const LSRUse &LU = Uses[Workspace.size()];
5433
5434 // If this use references any register that's already a part of the
5435 // in-progress solution, consider it a requirement that a formula must
5436 // reference that register in order to be considered. This prunes out
5437 // unprofitable searching.
5439 for (const SCEV *S : CurRegs)
5440 if (LU.Regs.count(S))
5441 ReqRegs.insert(S);
5442
5444 Cost NewCost(L, SE, TTI, AMK);
5445 for (const Formula &F : LU.Formulae) {
5446 // Ignore formulae which may not be ideal in terms of register reuse of
5447 // ReqRegs. The formula should use all required registers before
5448 // introducing new ones.
5449 // This can sometimes (notably when trying to favour postinc) lead to
5450 // sub-optimial decisions. There it is best left to the cost modelling to
5451 // get correct.
5452 if (AMK != TTI::AMK_PostIndexed || LU.Kind != LSRUse::Address) {
5453 int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
5454 for (const SCEV *Reg : ReqRegs) {
5455 if ((F.ScaledReg && F.ScaledReg == Reg) ||
5456 is_contained(F.BaseRegs, Reg)) {
5457 --NumReqRegsToFind;
5458 if (NumReqRegsToFind == 0)
5459 break;
5460 }
5461 }
5462 if (NumReqRegsToFind != 0) {
5463 // If none of the formulae satisfied the required registers, then we could
5464 // clear ReqRegs and try again. Currently, we simply give up in this case.
5465 continue;
5466 }
5467 }
5468
5469 // Evaluate the cost of the current formula. If it's already worse than
5470 // the current best, prune the search at that point.
5471 NewCost = CurCost;
5472 NewRegs = CurRegs;
5473 NewCost.RateFormula(F, NewRegs, VisitedRegs, LU);
5474 if (NewCost.isLess(SolutionCost)) {
5475 Workspace.push_back(&F);
5476 if (Workspace.size() != Uses.size()) {
5477 SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
5478 NewRegs, VisitedRegs);
5479 if (F.getNumRegs() == 1 && Workspace.size() == 1)
5480 VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
5481 } else {
5482 LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
5483 dbgs() << ".\nRegs:\n";
5484 for (const SCEV *S : NewRegs) dbgs()
5485 << "- " << *S << "\n";
5486 dbgs() << '\n');
5487
5488 SolutionCost = NewCost;
5489 Solution = Workspace;
5490 }
5491 Workspace.pop_back();
5492 }
5493 }
5494}
5495
5496/// Choose one formula from each use. Return the results in the given Solution
5497/// vector.
5498void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
5500 Cost SolutionCost(L, SE, TTI, AMK);
5501 SolutionCost.Lose();
5502 Cost CurCost(L, SE, TTI, AMK);
5504 DenseSet<const SCEV *> VisitedRegs;
5505 Workspace.reserve(Uses.size());
5506
5507 // SolveRecurse does all the work.
5508 SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
5509 CurRegs, VisitedRegs);
5510 if (Solution.empty()) {
5511 LLVM_DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
5512 return;
5513 }
5514
5515 // Ok, we've now made all our decisions.
5516 LLVM_DEBUG(dbgs() << "\n"
5517 "The chosen solution requires ";
5518 SolutionCost.print(dbgs()); dbgs() << ":\n";
5519 for (size_t i = 0, e = Uses.size(); i != e; ++i) {
5520 dbgs() << " ";
5521 Uses[i].print(dbgs());
5522 dbgs() << "\n"
5523 " ";
5524 Solution[i]->print(dbgs());
5525 dbgs() << '\n';
5526 });
5527
5528 assert(Solution.size() == Uses.size() && "Malformed solution!");
5529
5530 const bool EnableDropUnprofitableSolution = [&] {
5532 case cl::BOU_TRUE:
5533 return true;
5534 case cl::BOU_FALSE:
5535 return false;
5536 case cl::BOU_UNSET:
5538 }
5539 llvm_unreachable("Unhandled cl::boolOrDefault enum");
5540 }();
5541
5542 if (BaselineCost.isLess(SolutionCost)) {
5543 if (!EnableDropUnprofitableSolution)
5544 LLVM_DEBUG(
5545 dbgs() << "Baseline is more profitable than chosen solution, "
5546 "add option 'lsr-drop-solution' to drop LSR solution.\n");
5547 else {
5548 LLVM_DEBUG(dbgs() << "Baseline is more profitable than chosen "
5549 "solution, dropping LSR solution.\n";);
5550 Solution.clear();
5551 }
5552 }
5553}
5554
5555/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
5556/// we can go while still being dominated by the input positions. This helps
5557/// canonicalize the insert position, which encourages sharing.
5559LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
5560 const SmallVectorImpl<Instruction *> &Inputs)
5561 const {
5562 Instruction *Tentative = &*IP;
5563 while (true) {
5564 bool AllDominate = true;
5565 Instruction *BetterPos = nullptr;
5566 // Don't bother attempting to insert before a catchswitch, their basic block
5567 // cannot have other non-PHI instructions.
5568 if (isa<CatchSwitchInst>(Tentative))
5569 return IP;
5570
5571 for (Instruction *Inst : Inputs) {
5572 if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
5573 AllDominate = false;
5574 break;
5575 }
5576 // Attempt to find an insert position in the middle of the block,
5577 // instead of at the end, so that it can be used for other expansions.
5578 if (Tentative->getParent() == Inst->getParent() &&
5579 (!BetterPos || !DT.dominates(Inst, BetterPos)))
5580 BetterPos = &*std::next(BasicBlock::iterator(Inst));
5581 }
5582 if (!AllDominate)
5583 break;
5584 if (BetterPos)
5585 IP = BetterPos->getIterator();
5586 else
5587 IP = Tentative->getIterator();
5588
5589 const Loop *IPLoop = LI.getLoopFor(IP->getParent());
5590 unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
5591
5592 BasicBlock *IDom;
5593 for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
5594 if (!Rung) return IP;
5595 Rung = Rung->getIDom();
5596 if (!Rung) return IP;
5597 IDom = Rung->getBlock();
5598
5599 // Don't climb into a loop though.
5600 const Loop *IDomLoop = LI.getLoopFor(IDom);
5601 unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
5602 if (IDomDepth <= IPLoopDepth &&
5603 (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
5604 break;
5605 }
5606
5607 Tentative = IDom->getTerminator();
5608 }
5609
5610 return IP;
5611}
5612
5613/// Determine an input position which will be dominated by the operands and
5614/// which will dominate the result.
5615BasicBlock::iterator LSRInstance::AdjustInsertPositionForExpand(
5616 BasicBlock::iterator LowestIP, const LSRFixup &LF, const LSRUse &LU) const {
5617 // Collect some instructions which must be dominated by the
5618 // expanding replacement. These must be dominated by any operands that
5619 // will be required in the expansion.
5621 if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
5622 Inputs.push_back(I);
5623 if (LU.Kind == LSRUse::ICmpZero)
5624 if (Instruction *I =
5625 dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
5626 Inputs.push_back(I);
5627 if (LF.PostIncLoops.count(L)) {
5628 if (LF.isUseFullyOutsideLoop(L))
5629 Inputs.push_back(L->getLoopLatch()->getTerminator());
5630 else
5631 Inputs.push_back(IVIncInsertPos);
5632 }
5633 // The expansion must also be dominated by the increment positions of any
5634 // loops it for which it is using post-inc mode.
5635 for (const Loop *PIL : LF.PostIncLoops) {
5636 if (PIL == L) continue;
5637
5638 // Be dominated by the loop exit.
5639 SmallVector<BasicBlock *, 4> ExitingBlocks;
5640 PIL->getExitingBlocks(ExitingBlocks);
5641 if (!ExitingBlocks.empty()) {
5642 BasicBlock *BB = ExitingBlocks[0];
5643 for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
5644 BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
5645 Inputs.push_back(BB->getTerminator());
5646 }
5647 }
5648
5649 assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad()
5650 && !isa<DbgInfoIntrinsic>(LowestIP) &&
5651 "Insertion point must be a normal instruction");
5652
5653 // Then, climb up the immediate dominator tree as far as we can go while
5654 // still being dominated by the input positions.
5655 BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
5656
5657 // Don't insert instructions before PHI nodes.
5658 while (isa<PHINode>(IP)) ++IP;
5659
5660 // Ignore landingpad instructions.
5661 while (IP->isEHPad()) ++IP;
5662
5663 // Ignore debug intrinsics.
5664 while (isa<DbgInfoIntrinsic>(IP)) ++IP;
5665
5666 // Set IP below instructions recently inserted by SCEVExpander. This keeps the
5667 // IP consistent across expansions and allows the previously inserted
5668 // instructions to be reused by subsequent expansion.
5669 while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP)
5670 ++IP;
5671
5672 return IP;
5673}
5674
5675/// Emit instructions for the leading candidate expression for this LSRUse (this
5676/// is called "expanding").
5677Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
5678 const Formula &F, BasicBlock::iterator IP,
5679 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5680 if (LU.RigidFormula)
5681 return LF.OperandValToReplace;
5682
5683 // Determine an input position which will be dominated by the operands and
5684 // which will dominate the result.
5685 IP = AdjustInsertPositionForExpand(IP, LF, LU);
5686 Rewriter.setInsertPoint(&*IP);
5687
5688 // Inform the Rewriter if we have a post-increment use, so that it can
5689 // perform an advantageous expansion.
5690 Rewriter.setPostInc(LF.PostIncLoops);
5691
5692 // This is the type that the user actually needs.
5693 Type *OpTy = LF.OperandValToReplace->getType();
5694 // This will be the type that we'll initially expand to.
5695 Type *Ty = F.getType();
5696 if (!Ty)
5697 // No type known; just expand directly to the ultimate type.
5698 Ty = OpTy;
5699 else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
5700 // Expand directly to the ultimate type if it's the right size.
5701 Ty = OpTy;
5702 // This is the type to do integer arithmetic in.
5703 Type *IntTy = SE.getEffectiveSCEVType(Ty);
5704
5705 // Build up a list of operands to add together to form the full base.
5707
5708 // Expand the BaseRegs portion.
5709 for (const SCEV *Reg : F.BaseRegs) {
5710 assert(!Reg->isZero() && "Zero allocated in a base register!");
5711
5712 // If we're expanding for a post-inc user, make the post-inc adjustment.
5713 Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE);
5714 Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
5715 }
5716
5717 // Expand the ScaledReg portion.
5718 Value *ICmpScaledV = nullptr;
5719 if (F.Scale != 0) {
5720 const SCEV *ScaledS = F.ScaledReg;
5721
5722 // If we're expanding for a post-inc user, make the post-inc adjustment.
5723 PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
5724 ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE);
5725
5726 if (LU.Kind == LSRUse::ICmpZero) {
5727 // Expand ScaleReg as if it was part of the base regs.
5728 if (F.Scale == 1)
5729 Ops.push_back(
5730 SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)));
5731 else {
5732 // An interesting way of "folding" with an icmp is to use a negated
5733 // scale, which we'll implement by inserting it into the other operand
5734 // of the icmp.
5735 assert(F.Scale == -1 &&
5736 "The only scale supported by ICmpZero uses is -1!");
5737 ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr);
5738 }
5739 } else {
5740 // Otherwise just expand the scaled register and an explicit scale,
5741 // which is expected to be matched as part of the address.
5742
5743 // Flush the operand list to suppress SCEVExpander hoisting address modes.
5744 // Unless the addressing mode will not be folded.
5745 if (!Ops.empty() && LU.Kind == LSRUse::Address &&
5746 isAMCompletelyFolded(TTI, LU, F)) {
5747 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), nullptr);
5748 Ops.clear();
5749 Ops.push_back(SE.getUnknown(FullV));
5750 }
5751 ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr));
5752 if (F.Scale != 1)
5753 ScaledS =
5754 SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
5755 Ops.push_back(ScaledS);
5756 }
5757 }
5758
5759 // Expand the GV portion.
5760 if (F.BaseGV) {
5761 // Flush the operand list to suppress SCEVExpander hoisting.
5762 if (!Ops.empty()) {
5763 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), IntTy);
5764 Ops.clear();
5765 Ops.push_back(SE.getUnknown(FullV));
5766 }
5767 Ops.push_back(SE.getUnknown(F.BaseGV));
5768 }
5769
5770 // Flush the operand list to suppress SCEVExpander hoisting of both folded and
5771 // unfolded offsets. LSR assumes they both live next to their uses.
5772 if (!Ops.empty()) {
5773 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
5774 Ops.clear();
5775 Ops.push_back(SE.getUnknown(FullV));
5776 }
5777
5778 // FIXME: Are we sure we won't get a mismatch here? Is there a way to bail
5779 // out at this point, or should we generate a SCEV adding together mixed
5780 // offsets?
5781 assert(F.BaseOffset.isCompatibleImmediate(LF.Offset) &&
5782 "Expanding mismatched offsets\n");
5783 // Expand the immediate portion.
5784 Immediate Offset = F.BaseOffset.addUnsigned(LF.Offset);
5785 if (Offset.isNonZero()) {
5786 if (LU.Kind == LSRUse::ICmpZero) {
5787 // The other interesting way of "folding" with an ICmpZero is to use a
5788 // negated immediate.
5789 if (!ICmpScaledV)
5790 ICmpScaledV =
5791 ConstantInt::get(IntTy, -(uint64_t)Offset.getFixedValue());
5792 else {
5793 Ops.push_back(SE.getUnknown(ICmpScaledV));
5794 ICmpScaledV = ConstantInt::get(IntTy, Offset.getFixedValue());
5795 }
5796 } else {
5797 // Just add the immediate values. These again are expected to be matched
5798 // as part of the address.
5799 Ops.push_back(Offset.getUnknownSCEV(SE, IntTy));
5800 }
5801 }
5802
5803 // Expand the unfolded offset portion.
5804 Immediate UnfoldedOffset = F.UnfoldedOffset;
5805 if (UnfoldedOffset.isNonZero()) {
5806 // Just add the immediate values.
5807 Ops.push_back(UnfoldedOffset.getUnknownSCEV(SE, IntTy));
5808 }
5809
5810 // Emit instructions summing all the operands.
5811 const SCEV *FullS = Ops.empty() ?
5812 SE.getConstant(IntTy, 0) :
5813 SE.getAddExpr(Ops);
5814 Value *FullV = Rewriter.expandCodeFor(FullS, Ty);
5815
5816 // We're done expanding now, so reset the rewriter.
5817 Rewriter.clearPostInc();
5818
5819 // An ICmpZero Formula represents an ICmp which we're handling as a
5820 // comparison against zero. Now that we've expanded an expression for that
5821 // form, update the ICmp's other operand.
5822 if (LU.Kind == LSRUse::ICmpZero) {
5823 ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
5824 if (auto *OperandIsInstr = dyn_cast<Instruction>(CI->getOperand(1)))
5825 DeadInsts.emplace_back(OperandIsInstr);
5826 assert(!F.BaseGV && "ICmp does not support folding a global value and "
5827 "a scale at the same time!");
5828 if (F.Scale == -1) {
5829 if (ICmpScaledV->getType() != OpTy) {
5831 CastInst::getCastOpcode(ICmpScaledV, false, OpTy, false),
5832 ICmpScaledV, OpTy, "tmp", CI->getIterator());
5833 ICmpScaledV = Cast;
5834 }
5835 CI->setOperand(1, ICmpScaledV);
5836 } else {
5837 // A scale of 1 means that the scale has been expanded as part of the
5838 // base regs.
5839 assert((F.Scale == 0 || F.Scale == 1) &&
5840 "ICmp does not support folding a global value and "
5841 "a scale at the same time!");
5843 -(uint64_t)Offset.getFixedValue());
5844 if (C->getType() != OpTy) {
5846 CastInst::getCastOpcode(C, false, OpTy, false), C, OpTy,
5847 CI->getDataLayout());
5848 assert(C && "Cast of ConstantInt should have folded");
5849 }
5850
5851 CI->setOperand(1, C);
5852 }
5853 }
5854
5855 return FullV;
5856}
5857
5858/// Helper for Rewrite. PHI nodes are special because the use of their operands
5859/// effectively happens in their predecessor blocks, so the expression may need
5860/// to be expanded in multiple places.
5861void LSRInstance::RewriteForPHI(
5862 PHINode *PN, const LSRUse &LU, const LSRFixup &LF, const Formula &F,
5863 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5865
5866 // Inserting instructions in the loop and using them as PHI's input could
5867 // break LCSSA in case if PHI's parent block is not a loop exit (i.e. the
5868 // corresponding incoming block is not loop exiting). So collect all such
5869 // instructions to form LCSSA for them later.
5870 SmallVector<Instruction *, 4> InsertedNonLCSSAInsts;
5871
5872 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
5873 if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
5874 bool needUpdateFixups = false;
5875 BasicBlock *BB = PN->getIncomingBlock(i);
5876
5877 // If this is a critical edge, split the edge so that we do not insert
5878 // the code on all predecessor/successor paths. We do this unless this
5879 // is the canonical backedge for this loop, which complicates post-inc
5880 // users.
5881 if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
5882 !isa<IndirectBrInst>(BB->getTerminator()) &&
5883 !isa<CatchSwitchInst>(BB->getTerminator())) {
5884 BasicBlock *Parent = PN->getParent();
5885 Loop *PNLoop = LI.getLoopFor(Parent);
5886 if (!PNLoop || Parent != PNLoop->getHeader()) {
5887 // Split the critical edge.
5888 BasicBlock *NewBB = nullptr;
5889 if (!Parent->isLandingPad()) {
5890 NewBB =
5891 SplitCriticalEdge(BB, Parent,
5892 CriticalEdgeSplittingOptions(&DT, &LI, MSSAU)
5893 .setMergeIdenticalEdges()
5894 .setKeepOneInputPHIs());
5895 } else {
5897 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
5898 SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DTU, &LI);
5899 NewBB = NewBBs[0];
5900 }
5901 // If NewBB==NULL, then SplitCriticalEdge refused to split because all
5902 // phi predecessors are identical. The simple thing to do is skip
5903 // splitting in this case rather than complicate the API.
5904 if (NewBB) {
5905 // If PN is outside of the loop and BB is in the loop, we want to
5906 // move the block to be immediately before the PHI block, not
5907 // immediately after BB.
5908 if (L->contains(BB) && !L->contains(PN))
5909 NewBB->moveBefore(PN->getParent());
5910
5911 // Splitting the edge can reduce the number of PHI entries we have.
5912 e = PN->getNumIncomingValues();
5913 BB = NewBB;
5914 i = PN->getBasicBlockIndex(BB);
5915
5916 needUpdateFixups = true;
5917 }
5918 }
5919 }
5920
5921 std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
5922 Inserted.insert(std::make_pair(BB, static_cast<Value *>(nullptr)));
5923 if (!Pair.second)
5924 PN->setIncomingValue(i, Pair.first->second);
5925 else {
5926 Value *FullV =
5927 Expand(LU, LF, F, BB->getTerminator()->getIterator(), DeadInsts);
5928
5929 // If this is reuse-by-noop-cast, insert the noop cast.
5930 Type *OpTy = LF.OperandValToReplace->getType();
5931 if (FullV->getType() != OpTy)
5932 FullV = CastInst::Create(
5933 CastInst::getCastOpcode(FullV, false, OpTy, false), FullV,
5934 LF.OperandValToReplace->getType(), "tmp",
5935 BB->getTerminator()->getIterator());
5936
5937 // If the incoming block for this value is not in the loop, it means the
5938 // current PHI is not in a loop exit, so we must create a LCSSA PHI for
5939 // the inserted value.
5940 if (auto *I = dyn_cast<Instruction>(FullV))
5941 if (L->contains(I) && !L->contains(BB))
5942 InsertedNonLCSSAInsts.push_back(I);
5943
5944 PN->setIncomingValue(i, FullV);
5945 Pair.first->second = FullV;
5946 }
5947
5948 // If LSR splits critical edge and phi node has other pending
5949 // fixup operands, we need to update those pending fixups. Otherwise
5950 // formulae will not be implemented completely and some instructions
5951 // will not be eliminated.
5952 if (needUpdateFixups) {
5953 for (LSRUse &LU : Uses)
5954 for (LSRFixup &Fixup : LU.Fixups)
5955 // If fixup is supposed to rewrite some operand in the phi
5956 // that was just updated, it may be already moved to
5957 // another phi node. Such fixup requires update.
5958 if (Fixup.UserInst == PN) {
5959 // Check if the operand we try to replace still exists in the
5960 // original phi.
5961 bool foundInOriginalPHI = false;
5962 for (const auto &val : PN->incoming_values())
5963 if (val == Fixup.OperandValToReplace) {
5964 foundInOriginalPHI = true;
5965 break;
5966 }
5967
5968 // If fixup operand found in original PHI - nothing to do.
5969 if (foundInOriginalPHI)
5970 continue;
5971
5972 // Otherwise it might be moved to another PHI and requires update.
5973 // If fixup operand not found in any of the incoming blocks that
5974 // means we have already rewritten it - nothing to do.
5975 for (const auto &Block : PN->blocks())
5976 for (BasicBlock::iterator I = Block->begin(); isa<PHINode>(I);
5977 ++I) {
5978 PHINode *NewPN = cast<PHINode>(I);
5979 for (const auto &val : NewPN->incoming_values())
5980 if (val == Fixup.OperandValToReplace)
5981 Fixup.UserInst = NewPN;
5982 }
5983 }
5984 }
5985 }
5986
5987 formLCSSAForInstructions(InsertedNonLCSSAInsts, DT, LI, &SE);
5988}
5989
5990/// Emit instructions for the leading candidate expression for this LSRUse (this
5991/// is called "expanding"), and update the UserInst to reference the newly
5992/// expanded value.
5993void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
5994 const Formula &F,
5995 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5996 // First, find an insertion point that dominates UserInst. For PHI nodes,
5997 // find the nearest block which dominates all the relevant uses.
5998 if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
5999 RewriteForPHI(PN, LU, LF, F, DeadInsts);
6000 } else {
6001 Value *FullV = Expand(LU, LF, F, LF.UserInst->getIterator(), DeadInsts);
6002
6003 // If this is reuse-by-noop-cast, insert the noop cast.
6004 Type *OpTy = LF.OperandValToReplace->getType();
6005 if (FullV->getType() != OpTy) {
6006 Instruction *Cast =
6007 CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
6008 FullV, OpTy, "tmp", LF.UserInst->getIterator());
6009 FullV = Cast;
6010 }
6011
6012 // Update the user. ICmpZero is handled specially here (for now) because
6013 // Expand may have updated one of the operands of the icmp already, and
6014 // its new value may happen to be equal to LF.OperandValToReplace, in
6015 // which case doing replaceUsesOfWith leads to replacing both operands
6016 // with the same value. TODO: Reorganize this.
6017 if (LU.Kind == LSRUse::ICmpZero)
6018 LF.UserInst->setOperand(0, FullV);
6019 else
6020 LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
6021 }
6022
6023 if (auto *OperandIsInstr = dyn_cast<Instruction>(LF.OperandValToReplace))
6024 DeadInsts.emplace_back(OperandIsInstr);
6025}
6026
6027// Trying to hoist the IVInc to loop header if all IVInc users are in
6028// the loop header. It will help backend to generate post index load/store
6029// when the latch block is different from loop header block.
6030static bool canHoistIVInc(const TargetTransformInfo &TTI, const LSRFixup &Fixup,
6031 const LSRUse &LU, Instruction *IVIncInsertPos,
6032 Loop *L) {
6033 if (LU.Kind != LSRUse::Address)
6034 return false;
6035
6036 // For now this code do the conservative optimization, only work for
6037 // the header block. Later we can hoist the IVInc to the block post
6038 // dominate all users.
6039 BasicBlock *LHeader = L->getHeader();
6040 if (IVIncInsertPos->getParent() == LHeader)
6041 return false;
6042
6043 if (!Fixup.OperandValToReplace ||
6044 any_of(Fixup.OperandValToReplace->users(), [&LHeader](User *U) {
6045 Instruction *UI = cast<Instruction>(U);
6046 return UI->getParent() != LHeader;
6047 }))
6048 return false;
6049
6050 Instruction *I = Fixup.UserInst;
6051 Type *Ty = I->getType();
6052 return Ty->isIntegerTy() &&
6053 ((isa<LoadInst>(I) && TTI.isIndexedLoadLegal(TTI.MIM_PostInc, Ty)) ||
6054 (isa<StoreInst>(I) && TTI.isIndexedStoreLegal(TTI.MIM_PostInc, Ty)));
6055}
6056
6057/// Rewrite all the fixup locations with new values, following the chosen
6058/// solution.
6059void LSRInstance::ImplementSolution(
6060 const SmallVectorImpl<const Formula *> &Solution) {
6061 // Keep track of instructions we may have made dead, so that
6062 // we can remove them after we are done working.
6064
6065 // Mark phi nodes that terminate chains so the expander tries to reuse them.
6066 for (const IVChain &Chain : IVChainVec) {
6067 if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst()))
6068 Rewriter.setChainedPhi(PN);
6069 }
6070
6071 // Expand the new value definitions and update the users.
6072 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
6073 for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) {
6074 Instruction *InsertPos =
6075 canHoistIVInc(TTI, Fixup, Uses[LUIdx], IVIncInsertPos, L)
6076 ? L->getHeader()->getTerminator()
6077 : IVIncInsertPos;
6078 Rewriter.setIVIncInsertPos(L, InsertPos);
6079 Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], DeadInsts);
6080 Changed = true;
6081 }
6082
6083 for (const IVChain &Chain : IVChainVec) {
6084 GenerateIVChain(Chain, DeadInsts);
6085 Changed = true;
6086 }
6087
6088 for (const WeakVH &IV : Rewriter.getInsertedIVs())
6089 if (IV && dyn_cast<Instruction>(&*IV)->getParent())
6090 ScalarEvolutionIVs.push_back(IV);
6091
6092 // Clean up after ourselves. This must be done before deleting any
6093 // instructions.
6094 Rewriter.clear();
6095
6097 &TLI, MSSAU);
6098
6099 // In our cost analysis above, we assume that each addrec consumes exactly
6100 // one register, and arrange to have increments inserted just before the
6101 // latch to maximimize the chance this is true. However, if we reused
6102 // existing IVs, we now need to move the increments to match our
6103 // expectations. Otherwise, our cost modeling results in us having a
6104 // chosen a non-optimal result for the actual schedule. (And yes, this
6105 // scheduling decision does impact later codegen.)
6106 for (PHINode &PN : L->getHeader()->phis()) {
6107 BinaryOperator *BO = nullptr;
6108 Value *Start = nullptr, *Step = nullptr;
6109 if (!matchSimpleRecurrence(&PN, BO, Start, Step))
6110 continue;
6111
6112 switch (BO->getOpcode()) {
6113 case Instruction::Sub:
6114 if (BO->getOperand(0) != &PN)
6115 // sub is non-commutative - match handling elsewhere in LSR
6116 continue;
6117 break;
6118 case Instruction::Add:
6119 break;
6120 default:
6121 continue;
6122 };
6123
6124 if (!isa<Constant>(Step))
6125 // If not a constant step, might increase register pressure
6126 // (We assume constants have been canonicalized to RHS)
6127 continue;
6128
6129 if (BO->getParent() == IVIncInsertPos->getParent())
6130 // Only bother moving across blocks. Isel can handle block local case.
6131 continue;
6132
6133 // Can we legally schedule inc at the desired point?
6134 if (!llvm::all_of(BO->uses(),
6135 [&](Use &U) {return DT.dominates(IVIncInsertPos, U);}))
6136 continue;
6137 BO->moveBefore(IVIncInsertPos);
6138 Changed = true;
6139 }
6140
6141
6142}
6143
6144LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
6145 DominatorTree &DT, LoopInfo &LI,
6148 : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), TLI(TLI), TTI(TTI), L(L),
6149 MSSAU(MSSAU), AMK(PreferredAddresingMode.getNumOccurrences() > 0
6151 : TTI.getPreferredAddressingMode(L, &SE)),
6152 Rewriter(SE, L->getHeader()->getDataLayout(), "lsr", false),
6153 BaselineCost(L, SE, TTI, AMK) {
6154 // If LoopSimplify form is not available, stay out of trouble.
6155 if (!L->isLoopSimplifyForm())
6156 return;
6157
6158 // If there's no interesting work to be done, bail early.
6159 if (IU.empty()) return;
6160
6161 // If there's too much analysis to be done, bail early. We won't be able to
6162 // model the problem anyway.
6163 unsigned NumUsers = 0;
6164 for (const IVStrideUse &U : IU) {
6165 if (++NumUsers > MaxIVUsers) {
6166 (void)U;
6167 LLVM_DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U
6168 << "\n");
6169 return;
6170 }
6171 // Bail out if we have a PHI on an EHPad that gets a value from a
6172 // CatchSwitchInst. Because the CatchSwitchInst cannot be split, there is
6173 // no good place to stick any instructions.
6174 if (auto *PN = dyn_cast<PHINode>(U.getUser())) {
6175 auto *FirstNonPHI = PN->getParent()->getFirstNonPHI();
6176 if (isa<FuncletPadInst>(FirstNonPHI) ||
6177 isa<CatchSwitchInst>(FirstNonPHI))
6178 for (BasicBlock *PredBB : PN->blocks())
6179 if (isa<CatchSwitchInst>(PredBB->getFirstNonPHI()))
6180 return;
6181 }
6182 }
6183
6184 LLVM_DEBUG(dbgs() << "\nLSR on loop ";
6185 L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
6186 dbgs() << ":\n");
6187
6188 // Configure SCEVExpander already now, so the correct mode is used for
6189 // isSafeToExpand() checks.
6190#ifndef NDEBUG
6191 Rewriter.setDebugType(DEBUG_TYPE);
6192#endif
6193 Rewriter.disableCanonicalMode();
6194 Rewriter.enableLSRMode();
6195
6196 // First, perform some low-level loop optimizations.
6197 OptimizeShadowIV();
6198 OptimizeLoopTermCond();
6199
6200 // If loop preparation eliminates all interesting IV users, bail.
6201 if (IU.empty()) return;
6202
6203 // Skip nested loops until we can model them better with formulae.
6204 if (!L->isInnermost()) {
6205 LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
6206 return;
6207 }
6208
6209 // Start collecting data and preparing for the solver.
6210 // If number of registers is not the major cost, we cannot benefit from the
6211 // current profitable chain optimization which is based on number of
6212 // registers.
6213 // FIXME: add profitable chain optimization for other kinds major cost, for
6214 // example number of instructions.
6216 CollectChains();
6217 CollectInterestingTypesAndFactors();
6218 CollectFixupsAndInitialFormulae();
6219 CollectLoopInvariantFixupsAndFormulae();
6220
6221 if (Uses.empty())
6222 return;
6223
6224 LLVM_DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
6225 print_uses(dbgs()));
6226 LLVM_DEBUG(dbgs() << "The baseline solution requires ";
6227 BaselineCost.print(dbgs()); dbgs() << "\n");
6228
6229 // Now use the reuse data to generate a bunch of interesting ways
6230 // to formulate the values needed for the uses.
6231 GenerateAllReuseFormulae();
6232
6233 FilterOutUndesirableDedicatedRegisters();
6234 NarrowSearchSpaceUsingHeuristics();
6235
6237 Solve(Solution);
6238
6239 // Release memory that is no longer needed.
6240 Factors.clear();
6241 Types.clear();
6242 RegUses.clear();
6243
6244 if (Solution.empty())
6245 return;
6246
6247#ifndef NDEBUG
6248 // Formulae should be legal.
6249 for (const LSRUse &LU : Uses) {
6250 for (const Formula &F : LU.Formulae)
6251 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
6252 F) && "Illegal formula generated!");
6253 };
6254#endif
6255
6256 // Now that we've decided what we want, make it so.
6257 ImplementSolution(Solution);
6258}
6259
6260#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
6261void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
6262 if (Factors.empty() && Types.empty()) return;
6263
6264 OS << "LSR has identified the following interesting factors and types: ";
6265 bool First = true;
6266
6267 for (int64_t Factor : Factors) {
6268 if (!First) OS << ", ";
6269 First = false;
6270 OS << '*' << Factor;
6271 }
6272
6273 for (Type *Ty : Types) {
6274 if (!First) OS << ", ";
6275 First = false;
6276 OS << '(' << *Ty << ')';
6277 }
6278 OS << '\n';
6279}
6280
6281void LSRInstance::print_fixups(raw_ostream &OS) const {
6282 OS << "LSR is examining the following fixup sites:\n";
6283 for (const LSRUse &LU : Uses)
6284 for (const LSRFixup &LF : LU.Fixups) {
6285 dbgs() << " ";
6286 LF.print(OS);
6287 OS << '\n';
6288 }
6289}
6290
6291void LSRInstance::print_uses(raw_ostream &OS) const {
6292 OS << "LSR is examining the following uses:\n";
6293 for (const LSRUse &LU : Uses) {
6294 dbgs() << " ";
6295 LU.print(OS);
6296 OS << '\n';
6297 for (const Formula &F : LU.Formulae) {
6298 OS << " ";
6299 F.print(OS);
6300 OS << '\n';
6301 }
6302 }
6303}
6304
6305void LSRInstance::print(raw_ostream &OS) const {
6306 print_factors_and_types(OS);
6307 print_fixups(OS);
6308 print_uses(OS);
6309}
6310
6311LLVM_DUMP_METHOD void LSRInstance::dump() const {
6312 print(errs()); errs() << '\n';
6313}
6314#endif
6315
6316namespace {
6317
6318class LoopStrengthReduce : public LoopPass {
6319public:
6320 static char ID; // Pass ID, replacement for typeid
6321
6322 LoopStrengthReduce();
6323
6324private:
6325 bool runOnLoop(Loop *L, LPPassManager &LPM) override;
6326 void getAnalysisUsage(AnalysisUsage &AU) const override;
6327};
6328
6329} // end anonymous namespace
6330
6331LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
6333}
6334
6335void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
6336 // We split critical edges, so we change the CFG. However, we do update
6337 // many analyses if they are around.
6339
6349 // Requiring LoopSimplify a second time here prevents IVUsers from running
6350 // twice, since LoopSimplify was invalidated by running ScalarEvolution.
6356}
6357
6358namespace {
6359
6360/// Enables more convenient iteration over a DWARF expression vector.
6362ToDwarfOpIter(SmallVectorImpl<uint64_t> &Expr) {
6367 return {Begin, End};
6368}
6369
6370struct SCEVDbgValueBuilder {
6371 SCEVDbgValueBuilder() = default;
6372 SCEVDbgValueBuilder(const SCEVDbgValueBuilder &Base) { clone(Base); }
6373
6374 void clone(const SCEVDbgValueBuilder &Base) {
6375 LocationOps = Base.LocationOps;
6376 Expr = Base.Expr;
6377 }
6378
6379 void clear() {
6380 LocationOps.clear();
6381 Expr.clear();
6382 }
6383
6384 /// The DIExpression as we translate the SCEV.
6386 /// The location ops of the DIExpression.
6387 SmallVector<Value *, 2> LocationOps;
6388
6389 void pushOperator(uint64_t Op) { Expr.push_back(Op); }
6390 void pushUInt(uint64_t Operand) { Expr.push_back(Operand); }
6391
6392 /// Add a DW_OP_LLVM_arg to the expression, followed by the index of the value
6393 /// in the set of values referenced by the expression.
6394 void pushLocation(llvm::Value *V) {
6396 auto *It = llvm::find(LocationOps, V);
6397 unsigned ArgIndex = 0;
6398 if (It != LocationOps.end()) {
6399 ArgIndex = std::distance(LocationOps.begin(), It);
6400 } else {
6401 ArgIndex = LocationOps.size();
6402 LocationOps.push_back(V);
6403 }
6404 Expr.push_back(ArgIndex);
6405 }
6406
6407 void pushValue(const SCEVUnknown *U) {
6408 llvm::Value *V = cast<SCEVUnknown>(U)->getValue();
6409 pushLocation(V);
6410 }
6411
6412 bool pushConst(const SCEVConstant *C) {
6413 if (C->getAPInt().getSignificantBits() > 64)
6414 return false;
6415 Expr.push_back(llvm::dwarf::DW_OP_consts);
6416 Expr.push_back(C->getAPInt().getSExtValue());
6417 return true;
6418 }
6419
6420 // Iterating the expression as DWARF ops is convenient when updating
6421 // DWARF_OP_LLVM_args.
6423 return ToDwarfOpIter(Expr);
6424 }
6425
6426 /// Several SCEV types are sequences of the same arithmetic operator applied
6427 /// to constants and values that may be extended or truncated.
6428 bool pushArithmeticExpr(const llvm::SCEVCommutativeExpr *CommExpr,
6429 uint64_t DwarfOp) {
6430 assert((isa<llvm::SCEVAddExpr>(CommExpr) || isa<SCEVMulExpr>(CommExpr)) &&
6431 "Expected arithmetic SCEV type");
6432 bool Success = true;
6433 unsigned EmitOperator = 0;
6434 for (const auto &Op : CommExpr->operands()) {
6435 Success &= pushSCEV(Op);
6436
6437 if (EmitOperator >= 1)
6438 pushOperator(DwarfOp);
6439 ++EmitOperator;
6440 }
6441 return Success;
6442 }
6443
6444 // TODO: Identify and omit noop casts.
6445 bool pushCast(const llvm::SCEVCastExpr *C, bool IsSigned) {
6446 const llvm::SCEV *Inner = C->getOperand(0);
6447 const llvm::Type *Type = C->getType();
6448 uint64_t ToWidth = Type->getIntegerBitWidth();
6449 bool Success = pushSCEV(Inner);
6450 uint64_t CastOps[] = {dwarf::DW_OP_LLVM_convert, ToWidth,
6451 IsSigned ? llvm::dwarf::DW_ATE_signed
6452 : llvm::dwarf::DW_ATE_unsigned};
6453 for (const auto &Op : CastOps)
6454 pushOperator(Op);
6455 return Success;
6456 }
6457
6458 // TODO: MinMax - although these haven't been encountered in the test suite.
6459 bool pushSCEV(const llvm::SCEV *S) {
6460 bool Success = true;
6461 if (const SCEVConstant *StartInt = dyn_cast<SCEVConstant>(S)) {
6462 Success &= pushConst(StartInt);
6463
6464 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
6465 if (!U->getValue())
6466 return false;
6467 pushLocation(U->getValue());
6468
6469 } else if (const SCEVMulExpr *MulRec = dyn_cast<SCEVMulExpr>(S)) {
6470 Success &= pushArithmeticExpr(MulRec, llvm::dwarf::DW_OP_mul);
6471
6472 } else if (const SCEVUDivExpr *UDiv = dyn_cast<SCEVUDivExpr>(S)) {
6473 Success &= pushSCEV(UDiv->getLHS());
6474 Success &= pushSCEV(UDiv->getRHS());
6475 pushOperator(llvm::dwarf::DW_OP_div);
6476
6477 } else if (const SCEVCastExpr *Cast = dyn_cast<SCEVCastExpr>(S)) {
6478 // Assert if a new and unknown SCEVCastEXpr type is encountered.
6479 assert((isa<SCEVZeroExtendExpr>(Cast) || isa<SCEVTruncateExpr>(Cast) ||
6480 isa<SCEVPtrToIntExpr>(Cast) || isa<SCEVSignExtendExpr>(Cast)) &&
6481 "Unexpected cast type in SCEV.");
6482 Success &= pushCast(Cast, (isa<SCEVSignExtendExpr>(Cast)));
6483
6484 } else if (const SCEVAddExpr *AddExpr = dyn_cast<SCEVAddExpr>(S)) {
6485 Success &= pushArithmeticExpr(AddExpr, llvm::dwarf::DW_OP_plus);
6486
6487 } else if (isa<SCEVAddRecExpr>(S)) {
6488 // Nested SCEVAddRecExpr are generated by nested loops and are currently
6489 // unsupported.
6490 return false;
6491
6492 } else {
6493 return false;
6494 }
6495 return Success;
6496 }
6497
6498 /// Return true if the combination of arithmetic operator and underlying
6499 /// SCEV constant value is an identity function.
6500 bool isIdentityFunction(uint64_t Op, const SCEV *S) {
6501 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
6502 if (C->getAPInt().getSignificantBits() > 64)
6503 return false;
6504 int64_t I = C->getAPInt().getSExtValue();
6505 switch (Op) {
6506 case llvm::dwarf::DW_OP_plus:
6507 case llvm::dwarf::DW_OP_minus:
6508 return I == 0;
6509 case llvm::dwarf::DW_OP_mul:
6510 case llvm::dwarf::DW_OP_div:
6511 return I == 1;
6512 }
6513 }
6514 return false;
6515 }
6516
6517 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6518 /// builder's expression stack. The stack should already contain an
6519 /// expression for the iteration count, so that it can be multiplied by
6520 /// the stride and added to the start.
6521 /// Components of the expression are omitted if they are an identity function.
6522 /// Chain (non-affine) SCEVs are not supported.
6523 bool SCEVToValueExpr(const llvm::SCEVAddRecExpr &SAR, ScalarEvolution &SE) {
6524 assert(SAR.isAffine() && "Expected affine SCEV");
6525 // TODO: Is this check needed?
6526 if (isa<SCEVAddRecExpr>(SAR.getStart()))
6527 return false;
6528
6529 const SCEV *Start = SAR.getStart();
6530 const SCEV *Stride = SAR.getStepRecurrence(SE);
6531
6532 // Skip pushing arithmetic noops.
6533 if (!isIdentityFunction(llvm::dwarf::DW_OP_mul, Stride)) {
6534 if (!pushSCEV(Stride))
6535 return false;
6536 pushOperator(llvm::dwarf::DW_OP_mul);
6537 }
6538 if (!isIdentityFunction(llvm::dwarf::DW_OP_plus, Start)) {
6539 if (!pushSCEV(Start))
6540 return false;
6541 pushOperator(llvm::dwarf::DW_OP_plus);
6542 }
6543 return true;
6544 }
6545
6546 /// Create an expression that is an offset from a value (usually the IV).
6547 void createOffsetExpr(int64_t Offset, Value *OffsetValue) {
6548 pushLocation(OffsetValue);
6550 LLVM_DEBUG(
6551 dbgs() << "scev-salvage: Generated IV offset expression. Offset: "
6552 << std::to_string(Offset) << "\n");
6553 }
6554
6555 /// Combine a translation of the SCEV and the IV to create an expression that
6556 /// recovers a location's value.
6557 /// returns true if an expression was created.
6558 bool createIterCountExpr(const SCEV *S,
6559 const SCEVDbgValueBuilder &IterationCount,
6560 ScalarEvolution &SE) {
6561 // SCEVs for SSA values are most frquently of the form
6562 // {start,+,stride}, but sometimes they are ({start,+,stride} + %a + ..).
6563 // This is because %a is a PHI node that is not the IV. However, these
6564 // SCEVs have not been observed to result in debuginfo-lossy optimisations,
6565 // so its not expected this point will be reached.
6566 if (!isa<SCEVAddRecExpr>(S))
6567 return false;
6568
6569 LLVM_DEBUG(dbgs() << "scev-salvage: Location to salvage SCEV: " << *S
6570 << '\n');
6571
6572 const auto *Rec = cast<SCEVAddRecExpr>(S);
6573 if (!Rec->isAffine())
6574 return false;
6575
6577 return false;
6578
6579 // Initialise a new builder with the iteration count expression. In
6580 // combination with the value's SCEV this enables recovery.
6581 clone(IterationCount);
6582 if (!SCEVToValueExpr(*Rec, SE))
6583 return false;
6584
6585 return true;
6586 }
6587
6588 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6589 /// builder's expression stack. The stack should already contain an
6590 /// expression for the iteration count, so that it can be multiplied by
6591 /// the stride and added to the start.
6592 /// Components of the expression are omitted if they are an identity function.
6593 bool SCEVToIterCountExpr(const llvm::SCEVAddRecExpr &SAR,
6594 ScalarEvolution &SE) {
6595 assert(SAR.isAffine() && "Expected affine SCEV");
6596 if (isa<SCEVAddRecExpr>(SAR.getStart())) {
6597 LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV. Unsupported nested AddRec: "
6598 << SAR << '\n');
6599 return false;
6600 }
6601 const SCEV *Start = SAR.getStart();
6602 const SCEV *Stride = SAR.getStepRecurrence(SE);
6603
6604 // Skip pushing arithmetic noops.
6605 if (!isIdentityFunction(llvm::dwarf::DW_OP_minus, Start)) {
6606 if (!pushSCEV(Start))
6607 return false;
6608 pushOperator(llvm::dwarf::DW_OP_minus);
6609 }
6610 if (!isIdentityFunction(llvm::dwarf::DW_OP_div, Stride)) {
6611 if (!pushSCEV(Stride))
6612 return false;
6613 pushOperator(llvm::dwarf::DW_OP_div);
6614 }
6615 return true;
6616 }
6617
6618 // Append the current expression and locations to a location list and an
6619 // expression list. Modify the DW_OP_LLVM_arg indexes to account for
6620 // the locations already present in the destination list.
6621 void appendToVectors(SmallVectorImpl<uint64_t> &DestExpr,
6622 SmallVectorImpl<Value *> &DestLocations) {
6623 assert(!DestLocations.empty() &&
6624 "Expected the locations vector to contain the IV");
6625 // The DWARF_OP_LLVM_arg arguments of the expression being appended must be
6626 // modified to account for the locations already in the destination vector.
6627 // All builders contain the IV as the first location op.
6628 assert(!LocationOps.empty() &&
6629 "Expected the location ops to contain the IV.");
6630 // DestIndexMap[n] contains the index in DestLocations for the nth
6631 // location in this SCEVDbgValueBuilder.
6632 SmallVector<uint64_t, 2> DestIndexMap;
6633 for (const auto &Op : LocationOps) {
6634 auto It = find(DestLocations, Op);
6635 if (It != DestLocations.end()) {
6636 // Location already exists in DestLocations, reuse existing ArgIndex.
6637 DestIndexMap.push_back(std::distance(DestLocations.begin(), It));
6638 continue;
6639 }
6640 // Location is not in DestLocations, add it.
6641 DestIndexMap.push_back(DestLocations.size());
6642 DestLocations.push_back(Op);
6643 }
6644
6645 for (const auto &Op : expr_ops()) {
6646 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6647 Op.appendToVector(DestExpr);
6648 continue;
6649 }
6650
6652 // `DW_OP_LLVM_arg n` represents the nth LocationOp in this SCEV,
6653 // DestIndexMap[n] contains its new index in DestLocations.
6654 uint64_t NewIndex = DestIndexMap[Op.getArg(0)];
6655 DestExpr.push_back(NewIndex);
6656 }
6657 }
6658};
6659
6660/// Holds all the required data to salvage a dbg.value using the pre-LSR SCEVs
6661/// and DIExpression.
6662struct DVIRecoveryRec {
6663 DVIRecoveryRec(DbgValueInst *DbgValue)
6664 : DbgRef(DbgValue), Expr(DbgValue->getExpression()),
6665 HadLocationArgList(false) {}
6666 DVIRecoveryRec(DbgVariableRecord *DVR)
6667 : DbgRef(DVR), Expr(DVR->getExpression()), HadLocationArgList(false) {}
6668
6670 DIExpression *Expr;
6671 bool HadLocationArgList;
6672 SmallVector<WeakVH, 2> LocationOps;
6675
6676 void clear() {
6677 for (auto &RE : RecoveryExprs)
6678 RE.reset();
6679 RecoveryExprs.clear();
6680 }
6681
6682 ~DVIRecoveryRec() { clear(); }
6683};
6684} // namespace
6685
6686/// Returns the total number of DW_OP_llvm_arg operands in the expression.
6687/// This helps in determining if a DIArglist is necessary or can be omitted from
6688/// the dbg.value.
6690 auto expr_ops = ToDwarfOpIter(Expr);
6691 unsigned Count = 0;
6692 for (auto Op : expr_ops)
6693 if (Op.getOp() == dwarf::DW_OP_LLVM_arg)
6694 Count++;
6695 return Count;
6696}
6697
6698/// Overwrites DVI with the location and Ops as the DIExpression. This will
6699/// create an invalid expression if Ops has any dwarf::DW_OP_llvm_arg operands,
6700/// because a DIArglist is not created for the first argument of the dbg.value.
6701template <typename T>
6702static void updateDVIWithLocation(T &DbgVal, Value *Location,
6704 assert(numLLVMArgOps(Ops) == 0 && "Expected expression that does not "
6705 "contain any DW_OP_llvm_arg operands.");
6706 DbgVal.setRawLocation(ValueAsMetadata::get(Location));
6707 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6708 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6709}
6710
6711/// Overwrite DVI with locations placed into a DIArglist.
6712template <typename T>
6713static void updateDVIWithLocations(T &DbgVal,
6714 SmallVectorImpl<Value *> &Locations,
6716 assert(numLLVMArgOps(Ops) != 0 &&
6717 "Expected expression that references DIArglist locations using "
6718 "DW_OP_llvm_arg operands.");
6720 for (Value *V : Locations)
6721 MetadataLocs.push_back(ValueAsMetadata::get(V));
6722 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6723 DbgVal.setRawLocation(llvm::DIArgList::get(DbgVal.getContext(), ValArrayRef));
6724 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6725}
6726
6727/// Write the new expression and new location ops for the dbg.value. If possible
6728/// reduce the szie of the dbg.value intrinsic by omitting DIArglist. This
6729/// can be omitted if:
6730/// 1. There is only a single location, refenced by a single DW_OP_llvm_arg.
6731/// 2. The DW_OP_LLVM_arg is the first operand in the expression.
6732static void UpdateDbgValueInst(DVIRecoveryRec &DVIRec,
6733 SmallVectorImpl<Value *> &NewLocationOps,
6735 auto UpdateDbgValueInstImpl = [&](auto *DbgVal) {
6736 unsigned NumLLVMArgs = numLLVMArgOps(NewExpr);
6737 if (NumLLVMArgs == 0) {
6738 // Location assumed to be on the stack.
6739 updateDVIWithLocation(*DbgVal, NewLocationOps[0], NewExpr);
6740 } else if (NumLLVMArgs == 1 && NewExpr[0] == dwarf::DW_OP_LLVM_arg) {
6741 // There is only a single DW_OP_llvm_arg at the start of the expression,
6742 // so it can be omitted along with DIArglist.
6743 assert(NewExpr[1] == 0 &&
6744 "Lone LLVM_arg in a DIExpression should refer to location-op 0.");
6746 updateDVIWithLocation(*DbgVal, NewLocationOps[0], ShortenedOps);
6747 } else {
6748 // Multiple DW_OP_llvm_arg, so DIArgList is strictly necessary.
6749 updateDVIWithLocations(*DbgVal, NewLocationOps, NewExpr);
6750 }
6751
6752 // If the DIExpression was previously empty then add the stack terminator.
6753 // Non-empty expressions have only had elements inserted into them and so
6754 // the terminator should already be present e.g. stack_value or fragment.
6755 DIExpression *SalvageExpr = DbgVal->getExpression();
6756 if (!DVIRec.Expr->isComplex() && SalvageExpr->isComplex()) {
6757 SalvageExpr =
6758 DIExpression::append(SalvageExpr, {dwarf::DW_OP_stack_value});
6759 DbgVal->setExpression(SalvageExpr);
6760 }
6761 };
6762 if (isa<DbgValueInst *>(DVIRec.DbgRef))
6763 UpdateDbgValueInstImpl(cast<DbgValueInst *>(DVIRec.DbgRef));
6764 else
6765 UpdateDbgValueInstImpl(cast<DbgVariableRecord *>(DVIRec.DbgRef));
6766}
6767
6768/// Cached location ops may be erased during LSR, in which case a poison is
6769/// required when restoring from the cache. The type of that location is no
6770/// longer available, so just use int8. The poison will be replaced by one or
6771/// more locations later when a SCEVDbgValueBuilder selects alternative
6772/// locations to use for the salvage.
6774 return (VH) ? VH : PoisonValue::get(llvm::Type::getInt8Ty(C));
6775}
6776
6777/// Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
6778static void restorePreTransformState(DVIRecoveryRec &DVIRec) {
6779 auto RestorePreTransformStateImpl = [&](auto *DbgVal) {
6780 LLVM_DEBUG(dbgs() << "scev-salvage: restore dbg.value to pre-LSR state\n"
6781 << "scev-salvage: post-LSR: " << *DbgVal << '\n');
6782 assert(DVIRec.Expr && "Expected an expression");
6783 DbgVal->setExpression(DVIRec.Expr);
6784
6785 // Even a single location-op may be inside a DIArgList and referenced with
6786 // DW_OP_LLVM_arg, which is valid only with a DIArgList.
6787 if (!DVIRec.HadLocationArgList) {
6788 assert(DVIRec.LocationOps.size() == 1 &&
6789 "Unexpected number of location ops.");
6790 // LSR's unsuccessful salvage attempt may have added DIArgList, which in
6791 // this case was not present before, so force the location back to a
6792 // single uncontained Value.
6793 Value *CachedValue =
6794 getValueOrPoison(DVIRec.LocationOps[0], DbgVal->getContext());
6795 DbgVal->setRawLocation(ValueAsMetadata::get(CachedValue));
6796 } else {
6798 for (WeakVH VH : DVIRec.LocationOps) {
6799 Value *CachedValue = getValueOrPoison(VH, DbgVal->getContext());
6800 MetadataLocs.push_back(ValueAsMetadata::get(CachedValue));
6801 }
6802 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6803 DbgVal->setRawLocation(
6804 llvm::DIArgList::get(DbgVal->getContext(), ValArrayRef));
6805 }
6806 LLVM_DEBUG(dbgs() << "scev-salvage: pre-LSR: " << *DbgVal << '\n');
6807 };
6808 if (isa<DbgValueInst *>(DVIRec.DbgRef))
6809 RestorePreTransformStateImpl(cast<DbgValueInst *>(DVIRec.DbgRef));
6810 else
6811 RestorePreTransformStateImpl(cast<DbgVariableRecord *>(DVIRec.DbgRef));
6812}
6813
6815 llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec,
6816 const SCEV *SCEVInductionVar,
6817 SCEVDbgValueBuilder IterCountExpr) {
6818
6819 if (isa<DbgValueInst *>(DVIRec.DbgRef)
6820 ? !cast<DbgValueInst *>(DVIRec.DbgRef)->isKillLocation()
6821 : !cast<DbgVariableRecord *>(DVIRec.DbgRef)->isKillLocation())
6822 return false;
6823
6824 // LSR may have caused several changes to the dbg.value in the failed salvage
6825 // attempt. So restore the DIExpression, the location ops and also the
6826 // location ops format, which is always DIArglist for multiple ops, but only
6827 // sometimes for a single op.
6829
6830 // LocationOpIndexMap[i] will store the post-LSR location index of
6831 // the non-optimised out location at pre-LSR index i.
6832 SmallVector<int64_t, 2> LocationOpIndexMap;
6833 LocationOpIndexMap.assign(DVIRec.LocationOps.size(), -1);
6834 SmallVector<Value *, 2> NewLocationOps;
6835 NewLocationOps.push_back(LSRInductionVar);
6836
6837 for (unsigned i = 0; i < DVIRec.LocationOps.size(); i++) {
6838 WeakVH VH = DVIRec.LocationOps[i];
6839 // Place the locations not optimised out in the list first, avoiding
6840 // inserts later. The map is used to update the DIExpression's
6841 // DW_OP_LLVM_arg arguments as the expression is updated.
6842 if (VH && !isa<UndefValue>(VH)) {
6843 NewLocationOps.push_back(VH);
6844 LocationOpIndexMap[i] = NewLocationOps.size() - 1;
6845 LLVM_DEBUG(dbgs() << "scev-salvage: Location index " << i
6846 << " now at index " << LocationOpIndexMap[i] << "\n");
6847 continue;
6848 }
6849
6850 // It's possible that a value referred to in the SCEV may have been
6851 // optimised out by LSR.
6852 if (SE.containsErasedValue(DVIRec.SCEVs[i]) ||
6853 SE.containsUndefs(DVIRec.SCEVs[i])) {
6854 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV for location at index: " << i
6855 << " refers to a location that is now undef or erased. "
6856 "Salvage abandoned.\n");
6857 return false;
6858 }
6859
6860 LLVM_DEBUG(dbgs() << "scev-salvage: salvaging location at index " << i
6861 << " with SCEV: " << *DVIRec.SCEVs[i] << "\n");
6862
6863 DVIRec.RecoveryExprs[i] = std::make_unique<SCEVDbgValueBuilder>();
6864 SCEVDbgValueBuilder *SalvageExpr = DVIRec.RecoveryExprs[i].get();
6865
6866 // Create an offset-based salvage expression if possible, as it requires
6867 // less DWARF ops than an iteration count-based expression.
6868 if (std::optional<APInt> Offset =
6869 SE.computeConstantDifference(DVIRec.SCEVs[i], SCEVInductionVar)) {
6870 if (Offset->getSignificantBits() <= 64)
6871 SalvageExpr->createOffsetExpr(Offset->getSExtValue(), LSRInductionVar);
6872 } else if (!SalvageExpr->createIterCountExpr(DVIRec.SCEVs[i], IterCountExpr,
6873 SE))
6874 return false;
6875 }
6876
6877 // Merge the DbgValueBuilder generated expressions and the original
6878 // DIExpression, place the result into an new vector.
6880 if (DVIRec.Expr->getNumElements() == 0) {
6881 assert(DVIRec.RecoveryExprs.size() == 1 &&
6882 "Expected only a single recovery expression for an empty "
6883 "DIExpression.");
6884 assert(DVIRec.RecoveryExprs[0] &&
6885 "Expected a SCEVDbgSalvageBuilder for location 0");
6886 SCEVDbgValueBuilder *B = DVIRec.RecoveryExprs[0].get();
6887 B->appendToVectors(NewExpr, NewLocationOps);
6888 }
6889 for (const auto &Op : DVIRec.Expr->expr_ops()) {
6890 // Most Ops needn't be updated.
6891 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6892 Op.appendToVector(NewExpr);
6893 continue;
6894 }
6895
6896 uint64_t LocationArgIndex = Op.getArg(0);
6897 SCEVDbgValueBuilder *DbgBuilder =
6898 DVIRec.RecoveryExprs[LocationArgIndex].get();
6899 // The location doesn't have s SCEVDbgValueBuilder, so LSR did not
6900 // optimise it away. So just translate the argument to the updated
6901 // location index.
6902 if (!DbgBuilder) {
6903 NewExpr.push_back(dwarf::DW_OP_LLVM_arg);
6904 assert(LocationOpIndexMap[Op.getArg(0)] != -1 &&
6905 "Expected a positive index for the location-op position.");
6906 NewExpr.push_back(LocationOpIndexMap[Op.getArg(0)]);
6907 continue;
6908 }
6909 // The location has a recovery expression.
6910 DbgBuilder->appendToVectors(NewExpr, NewLocationOps);
6911 }
6912
6913 UpdateDbgValueInst(DVIRec, NewLocationOps, NewExpr);
6914 if (isa<DbgValueInst *>(DVIRec.DbgRef))
6915 LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: "
6916 << *cast<DbgValueInst *>(DVIRec.DbgRef) << "\n");
6917 else
6918 LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: "
6919 << *cast<DbgVariableRecord *>(DVIRec.DbgRef) << "\n");
6920 return true;
6921}
6922
6923/// Obtain an expression for the iteration count, then attempt to salvage the
6924/// dbg.value intrinsics.
6926 llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar,
6927 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &DVIToUpdate) {
6928 if (DVIToUpdate.empty())
6929 return;
6930
6931 const llvm::SCEV *SCEVInductionVar = SE.getSCEV(LSRInductionVar);
6932 assert(SCEVInductionVar &&
6933 "Anticipated a SCEV for the post-LSR induction variable");
6934
6935 if (const SCEVAddRecExpr *IVAddRec =
6936 dyn_cast<SCEVAddRecExpr>(SCEVInductionVar)) {
6937 if (!IVAddRec->isAffine())
6938 return;
6939
6940 // Prevent translation using excessive resources.
6941 if (IVAddRec->getExpressionSize() > MaxSCEVSalvageExpressionSize)
6942 return;
6943
6944 // The iteration count is required to recover location values.
6945 SCEVDbgValueBuilder IterCountExpr;
6946 IterCountExpr.pushLocation(LSRInductionVar);
6947 if (!IterCountExpr.SCEVToIterCountExpr(*IVAddRec, SE))
6948 return;
6949
6950 LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV: " << *SCEVInductionVar
6951 << '\n');
6952
6953 for (auto &DVIRec : DVIToUpdate) {
6954 SalvageDVI(L, SE, LSRInductionVar, *DVIRec, SCEVInductionVar,
6955 IterCountExpr);
6956 }
6957 }
6958}
6959
6960/// Identify and cache salvageable DVI locations and expressions along with the
6961/// corresponding SCEV(s). Also ensure that the DVI is not deleted between
6962/// cacheing and salvaging.
6964 Loop *L, ScalarEvolution &SE,
6965 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &SalvageableDVISCEVs,
6966 SmallSet<AssertingVH<DbgValueInst>, 2> &DVIHandles) {
6967 for (const auto &B : L->getBlocks()) {
6968 for (auto &I : *B) {
6969 auto ProcessDbgValue = [&](auto *DbgVal) -> bool {
6970 // Ensure that if any location op is undef that the dbg.vlue is not
6971 // cached.
6972 if (DbgVal->isKillLocation())
6973 return false;
6974
6975 // Check that the location op SCEVs are suitable for translation to
6976 // DIExpression.
6977 const auto &HasTranslatableLocationOps =
6978 [&](const auto *DbgValToTranslate) -> bool {
6979 for (const auto LocOp : DbgValToTranslate->location_ops()) {
6980 if (!LocOp)
6981 return false;
6982
6983 if (!SE.isSCEVable(LocOp->getType()))
6984 return false;
6985
6986 const SCEV *S = SE.getSCEV(LocOp);
6987 if (SE.containsUndefs(S))
6988 return false;
6989 }
6990 return true;
6991 };
6992
6993 if (!HasTranslatableLocationOps(DbgVal))
6994 return false;
6995
6996 std::unique_ptr<DVIRecoveryRec> NewRec =
6997 std::make_unique<DVIRecoveryRec>(DbgVal);
6998 // Each location Op may need a SCEVDbgValueBuilder in order to recover
6999 // it. Pre-allocating a vector will enable quick lookups of the builder
7000 // later during the salvage.
7001 NewRec->RecoveryExprs.resize(DbgVal->getNumVariableLocationOps());
7002 for (const auto LocOp : DbgVal->location_ops()) {
7003 NewRec->SCEVs.push_back(SE.getSCEV(LocOp));
7004 NewRec->LocationOps.push_back(LocOp);
7005 NewRec->HadLocationArgList = DbgVal->hasArgList();
7006 }
7007 SalvageableDVISCEVs.push_back(std::move(NewRec));
7008 return true;
7009 };
7010 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
7011 if (DVR.isDbgValue() || DVR.isDbgAssign())
7012 ProcessDbgValue(&DVR);
7013 }
7014 auto DVI = dyn_cast<DbgValueInst>(&I);
7015 if (!DVI)
7016 continue;
7017 if (ProcessDbgValue(DVI))
7018 DVIHandles.insert(DVI);
7019 }
7020 }
7021}
7022
7023/// Ideally pick the PHI IV inserted by ScalarEvolutionExpander. As a fallback
7024/// any PHi from the loop header is usable, but may have less chance of
7025/// surviving subsequent transforms.
7027 const LSRInstance &LSR) {
7028
7029 auto IsSuitableIV = [&](PHINode *P) {
7030 if (!SE.isSCEVable(P->getType()))
7031 return false;
7032 if (const SCEVAddRecExpr *Rec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(P)))
7033 return Rec->isAffine() && !SE.containsUndefs(SE.getSCEV(P));
7034 return false;
7035 };
7036
7037 // For now, just pick the first IV that was generated and inserted by
7038 // ScalarEvolution. Ideally pick an IV that is unlikely to be optimised away
7039 // by subsequent transforms.
7040 for (const WeakVH &IV : LSR.getScalarEvolutionIVs()) {
7041 if (!IV)
7042 continue;
7043
7044 // There should only be PHI node IVs.
7045 PHINode *P = cast<PHINode>(&*IV);
7046
7047 if (IsSuitableIV(P))
7048 return P;
7049 }
7050
7051 for (PHINode &P : L.getHeader()->phis()) {
7052 if (IsSuitableIV(&P))
7053 return &P;
7054 }
7055 return nullptr;
7056}
7057
7059 DominatorTree &DT, LoopInfo &LI,
7060 const TargetTransformInfo &TTI,
7062 MemorySSA *MSSA) {
7063
7064 // Debug preservation - before we start removing anything identify which DVI
7065 // meet the salvageable criteria and store their DIExpression and SCEVs.
7066 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> SalvageableDVIRecords;
7068 DbgGatherSalvagableDVI(L, SE, SalvageableDVIRecords, DVIHandles);
7069
7070 bool Changed = false;
7071 std::unique_ptr<MemorySSAUpdater> MSSAU;
7072 if (MSSA)
7073 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
7074
7075 // Run the main LSR transformation.
7076 const LSRInstance &Reducer =
7077 LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get());
7078 Changed |= Reducer.getChanged();
7079
7080 // Remove any extra phis created by processing inner loops.
7081 Changed |= DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7082 if (EnablePhiElim && L->isLoopSimplifyForm()) {
7084 const DataLayout &DL = L->getHeader()->getDataLayout();
7085 SCEVExpander Rewriter(SE, DL, "lsr", false);
7086#ifndef NDEBUG
7087 Rewriter.setDebugType(DEBUG_TYPE);
7088#endif
7089 unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI);
7090 Rewriter.clear();
7091 if (numFolded) {
7092 Changed = true;
7094 MSSAU.get());
7095 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7096 }
7097 }
7098 // LSR may at times remove all uses of an induction variable from a loop.
7099 // The only remaining use is the PHI in the exit block.
7100 // When this is the case, if the exit value of the IV can be calculated using
7101 // SCEV, we can replace the exit block PHI with the final value of the IV and
7102 // skip the updates in each loop iteration.
7103 if (L->isRecursivelyLCSSAForm(DT, LI) && L->getExitBlock()) {
7105 const DataLayout &DL = L->getHeader()->getDataLayout();
7106 SCEVExpander Rewriter(SE, DL, "lsr", true);
7107 int Rewrites = rewriteLoopExitValues(L, &LI, &TLI, &SE, &TTI, Rewriter, &DT,
7108 UnusedIndVarInLoop, DeadInsts);
7109 Rewriter.clear();
7110 if (Rewrites) {
7111 Changed = true;
7113 MSSAU.get());
7114 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7115 }
7116 }
7117
7118 if (SalvageableDVIRecords.empty())
7119 return Changed;
7120
7121 // Obtain relevant IVs and attempt to rewrite the salvageable DVIs with
7122 // expressions composed using the derived iteration count.
7123 // TODO: Allow for multiple IV references for nested AddRecSCEVs
7124 for (const auto &L : LI) {
7125 if (llvm::PHINode *IV = GetInductionVariable(*L, SE, Reducer))
7126 DbgRewriteSalvageableDVIs(L, SE, IV, SalvageableDVIRecords);
7127 else {
7128 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV salvaging not possible. An IV "
7129 "could not be identified.\n");
7130 }
7131 }
7132
7133 for (auto &Rec : SalvageableDVIRecords)
7134 Rec->clear();
7135 SalvageableDVIRecords.clear();
7136 DVIHandles.clear();
7137 return Changed;
7138}
7139
7140bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
7141 if (skipLoop(L))
7142 return false;
7143
7144 auto &IU = getAnalysis<IVUsersWrapperPass>().getIU();
7145 auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
7146 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
7147 auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
7148 const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
7149 *L->getHeader()->getParent());
7150 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
7151 *L->getHeader()->getParent());
7152 auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
7153 *L->getHeader()->getParent());
7154 auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
7155 MemorySSA *MSSA = nullptr;
7156 if (MSSAAnalysis)
7157 MSSA = &MSSAAnalysis->getMSSA();
7158 return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, TLI, MSSA);
7159}
7160
7163 LPMUpdater &) {
7164 if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,
7165 AR.DT, AR.LI, AR.TTI, AR.AC, AR.TLI, AR.MSSA))
7166 return PreservedAnalyses::all();
7167
7168 auto PA = getLoopPassPreservedAnalyses();
7169 if (AR.MSSA)
7170 PA.preserve<MemorySSAAnalysis>();
7171 return PA;
7172}
7173
7174char LoopStrengthReduce::ID = 0;
7175
7176INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
7177 "Loop Strength Reduction", false, false)
7183INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
7184INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
7185 "Loop Strength Reduction", false, false)
7186
7187Pass *llvm::createLoopStrengthReducePass() { return new LoopStrengthReduce(); }
#define Success
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:686
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:533
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static void clear(coro::Shape &Shape)
Definition: Coroutines.cpp:148
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static bool isCanonical(const MDString *S)
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
This file contains constants used for implementing Dwarf debug support.
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1309
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Rewrite Partial Register Uses
Hexagon Hardware Loops
This defines the Use class.
iv Induction Variable Users
Definition: IVUsers.cpp:48
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:512
This header provides classes for managing per-loop analyses.
static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec, const SCEV *SCEVInductionVar, SCEVDbgValueBuilder IterCountExpr)
static cl::opt< bool > DropScaledForVScale("lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true), cl::desc("Avoid using scaled registers with vscale-relative addressing"))
static Value * getWideOperand(Value *Oper)
IVChain logic must consistently peek base TruncInst operands, so wrap it in a convenient helper.
static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE)
Return true if the given add can be sign-extended without changing its value.
static bool mayUsePostIncMode(const TargetTransformInfo &TTI, LSRUse &LU, const SCEV *S, const Loop *L, ScalarEvolution &SE)
Return true if the SCEV represents a value that may end up as a post-increment operation.
static void restorePreTransformState(DVIRecoveryRec &DVIRec)
Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE)
If S involves the addition of a constant integer value, return that integer value,...
static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L)
static User::op_iterator findIVOperand(User::op_iterator OI, User::op_iterator OE, Loop *L, ScalarEvolution &SE)
Helper for CollectChains that finds an IV operand (computed by an AddRec in this loop) within [OI,...
static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset, Immediate MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg, int64_t Scale)
Test whether we know how to expand the current formula.
static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE)
Return true if the given mul can be sign-extended without changing its value.
static const unsigned MaxSCEVSalvageExpressionSize
Limit the size of expression that SCEV-based salvaging will attempt to translate into a DIExpression.
static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if this AddRec is already a phi in its loop.
static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F, const Loop &L)
static cl::opt< bool > InsnsCost("lsr-insns-cost", cl::Hidden, cl::init(true), cl::desc("Add instruction count to a LSR cost model"))
static cl::opt< bool > StressIVChain("stress-ivchain", cl::Hidden, cl::init(false), cl::desc("Stress test LSR IV chains"))
static bool isAddressUse(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Returns true if the specified instruction is using the specified value as an address.
static GlobalValue * ExtractSymbol(const SCEV *&S, ScalarEvolution &SE)
If S involves the addition of a GlobalValue address, return that symbol, and mutate S to point to a n...
static void updateDVIWithLocation(T &DbgVal, Value *Location, SmallVectorImpl< uint64_t > &Ops)
Overwrites DVI with the location and Ops as the DIExpression.
static bool isLegalAddImmediate(const TargetTransformInfo &TTI, Immediate Offset)
static cl::opt< cl::boolOrDefault > AllowDropSolutionIfLessProfitable("lsr-drop-solution", cl::Hidden, cl::desc("Attempt to drop solution if it is less profitable"))
static cl::opt< bool > EnableVScaleImmediates("lsr-enable-vscale-immediates", cl::Hidden, cl::init(true), cl::desc("Enable analysis of vscale-relative immediates in LSR"))
static cl::opt< TTI::AddressingModeKind > PreferredAddresingMode("lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None), cl::desc("A flag that overrides the target's preferred addressing mode."), cl::values(clEnumValN(TTI::AMK_None, "none", "Don't prefer any addressing mode"), clEnumValN(TTI::AMK_PreIndexed, "preindexed", "Prefer pre-indexed addressing mode"), clEnumValN(TTI::AMK_PostIndexed, "postindexed", "Prefer post-indexed addressing mode")))
static const SCEV * getExprBase(const SCEV *S)
Return an approximation of this SCEV expression's "base", or NULL for any constant.
static bool isAlwaysFoldable(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg)
static llvm::PHINode * GetInductionVariable(const Loop &L, ScalarEvolution &SE, const LSRInstance &LSR)
Ideally pick the PHI IV inserted by ScalarEvolutionExpander.
static bool IsSimplerBaseSCEVForTarget(const TargetTransformInfo &TTI, ScalarEvolution &SE, const SCEV *Best, const SCEV *Reg, MemAccessTy AccessType)
loop reduce
static const unsigned MaxIVUsers
MaxIVUsers is an arbitrary threshold that provides an early opportunity for bail out.
static bool isHighCostExpansion(const SCEV *S, SmallPtrSetImpl< const SCEV * > &Processed, ScalarEvolution &SE)
Check if expanding this expression is likely to incur significant cost.
static Value * getValueOrPoison(WeakVH &VH, LLVMContext &C)
Cached location ops may be erased during LSR, in which case a poison is required when restoring from ...
static MemAccessTy getAccessType(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Return the type of the memory being accessed.
static unsigned numLLVMArgOps(SmallVectorImpl< uint64_t > &Expr)
Returns the total number of DW_OP_llvm_arg operands in the expression.
static void DbgRewriteSalvageableDVIs(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &DVIToUpdate)
Obtain an expression for the iteration count, then attempt to salvage the dbg.value intrinsics.
static cl::opt< bool > EnablePhiElim("enable-lsr-phielim", cl::Hidden, cl::init(true), cl::desc("Enable LSR phi elimination"))
static void DbgGatherSalvagableDVI(Loop *L, ScalarEvolution &SE, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &SalvageableDVISCEVs, SmallSet< AssertingVH< DbgValueInst >, 2 > &DVIHandles)
Identify and cache salvageable DVI locations and expressions along with the corresponding SCEV(s).
static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if the given addrec can be sign-extended without changing its value.
static bool canHoistIVInc(const TargetTransformInfo &TTI, const LSRFixup &Fixup, const LSRUse &LU, Instruction *IVIncInsertPos, Loop *L)
static void DoInitialMatch(const SCEV *S, Loop *L, SmallVectorImpl< const SCEV * > &Good, SmallVectorImpl< const SCEV * > &Bad, ScalarEvolution &SE)
Recursion helper for initialMatch.
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F)
Check if the addressing mode defined by F is completely folded in LU at isel time.
static cl::opt< bool > LSRExpNarrow("lsr-exp-narrow", cl::Hidden, cl::init(false), cl::desc("Narrow LSR complex solution using" " expectation of registers number"))
static cl::opt< bool > FilterSameScaledReg("lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true), cl::desc("Narrow LSR search space by filtering non-optimal formulae" " with the same ScaledReg and Scale"))
static void updateDVIWithLocations(T &DbgVal, SmallVectorImpl< Value * > &Locations, SmallVectorImpl< uint64_t > &Ops)
Overwrite DVI with locations placed into a DIArglist.
static cl::opt< unsigned > ComplexityLimit("lsr-complexity-limit", cl::Hidden, cl::init(std::numeric_limits< uint16_t >::max()), cl::desc("LSR search space complexity limit"))
static void UpdateDbgValueInst(DVIRecoveryRec &DVIRec, SmallVectorImpl< Value * > &NewLocationOps, SmallVectorImpl< uint64_t > &NewExpr)
Write the new expression and new location ops for the dbg.value.
static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC, TargetLibraryInfo &TLI, MemorySSA *MSSA)
static bool isProfitableChain(IVChain &Chain, SmallPtrSetImpl< Instruction * > &Users, ScalarEvolution &SE, const TargetTransformInfo &TTI)
Return true if the number of registers needed for the chain is estimated to be less than the number r...
static const SCEV * CollectSubexprs(const SCEV *S, const SCEVConstant *C, SmallVectorImpl< const SCEV * > &Ops, const Loop *L, ScalarEvolution &SE, unsigned Depth=0)
Split S into subexpressions which can be pulled out into separate registers.
static const SCEV * getExactSDiv(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool IgnoreSignificantBits=false)
Return an expression for LHS /s RHS, if it can be determined and if the remainder is known to be zero...
static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, Value *Operand, const TargetTransformInfo &TTI)
Return true if the IVInc can be folded into an addressing mode.
#define DEBUG_TYPE
static const SCEV * getAnyExtendConsideringPostIncUses(ArrayRef< PostIncLoopSet > Loops, const SCEV *Expr, Type *ToTy, ScalarEvolution &SE)
Extend/Truncate Expr to ToTy considering post-inc uses in Loops.
static unsigned getSetupCost(const SCEV *Reg, unsigned Depth)
static cl::opt< unsigned > SetupCostDepthLimit("lsr-setupcost-depth-limit", cl::Hidden, cl::init(7), cl::desc("The limit on recursion depth for LSRs setup cost"))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
unsigned Reg
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
Module.h This file contains the declarations for the Module class.
uint64_t IntrinsicInst * II
#define P(N)
PowerPC TLS Dynamic Call Fixup
This header defines various interfaces for pass management in LLVM.
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file defines the PointerIntPair class.
const SmallVectorImpl< MachineOperand > & Cond
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
SI optimize exec mask operations pre RA
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This pass exposes codegen information to IR-level passes.
Virtual Register Rewriter
Definition: VirtRegMap.cpp:237
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class recording the (high level) value of a variable.
Class for arbitrary precision integers.
Definition: APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1498
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:307
APInt sdiv(const APInt &RHS) const
Signed division function for APInt.
Definition: APInt.cpp:1614
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition: APInt.h:1489
APInt srem(const APInt &RHS) const
Function for signed remainder operation.
Definition: APInt.cpp:1706
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1520
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:405
Represent the analysis usage information of a pass.
AnalysisUsage & addRequiredID(const void *ID)
Definition: Pass.cpp:270
AnalysisUsage & addPreservedID(const void *ID)
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Definition: Any.h:28
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
Value handle that asserts if the Value is deleted.
Definition: ValueHandle.h:264
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:495
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:696
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:517
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition: BasicBlock.h:376
bool isLandingPad() const
Return true if this basic block is a landing pad.
Definition: BasicBlock.cpp:677
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
BinaryOps getOpcode() const
Definition: InstrTypes.h:442
static BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
Conditional or Unconditional Branch instruction.
bool isUnconditional() const
Value * getCondition() const
static Instruction::CastOps getCastOpcode(const Value *Val, bool SrcIsSigned, Type *Ty, bool DstIsSigned)
Returns the opcode necessary to cast Val into Ty using usual casting rules.
static CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ ICMP_EQ
equal
Definition: InstrTypes.h:778
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:871
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
static bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
Definition: Constants.cpp:1575
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.h:124
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:161
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:155
This is an important base class in LLVM.
Definition: Constant.h:42
static DIArgList * get(LLVMContext &Context, ArrayRef< ValueAsMetadata * > Args)
An iterator for expression operands.
DWARF expression.
static DIExpression * append(const DIExpression *Expr, ArrayRef< uint64_t > Ops)
Append the opcodes Ops to DIExpr.
static void appendOffset(SmallVectorImpl< uint64_t > &Ops, int64_t Offset)
Append Ops with operations to apply the Offset.
bool isComplex() const
Return whether the location is computed on the expression stack, meaning it cannot be a simple regist...
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
This represents the llvm.dbg.value instruction.
Record of a variable value-assignment, aka a non instruction representation of the dbg....
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
NodeT * getBlock() const
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:317
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
This instruction compares its operands according to the predicate given to the constructor.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2686
IVStrideUse - Keep track of one use of a strided induction variable.
Definition: IVUsers.h:35
void transformToPostInc(const Loop *L)
transformToPostInc - Transform the expression to post-inc form for the given loop.
Definition: IVUsers.cpp:367
Value * getOperandValToReplace() const
getOperandValToReplace - Return the Value of the operand in the user instruction that this IVStrideUs...
Definition: IVUsers.h:54
void setUser(Instruction *NewUser)
setUser - Assign a new user instruction for this use.
Definition: IVUsers.h:48
Analysis pass that exposes the IVUsers for a loop.
Definition: IVUsers.h:184
ilist< IVStrideUse >::const_iterator const_iterator
Definition: IVUsers.h:142
bool empty() const
Definition: IVUsers.h:147
void print(raw_ostream &OS) const
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:466
bool isEHPad() const
Return true if the instruction is a variety of EH-block.
Definition: Instruction.h:824
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:92
Type * getAccessType() const LLVM_READONLY
Return the type this instruction accesses in memory, if any.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:463
const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition: Instruction.cpp:74
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:266
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
An instruction for reading from memory.
Definition: Instructions.h:174
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:593
virtual bool runOnLoop(Loop *L, LPPassManager &LPM)=0
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
An analysis that produces MemorySSA for a function.
Definition: MemorySSA.h:924
Legacy analysis pass which computes MemorySSA.
Definition: MemorySSA.h:981
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition: MemorySSA.h:697
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
iterator_range< const_block_iterator > blocks() const
op_range incoming_values()
void setIncomingValue(unsigned i, Value *V)
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
static unsigned getIncomingValueNumForOperand(unsigned i)
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98
PointerIntPair - This class implements a pair of a pointer and small integer.
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1852
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
This node represents an addition of some number of SCEVs.
This node represents a polynomial recurrence on the trip count of the specified loop.
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This is the base class for unary cast operator classes.
This node is the base class for n'ary commutative operators.
This class represents a constant integer value.
ConstantInt * getValue() const
const APInt & getAPInt() const
This class uses information about analyze scalars to rewrite expressions in canonical form.
This is the base class for unary integral cast operator classes.
This node represents multiplication of some number of SCEVs.
This node is a base class providing common functionality for n'ary operators.
ArrayRef< const SCEV * > operands() const
This class represents a signed maximum selection.
This class represents a binary unsigned division operation.
This class represents an unsigned maximum selection.
This means that we are dealing with an entirely unknown SCEV value, and only represent it as its LLVM...
This class represents an analyzed expression in the program.
ArrayRef< const SCEV * > operands() const
Return operands of this SCEV expression.
unsigned short getExpressionSize() const
bool isZero() const
Return true if the expression is a constant zero.
SCEVTypes getSCEVType() const
Type * getType() const
Return the LLVM type of this SCEV expression.
This class represents a cast from signed integer to floating point.
The main scalar evolution driver.
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
uint64_t getTypeSizeInBits(Type *Ty) const
Return the size in bits of the specified type, for which isSCEVable must return true.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getNoopOrSignExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getAddRecExpr(const SCEV *Start, const SCEV *Step, const Loop *L, SCEV::NoWrapFlags Flags)
Get an add recurrence expression for the specified loop.
bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
Type * getEffectiveSCEVType(Type *Ty) const
Return a type with the same bitwidth as the given type and which represents how SCEV will treat the g...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getAnyExtendExpr(const SCEV *Op, Type *Ty)
getAnyExtendExpr - Return a SCEV for the given operand extended with unspecified bits out to the give...
bool containsUndefs(const SCEV *S) const
Return true if the SCEV expression contains an undef value.
const SCEV * getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
const SCEV * getVScale(Type *Ty)
bool hasComputableLoopEvolution(const SCEV *S, const Loop *L)
Return true if the given SCEV changes value in a known way in the specified loop.
const SCEV * getPointerBase(const SCEV *V)
Transitively follow the chain of pointer-type operands until reaching a SCEV that does not have a sin...
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUnknown(Value *V)
std::optional< APInt > computeConstantDifference(const SCEV *LHS, const SCEV *RHS)
Compute LHS - RHS and returns the result as an APInt if it is a constant, and std::nullopt if it isn'...
bool properlyDominates(const SCEV *S, const BasicBlock *BB)
Return true if elements that makes up the given SCEV properly dominate the specified basic block.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
bool containsErasedValue(const SCEV *S) const
Return true if the SCEV expression contains a Value that has been optimised out and is now a nullptr.
LLVMContext & getContext() const
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:113
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
iterator_range< const_set_bits_iterator > set_bits() const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
size_type size() const
Returns the number of bits in this bitvector.
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
size_type count() const
Returns the number of bits which are set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:346
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:435
iterator end() const
Definition: SmallPtrSet.h:460
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:367
iterator begin() const
Definition: SmallPtrSet.h:455
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
void clear()
Definition: SmallSet.h:218
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:717
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void reserve(size_type N)
Definition: SmallVector.h:676
iterator erase(const_iterator CI)
Definition: SmallVector.h:750
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:591
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
typename SuperClass::iterator iterator
Definition: SmallVector.h:590
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
static StackOffset get(int64_t Fixed, int64_t Scalable)
Definition: TypeSize.h:44
An instruction for storing to memory.
Definition: Instructions.h:290
Provides information about what library functions are available for the current target.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const
bool shouldDropLSRSolutionIfLessProfitable() const
Return true if LSR should drop a found solution if it's calculated to be less profitable than the bas...
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const
Return true if LSR cost of C1 is lower than C2.
bool isProfitableLSRChainElement(Instruction *I) const
bool LSRWithInstrQueries() const
Return true if the loop strength reduce pass should make Instruction* based TTI queries to isLegalAdd...
bool isIndexedStoreLegal(enum MemIndexedMode Mode, Type *Ty) const
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace=0, Instruction *I=nullptr, int64_t ScalableOffset=0) const
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isIndexedLoadLegal(enum MemIndexedMode Mode, Type *Ty) const
bool isLegalICmpImmediate(int64_t Imm) const
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
bool isLegalAddImmediate(int64_t Imm) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *LibInfo) const
Return true if the target can save a compare for loop count, for example hardware loop saves a compar...
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isLegalAddScalableImmediate(int64_t Imm) const
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
bool isNumRegsMajorCostOfLSR() const
Return true if LSR major cost is number of registers.
@ MIM_PostInc
Post-incrementing.
bool canMacroFuseCmp() const
Return true if the target can fuse a compare and branch.
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace=0) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
This class represents a truncation of integer types.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:251
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static Type * getVoidTy(LLVMContext &C)
int getFPMantissaWidth() const
Return the width of the mantissa of this type.
static IntegerType * getInt8Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:224
This class represents a cast unsigned integer to floating point.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:242
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
op_iterator op_end()
Definition: User.h:236
static ValueAsMetadata * get(Value *V)
Definition: Metadata.cpp:501
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
iterator_range< use_iterator > uses()
Definition: Value.h:376
A nullable Value handle that is nullable.
Definition: ValueHandle.h:144
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Key
PAL metadata keys.
@ Entry
Definition: COFF.h:826
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SC
CHAIN = SC CHAIN, Imm128 - System call.
Reg
All possible values of the reg field in the ModR/M byte.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:711
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
@ DW_OP_LLVM_arg
Only used in LLVM metadata.
Definition: Dwarf.h:147
@ DW_OP_LLVM_convert
Only used in LLVM metadata.
Definition: Dwarf.h:143
constexpr double e
Definition: MathExtras.h:47
Sequence
A sequence of states that a pointer may go through in which an objc_retain and objc_release are actua...
Definition: PtrState.h:41
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:227
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:236
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition: DWP.cpp:480
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
bool operator!=(uint64_t V1, const APInt &V2)
Definition: APInt.h:2060
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2098
char & LoopSimplifyID
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
bool matchSimpleRecurrence(const PHINode *P, BinaryOperator *&BO, Value *&Start, Value *&Step)
Attempt to match a simple first order recurrence cycle of the form: iv = phi Ty [Start,...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
bool DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr)
Examine each PHI in the given block and delete it if it is dead.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
const SCEV * denormalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE)
Denormalize S to be post-increment for all loops present in Loops.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
void SplitLandingPadPredecessors(BasicBlock *OrigBB, ArrayRef< BasicBlock * > Preds, const char *Suffix, const char *Suffix2, SmallVectorImpl< BasicBlock * > &NewBBs, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, bool PreserveLCSSA=false)
This method transforms the landing pad, OrigBB, by introducing two new basic blocks into the function...
const SCEV * normalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE, bool CheckInvertible=true)
Normalize S to be post-increment for all loops present in Loops.
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
@ Add
Sum of integers.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1921
Pass * createLoopStrengthReducePass()
BasicBlock * SplitCriticalEdge(Instruction *TI, unsigned SuccNum, const CriticalEdgeSplittingOptions &Options=CriticalEdgeSplittingOptions(), const Twine &BBName="")
If this edge is a critical edge, insert a new node to split the critical edge.
bool RecursivelyDeleteTriviallyDeadInstructionsPermissive(SmallVectorImpl< WeakTrackingVH > &DeadInsts, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
Same functionality as RecursivelyDeleteTriviallyDeadInstructions, but allow instructions that are not...
Definition: Local.cpp:555
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
bool formLCSSAForInstructions(SmallVectorImpl< Instruction * > &Worklist, const DominatorTree &DT, const LoopInfo &LI, ScalarEvolution *SE, SmallVectorImpl< PHINode * > *PHIsToRemove=nullptr, SmallVectorImpl< PHINode * > *InsertedPHIs=nullptr)
Ensures LCSSA form for every instruction from the Worklist in the scope of innermost containing loop.
Definition: LCSSA.cpp:325
void initializeLoopStrengthReducePass(PassRegistry &)
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
int rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI, ScalarEvolution *SE, const TargetTransformInfo *TTI, SCEVExpander &Rewriter, DominatorTree *DT, ReplaceExitVal ReplaceExitValue, SmallVector< WeakTrackingVH, 16 > &DeadInsts)
If the final value of any expressions that are recurrent in the loop can be computed,...
Definition: LoopUtils.cpp:1489
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1886
@ UnusedIndVarInLoop
Definition: LoopUtils.h:472
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:471
bool SCEVExprContains(const SCEV *Root, PredTy Pred)
Return true if any node in Root satisfies the predicate Pred.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Option class for critical edge splitting.
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Information about a load/store intrinsic defined by the target.
Value * PtrVal
This is the pointer that the intrinsic is loading from or storing to.