LLVM 22.0.0git
LoopStrengthReduce.cpp
Go to the documentation of this file.
1//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This transformation analyzes and transforms the induction variables (and
10// computations derived from them) into forms suitable for efficient execution
11// on the target.
12//
13// This pass performs a strength reduction on array references inside loops that
14// have as one or more of their components the loop induction variable, it
15// rewrites expressions to take advantage of scaled-index addressing modes
16// available on the target, and it performs a variety of other optimizations
17// related to loop induction variables.
18//
19// Terminology note: this code has a lot of handling for "post-increment" or
20// "post-inc" users. This is not talking about post-increment addressing modes;
21// it is instead talking about code like this:
22//
23// %i = phi [ 0, %entry ], [ %i.next, %latch ]
24// ...
25// %i.next = add %i, 1
26// %c = icmp eq %i.next, %n
27//
28// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
29// it's useful to think about these as the same register, with some uses using
30// the value of the register before the add and some using it after. In this
31// example, the icmp is a post-increment user, since it uses %i.next, which is
32// the value of the induction variable after the increment. The other common
33// case of post-increment users is users outside the loop.
34//
35// TODO: More sophistication in the way Formulae are generated and filtered.
36//
37// TODO: Handle multiple loops at a time.
38//
39// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
40// of a GlobalValue?
41//
42// TODO: When truncation is free, truncate ICmp users' operands to make it a
43// smaller encoding (on x86 at least).
44//
45// TODO: When a negated register is used by an add (such as in a list of
46// multiple base registers, or as the increment expression in an addrec),
47// we may not actually need both reg and (-1 * reg) in registers; the
48// negation can be implemented by using a sub instead of an add. The
49// lack of support for taking this into consideration when making
50// register pressure decisions is partly worked around by the "Special"
51// use kind.
52//
53//===----------------------------------------------------------------------===//
54
56#include "llvm/ADT/APInt.h"
57#include "llvm/ADT/DenseMap.h"
58#include "llvm/ADT/DenseSet.h"
60#include "llvm/ADT/STLExtras.h"
61#include "llvm/ADT/SetVector.h"
64#include "llvm/ADT/SmallSet.h"
66#include "llvm/ADT/Statistic.h"
84#include "llvm/IR/BasicBlock.h"
85#include "llvm/IR/Constant.h"
86#include "llvm/IR/Constants.h"
89#include "llvm/IR/Dominators.h"
90#include "llvm/IR/GlobalValue.h"
91#include "llvm/IR/IRBuilder.h"
92#include "llvm/IR/InstrTypes.h"
93#include "llvm/IR/Instruction.h"
96#include "llvm/IR/Module.h"
97#include "llvm/IR/Operator.h"
98#include "llvm/IR/Type.h"
99#include "llvm/IR/Use.h"
100#include "llvm/IR/User.h"
101#include "llvm/IR/Value.h"
102#include "llvm/IR/ValueHandle.h"
104#include "llvm/Pass.h"
105#include "llvm/Support/Casting.h"
108#include "llvm/Support/Debug.h"
118#include <algorithm>
119#include <cassert>
120#include <cstddef>
121#include <cstdint>
122#include <iterator>
123#include <limits>
124#include <map>
125#include <numeric>
126#include <optional>
127#include <utility>
128
129using namespace llvm;
130using namespace SCEVPatternMatch;
131
132#define DEBUG_TYPE "loop-reduce"
133
134/// MaxIVUsers is an arbitrary threshold that provides an early opportunity for
135/// bail out. This threshold is far beyond the number of users that LSR can
136/// conceivably solve, so it should not affect generated code, but catches the
137/// worst cases before LSR burns too much compile time and stack space.
138static const unsigned MaxIVUsers = 200;
139
140/// Limit the size of expression that SCEV-based salvaging will attempt to
141/// translate into a DIExpression.
142/// Choose a maximum size such that debuginfo is not excessively increased and
143/// the salvaging is not too expensive for the compiler.
144static const unsigned MaxSCEVSalvageExpressionSize = 64;
145
146// Cleanup congruent phis after LSR phi expansion.
148 "enable-lsr-phielim", cl::Hidden, cl::init(true),
149 cl::desc("Enable LSR phi elimination"));
150
151// The flag adds instruction count to solutions cost comparison.
153 "lsr-insns-cost", cl::Hidden, cl::init(true),
154 cl::desc("Add instruction count to a LSR cost model"));
155
156// Flag to choose how to narrow complex lsr solution
158 "lsr-exp-narrow", cl::Hidden, cl::init(false),
159 cl::desc("Narrow LSR complex solution using"
160 " expectation of registers number"));
161
162// Flag to narrow search space by filtering non-optimal formulae with
163// the same ScaledReg and Scale.
165 "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true),
166 cl::desc("Narrow LSR search space by filtering non-optimal formulae"
167 " with the same ScaledReg and Scale"));
168
170 "lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None),
171 cl::desc("A flag that overrides the target's preferred addressing mode."),
173 clEnumValN(TTI::AMK_None, "none", "Don't prefer any addressing mode"),
174 clEnumValN(TTI::AMK_PreIndexed, "preindexed",
175 "Prefer pre-indexed addressing mode"),
176 clEnumValN(TTI::AMK_PostIndexed, "postindexed",
177 "Prefer post-indexed addressing mode"),
178 clEnumValN(TTI::AMK_All, "all", "Consider all addressing modes")));
179
181 "lsr-complexity-limit", cl::Hidden,
182 cl::init(std::numeric_limits<uint16_t>::max()),
183 cl::desc("LSR search space complexity limit"));
184
186 "lsr-setupcost-depth-limit", cl::Hidden, cl::init(7),
187 cl::desc("The limit on recursion depth for LSRs setup cost"));
188
190 "lsr-drop-solution", cl::Hidden,
191 cl::desc("Attempt to drop solution if it is less profitable"));
192
194 "lsr-enable-vscale-immediates", cl::Hidden, cl::init(true),
195 cl::desc("Enable analysis of vscale-relative immediates in LSR"));
196
198 "lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true),
199 cl::desc("Avoid using scaled registers with vscale-relative addressing"));
200
201#ifndef NDEBUG
202// Stress test IV chain generation.
204 "stress-ivchain", cl::Hidden, cl::init(false),
205 cl::desc("Stress test LSR IV chains"));
206#else
207static bool StressIVChain = false;
208#endif
209
210namespace {
211
212struct MemAccessTy {
213 /// Used in situations where the accessed memory type is unknown.
214 static const unsigned UnknownAddressSpace =
215 std::numeric_limits<unsigned>::max();
216
217 Type *MemTy = nullptr;
218 unsigned AddrSpace = UnknownAddressSpace;
219
220 MemAccessTy() = default;
221 MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {}
222
223 bool operator==(MemAccessTy Other) const {
224 return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
225 }
226
227 bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
228
229 static MemAccessTy getUnknown(LLVMContext &Ctx,
230 unsigned AS = UnknownAddressSpace) {
231 return MemAccessTy(Type::getVoidTy(Ctx), AS);
232 }
233
234 Type *getType() { return MemTy; }
235};
236
237/// This class holds data which is used to order reuse candidates.
238class RegSortData {
239public:
240 /// This represents the set of LSRUse indices which reference
241 /// a particular register.
242 SmallBitVector UsedByIndices;
243
244 void print(raw_ostream &OS) const;
245 void dump() const;
246};
247
248// An offset from an address that is either scalable or fixed. Used for
249// per-target optimizations of addressing modes.
250class Immediate : public details::FixedOrScalableQuantity<Immediate, int64_t> {
251 constexpr Immediate(ScalarTy MinVal, bool Scalable)
252 : FixedOrScalableQuantity(MinVal, Scalable) {}
253
254 constexpr Immediate(const FixedOrScalableQuantity<Immediate, int64_t> &V)
255 : FixedOrScalableQuantity(V) {}
256
257public:
258 constexpr Immediate() = delete;
259
260 static constexpr Immediate getFixed(ScalarTy MinVal) {
261 return {MinVal, false};
262 }
263 static constexpr Immediate getScalable(ScalarTy MinVal) {
264 return {MinVal, true};
265 }
266 static constexpr Immediate get(ScalarTy MinVal, bool Scalable) {
267 return {MinVal, Scalable};
268 }
269 static constexpr Immediate getZero() { return {0, false}; }
270 static constexpr Immediate getFixedMin() {
271 return {std::numeric_limits<int64_t>::min(), false};
272 }
273 static constexpr Immediate getFixedMax() {
274 return {std::numeric_limits<int64_t>::max(), false};
275 }
276 static constexpr Immediate getScalableMin() {
277 return {std::numeric_limits<int64_t>::min(), true};
278 }
279 static constexpr Immediate getScalableMax() {
280 return {std::numeric_limits<int64_t>::max(), true};
281 }
282
283 constexpr bool isLessThanZero() const { return Quantity < 0; }
284
285 constexpr bool isGreaterThanZero() const { return Quantity > 0; }
286
287 constexpr bool isCompatibleImmediate(const Immediate &Imm) const {
288 return isZero() || Imm.isZero() || Imm.Scalable == Scalable;
289 }
290
291 constexpr bool isMin() const {
292 return Quantity == std::numeric_limits<ScalarTy>::min();
293 }
294
295 constexpr bool isMax() const {
296 return Quantity == std::numeric_limits<ScalarTy>::max();
297 }
298
299 // Arithmetic 'operators' that cast to unsigned types first.
300 constexpr Immediate addUnsigned(const Immediate &RHS) const {
301 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
302 ScalarTy Value = (uint64_t)Quantity + RHS.getKnownMinValue();
303 return {Value, Scalable || RHS.isScalable()};
304 }
305
306 constexpr Immediate subUnsigned(const Immediate &RHS) const {
307 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
308 ScalarTy Value = (uint64_t)Quantity - RHS.getKnownMinValue();
309 return {Value, Scalable || RHS.isScalable()};
310 }
311
312 // Scale the quantity by a constant without caring about runtime scalability.
313 constexpr Immediate mulUnsigned(const ScalarTy RHS) const {
314 ScalarTy Value = (uint64_t)Quantity * RHS;
315 return {Value, Scalable};
316 }
317
318 // Helpers for generating SCEVs with vscale terms where needed.
319 const SCEV *getSCEV(ScalarEvolution &SE, Type *Ty) const {
320 const SCEV *S = SE.getConstant(Ty, Quantity);
321 if (Scalable)
322 S = SE.getMulExpr(S, SE.getVScale(S->getType()));
323 return S;
324 }
325
326 const SCEV *getNegativeSCEV(ScalarEvolution &SE, Type *Ty) const {
327 const SCEV *NegS = SE.getConstant(Ty, -(uint64_t)Quantity);
328 if (Scalable)
329 NegS = SE.getMulExpr(NegS, SE.getVScale(NegS->getType()));
330 return NegS;
331 }
332
333 const SCEV *getUnknownSCEV(ScalarEvolution &SE, Type *Ty) const {
334 const SCEV *SU = SE.getUnknown(ConstantInt::getSigned(Ty, Quantity));
335 if (Scalable)
336 SU = SE.getMulExpr(SU, SE.getVScale(SU->getType()));
337 return SU;
338 }
339};
340
341// This is needed for the Compare type of std::map when Immediate is used
342// as a key. We don't need it to be fully correct against any value of vscale,
343// just to make sure that vscale-related terms in the map are considered against
344// each other rather than being mixed up and potentially missing opportunities.
345struct KeyOrderTargetImmediate {
346 bool operator()(const Immediate &LHS, const Immediate &RHS) const {
347 if (LHS.isScalable() && !RHS.isScalable())
348 return false;
349 if (!LHS.isScalable() && RHS.isScalable())
350 return true;
351 return LHS.getKnownMinValue() < RHS.getKnownMinValue();
352 }
353};
354
355// This would be nicer if we could be generic instead of directly using size_t,
356// but there doesn't seem to be a type trait for is_orderable or
357// is_lessthan_comparable or similar.
358struct KeyOrderSizeTAndImmediate {
359 bool operator()(const std::pair<size_t, Immediate> &LHS,
360 const std::pair<size_t, Immediate> &RHS) const {
361 size_t LSize = LHS.first;
362 size_t RSize = RHS.first;
363 if (LSize != RSize)
364 return LSize < RSize;
365 return KeyOrderTargetImmediate()(LHS.second, RHS.second);
366 }
367};
368} // end anonymous namespace
369
370#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
371void RegSortData::print(raw_ostream &OS) const {
372 OS << "[NumUses=" << UsedByIndices.count() << ']';
373}
374
375LLVM_DUMP_METHOD void RegSortData::dump() const {
376 print(errs()); errs() << '\n';
377}
378#endif
379
380namespace {
381
382/// Map register candidates to information about how they are used.
383class RegUseTracker {
384 using RegUsesTy = DenseMap<const SCEV *, RegSortData>;
385
386 RegUsesTy RegUsesMap;
388
389public:
390 void countRegister(const SCEV *Reg, size_t LUIdx);
391 void dropRegister(const SCEV *Reg, size_t LUIdx);
392 void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
393
394 bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
395
396 const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
397
398 void clear();
399
402
403 iterator begin() { return RegSequence.begin(); }
404 iterator end() { return RegSequence.end(); }
405 const_iterator begin() const { return RegSequence.begin(); }
406 const_iterator end() const { return RegSequence.end(); }
407};
408
409} // end anonymous namespace
410
411void
412RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
413 std::pair<RegUsesTy::iterator, bool> Pair = RegUsesMap.try_emplace(Reg);
414 RegSortData &RSD = Pair.first->second;
415 if (Pair.second)
416 RegSequence.push_back(Reg);
417 RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
418 RSD.UsedByIndices.set(LUIdx);
419}
420
421void
422RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
423 RegUsesTy::iterator It = RegUsesMap.find(Reg);
424 assert(It != RegUsesMap.end());
425 RegSortData &RSD = It->second;
426 assert(RSD.UsedByIndices.size() > LUIdx);
427 RSD.UsedByIndices.reset(LUIdx);
428}
429
430void
431RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
432 assert(LUIdx <= LastLUIdx);
433
434 // Update RegUses. The data structure is not optimized for this purpose;
435 // we must iterate through it and update each of the bit vectors.
436 for (auto &Pair : RegUsesMap) {
437 SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
438 if (LUIdx < UsedByIndices.size())
439 UsedByIndices[LUIdx] =
440 LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : false;
441 UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
442 }
443}
444
445bool
446RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
447 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
448 if (I == RegUsesMap.end())
449 return false;
450 const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
451 int i = UsedByIndices.find_first();
452 if (i == -1) return false;
453 if ((size_t)i != LUIdx) return true;
454 return UsedByIndices.find_next(i) != -1;
455}
456
457const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
458 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
459 assert(I != RegUsesMap.end() && "Unknown register!");
460 return I->second.UsedByIndices;
461}
462
463void RegUseTracker::clear() {
464 RegUsesMap.clear();
465 RegSequence.clear();
466}
467
468namespace {
469
470/// This class holds information that describes a formula for computing
471/// satisfying a use. It may include broken-out immediates and scaled registers.
472struct Formula {
473 /// Global base address used for complex addressing.
474 GlobalValue *BaseGV = nullptr;
475
476 /// Base offset for complex addressing.
477 Immediate BaseOffset = Immediate::getZero();
478
479 /// Whether any complex addressing has a base register.
480 bool HasBaseReg = false;
481
482 /// The scale of any complex addressing.
483 int64_t Scale = 0;
484
485 /// The list of "base" registers for this use. When this is non-empty. The
486 /// canonical representation of a formula is
487 /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
488 /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
489 /// 3. The reg containing recurrent expr related with currect loop in the
490 /// formula should be put in the ScaledReg.
491 /// #1 enforces that the scaled register is always used when at least two
492 /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
493 /// #2 enforces that 1 * reg is reg.
494 /// #3 ensures invariant regs with respect to current loop can be combined
495 /// together in LSR codegen.
496 /// This invariant can be temporarily broken while building a formula.
497 /// However, every formula inserted into the LSRInstance must be in canonical
498 /// form.
500
501 /// The 'scaled' register for this use. This should be non-null when Scale is
502 /// not zero.
503 const SCEV *ScaledReg = nullptr;
504
505 /// An additional constant offset which added near the use. This requires a
506 /// temporary register, but the offset itself can live in an add immediate
507 /// field rather than a register.
508 Immediate UnfoldedOffset = Immediate::getZero();
509
510 Formula() = default;
511
512 void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
513
514 bool isCanonical(const Loop &L) const;
515
516 void canonicalize(const Loop &L);
517
518 bool unscale();
519
520 bool hasZeroEnd() const;
521
522 bool countsDownToZero() const;
523
524 size_t getNumRegs() const;
525 Type *getType() const;
526
527 void deleteBaseReg(const SCEV *&S);
528
529 bool referencesReg(const SCEV *S) const;
530 bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
531 const RegUseTracker &RegUses) const;
532
533 void print(raw_ostream &OS) const;
534 void dump() const;
535};
536
537} // end anonymous namespace
538
539/// Recursion helper for initialMatch.
540static void DoInitialMatch(const SCEV *S, Loop *L,
543 ScalarEvolution &SE) {
544 // Collect expressions which properly dominate the loop header.
545 if (SE.properlyDominates(S, L->getHeader())) {
546 Good.push_back(S);
547 return;
548 }
549
550 // Look at add operands.
551 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
552 for (const SCEV *S : Add->operands())
553 DoInitialMatch(S, L, Good, Bad, SE);
554 return;
555 }
556
557 // Look at addrec operands.
558 const SCEV *Start, *Step;
559 const Loop *ARLoop;
560 if (match(S,
561 m_scev_AffineAddRec(m_SCEV(Start), m_SCEV(Step), m_Loop(ARLoop))) &&
562 !Start->isZero()) {
563 DoInitialMatch(Start, L, Good, Bad, SE);
564 DoInitialMatch(SE.getAddRecExpr(SE.getConstant(S->getType(), 0), Step,
565 // FIXME: AR->getNoWrapFlags()
566 ARLoop, SCEV::FlagAnyWrap),
567 L, Good, Bad, SE);
568 return;
569 }
570
571 // Handle a multiplication by -1 (negation) if it didn't fold.
572 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
573 if (Mul->getOperand(0)->isAllOnesValue()) {
575 const SCEV *NewMul = SE.getMulExpr(Ops);
576
579 DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
580 const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
581 SE.getEffectiveSCEVType(NewMul->getType())));
582 for (const SCEV *S : MyGood)
583 Good.push_back(SE.getMulExpr(NegOne, S));
584 for (const SCEV *S : MyBad)
585 Bad.push_back(SE.getMulExpr(NegOne, S));
586 return;
587 }
588
589 // Ok, we can't do anything interesting. Just stuff the whole thing into a
590 // register and hope for the best.
591 Bad.push_back(S);
592}
593
594/// Incorporate loop-variant parts of S into this Formula, attempting to keep
595/// all loop-invariant and loop-computable values in a single base register.
596void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
599 DoInitialMatch(S, L, Good, Bad, SE);
600 if (!Good.empty()) {
601 const SCEV *Sum = SE.getAddExpr(Good);
602 if (!Sum->isZero())
603 BaseRegs.push_back(Sum);
604 HasBaseReg = true;
605 }
606 if (!Bad.empty()) {
607 const SCEV *Sum = SE.getAddExpr(Bad);
608 if (!Sum->isZero())
609 BaseRegs.push_back(Sum);
610 HasBaseReg = true;
611 }
612 canonicalize(*L);
613}
614
615static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L) {
616 return SCEVExprContains(S, [&L](const SCEV *S) {
617 return isa<SCEVAddRecExpr>(S) && (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
618 });
619}
620
621/// Check whether or not this formula satisfies the canonical
622/// representation.
623/// \see Formula::BaseRegs.
624bool Formula::isCanonical(const Loop &L) const {
625 assert((Scale == 0 || ScaledReg) &&
626 "ScaledReg must be non-null if Scale is non-zero");
627
628 if (!ScaledReg)
629 return BaseRegs.size() <= 1;
630
631 if (Scale != 1)
632 return true;
633
634 if (Scale == 1 && BaseRegs.empty())
635 return false;
636
637 if (containsAddRecDependentOnLoop(ScaledReg, L))
638 return true;
639
640 // If ScaledReg is not a recurrent expr, or it is but its loop is not current
641 // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
642 // loop, we want to swap the reg in BaseRegs with ScaledReg.
643 return none_of(BaseRegs, [&L](const SCEV *S) {
645 });
646}
647
648/// Helper method to morph a formula into its canonical representation.
649/// \see Formula::BaseRegs.
650/// Every formula having more than one base register, must use the ScaledReg
651/// field. Otherwise, we would have to do special cases everywhere in LSR
652/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
653/// On the other hand, 1*reg should be canonicalized into reg.
654void Formula::canonicalize(const Loop &L) {
655 if (isCanonical(L))
656 return;
657
658 if (BaseRegs.empty()) {
659 // No base reg? Use scale reg with scale = 1 as such.
660 assert(ScaledReg && "Expected 1*reg => reg");
661 assert(Scale == 1 && "Expected 1*reg => reg");
662 BaseRegs.push_back(ScaledReg);
663 Scale = 0;
664 ScaledReg = nullptr;
665 return;
666 }
667
668 // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
669 if (!ScaledReg) {
670 ScaledReg = BaseRegs.pop_back_val();
671 Scale = 1;
672 }
673
674 // If ScaledReg is an invariant with respect to L, find the reg from
675 // BaseRegs containing the recurrent expr related with Loop L. Swap the
676 // reg with ScaledReg.
677 if (!containsAddRecDependentOnLoop(ScaledReg, L)) {
678 auto I = find_if(BaseRegs, [&L](const SCEV *S) {
680 });
681 if (I != BaseRegs.end())
682 std::swap(ScaledReg, *I);
683 }
684 assert(isCanonical(L) && "Failed to canonicalize?");
685}
686
687/// Get rid of the scale in the formula.
688/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
689/// \return true if it was possible to get rid of the scale, false otherwise.
690/// \note After this operation the formula may not be in the canonical form.
691bool Formula::unscale() {
692 if (Scale != 1)
693 return false;
694 Scale = 0;
695 BaseRegs.push_back(ScaledReg);
696 ScaledReg = nullptr;
697 return true;
698}
699
700bool Formula::hasZeroEnd() const {
701 if (UnfoldedOffset || BaseOffset)
702 return false;
703 if (BaseRegs.size() != 1 || ScaledReg)
704 return false;
705 return true;
706}
707
708bool Formula::countsDownToZero() const {
709 if (!hasZeroEnd())
710 return false;
711 assert(BaseRegs.size() == 1 && "hasZeroEnd should mean one BaseReg");
712 const APInt *StepInt;
713 if (!match(BaseRegs[0], m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StepInt))))
714 return false;
715 return StepInt->isNegative();
716}
717
718/// Return the total number of register operands used by this formula. This does
719/// not include register uses implied by non-constant addrec strides.
720size_t Formula::getNumRegs() const {
721 return !!ScaledReg + BaseRegs.size();
722}
723
724/// Return the type of this formula, if it has one, or null otherwise. This type
725/// is meaningless except for the bit size.
726Type *Formula::getType() const {
727 return !BaseRegs.empty() ? BaseRegs.front()->getType() :
728 ScaledReg ? ScaledReg->getType() :
729 BaseGV ? BaseGV->getType() :
730 nullptr;
731}
732
733/// Delete the given base reg from the BaseRegs list.
734void Formula::deleteBaseReg(const SCEV *&S) {
735 if (&S != &BaseRegs.back())
736 std::swap(S, BaseRegs.back());
737 BaseRegs.pop_back();
738}
739
740/// Test if this formula references the given register.
741bool Formula::referencesReg(const SCEV *S) const {
742 return S == ScaledReg || is_contained(BaseRegs, S);
743}
744
745/// Test whether this formula uses registers which are used by uses other than
746/// the use with the given index.
747bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
748 const RegUseTracker &RegUses) const {
749 if (ScaledReg)
750 if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
751 return true;
752 for (const SCEV *BaseReg : BaseRegs)
753 if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
754 return true;
755 return false;
756}
757
758#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
759void Formula::print(raw_ostream &OS) const {
760 bool First = true;
761 if (BaseGV) {
762 if (!First) OS << " + "; else First = false;
763 BaseGV->printAsOperand(OS, /*PrintType=*/false);
764 }
765 if (BaseOffset.isNonZero()) {
766 if (!First) OS << " + "; else First = false;
767 OS << BaseOffset;
768 }
769 for (const SCEV *BaseReg : BaseRegs) {
770 if (!First) OS << " + "; else First = false;
771 OS << "reg(" << *BaseReg << ')';
772 }
773 if (HasBaseReg && BaseRegs.empty()) {
774 if (!First) OS << " + "; else First = false;
775 OS << "**error: HasBaseReg**";
776 } else if (!HasBaseReg && !BaseRegs.empty()) {
777 if (!First) OS << " + "; else First = false;
778 OS << "**error: !HasBaseReg**";
779 }
780 if (Scale != 0) {
781 if (!First) OS << " + "; else First = false;
782 OS << Scale << "*reg(";
783 if (ScaledReg)
784 OS << *ScaledReg;
785 else
786 OS << "<unknown>";
787 OS << ')';
788 }
789 if (UnfoldedOffset.isNonZero()) {
790 if (!First) OS << " + ";
791 OS << "imm(" << UnfoldedOffset << ')';
792 }
793}
794
795LLVM_DUMP_METHOD void Formula::dump() const {
796 print(errs()); errs() << '\n';
797}
798#endif
799
800/// Return true if the given addrec can be sign-extended without changing its
801/// value.
803 Type *WideTy =
805 return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
806}
807
808/// Return true if the given add can be sign-extended without changing its
809/// value.
810static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
811 Type *WideTy =
812 IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
813 return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
814}
815
816/// Return true if the given mul can be sign-extended without changing its
817/// value.
818static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
819 Type *WideTy =
821 SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
822 return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
823}
824
825/// Return an expression for LHS /s RHS, if it can be determined and if the
826/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
827/// is true, expressions like (X * Y) /s Y are simplified to X, ignoring that
828/// the multiplication may overflow, which is useful when the result will be
829/// used in a context where the most significant bits are ignored.
830static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
831 ScalarEvolution &SE,
832 bool IgnoreSignificantBits = false) {
833 // Handle the trivial case, which works for any SCEV type.
834 if (LHS == RHS)
835 return SE.getConstant(LHS->getType(), 1);
836
837 // Handle a few RHS special cases.
839 if (RC) {
840 const APInt &RA = RC->getAPInt();
841 // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
842 // some folding.
843 if (RA.isAllOnes()) {
844 if (LHS->getType()->isPointerTy())
845 return nullptr;
846 return SE.getMulExpr(LHS, RC);
847 }
848 // Handle x /s 1 as x.
849 if (RA == 1)
850 return LHS;
851 }
852
853 // Check for a division of a constant by a constant.
855 if (!RC)
856 return nullptr;
857 const APInt &LA = C->getAPInt();
858 const APInt &RA = RC->getAPInt();
859 if (LA.srem(RA) != 0)
860 return nullptr;
861 return SE.getConstant(LA.sdiv(RA));
862 }
863
864 // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
866 if ((IgnoreSignificantBits || isAddRecSExtable(AR, SE)) && AR->isAffine()) {
867 const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
868 IgnoreSignificantBits);
869 if (!Step) return nullptr;
870 const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
871 IgnoreSignificantBits);
872 if (!Start) return nullptr;
873 // FlagNW is independent of the start value, step direction, and is
874 // preserved with smaller magnitude steps.
875 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
876 return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
877 }
878 return nullptr;
879 }
880
881 // Distribute the sdiv over add operands, if the add doesn't overflow.
883 if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
885 for (const SCEV *S : Add->operands()) {
886 const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
887 if (!Op) return nullptr;
888 Ops.push_back(Op);
889 }
890 return SE.getAddExpr(Ops);
891 }
892 return nullptr;
893 }
894
895 // Check for a multiply operand that we can pull RHS out of.
897 if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
898 // Handle special case C1*X*Y /s C2*X*Y.
899 if (const SCEVMulExpr *MulRHS = dyn_cast<SCEVMulExpr>(RHS)) {
900 if (IgnoreSignificantBits || isMulSExtable(MulRHS, SE)) {
901 const SCEVConstant *LC = dyn_cast<SCEVConstant>(Mul->getOperand(0));
902 const SCEVConstant *RC =
903 dyn_cast<SCEVConstant>(MulRHS->getOperand(0));
904 if (LC && RC) {
906 SmallVector<const SCEV *, 4> ROps(drop_begin(MulRHS->operands()));
907 if (LOps == ROps)
908 return getExactSDiv(LC, RC, SE, IgnoreSignificantBits);
909 }
910 }
911 }
912
914 bool Found = false;
915 for (const SCEV *S : Mul->operands()) {
916 if (!Found)
917 if (const SCEV *Q = getExactSDiv(S, RHS, SE,
918 IgnoreSignificantBits)) {
919 S = Q;
920 Found = true;
921 }
922 Ops.push_back(S);
923 }
924 return Found ? SE.getMulExpr(Ops) : nullptr;
925 }
926 return nullptr;
927 }
928
929 // Otherwise we don't know.
930 return nullptr;
931}
932
933/// If S involves the addition of a constant integer value, return that integer
934/// value, and mutate S to point to a new SCEV with that value excluded.
935static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
936 const APInt *C;
937 if (match(S, m_scev_APInt(C))) {
938 if (C->getSignificantBits() <= 64) {
939 S = SE.getConstant(S->getType(), 0);
940 return Immediate::getFixed(C->getSExtValue());
941 }
942 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
943 SmallVector<const SCEV *, 8> NewOps(Add->operands());
944 Immediate Result = ExtractImmediate(NewOps.front(), SE);
945 if (Result.isNonZero())
946 S = SE.getAddExpr(NewOps);
947 return Result;
948 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
949 SmallVector<const SCEV *, 8> NewOps(AR->operands());
950 Immediate Result = ExtractImmediate(NewOps.front(), SE);
951 if (Result.isNonZero())
952 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
953 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
955 return Result;
956 } else if (EnableVScaleImmediates &&
958 S = SE.getConstant(S->getType(), 0);
959 return Immediate::getScalable(C->getSExtValue());
960 }
961 return Immediate::getZero();
962}
963
964/// If S involves the addition of a GlobalValue address, return that symbol, and
965/// mutate S to point to a new SCEV with that value excluded.
967 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
968 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
969 S = SE.getConstant(GV->getType(), 0);
970 return GV;
971 }
972 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
973 SmallVector<const SCEV *, 8> NewOps(Add->operands());
974 GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
975 if (Result)
976 S = SE.getAddExpr(NewOps);
977 return Result;
978 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
979 SmallVector<const SCEV *, 8> NewOps(AR->operands());
980 GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
981 if (Result)
982 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
983 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
985 return Result;
986 }
987 return nullptr;
988}
989
990/// Returns true if the specified instruction is using the specified value as an
991/// address.
993 Instruction *Inst, Value *OperandVal) {
994 bool isAddress = isa<LoadInst>(Inst);
995 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
996 if (SI->getPointerOperand() == OperandVal)
997 isAddress = true;
998 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
999 // Addressing modes can also be folded into prefetches and a variety
1000 // of intrinsics.
1001 switch (II->getIntrinsicID()) {
1002 case Intrinsic::memset:
1003 case Intrinsic::prefetch:
1004 case Intrinsic::masked_load:
1005 if (II->getArgOperand(0) == OperandVal)
1006 isAddress = true;
1007 break;
1008 case Intrinsic::masked_store:
1009 if (II->getArgOperand(1) == OperandVal)
1010 isAddress = true;
1011 break;
1012 case Intrinsic::memmove:
1013 case Intrinsic::memcpy:
1014 if (II->getArgOperand(0) == OperandVal ||
1015 II->getArgOperand(1) == OperandVal)
1016 isAddress = true;
1017 break;
1018 default: {
1019 MemIntrinsicInfo IntrInfo;
1020 if (TTI.getTgtMemIntrinsic(II, IntrInfo)) {
1021 if (IntrInfo.PtrVal == OperandVal)
1022 isAddress = true;
1023 }
1024 }
1025 }
1026 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1027 if (RMW->getPointerOperand() == OperandVal)
1028 isAddress = true;
1029 } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1030 if (CmpX->getPointerOperand() == OperandVal)
1031 isAddress = true;
1032 }
1033 return isAddress;
1034}
1035
1036/// Return the type of the memory being accessed.
1037static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
1038 Instruction *Inst, Value *OperandVal) {
1039 MemAccessTy AccessTy = MemAccessTy::getUnknown(Inst->getContext());
1040
1041 // First get the type of memory being accessed.
1042 if (Type *Ty = Inst->getAccessType())
1043 AccessTy.MemTy = Ty;
1044
1045 // Then get the pointer address space.
1046 if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
1047 AccessTy.AddrSpace = SI->getPointerAddressSpace();
1048 } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
1049 AccessTy.AddrSpace = LI->getPointerAddressSpace();
1050 } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1051 AccessTy.AddrSpace = RMW->getPointerAddressSpace();
1052 } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1053 AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
1054 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
1055 switch (II->getIntrinsicID()) {
1056 case Intrinsic::prefetch:
1057 case Intrinsic::memset:
1058 AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
1059 AccessTy.MemTy = OperandVal->getType();
1060 break;
1061 case Intrinsic::memmove:
1062 case Intrinsic::memcpy:
1063 AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace();
1064 AccessTy.MemTy = OperandVal->getType();
1065 break;
1066 case Intrinsic::masked_load:
1067 AccessTy.AddrSpace =
1068 II->getArgOperand(0)->getType()->getPointerAddressSpace();
1069 break;
1070 case Intrinsic::masked_store:
1071 AccessTy.AddrSpace =
1072 II->getArgOperand(1)->getType()->getPointerAddressSpace();
1073 break;
1074 default: {
1075 MemIntrinsicInfo IntrInfo;
1076 if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) {
1077 AccessTy.AddrSpace
1078 = IntrInfo.PtrVal->getType()->getPointerAddressSpace();
1079 }
1080
1081 break;
1082 }
1083 }
1084 }
1085
1086 return AccessTy;
1087}
1088
1089/// Return true if this AddRec is already a phi in its loop.
1090static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
1091 for (PHINode &PN : AR->getLoop()->getHeader()->phis()) {
1092 if (SE.isSCEVable(PN.getType()) &&
1093 (SE.getEffectiveSCEVType(PN.getType()) ==
1094 SE.getEffectiveSCEVType(AR->getType())) &&
1095 SE.getSCEV(&PN) == AR)
1096 return true;
1097 }
1098 return false;
1099}
1100
1101/// Check if expanding this expression is likely to incur significant cost. This
1102/// is tricky because SCEV doesn't track which expressions are actually computed
1103/// by the current IR.
1104///
1105/// We currently allow expansion of IV increments that involve adds,
1106/// multiplication by constants, and AddRecs from existing phis.
1107///
1108/// TODO: Allow UDivExpr if we can find an existing IV increment that is an
1109/// obvious multiple of the UDivExpr.
1110static bool isHighCostExpansion(const SCEV *S,
1112 ScalarEvolution &SE) {
1113 // Zero/One operand expressions
1114 switch (S->getSCEVType()) {
1115 case scUnknown:
1116 case scConstant:
1117 case scVScale:
1118 return false;
1119 case scTruncate:
1120 return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
1121 Processed, SE);
1122 case scZeroExtend:
1123 return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
1124 Processed, SE);
1125 case scSignExtend:
1126 return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
1127 Processed, SE);
1128 default:
1129 break;
1130 }
1131
1132 if (!Processed.insert(S).second)
1133 return false;
1134
1135 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
1136 for (const SCEV *S : Add->operands()) {
1137 if (isHighCostExpansion(S, Processed, SE))
1138 return true;
1139 }
1140 return false;
1141 }
1142
1143 const SCEV *Op0, *Op1;
1144 if (match(S, m_scev_Mul(m_SCEV(Op0), m_SCEV(Op1)))) {
1145 // Multiplication by a constant is ok
1146 if (isa<SCEVConstant>(Op0))
1147 return isHighCostExpansion(Op1, Processed, SE);
1148
1149 // If we have the value of one operand, check if an existing
1150 // multiplication already generates this expression.
1151 if (const auto *U = dyn_cast<SCEVUnknown>(Op1)) {
1152 Value *UVal = U->getValue();
1153 for (User *UR : UVal->users()) {
1154 // If U is a constant, it may be used by a ConstantExpr.
1156 if (UI && UI->getOpcode() == Instruction::Mul &&
1157 SE.isSCEVable(UI->getType())) {
1158 return SE.getSCEV(UI) == S;
1159 }
1160 }
1161 }
1162 }
1163
1164 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
1165 if (isExistingPhi(AR, SE))
1166 return false;
1167 }
1168
1169 // Fow now, consider any other type of expression (div/mul/min/max) high cost.
1170 return true;
1171}
1172
1173namespace {
1174
1175class LSRUse;
1176
1177} // end anonymous namespace
1178
1179/// Check if the addressing mode defined by \p F is completely
1180/// folded in \p LU at isel time.
1181/// This includes address-mode folding and special icmp tricks.
1182/// This function returns true if \p LU can accommodate what \p F
1183/// defines and up to 1 base + 1 scaled + offset.
1184/// In other words, if \p F has several base registers, this function may
1185/// still return true. Therefore, users still need to account for
1186/// additional base registers and/or unfolded offsets to derive an
1187/// accurate cost model.
1188static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1189 const LSRUse &LU, const Formula &F);
1190
1191// Get the cost of the scaling factor used in F for LU.
1192static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI,
1193 const LSRUse &LU, const Formula &F,
1194 const Loop &L);
1195
1196namespace {
1197
1198/// This class is used to measure and compare candidate formulae.
1199class Cost {
1200 const Loop *L = nullptr;
1201 ScalarEvolution *SE = nullptr;
1202 const TargetTransformInfo *TTI = nullptr;
1203 TargetTransformInfo::LSRCost C;
1205
1206public:
1207 Cost() = delete;
1208 Cost(const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
1210 L(L), SE(&SE), TTI(&TTI), AMK(AMK) {
1211 C.Insns = 0;
1212 C.NumRegs = 0;
1213 C.AddRecCost = 0;
1214 C.NumIVMuls = 0;
1215 C.NumBaseAdds = 0;
1216 C.ImmCost = 0;
1217 C.SetupCost = 0;
1218 C.ScaleCost = 0;
1219 }
1220
1221 bool isLess(const Cost &Other) const;
1222
1223 void Lose();
1224
1225#ifndef NDEBUG
1226 // Once any of the metrics loses, they must all remain losers.
1227 bool isValid() {
1228 return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
1229 | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
1230 || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
1231 & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
1232 }
1233#endif
1234
1235 bool isLoser() {
1236 assert(isValid() && "invalid cost");
1237 return C.NumRegs == ~0u;
1238 }
1239
1240 void RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
1241 const DenseSet<const SCEV *> &VisitedRegs, const LSRUse &LU,
1242 bool HardwareLoopProfitable,
1243 SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
1244
1245 void print(raw_ostream &OS) const;
1246 void dump() const;
1247
1248private:
1249 void RateRegister(const Formula &F, const SCEV *Reg,
1250 SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
1251 bool HardwareLoopProfitable);
1252 void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1253 SmallPtrSetImpl<const SCEV *> &Regs,
1254 const LSRUse &LU, bool HardwareLoopProfitable,
1255 SmallPtrSetImpl<const SCEV *> *LoserRegs);
1256};
1257
1258/// An operand value in an instruction which is to be replaced with some
1259/// equivalent, possibly strength-reduced, replacement.
1260struct LSRFixup {
1261 /// The instruction which will be updated.
1262 Instruction *UserInst = nullptr;
1263
1264 /// The operand of the instruction which will be replaced. The operand may be
1265 /// used more than once; every instance will be replaced.
1266 Value *OperandValToReplace = nullptr;
1267
1268 /// If this user is to use the post-incremented value of an induction
1269 /// variable, this set is non-empty and holds the loops associated with the
1270 /// induction variable.
1271 PostIncLoopSet PostIncLoops;
1272
1273 /// A constant offset to be added to the LSRUse expression. This allows
1274 /// multiple fixups to share the same LSRUse with different offsets, for
1275 /// example in an unrolled loop.
1276 Immediate Offset = Immediate::getZero();
1277
1278 LSRFixup() = default;
1279
1280 bool isUseFullyOutsideLoop(const Loop *L) const;
1281
1282 void print(raw_ostream &OS) const;
1283 void dump() const;
1284};
1285
1286/// This class holds the state that LSR keeps for each use in IVUsers, as well
1287/// as uses invented by LSR itself. It includes information about what kinds of
1288/// things can be folded into the user, information about the user itself, and
1289/// information about how the use may be satisfied. TODO: Represent multiple
1290/// users of the same expression in common?
1291class LSRUse {
1292 DenseSet<SmallVector<const SCEV *, 4>> Uniquifier;
1293
1294public:
1295 /// An enum for a kind of use, indicating what types of scaled and immediate
1296 /// operands it might support.
1297 enum KindType {
1298 Basic, ///< A normal use, with no folding.
1299 Special, ///< A special case of basic, allowing -1 scales.
1300 Address, ///< An address use; folding according to TargetLowering
1301 ICmpZero ///< An equality icmp with both operands folded into one.
1302 // TODO: Add a generic icmp too?
1303 };
1304
1305 using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>;
1306
1307 KindType Kind;
1308 MemAccessTy AccessTy;
1309
1310 /// The list of operands which are to be replaced.
1312
1313 /// Keep track of the min and max offsets of the fixups.
1314 Immediate MinOffset = Immediate::getFixedMax();
1315 Immediate MaxOffset = Immediate::getFixedMin();
1316
1317 /// This records whether all of the fixups using this LSRUse are outside of
1318 /// the loop, in which case some special-case heuristics may be used.
1319 bool AllFixupsOutsideLoop = true;
1320
1321 /// This records whether all of the fixups using this LSRUse are unconditional
1322 /// within the loop, meaning they will be executed on every path to the loop
1323 /// latch. This includes fixups before early exits.
1324 bool AllFixupsUnconditional = true;
1325
1326 /// RigidFormula is set to true to guarantee that this use will be associated
1327 /// with a single formula--the one that initially matched. Some SCEV
1328 /// expressions cannot be expanded. This allows LSR to consider the registers
1329 /// used by those expressions without the need to expand them later after
1330 /// changing the formula.
1331 bool RigidFormula = false;
1332
1333 /// This records the widest use type for any fixup using this
1334 /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
1335 /// fixup widths to be equivalent, because the narrower one may be relying on
1336 /// the implicit truncation to truncate away bogus bits.
1337 Type *WidestFixupType = nullptr;
1338
1339 /// A list of ways to build a value that can satisfy this user. After the
1340 /// list is populated, one of these is selected heuristically and used to
1341 /// formulate a replacement for OperandValToReplace in UserInst.
1342 SmallVector<Formula, 12> Formulae;
1343
1344 /// The set of register candidates used by all formulae in this LSRUse.
1345 SmallPtrSet<const SCEV *, 4> Regs;
1346
1347 LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {}
1348
1349 LSRFixup &getNewFixup() {
1350 Fixups.push_back(LSRFixup());
1351 return Fixups.back();
1352 }
1353
1354 void pushFixup(LSRFixup &f) {
1355 Fixups.push_back(f);
1356 if (Immediate::isKnownGT(f.Offset, MaxOffset))
1357 MaxOffset = f.Offset;
1358 if (Immediate::isKnownLT(f.Offset, MinOffset))
1359 MinOffset = f.Offset;
1360 }
1361
1362 bool HasFormulaWithSameRegs(const Formula &F) const;
1363 float getNotSelectedProbability(const SCEV *Reg) const;
1364 bool InsertFormula(const Formula &F, const Loop &L);
1365 void DeleteFormula(Formula &F);
1366 void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
1367
1368 void print(raw_ostream &OS) const;
1369 void dump() const;
1370};
1371
1372} // end anonymous namespace
1373
1374static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1375 LSRUse::KindType Kind, MemAccessTy AccessTy,
1376 GlobalValue *BaseGV, Immediate BaseOffset,
1377 bool HasBaseReg, int64_t Scale,
1378 Instruction *Fixup = nullptr);
1379
1380static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) {
1382 return 1;
1383 if (Depth == 0)
1384 return 0;
1385 if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))
1386 return getSetupCost(S->getStart(), Depth - 1);
1387 if (auto S = dyn_cast<SCEVIntegralCastExpr>(Reg))
1388 return getSetupCost(S->getOperand(), Depth - 1);
1389 if (auto S = dyn_cast<SCEVNAryExpr>(Reg))
1390 return std::accumulate(S->operands().begin(), S->operands().end(), 0,
1391 [&](unsigned i, const SCEV *Reg) {
1392 return i + getSetupCost(Reg, Depth - 1);
1393 });
1394 if (auto S = dyn_cast<SCEVUDivExpr>(Reg))
1395 return getSetupCost(S->getLHS(), Depth - 1) +
1396 getSetupCost(S->getRHS(), Depth - 1);
1397 return 0;
1398}
1399
1400/// Tally up interesting quantities from the given register.
1401void Cost::RateRegister(const Formula &F, const SCEV *Reg,
1402 SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
1403 bool HardwareLoopProfitable) {
1404 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
1405 // If this is an addrec for another loop, it should be an invariant
1406 // with respect to L since L is the innermost loop (at least
1407 // for now LSR only handles innermost loops).
1408 if (AR->getLoop() != L) {
1409 // If the AddRec exists, consider it's register free and leave it alone.
1410 if (isExistingPhi(AR, *SE) && !(AMK & TTI::AMK_PostIndexed))
1411 return;
1412
1413 // It is bad to allow LSR for current loop to add induction variables
1414 // for its sibling loops.
1415 if (!AR->getLoop()->contains(L)) {
1416 Lose();
1417 return;
1418 }
1419
1420 // Otherwise, it will be an invariant with respect to Loop L.
1421 ++C.NumRegs;
1422 return;
1423 }
1424
1425 unsigned LoopCost = 1;
1426 if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||
1427 TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {
1428 const SCEV *Start;
1429 const APInt *Step;
1430 if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_scev_APInt(Step)))) {
1431 // If the step size matches the base offset, we could use pre-indexed
1432 // addressing.
1433 bool CanPreIndex = (AMK & TTI::AMK_PreIndexed) &&
1434 F.BaseOffset.isFixed() &&
1435 *Step == F.BaseOffset.getFixedValue();
1436 bool CanPostIndex = (AMK & TTI::AMK_PostIndexed) &&
1437 !isa<SCEVConstant>(Start) &&
1438 SE->isLoopInvariant(Start, L);
1439 // We can only pre or post index when the load/store is unconditional.
1440 if ((CanPreIndex || CanPostIndex) && LU.AllFixupsUnconditional)
1441 LoopCost = 0;
1442 }
1443 }
1444
1445 // If the loop counts down to zero and we'll be using a hardware loop then
1446 // the addrec will be combined into the hardware loop instruction.
1447 if (LU.Kind == LSRUse::ICmpZero && F.countsDownToZero() &&
1448 HardwareLoopProfitable)
1449 LoopCost = 0;
1450 C.AddRecCost += LoopCost;
1451
1452 // Add the step value register, if it needs one.
1453 // TODO: The non-affine case isn't precisely modeled here.
1454 if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
1455 if (!Regs.count(AR->getOperand(1))) {
1456 RateRegister(F, AR->getOperand(1), Regs, LU, HardwareLoopProfitable);
1457 if (isLoser())
1458 return;
1459 }
1460 }
1461 }
1462 ++C.NumRegs;
1463
1464 // Rough heuristic; favor registers which don't require extra setup
1465 // instructions in the preheader.
1466 C.SetupCost += getSetupCost(Reg, SetupCostDepthLimit);
1467 // Ensure we don't, even with the recusion limit, produce invalid costs.
1468 C.SetupCost = std::min<unsigned>(C.SetupCost, 1 << 16);
1469
1470 C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
1472}
1473
1474/// Record this register in the set. If we haven't seen it before, rate
1475/// it. Optional LoserRegs provides a way to declare any formula that refers to
1476/// one of those regs an instant loser.
1477void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1478 SmallPtrSetImpl<const SCEV *> &Regs,
1479 const LSRUse &LU, bool HardwareLoopProfitable,
1480 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1481 if (LoserRegs && LoserRegs->count(Reg)) {
1482 Lose();
1483 return;
1484 }
1485 if (Regs.insert(Reg).second) {
1486 RateRegister(F, Reg, Regs, LU, HardwareLoopProfitable);
1487 if (LoserRegs && isLoser())
1488 LoserRegs->insert(Reg);
1489 }
1490}
1491
1492void Cost::RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
1493 const DenseSet<const SCEV *> &VisitedRegs,
1494 const LSRUse &LU, bool HardwareLoopProfitable,
1495 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1496 if (isLoser())
1497 return;
1498 assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
1499 // Tally up the registers.
1500 unsigned PrevAddRecCost = C.AddRecCost;
1501 unsigned PrevNumRegs = C.NumRegs;
1502 unsigned PrevNumBaseAdds = C.NumBaseAdds;
1503 if (const SCEV *ScaledReg = F.ScaledReg) {
1504 if (VisitedRegs.count(ScaledReg)) {
1505 Lose();
1506 return;
1507 }
1508 RatePrimaryRegister(F, ScaledReg, Regs, LU, HardwareLoopProfitable,
1509 LoserRegs);
1510 if (isLoser())
1511 return;
1512 }
1513 for (const SCEV *BaseReg : F.BaseRegs) {
1514 if (VisitedRegs.count(BaseReg)) {
1515 Lose();
1516 return;
1517 }
1518 RatePrimaryRegister(F, BaseReg, Regs, LU, HardwareLoopProfitable,
1519 LoserRegs);
1520 if (isLoser())
1521 return;
1522 }
1523
1524 // Determine how many (unfolded) adds we'll need inside the loop.
1525 size_t NumBaseParts = F.getNumRegs();
1526 if (NumBaseParts > 1)
1527 // Do not count the base and a possible second register if the target
1528 // allows to fold 2 registers.
1529 C.NumBaseAdds +=
1530 NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F)));
1531 C.NumBaseAdds += (F.UnfoldedOffset.isNonZero());
1532
1533 // Accumulate non-free scaling amounts.
1534 C.ScaleCost += getScalingFactorCost(*TTI, LU, F, *L).getValue();
1535
1536 // Tally up the non-zero immediates.
1537 for (const LSRFixup &Fixup : LU.Fixups) {
1538 if (Fixup.Offset.isCompatibleImmediate(F.BaseOffset)) {
1539 Immediate Offset = Fixup.Offset.addUnsigned(F.BaseOffset);
1540 if (F.BaseGV)
1541 C.ImmCost += 64; // Handle symbolic values conservatively.
1542 // TODO: This should probably be the pointer size.
1543 else if (Offset.isNonZero())
1544 C.ImmCost +=
1545 APInt(64, Offset.getKnownMinValue(), true).getSignificantBits();
1546
1547 // Check with target if this offset with this instruction is
1548 // specifically not supported.
1549 if (LU.Kind == LSRUse::Address && Offset.isNonZero() &&
1550 !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1551 Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
1552 C.NumBaseAdds++;
1553 } else {
1554 // Incompatible immediate type, increase cost to avoid using
1555 C.ImmCost += 2048;
1556 }
1557 }
1558
1559 // If we don't count instruction cost exit here.
1560 if (!InsnsCost) {
1561 assert(isValid() && "invalid cost");
1562 return;
1563 }
1564
1565 // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
1566 // additional instruction (at least fill).
1567 // TODO: Need distinguish register class?
1568 unsigned TTIRegNum = TTI->getNumberOfRegisters(
1569 TTI->getRegisterClassForType(false, F.getType())) - 1;
1570 if (C.NumRegs > TTIRegNum) {
1571 // Cost already exceeded TTIRegNum, then only newly added register can add
1572 // new instructions.
1573 if (PrevNumRegs > TTIRegNum)
1574 C.Insns += (C.NumRegs - PrevNumRegs);
1575 else
1576 C.Insns += (C.NumRegs - TTIRegNum);
1577 }
1578
1579 // If ICmpZero formula ends with not 0, it could not be replaced by
1580 // just add or sub. We'll need to compare final result of AddRec.
1581 // That means we'll need an additional instruction. But if the target can
1582 // macro-fuse a compare with a branch, don't count this extra instruction.
1583 // For -10 + {0, +, 1}:
1584 // i = i + 1;
1585 // cmp i, 10
1586 //
1587 // For {-10, +, 1}:
1588 // i = i + 1;
1589 if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() &&
1590 !TTI->canMacroFuseCmp())
1591 C.Insns++;
1592 // Each new AddRec adds 1 instruction to calculation.
1593 C.Insns += (C.AddRecCost - PrevAddRecCost);
1594
1595 // BaseAdds adds instructions for unfolded registers.
1596 if (LU.Kind != LSRUse::ICmpZero)
1597 C.Insns += C.NumBaseAdds - PrevNumBaseAdds;
1598 assert(isValid() && "invalid cost");
1599}
1600
1601/// Set this cost to a losing value.
1602void Cost::Lose() {
1603 C.Insns = std::numeric_limits<unsigned>::max();
1604 C.NumRegs = std::numeric_limits<unsigned>::max();
1605 C.AddRecCost = std::numeric_limits<unsigned>::max();
1606 C.NumIVMuls = std::numeric_limits<unsigned>::max();
1607 C.NumBaseAdds = std::numeric_limits<unsigned>::max();
1608 C.ImmCost = std::numeric_limits<unsigned>::max();
1609 C.SetupCost = std::numeric_limits<unsigned>::max();
1610 C.ScaleCost = std::numeric_limits<unsigned>::max();
1611}
1612
1613/// Choose the lower cost.
1614bool Cost::isLess(const Cost &Other) const {
1615 if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
1616 C.Insns != Other.C.Insns)
1617 return C.Insns < Other.C.Insns;
1618 return TTI->isLSRCostLess(C, Other.C);
1619}
1620
1621#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1622void Cost::print(raw_ostream &OS) const {
1623 if (InsnsCost)
1624 OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
1625 OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s");
1626 if (C.AddRecCost != 0)
1627 OS << ", with addrec cost " << C.AddRecCost;
1628 if (C.NumIVMuls != 0)
1629 OS << ", plus " << C.NumIVMuls << " IV mul"
1630 << (C.NumIVMuls == 1 ? "" : "s");
1631 if (C.NumBaseAdds != 0)
1632 OS << ", plus " << C.NumBaseAdds << " base add"
1633 << (C.NumBaseAdds == 1 ? "" : "s");
1634 if (C.ScaleCost != 0)
1635 OS << ", plus " << C.ScaleCost << " scale cost";
1636 if (C.ImmCost != 0)
1637 OS << ", plus " << C.ImmCost << " imm cost";
1638 if (C.SetupCost != 0)
1639 OS << ", plus " << C.SetupCost << " setup cost";
1640}
1641
1642LLVM_DUMP_METHOD void Cost::dump() const {
1643 print(errs()); errs() << '\n';
1644}
1645#endif
1646
1647/// Test whether this fixup always uses its value outside of the given loop.
1648bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
1649 // PHI nodes use their value in their incoming blocks.
1650 if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
1651 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
1652 if (PN->getIncomingValue(i) == OperandValToReplace &&
1653 L->contains(PN->getIncomingBlock(i)))
1654 return false;
1655 return true;
1656 }
1657
1658 return !L->contains(UserInst);
1659}
1660
1661#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1662void LSRFixup::print(raw_ostream &OS) const {
1663 OS << "UserInst=";
1664 // Store is common and interesting enough to be worth special-casing.
1665 if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
1666 OS << "store ";
1667 Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
1668 } else if (UserInst->getType()->isVoidTy())
1669 OS << UserInst->getOpcodeName();
1670 else
1671 UserInst->printAsOperand(OS, /*PrintType=*/false);
1672
1673 OS << ", OperandValToReplace=";
1674 OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
1675
1676 for (const Loop *PIL : PostIncLoops) {
1677 OS << ", PostIncLoop=";
1678 PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
1679 }
1680
1681 if (Offset.isNonZero())
1682 OS << ", Offset=" << Offset;
1683}
1684
1685LLVM_DUMP_METHOD void LSRFixup::dump() const {
1686 print(errs()); errs() << '\n';
1687}
1688#endif
1689
1690/// Test whether this use as a formula which has the same registers as the given
1691/// formula.
1692bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
1694 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1695 // Unstable sort by host order ok, because this is only used for uniquifying.
1696 llvm::sort(Key);
1697 return Uniquifier.count(Key);
1698}
1699
1700/// The function returns a probability of selecting formula without Reg.
1701float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
1702 unsigned FNum = 0;
1703 for (const Formula &F : Formulae)
1704 if (F.referencesReg(Reg))
1705 FNum++;
1706 return ((float)(Formulae.size() - FNum)) / Formulae.size();
1707}
1708
1709/// If the given formula has not yet been inserted, add it to the list, and
1710/// return true. Return false otherwise. The formula must be in canonical form.
1711bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
1712 assert(F.isCanonical(L) && "Invalid canonical representation");
1713
1714 if (!Formulae.empty() && RigidFormula)
1715 return false;
1716
1718 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1719 // Unstable sort by host order ok, because this is only used for uniquifying.
1720 llvm::sort(Key);
1721
1722 if (!Uniquifier.insert(Key).second)
1723 return false;
1724
1725 // Using a register to hold the value of 0 is not profitable.
1726 assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
1727 "Zero allocated in a scaled register!");
1728#ifndef NDEBUG
1729 for (const SCEV *BaseReg : F.BaseRegs)
1730 assert(!BaseReg->isZero() && "Zero allocated in a base register!");
1731#endif
1732
1733 // Add the formula to the list.
1734 Formulae.push_back(F);
1735
1736 // Record registers now being used by this use.
1737 Regs.insert_range(F.BaseRegs);
1738 if (F.ScaledReg)
1739 Regs.insert(F.ScaledReg);
1740
1741 return true;
1742}
1743
1744/// Remove the given formula from this use's list.
1745void LSRUse::DeleteFormula(Formula &F) {
1746 if (&F != &Formulae.back())
1747 std::swap(F, Formulae.back());
1748 Formulae.pop_back();
1749}
1750
1751/// Recompute the Regs field, and update RegUses.
1752void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
1753 // Now that we've filtered out some formulae, recompute the Regs set.
1754 SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
1755 Regs.clear();
1756 for (const Formula &F : Formulae) {
1757 if (F.ScaledReg) Regs.insert(F.ScaledReg);
1758 Regs.insert_range(F.BaseRegs);
1759 }
1760
1761 // Update the RegTracker.
1762 for (const SCEV *S : OldRegs)
1763 if (!Regs.count(S))
1764 RegUses.dropRegister(S, LUIdx);
1765}
1766
1767#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1768void LSRUse::print(raw_ostream &OS) const {
1769 OS << "LSR Use: Kind=";
1770 switch (Kind) {
1771 case Basic: OS << "Basic"; break;
1772 case Special: OS << "Special"; break;
1773 case ICmpZero: OS << "ICmpZero"; break;
1774 case Address:
1775 OS << "Address of ";
1776 if (AccessTy.MemTy->isPointerTy())
1777 OS << "pointer"; // the full pointer type could be really verbose
1778 else {
1779 OS << *AccessTy.MemTy;
1780 }
1781
1782 OS << " in addrspace(" << AccessTy.AddrSpace << ')';
1783 }
1784
1785 OS << ", Offsets={";
1786 bool NeedComma = false;
1787 for (const LSRFixup &Fixup : Fixups) {
1788 if (NeedComma) OS << ',';
1789 OS << Fixup.Offset;
1790 NeedComma = true;
1791 }
1792 OS << '}';
1793
1794 if (AllFixupsOutsideLoop)
1795 OS << ", all-fixups-outside-loop";
1796
1797 if (AllFixupsUnconditional)
1798 OS << ", all-fixups-unconditional";
1799
1800 if (WidestFixupType)
1801 OS << ", widest fixup type: " << *WidestFixupType;
1802}
1803
1804LLVM_DUMP_METHOD void LSRUse::dump() const {
1805 print(errs()); errs() << '\n';
1806}
1807#endif
1808
1810 LSRUse::KindType Kind, MemAccessTy AccessTy,
1811 GlobalValue *BaseGV, Immediate BaseOffset,
1812 bool HasBaseReg, int64_t Scale,
1813 Instruction *Fixup /* = nullptr */) {
1814 switch (Kind) {
1815 case LSRUse::Address: {
1816 int64_t FixedOffset =
1817 BaseOffset.isScalable() ? 0 : BaseOffset.getFixedValue();
1818 int64_t ScalableOffset =
1819 BaseOffset.isScalable() ? BaseOffset.getKnownMinValue() : 0;
1820 return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, FixedOffset,
1821 HasBaseReg, Scale, AccessTy.AddrSpace,
1822 Fixup, ScalableOffset);
1823 }
1824 case LSRUse::ICmpZero:
1825 // There's not even a target hook for querying whether it would be legal to
1826 // fold a GV into an ICmp.
1827 if (BaseGV)
1828 return false;
1829
1830 // ICmp only has two operands; don't allow more than two non-trivial parts.
1831 if (Scale != 0 && HasBaseReg && BaseOffset.isNonZero())
1832 return false;
1833
1834 // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
1835 // putting the scaled register in the other operand of the icmp.
1836 if (Scale != 0 && Scale != -1)
1837 return false;
1838
1839 // If we have low-level target information, ask the target if it can fold an
1840 // integer immediate on an icmp.
1841 if (BaseOffset.isNonZero()) {
1842 // We don't have an interface to query whether the target supports
1843 // icmpzero against scalable quantities yet.
1844 if (BaseOffset.isScalable())
1845 return false;
1846
1847 // We have one of:
1848 // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
1849 // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
1850 // Offs is the ICmp immediate.
1851 if (Scale == 0)
1852 // The cast does the right thing with
1853 // std::numeric_limits<int64_t>::min().
1854 BaseOffset = BaseOffset.getFixed(-(uint64_t)BaseOffset.getFixedValue());
1855 return TTI.isLegalICmpImmediate(BaseOffset.getFixedValue());
1856 }
1857
1858 // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
1859 return true;
1860
1861 case LSRUse::Basic:
1862 // Only handle single-register values.
1863 return !BaseGV && Scale == 0 && BaseOffset.isZero();
1864
1865 case LSRUse::Special:
1866 // Special case Basic to handle -1 scales.
1867 return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset.isZero();
1868 }
1869
1870 llvm_unreachable("Invalid LSRUse Kind!");
1871}
1872
1874 Immediate MinOffset, Immediate MaxOffset,
1875 LSRUse::KindType Kind, MemAccessTy AccessTy,
1876 GlobalValue *BaseGV, Immediate BaseOffset,
1877 bool HasBaseReg, int64_t Scale) {
1878 if (BaseOffset.isNonZero() &&
1879 (BaseOffset.isScalable() != MinOffset.isScalable() ||
1880 BaseOffset.isScalable() != MaxOffset.isScalable()))
1881 return false;
1882 // Check for overflow.
1883 int64_t Base = BaseOffset.getKnownMinValue();
1884 int64_t Min = MinOffset.getKnownMinValue();
1885 int64_t Max = MaxOffset.getKnownMinValue();
1886 if (((int64_t)((uint64_t)Base + Min) > Base) != (Min > 0))
1887 return false;
1888 MinOffset = Immediate::get((uint64_t)Base + Min, MinOffset.isScalable());
1889 if (((int64_t)((uint64_t)Base + Max) > Base) != (Max > 0))
1890 return false;
1891 MaxOffset = Immediate::get((uint64_t)Base + Max, MaxOffset.isScalable());
1892
1893 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
1894 HasBaseReg, Scale) &&
1895 isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
1896 HasBaseReg, Scale);
1897}
1898
1900 Immediate MinOffset, Immediate MaxOffset,
1901 LSRUse::KindType Kind, MemAccessTy AccessTy,
1902 const Formula &F, const Loop &L) {
1903 // For the purpose of isAMCompletelyFolded either having a canonical formula
1904 // or a scale not equal to zero is correct.
1905 // Problems may arise from non canonical formulae having a scale == 0.
1906 // Strictly speaking it would best to just rely on canonical formulae.
1907 // However, when we generate the scaled formulae, we first check that the
1908 // scaling factor is profitable before computing the actual ScaledReg for
1909 // compile time sake.
1910 assert((F.isCanonical(L) || F.Scale != 0));
1911 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1912 F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
1913}
1914
1915/// Test whether we know how to expand the current formula.
1916static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1917 Immediate MaxOffset, LSRUse::KindType Kind,
1918 MemAccessTy AccessTy, GlobalValue *BaseGV,
1919 Immediate BaseOffset, bool HasBaseReg, int64_t Scale) {
1920 // We know how to expand completely foldable formulae.
1921 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1922 BaseOffset, HasBaseReg, Scale) ||
1923 // Or formulae that use a base register produced by a sum of base
1924 // registers.
1925 (Scale == 1 &&
1926 isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1927 BaseGV, BaseOffset, true, 0));
1928}
1929
1930static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1931 Immediate MaxOffset, LSRUse::KindType Kind,
1932 MemAccessTy AccessTy, const Formula &F) {
1933 return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
1934 F.BaseOffset, F.HasBaseReg, F.Scale);
1935}
1936
1938 Immediate Offset) {
1939 if (Offset.isScalable())
1940 return TTI.isLegalAddScalableImmediate(Offset.getKnownMinValue());
1941
1942 return TTI.isLegalAddImmediate(Offset.getFixedValue());
1943}
1944
1946 const LSRUse &LU, const Formula &F) {
1947 // Target may want to look at the user instructions.
1948 if (LU.Kind == LSRUse::Address && TTI.LSRWithInstrQueries()) {
1949 for (const LSRFixup &Fixup : LU.Fixups)
1950 if (!isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1951 (F.BaseOffset + Fixup.Offset), F.HasBaseReg,
1952 F.Scale, Fixup.UserInst))
1953 return false;
1954 return true;
1955 }
1956
1957 return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1958 LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
1959 F.Scale);
1960}
1961
1963 const LSRUse &LU, const Formula &F,
1964 const Loop &L) {
1965 if (!F.Scale)
1966 return 0;
1967
1968 // If the use is not completely folded in that instruction, we will have to
1969 // pay an extra cost only for scale != 1.
1970 if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1971 LU.AccessTy, F, L))
1972 return F.Scale != 1;
1973
1974 switch (LU.Kind) {
1975 case LSRUse::Address: {
1976 // Check the scaling factor cost with both the min and max offsets.
1977 int64_t ScalableMin = 0, ScalableMax = 0, FixedMin = 0, FixedMax = 0;
1978 if (F.BaseOffset.isScalable()) {
1979 ScalableMin = (F.BaseOffset + LU.MinOffset).getKnownMinValue();
1980 ScalableMax = (F.BaseOffset + LU.MaxOffset).getKnownMinValue();
1981 } else {
1982 FixedMin = (F.BaseOffset + LU.MinOffset).getFixedValue();
1983 FixedMax = (F.BaseOffset + LU.MaxOffset).getFixedValue();
1984 }
1985 InstructionCost ScaleCostMinOffset = TTI.getScalingFactorCost(
1986 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMin, ScalableMin),
1987 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
1988 InstructionCost ScaleCostMaxOffset = TTI.getScalingFactorCost(
1989 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMax, ScalableMax),
1990 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
1991
1992 assert(ScaleCostMinOffset.isValid() && ScaleCostMaxOffset.isValid() &&
1993 "Legal addressing mode has an illegal cost!");
1994 return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
1995 }
1996 case LSRUse::ICmpZero:
1997 case LSRUse::Basic:
1998 case LSRUse::Special:
1999 // The use is completely folded, i.e., everything is folded into the
2000 // instruction.
2001 return 0;
2002 }
2003
2004 llvm_unreachable("Invalid LSRUse Kind!");
2005}
2006
2008 LSRUse::KindType Kind, MemAccessTy AccessTy,
2009 GlobalValue *BaseGV, Immediate BaseOffset,
2010 bool HasBaseReg) {
2011 // Fast-path: zero is always foldable.
2012 if (BaseOffset.isZero() && !BaseGV)
2013 return true;
2014
2015 // Conservatively, create an address with an immediate and a
2016 // base and a scale.
2017 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2018
2019 // Canonicalize a scale of 1 to a base register if the formula doesn't
2020 // already have a base register.
2021 if (!HasBaseReg && Scale == 1) {
2022 Scale = 0;
2023 HasBaseReg = true;
2024 }
2025
2026 // FIXME: Try with + without a scale? Maybe based on TTI?
2027 // I think basereg + scaledreg + immediateoffset isn't a good 'conservative'
2028 // default for many architectures, not just AArch64 SVE. More investigation
2029 // needed later to determine if this should be used more widely than just
2030 // on scalable types.
2031 if (HasBaseReg && BaseOffset.isNonZero() && Kind != LSRUse::ICmpZero &&
2032 AccessTy.MemTy && AccessTy.MemTy->isScalableTy() && DropScaledForVScale)
2033 Scale = 0;
2034
2035 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
2036 HasBaseReg, Scale);
2037}
2038
2040 ScalarEvolution &SE, Immediate MinOffset,
2041 Immediate MaxOffset, LSRUse::KindType Kind,
2042 MemAccessTy AccessTy, const SCEV *S,
2043 bool HasBaseReg) {
2044 // Fast-path: zero is always foldable.
2045 if (S->isZero()) return true;
2046
2047 // Conservatively, create an address with an immediate and a
2048 // base and a scale.
2049 Immediate BaseOffset = ExtractImmediate(S, SE);
2050 GlobalValue *BaseGV = ExtractSymbol(S, SE);
2051
2052 // If there's anything else involved, it's not foldable.
2053 if (!S->isZero()) return false;
2054
2055 // Fast-path: zero is always foldable.
2056 if (BaseOffset.isZero() && !BaseGV)
2057 return true;
2058
2059 if (BaseOffset.isScalable())
2060 return false;
2061
2062 // Conservatively, create an address with an immediate and a
2063 // base and a scale.
2064 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2065
2066 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
2067 BaseOffset, HasBaseReg, Scale);
2068}
2069
2070namespace {
2071
2072/// An individual increment in a Chain of IV increments. Relate an IV user to
2073/// an expression that computes the IV it uses from the IV used by the previous
2074/// link in the Chain.
2075///
2076/// For the head of a chain, IncExpr holds the absolute SCEV expression for the
2077/// original IVOperand. The head of the chain's IVOperand is only valid during
2078/// chain collection, before LSR replaces IV users. During chain generation,
2079/// IncExpr can be used to find the new IVOperand that computes the same
2080/// expression.
2081struct IVInc {
2082 Instruction *UserInst;
2083 Value* IVOperand;
2084 const SCEV *IncExpr;
2085
2086 IVInc(Instruction *U, Value *O, const SCEV *E)
2087 : UserInst(U), IVOperand(O), IncExpr(E) {}
2088};
2089
2090// The list of IV increments in program order. We typically add the head of a
2091// chain without finding subsequent links.
2092struct IVChain {
2094 const SCEV *ExprBase = nullptr;
2095
2096 IVChain() = default;
2097 IVChain(const IVInc &Head, const SCEV *Base)
2098 : Incs(1, Head), ExprBase(Base) {}
2099
2100 using const_iterator = SmallVectorImpl<IVInc>::const_iterator;
2101
2102 // Return the first increment in the chain.
2103 const_iterator begin() const {
2104 assert(!Incs.empty());
2105 return std::next(Incs.begin());
2106 }
2107 const_iterator end() const {
2108 return Incs.end();
2109 }
2110
2111 // Returns true if this chain contains any increments.
2112 bool hasIncs() const { return Incs.size() >= 2; }
2113
2114 // Add an IVInc to the end of this chain.
2115 void add(const IVInc &X) { Incs.push_back(X); }
2116
2117 // Returns the last UserInst in the chain.
2118 Instruction *tailUserInst() const { return Incs.back().UserInst; }
2119
2120 // Returns true if IncExpr can be profitably added to this chain.
2121 bool isProfitableIncrement(const SCEV *OperExpr,
2122 const SCEV *IncExpr,
2123 ScalarEvolution&);
2124};
2125
2126/// Helper for CollectChains to track multiple IV increment uses. Distinguish
2127/// between FarUsers that definitely cross IV increments and NearUsers that may
2128/// be used between IV increments.
2129struct ChainUsers {
2130 SmallPtrSet<Instruction*, 4> FarUsers;
2131 SmallPtrSet<Instruction*, 4> NearUsers;
2132};
2133
2134/// This class holds state for the main loop strength reduction logic.
2135class LSRInstance {
2136 IVUsers &IU;
2137 ScalarEvolution &SE;
2138 DominatorTree &DT;
2139 LoopInfo &LI;
2140 AssumptionCache &AC;
2141 TargetLibraryInfo &TLI;
2142 const TargetTransformInfo &TTI;
2143 Loop *const L;
2144 MemorySSAUpdater *MSSAU;
2146 mutable SCEVExpander Rewriter;
2147 bool Changed = false;
2148 bool HardwareLoopProfitable = false;
2149
2150 /// This is the insert position that the current loop's induction variable
2151 /// increment should be placed. In simple loops, this is the latch block's
2152 /// terminator. But in more complicated cases, this is a position which will
2153 /// dominate all the in-loop post-increment users.
2154 Instruction *IVIncInsertPos = nullptr;
2155
2156 /// Interesting factors between use strides.
2157 ///
2158 /// We explicitly use a SetVector which contains a SmallSet, instead of the
2159 /// default, a SmallDenseSet, because we need to use the full range of
2160 /// int64_ts, and there's currently no good way of doing that with
2161 /// SmallDenseSet.
2162 SetVector<int64_t, SmallVector<int64_t, 8>, SmallSet<int64_t, 8>> Factors;
2163
2164 /// The cost of the current SCEV, the best solution by LSR will be dropped if
2165 /// the solution is not profitable.
2166 Cost BaselineCost;
2167
2168 /// Interesting use types, to facilitate truncation reuse.
2169 SmallSetVector<Type *, 4> Types;
2170
2171 /// The list of interesting uses.
2173
2174 /// Track which uses use which register candidates.
2175 RegUseTracker RegUses;
2176
2177 // Limit the number of chains to avoid quadratic behavior. We don't expect to
2178 // have more than a few IV increment chains in a loop. Missing a Chain falls
2179 // back to normal LSR behavior for those uses.
2180 static const unsigned MaxChains = 8;
2181
2182 /// IV users can form a chain of IV increments.
2184
2185 /// IV users that belong to profitable IVChains.
2186 SmallPtrSet<Use*, MaxChains> IVIncSet;
2187
2188 /// Induction variables that were generated and inserted by the SCEV Expander.
2189 SmallVector<llvm::WeakVH, 2> ScalarEvolutionIVs;
2190
2191 // Inserting instructions in the loop and using them as PHI's input could
2192 // break LCSSA in case if PHI's parent block is not a loop exit (i.e. the
2193 // corresponding incoming block is not loop exiting). So collect all such
2194 // instructions to form LCSSA for them later.
2195 SmallSetVector<Instruction *, 4> InsertedNonLCSSAInsts;
2196
2197 void OptimizeShadowIV();
2198 bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
2199 ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
2200 void OptimizeLoopTermCond();
2201
2202 void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
2203 SmallVectorImpl<ChainUsers> &ChainUsersVec);
2204 void FinalizeChain(IVChain &Chain);
2205 void CollectChains();
2206 void GenerateIVChain(const IVChain &Chain,
2207 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2208
2209 void CollectInterestingTypesAndFactors();
2210 void CollectFixupsAndInitialFormulae();
2211
2212 // Support for sharing of LSRUses between LSRFixups.
2213 using UseMapTy = DenseMap<LSRUse::SCEVUseKindPair, size_t>;
2214 UseMapTy UseMap;
2215
2216 bool reconcileNewOffset(LSRUse &LU, Immediate NewOffset, bool HasBaseReg,
2217 LSRUse::KindType Kind, MemAccessTy AccessTy);
2218
2219 std::pair<size_t, Immediate> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
2220 MemAccessTy AccessTy);
2221
2222 void DeleteUse(LSRUse &LU, size_t LUIdx);
2223
2224 LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
2225
2226 void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2227 void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2228 void CountRegisters(const Formula &F, size_t LUIdx);
2229 bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
2230 bool IsFixupExecutedEachIncrement(const LSRFixup &LF) const;
2231
2232 void CollectLoopInvariantFixupsAndFormulae();
2233
2234 void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
2235 unsigned Depth = 0);
2236
2237 void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
2238 const Formula &Base, unsigned Depth,
2239 size_t Idx, bool IsScaledReg = false);
2240 void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
2241 void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2242 const Formula &Base, size_t Idx,
2243 bool IsScaledReg = false);
2244 void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2245 void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2246 const Formula &Base,
2247 const SmallVectorImpl<Immediate> &Worklist,
2248 size_t Idx, bool IsScaledReg = false);
2249 void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2250 void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2251 void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2252 void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
2253 void GenerateCrossUseConstantOffsets();
2254 void GenerateAllReuseFormulae();
2255
2256 void FilterOutUndesirableDedicatedRegisters();
2257
2258 size_t EstimateSearchSpaceComplexity() const;
2259 void NarrowSearchSpaceByDetectingSupersets();
2260 void NarrowSearchSpaceByCollapsingUnrolledCode();
2261 void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
2262 void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
2263 void NarrowSearchSpaceByFilterPostInc();
2264 void NarrowSearchSpaceByDeletingCostlyFormulas();
2265 void NarrowSearchSpaceByPickingWinnerRegs();
2266 void NarrowSearchSpaceUsingHeuristics();
2267
2268 void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
2269 Cost &SolutionCost,
2270 SmallVectorImpl<const Formula *> &Workspace,
2271 const Cost &CurCost,
2272 const SmallPtrSet<const SCEV *, 16> &CurRegs,
2273 DenseSet<const SCEV *> &VisitedRegs) const;
2274 void Solve(SmallVectorImpl<const Formula *> &Solution) const;
2275
2277 HoistInsertPosition(BasicBlock::iterator IP,
2278 const SmallVectorImpl<Instruction *> &Inputs) const;
2279 BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP,
2280 const LSRFixup &LF,
2281 const LSRUse &LU) const;
2282
2283 Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2285 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2286 void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
2287 const Formula &F,
2288 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2289 void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2290 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2291 void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
2292
2293public:
2294 LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
2295 LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC,
2296 TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU);
2297
2298 bool getChanged() const { return Changed; }
2299 const SmallVectorImpl<WeakVH> &getScalarEvolutionIVs() const {
2300 return ScalarEvolutionIVs;
2301 }
2302
2303 void print_factors_and_types(raw_ostream &OS) const;
2304 void print_fixups(raw_ostream &OS) const;
2305 void print_uses(raw_ostream &OS) const;
2306 void print(raw_ostream &OS) const;
2307 void dump() const;
2308};
2309
2310} // end anonymous namespace
2311
2312/// If IV is used in a int-to-float cast inside the loop then try to eliminate
2313/// the cast operation.
2314void LSRInstance::OptimizeShadowIV() {
2315 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2316 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2317 return;
2318
2319 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
2320 UI != E; /* empty */) {
2321 IVUsers::const_iterator CandidateUI = UI;
2322 ++UI;
2323 Instruction *ShadowUse = CandidateUI->getUser();
2324 Type *DestTy = nullptr;
2325 bool IsSigned = false;
2326
2327 /* If shadow use is a int->float cast then insert a second IV
2328 to eliminate this cast.
2329
2330 for (unsigned i = 0; i < n; ++i)
2331 foo((double)i);
2332
2333 is transformed into
2334
2335 double d = 0.0;
2336 for (unsigned i = 0; i < n; ++i, ++d)
2337 foo(d);
2338 */
2339 if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
2340 IsSigned = false;
2341 DestTy = UCast->getDestTy();
2342 }
2343 else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
2344 IsSigned = true;
2345 DestTy = SCast->getDestTy();
2346 }
2347 if (!DestTy) continue;
2348
2349 // If target does not support DestTy natively then do not apply
2350 // this transformation.
2351 if (!TTI.isTypeLegal(DestTy)) continue;
2352
2353 PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
2354 if (!PH) continue;
2355 if (PH->getNumIncomingValues() != 2) continue;
2356
2357 // If the calculation in integers overflows, the result in FP type will
2358 // differ. So we only can do this transformation if we are guaranteed to not
2359 // deal with overflowing values
2360 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PH));
2361 if (!AR) continue;
2362 if (IsSigned && !AR->hasNoSignedWrap()) continue;
2363 if (!IsSigned && !AR->hasNoUnsignedWrap()) continue;
2364
2365 Type *SrcTy = PH->getType();
2366 int Mantissa = DestTy->getFPMantissaWidth();
2367 if (Mantissa == -1) continue;
2368 if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
2369 continue;
2370
2371 unsigned Entry, Latch;
2372 if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
2373 Entry = 0;
2374 Latch = 1;
2375 } else {
2376 Entry = 1;
2377 Latch = 0;
2378 }
2379
2380 ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
2381 if (!Init) continue;
2382 Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
2383 (double)Init->getSExtValue() :
2384 (double)Init->getZExtValue());
2385
2386 BinaryOperator *Incr =
2388 if (!Incr) continue;
2389 if (Incr->getOpcode() != Instruction::Add
2390 && Incr->getOpcode() != Instruction::Sub)
2391 continue;
2392
2393 /* Initialize new IV, double d = 0.0 in above example. */
2394 ConstantInt *C = nullptr;
2395 if (Incr->getOperand(0) == PH)
2397 else if (Incr->getOperand(1) == PH)
2399 else
2400 continue;
2401
2402 if (!C) continue;
2403
2404 // Ignore negative constants, as the code below doesn't handle them
2405 // correctly. TODO: Remove this restriction.
2406 if (!C->getValue().isStrictlyPositive())
2407 continue;
2408
2409 /* Add new PHINode. */
2410 PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH->getIterator());
2411 NewPH->setDebugLoc(PH->getDebugLoc());
2412
2413 /* create new increment. '++d' in above example. */
2414 Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
2415 BinaryOperator *NewIncr = BinaryOperator::Create(
2416 Incr->getOpcode() == Instruction::Add ? Instruction::FAdd
2417 : Instruction::FSub,
2418 NewPH, CFP, "IV.S.next.", Incr->getIterator());
2419 NewIncr->setDebugLoc(Incr->getDebugLoc());
2420
2421 NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
2422 NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
2423
2424 /* Remove cast operation */
2425 ShadowUse->replaceAllUsesWith(NewPH);
2426 ShadowUse->eraseFromParent();
2427 Changed = true;
2428 break;
2429 }
2430}
2431
2432/// If Cond has an operand that is an expression of an IV, set the IV user and
2433/// stride information and return true, otherwise return false.
2434bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
2435 for (IVStrideUse &U : IU)
2436 if (U.getUser() == Cond) {
2437 // NOTE: we could handle setcc instructions with multiple uses here, but
2438 // InstCombine does it as well for simple uses, it's not clear that it
2439 // occurs enough in real life to handle.
2440 CondUse = &U;
2441 return true;
2442 }
2443 return false;
2444}
2445
2446/// Rewrite the loop's terminating condition if it uses a max computation.
2447///
2448/// This is a narrow solution to a specific, but acute, problem. For loops
2449/// like this:
2450///
2451/// i = 0;
2452/// do {
2453/// p[i] = 0.0;
2454/// } while (++i < n);
2455///
2456/// the trip count isn't just 'n', because 'n' might not be positive. And
2457/// unfortunately this can come up even for loops where the user didn't use
2458/// a C do-while loop. For example, seemingly well-behaved top-test loops
2459/// will commonly be lowered like this:
2460///
2461/// if (n > 0) {
2462/// i = 0;
2463/// do {
2464/// p[i] = 0.0;
2465/// } while (++i < n);
2466/// }
2467///
2468/// and then it's possible for subsequent optimization to obscure the if
2469/// test in such a way that indvars can't find it.
2470///
2471/// When indvars can't find the if test in loops like this, it creates a
2472/// max expression, which allows it to give the loop a canonical
2473/// induction variable:
2474///
2475/// i = 0;
2476/// max = n < 1 ? 1 : n;
2477/// do {
2478/// p[i] = 0.0;
2479/// } while (++i != max);
2480///
2481/// Canonical induction variables are necessary because the loop passes
2482/// are designed around them. The most obvious example of this is the
2483/// LoopInfo analysis, which doesn't remember trip count values. It
2484/// expects to be able to rediscover the trip count each time it is
2485/// needed, and it does this using a simple analysis that only succeeds if
2486/// the loop has a canonical induction variable.
2487///
2488/// However, when it comes time to generate code, the maximum operation
2489/// can be quite costly, especially if it's inside of an outer loop.
2490///
2491/// This function solves this problem by detecting this type of loop and
2492/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
2493/// the instructions for the maximum computation.
2494ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
2495 // Check that the loop matches the pattern we're looking for.
2496 if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
2497 Cond->getPredicate() != CmpInst::ICMP_NE)
2498 return Cond;
2499
2500 SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
2501 if (!Sel || !Sel->hasOneUse()) return Cond;
2502
2503 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2504 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2505 return Cond;
2506 const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
2507
2508 // Add one to the backedge-taken count to get the trip count.
2509 const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
2510 if (IterationCount != SE.getSCEV(Sel)) return Cond;
2511
2512 // Check for a max calculation that matches the pattern. There's no check
2513 // for ICMP_ULE here because the comparison would be with zero, which
2514 // isn't interesting.
2515 CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
2516 const SCEVNAryExpr *Max = nullptr;
2517 if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
2518 Pred = ICmpInst::ICMP_SLE;
2519 Max = S;
2520 } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
2521 Pred = ICmpInst::ICMP_SLT;
2522 Max = S;
2523 } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
2524 Pred = ICmpInst::ICMP_ULT;
2525 Max = U;
2526 } else {
2527 // No match; bail.
2528 return Cond;
2529 }
2530
2531 // To handle a max with more than two operands, this optimization would
2532 // require additional checking and setup.
2533 if (Max->getNumOperands() != 2)
2534 return Cond;
2535
2536 const SCEV *MaxLHS = Max->getOperand(0);
2537 const SCEV *MaxRHS = Max->getOperand(1);
2538
2539 // ScalarEvolution canonicalizes constants to the left. For < and >, look
2540 // for a comparison with 1. For <= and >=, a comparison with zero.
2541 if (!MaxLHS ||
2542 (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
2543 return Cond;
2544
2545 // Check the relevant induction variable for conformance to
2546 // the pattern.
2547 const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
2548 if (!match(IV,
2550 return Cond;
2551
2552 assert(cast<SCEVAddRecExpr>(IV)->getLoop() == L &&
2553 "Loop condition operand is an addrec in a different loop!");
2554
2555 // Check the right operand of the select, and remember it, as it will
2556 // be used in the new comparison instruction.
2557 Value *NewRHS = nullptr;
2558 if (ICmpInst::isTrueWhenEqual(Pred)) {
2559 // Look for n+1, and grab n.
2560 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
2561 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2562 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2563 NewRHS = BO->getOperand(0);
2564 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
2565 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2566 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2567 NewRHS = BO->getOperand(0);
2568 if (!NewRHS)
2569 return Cond;
2570 } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
2571 NewRHS = Sel->getOperand(1);
2572 else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
2573 NewRHS = Sel->getOperand(2);
2574 else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
2575 NewRHS = SU->getValue();
2576 else
2577 // Max doesn't match expected pattern.
2578 return Cond;
2579
2580 // Determine the new comparison opcode. It may be signed or unsigned,
2581 // and the original comparison may be either equality or inequality.
2582 if (Cond->getPredicate() == CmpInst::ICMP_EQ)
2583 Pred = CmpInst::getInversePredicate(Pred);
2584
2585 // Ok, everything looks ok to change the condition into an SLT or SGE and
2586 // delete the max calculation.
2587 ICmpInst *NewCond = new ICmpInst(Cond->getIterator(), Pred,
2588 Cond->getOperand(0), NewRHS, "scmp");
2589
2590 // Delete the max calculation instructions.
2591 NewCond->setDebugLoc(Cond->getDebugLoc());
2592 Cond->replaceAllUsesWith(NewCond);
2593 CondUse->setUser(NewCond);
2595 Cond->eraseFromParent();
2596 Sel->eraseFromParent();
2597 if (Cmp->use_empty()) {
2598 salvageDebugInfo(*Cmp);
2599 Cmp->eraseFromParent();
2600 }
2601 return NewCond;
2602}
2603
2604/// Change loop terminating condition to use the postinc iv when possible.
2605void
2606LSRInstance::OptimizeLoopTermCond() {
2607 SmallPtrSet<Instruction *, 4> PostIncs;
2608
2609 // We need a different set of heuristics for rotated and non-rotated loops.
2610 // If a loop is rotated then the latch is also the backedge, so inserting
2611 // post-inc expressions just before the latch is ideal. To reduce live ranges
2612 // it also makes sense to rewrite terminating conditions to use post-inc
2613 // expressions.
2614 //
2615 // If the loop is not rotated then the latch is not a backedge; the latch
2616 // check is done in the loop head. Adding post-inc expressions before the
2617 // latch will cause overlapping live-ranges of pre-inc and post-inc expressions
2618 // in the loop body. In this case we do *not* want to use post-inc expressions
2619 // in the latch check, and we want to insert post-inc expressions before
2620 // the backedge.
2621 BasicBlock *LatchBlock = L->getLoopLatch();
2622 SmallVector<BasicBlock*, 8> ExitingBlocks;
2623 L->getExitingBlocks(ExitingBlocks);
2624 if (!llvm::is_contained(ExitingBlocks, LatchBlock)) {
2625 // The backedge doesn't exit the loop; treat this as a head-tested loop.
2626 IVIncInsertPos = LatchBlock->getTerminator();
2627 return;
2628 }
2629
2630 // Otherwise treat this as a rotated loop.
2631 for (BasicBlock *ExitingBlock : ExitingBlocks) {
2632 // Get the terminating condition for the loop if possible. If we
2633 // can, we want to change it to use a post-incremented version of its
2634 // induction variable, to allow coalescing the live ranges for the IV into
2635 // one register value.
2636
2637 BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
2638 if (!TermBr)
2639 continue;
2640 // FIXME: Overly conservative, termination condition could be an 'or' etc..
2641 if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
2642 continue;
2643
2644 // Search IVUsesByStride to find Cond's IVUse if there is one.
2645 IVStrideUse *CondUse = nullptr;
2646 ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
2647 if (!FindIVUserForCond(Cond, CondUse))
2648 continue;
2649
2650 // If the trip count is computed in terms of a max (due to ScalarEvolution
2651 // being unable to find a sufficient guard, for example), change the loop
2652 // comparison to use SLT or ULT instead of NE.
2653 // One consequence of doing this now is that it disrupts the count-down
2654 // optimization. That's not always a bad thing though, because in such
2655 // cases it may still be worthwhile to avoid a max.
2656 Cond = OptimizeMax(Cond, CondUse);
2657
2658 // If this exiting block dominates the latch block, it may also use
2659 // the post-inc value if it won't be shared with other uses.
2660 // Check for dominance.
2661 if (!DT.dominates(ExitingBlock, LatchBlock))
2662 continue;
2663
2664 // Conservatively avoid trying to use the post-inc value in non-latch
2665 // exits if there may be pre-inc users in intervening blocks.
2666 if (LatchBlock != ExitingBlock)
2667 for (const IVStrideUse &UI : IU)
2668 // Test if the use is reachable from the exiting block. This dominator
2669 // query is a conservative approximation of reachability.
2670 if (&UI != CondUse &&
2671 !DT.properlyDominates(UI.getUser()->getParent(), ExitingBlock)) {
2672 // Conservatively assume there may be reuse if the quotient of their
2673 // strides could be a legal scale.
2674 const SCEV *A = IU.getStride(*CondUse, L);
2675 const SCEV *B = IU.getStride(UI, L);
2676 if (!A || !B) continue;
2677 if (SE.getTypeSizeInBits(A->getType()) !=
2678 SE.getTypeSizeInBits(B->getType())) {
2679 if (SE.getTypeSizeInBits(A->getType()) >
2680 SE.getTypeSizeInBits(B->getType()))
2681 B = SE.getSignExtendExpr(B, A->getType());
2682 else
2683 A = SE.getSignExtendExpr(A, B->getType());
2684 }
2685 if (const SCEVConstant *D =
2687 const ConstantInt *C = D->getValue();
2688 // Stride of one or negative one can have reuse with non-addresses.
2689 if (C->isOne() || C->isMinusOne())
2690 goto decline_post_inc;
2691 // Avoid weird situations.
2692 if (C->getValue().getSignificantBits() >= 64 ||
2693 C->getValue().isMinSignedValue())
2694 goto decline_post_inc;
2695 // Check for possible scaled-address reuse.
2696 if (isAddressUse(TTI, UI.getUser(), UI.getOperandValToReplace())) {
2697 MemAccessTy AccessTy =
2698 getAccessType(TTI, UI.getUser(), UI.getOperandValToReplace());
2699 int64_t Scale = C->getSExtValue();
2700 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2701 /*BaseOffset=*/0,
2702 /*HasBaseReg=*/true, Scale,
2703 AccessTy.AddrSpace))
2704 goto decline_post_inc;
2705 Scale = -Scale;
2706 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2707 /*BaseOffset=*/0,
2708 /*HasBaseReg=*/true, Scale,
2709 AccessTy.AddrSpace))
2710 goto decline_post_inc;
2711 }
2712 }
2713 }
2714
2715 LLVM_DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: "
2716 << *Cond << '\n');
2717
2718 // It's possible for the setcc instruction to be anywhere in the loop, and
2719 // possible for it to have multiple users. If it is not immediately before
2720 // the exiting block branch, move it.
2721 if (Cond->getNextNode() != TermBr) {
2722 if (Cond->hasOneUse()) {
2723 Cond->moveBefore(TermBr->getIterator());
2724 } else {
2725 // Clone the terminating condition and insert into the loopend.
2726 ICmpInst *OldCond = Cond;
2727 Cond = cast<ICmpInst>(Cond->clone());
2728 Cond->setName(L->getHeader()->getName() + ".termcond");
2729 Cond->insertInto(ExitingBlock, TermBr->getIterator());
2730
2731 // Clone the IVUse, as the old use still exists!
2732 CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
2733 TermBr->replaceUsesOfWith(OldCond, Cond);
2734 }
2735 }
2736
2737 // If we get to here, we know that we can transform the setcc instruction to
2738 // use the post-incremented version of the IV, allowing us to coalesce the
2739 // live ranges for the IV correctly.
2740 CondUse->transformToPostInc(L);
2741 Changed = true;
2742
2743 PostIncs.insert(Cond);
2744 decline_post_inc:;
2745 }
2746
2747 // Determine an insertion point for the loop induction variable increment. It
2748 // must dominate all the post-inc comparisons we just set up, and it must
2749 // dominate the loop latch edge.
2750 IVIncInsertPos = L->getLoopLatch()->getTerminator();
2751 for (Instruction *Inst : PostIncs)
2752 IVIncInsertPos = DT.findNearestCommonDominator(IVIncInsertPos, Inst);
2753}
2754
2755/// Determine if the given use can accommodate a fixup at the given offset and
2756/// other details. If so, update the use and return true.
2757bool LSRInstance::reconcileNewOffset(LSRUse &LU, Immediate NewOffset,
2758 bool HasBaseReg, LSRUse::KindType Kind,
2759 MemAccessTy AccessTy) {
2760 Immediate NewMinOffset = LU.MinOffset;
2761 Immediate NewMaxOffset = LU.MaxOffset;
2762 MemAccessTy NewAccessTy = AccessTy;
2763
2764 // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
2765 // something conservative, however this can pessimize in the case that one of
2766 // the uses will have all its uses outside the loop, for example.
2767 if (LU.Kind != Kind)
2768 return false;
2769
2770 // Check for a mismatched access type, and fall back conservatively as needed.
2771 // TODO: Be less conservative when the type is similar and can use the same
2772 // addressing modes.
2773 if (Kind == LSRUse::Address) {
2774 if (AccessTy.MemTy != LU.AccessTy.MemTy) {
2775 NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext(),
2776 AccessTy.AddrSpace);
2777 }
2778 }
2779
2780 // Conservatively assume HasBaseReg is true for now.
2781 if (Immediate::isKnownLT(NewOffset, LU.MinOffset)) {
2782 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2783 LU.MaxOffset - NewOffset, HasBaseReg))
2784 return false;
2785 NewMinOffset = NewOffset;
2786 } else if (Immediate::isKnownGT(NewOffset, LU.MaxOffset)) {
2787 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2788 NewOffset - LU.MinOffset, HasBaseReg))
2789 return false;
2790 NewMaxOffset = NewOffset;
2791 }
2792
2793 // FIXME: We should be able to handle some level of scalable offset support
2794 // for 'void', but in order to get basic support up and running this is
2795 // being left out.
2796 if (NewAccessTy.MemTy && NewAccessTy.MemTy->isVoidTy() &&
2797 (NewMinOffset.isScalable() || NewMaxOffset.isScalable()))
2798 return false;
2799
2800 // Update the use.
2801 LU.MinOffset = NewMinOffset;
2802 LU.MaxOffset = NewMaxOffset;
2803 LU.AccessTy = NewAccessTy;
2804 return true;
2805}
2806
2807/// Return an LSRUse index and an offset value for a fixup which needs the given
2808/// expression, with the given kind and optional access type. Either reuse an
2809/// existing use or create a new one, as needed.
2810std::pair<size_t, Immediate> LSRInstance::getUse(const SCEV *&Expr,
2811 LSRUse::KindType Kind,
2812 MemAccessTy AccessTy) {
2813 const SCEV *Copy = Expr;
2814 Immediate Offset = ExtractImmediate(Expr, SE);
2815
2816 // Basic uses can't accept any offset, for example.
2817 if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
2818 Offset, /*HasBaseReg=*/ true)) {
2819 Expr = Copy;
2820 Offset = Immediate::getFixed(0);
2821 }
2822
2823 std::pair<UseMapTy::iterator, bool> P =
2824 UseMap.try_emplace(LSRUse::SCEVUseKindPair(Expr, Kind));
2825 if (!P.second) {
2826 // A use already existed with this base.
2827 size_t LUIdx = P.first->second;
2828 LSRUse &LU = Uses[LUIdx];
2829 if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
2830 // Reuse this use.
2831 return std::make_pair(LUIdx, Offset);
2832 }
2833
2834 // Create a new use.
2835 size_t LUIdx = Uses.size();
2836 P.first->second = LUIdx;
2837 Uses.push_back(LSRUse(Kind, AccessTy));
2838 LSRUse &LU = Uses[LUIdx];
2839
2840 LU.MinOffset = Offset;
2841 LU.MaxOffset = Offset;
2842 return std::make_pair(LUIdx, Offset);
2843}
2844
2845/// Delete the given use from the Uses list.
2846void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
2847 if (&LU != &Uses.back())
2848 std::swap(LU, Uses.back());
2849 Uses.pop_back();
2850
2851 // Update RegUses.
2852 RegUses.swapAndDropUse(LUIdx, Uses.size());
2853}
2854
2855/// Look for a use distinct from OrigLU which is has a formula that has the same
2856/// registers as the given formula.
2857LSRUse *
2858LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
2859 const LSRUse &OrigLU) {
2860 // Search all uses for the formula. This could be more clever.
2861 for (LSRUse &LU : Uses) {
2862 // Check whether this use is close enough to OrigLU, to see whether it's
2863 // worthwhile looking through its formulae.
2864 // Ignore ICmpZero uses because they may contain formulae generated by
2865 // GenerateICmpZeroScales, in which case adding fixup offsets may
2866 // be invalid.
2867 if (&LU != &OrigLU &&
2868 LU.Kind != LSRUse::ICmpZero &&
2869 LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
2870 LU.WidestFixupType == OrigLU.WidestFixupType &&
2871 LU.HasFormulaWithSameRegs(OrigF)) {
2872 // Scan through this use's formulae.
2873 for (const Formula &F : LU.Formulae) {
2874 // Check to see if this formula has the same registers and symbols
2875 // as OrigF.
2876 if (F.BaseRegs == OrigF.BaseRegs &&
2877 F.ScaledReg == OrigF.ScaledReg &&
2878 F.BaseGV == OrigF.BaseGV &&
2879 F.Scale == OrigF.Scale &&
2880 F.UnfoldedOffset == OrigF.UnfoldedOffset) {
2881 if (F.BaseOffset.isZero())
2882 return &LU;
2883 // This is the formula where all the registers and symbols matched;
2884 // there aren't going to be any others. Since we declined it, we
2885 // can skip the rest of the formulae and proceed to the next LSRUse.
2886 break;
2887 }
2888 }
2889 }
2890 }
2891
2892 // Nothing looked good.
2893 return nullptr;
2894}
2895
2896void LSRInstance::CollectInterestingTypesAndFactors() {
2897 SmallSetVector<const SCEV *, 4> Strides;
2898
2899 // Collect interesting types and strides.
2901 for (const IVStrideUse &U : IU) {
2902 const SCEV *Expr = IU.getExpr(U);
2903 if (!Expr)
2904 continue;
2905
2906 // Collect interesting types.
2907 Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
2908
2909 // Add strides for mentioned loops.
2910 Worklist.push_back(Expr);
2911 do {
2912 const SCEV *S = Worklist.pop_back_val();
2913 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
2914 if (AR->getLoop() == L)
2915 Strides.insert(AR->getStepRecurrence(SE));
2916 Worklist.push_back(AR->getStart());
2917 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
2918 append_range(Worklist, Add->operands());
2919 }
2920 } while (!Worklist.empty());
2921 }
2922
2923 // Compute interesting factors from the set of interesting strides.
2924 for (SmallSetVector<const SCEV *, 4>::const_iterator
2925 I = Strides.begin(), E = Strides.end(); I != E; ++I)
2926 for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
2927 std::next(I); NewStrideIter != E; ++NewStrideIter) {
2928 const SCEV *OldStride = *I;
2929 const SCEV *NewStride = *NewStrideIter;
2930
2931 if (SE.getTypeSizeInBits(OldStride->getType()) !=
2932 SE.getTypeSizeInBits(NewStride->getType())) {
2933 if (SE.getTypeSizeInBits(OldStride->getType()) >
2934 SE.getTypeSizeInBits(NewStride->getType()))
2935 NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
2936 else
2937 OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
2938 }
2939 if (const SCEVConstant *Factor =
2940 dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
2941 SE, true))) {
2942 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2943 Factors.insert(Factor->getAPInt().getSExtValue());
2944 } else if (const SCEVConstant *Factor =
2946 NewStride,
2947 SE, true))) {
2948 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2949 Factors.insert(Factor->getAPInt().getSExtValue());
2950 }
2951 }
2952
2953 // If all uses use the same type, don't bother looking for truncation-based
2954 // reuse.
2955 if (Types.size() == 1)
2956 Types.clear();
2957
2958 LLVM_DEBUG(print_factors_and_types(dbgs()));
2959}
2960
2961/// Helper for CollectChains that finds an IV operand (computed by an AddRec in
2962/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
2963/// IVStrideUses, we could partially skip this.
2964static User::op_iterator
2966 Loop *L, ScalarEvolution &SE) {
2967 for(; OI != OE; ++OI) {
2968 if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
2969 if (!SE.isSCEVable(Oper->getType()))
2970 continue;
2971
2972 if (const SCEVAddRecExpr *AR =
2974 if (AR->getLoop() == L)
2975 break;
2976 }
2977 }
2978 }
2979 return OI;
2980}
2981
2982/// IVChain logic must consistently peek base TruncInst operands, so wrap it in
2983/// a convenient helper.
2985 if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
2986 return Trunc->getOperand(0);
2987 return Oper;
2988}
2989
2990/// Return an approximation of this SCEV expression's "base", or NULL for any
2991/// constant. Returning the expression itself is conservative. Returning a
2992/// deeper subexpression is more precise and valid as long as it isn't less
2993/// complex than another subexpression. For expressions involving multiple
2994/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
2995/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
2996/// IVInc==b-a.
2997///
2998/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
2999/// SCEVUnknown, we simply return the rightmost SCEV operand.
3000static const SCEV *getExprBase(const SCEV *S) {
3001 switch (S->getSCEVType()) {
3002 default: // including scUnknown.
3003 return S;
3004 case scConstant:
3005 case scVScale:
3006 return nullptr;
3007 case scTruncate:
3008 return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
3009 case scZeroExtend:
3010 return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
3011 case scSignExtend:
3012 return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
3013 case scAddExpr: {
3014 // Skip over scaled operands (scMulExpr) to follow add operands as long as
3015 // there's nothing more complex.
3016 // FIXME: not sure if we want to recognize negation.
3017 const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
3018 for (const SCEV *SubExpr : reverse(Add->operands())) {
3019 if (SubExpr->getSCEVType() == scAddExpr)
3020 return getExprBase(SubExpr);
3021
3022 if (SubExpr->getSCEVType() != scMulExpr)
3023 return SubExpr;
3024 }
3025 return S; // all operands are scaled, be conservative.
3026 }
3027 case scAddRecExpr:
3028 return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
3029 }
3030 llvm_unreachable("Unknown SCEV kind!");
3031}
3032
3033/// Return true if the chain increment is profitable to expand into a loop
3034/// invariant value, which may require its own register. A profitable chain
3035/// increment will be an offset relative to the same base. We allow such offsets
3036/// to potentially be used as chain increment as long as it's not obviously
3037/// expensive to expand using real instructions.
3038bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
3039 const SCEV *IncExpr,
3040 ScalarEvolution &SE) {
3041 // Aggressively form chains when -stress-ivchain.
3042 if (StressIVChain)
3043 return true;
3044
3045 // Do not replace a constant offset from IV head with a nonconstant IV
3046 // increment.
3047 if (!isa<SCEVConstant>(IncExpr)) {
3048 const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
3049 if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
3050 return false;
3051 }
3052
3053 SmallPtrSet<const SCEV*, 8> Processed;
3054 return !isHighCostExpansion(IncExpr, Processed, SE);
3055}
3056
3057/// Return true if the number of registers needed for the chain is estimated to
3058/// be less than the number required for the individual IV users. First prohibit
3059/// any IV users that keep the IV live across increments (the Users set should
3060/// be empty). Next count the number and type of increments in the chain.
3061///
3062/// Chaining IVs can lead to considerable code bloat if ISEL doesn't
3063/// effectively use postinc addressing modes. Only consider it profitable it the
3064/// increments can be computed in fewer registers when chained.
3065///
3066/// TODO: Consider IVInc free if it's already used in another chains.
3067static bool isProfitableChain(IVChain &Chain,
3069 ScalarEvolution &SE,
3070 const TargetTransformInfo &TTI) {
3071 if (StressIVChain)
3072 return true;
3073
3074 if (!Chain.hasIncs())
3075 return false;
3076
3077 if (!Users.empty()) {
3078 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
3079 for (Instruction *Inst
3080 : Users) { dbgs() << " " << *Inst << "\n"; });
3081 return false;
3082 }
3083 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3084
3085 // The chain itself may require a register, so intialize cost to 1.
3086 int cost = 1;
3087
3088 // A complete chain likely eliminates the need for keeping the original IV in
3089 // a register. LSR does not currently know how to form a complete chain unless
3090 // the header phi already exists.
3091 if (isa<PHINode>(Chain.tailUserInst())
3092 && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
3093 --cost;
3094 }
3095 const SCEV *LastIncExpr = nullptr;
3096 unsigned NumConstIncrements = 0;
3097 unsigned NumVarIncrements = 0;
3098 unsigned NumReusedIncrements = 0;
3099
3100 if (TTI.isProfitableLSRChainElement(Chain.Incs[0].UserInst))
3101 return true;
3102
3103 for (const IVInc &Inc : Chain) {
3104 if (TTI.isProfitableLSRChainElement(Inc.UserInst))
3105 return true;
3106 if (Inc.IncExpr->isZero())
3107 continue;
3108
3109 // Incrementing by zero or some constant is neutral. We assume constants can
3110 // be folded into an addressing mode or an add's immediate operand.
3111 if (isa<SCEVConstant>(Inc.IncExpr)) {
3112 ++NumConstIncrements;
3113 continue;
3114 }
3115
3116 if (Inc.IncExpr == LastIncExpr)
3117 ++NumReusedIncrements;
3118 else
3119 ++NumVarIncrements;
3120
3121 LastIncExpr = Inc.IncExpr;
3122 }
3123 // An IV chain with a single increment is handled by LSR's postinc
3124 // uses. However, a chain with multiple increments requires keeping the IV's
3125 // value live longer than it needs to be if chained.
3126 if (NumConstIncrements > 1)
3127 --cost;
3128
3129 // Materializing increment expressions in the preheader that didn't exist in
3130 // the original code may cost a register. For example, sign-extended array
3131 // indices can produce ridiculous increments like this:
3132 // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
3133 cost += NumVarIncrements;
3134
3135 // Reusing variable increments likely saves a register to hold the multiple of
3136 // the stride.
3137 cost -= NumReusedIncrements;
3138
3139 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
3140 << "\n");
3141
3142 return cost < 0;
3143}
3144
3145/// Add this IV user to an existing chain or make it the head of a new chain.
3146void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
3147 SmallVectorImpl<ChainUsers> &ChainUsersVec) {
3148 // When IVs are used as types of varying widths, they are generally converted
3149 // to a wider type with some uses remaining narrow under a (free) trunc.
3150 Value *const NextIV = getWideOperand(IVOper);
3151 const SCEV *const OperExpr = SE.getSCEV(NextIV);
3152 const SCEV *const OperExprBase = getExprBase(OperExpr);
3153
3154 // Visit all existing chains. Check if its IVOper can be computed as a
3155 // profitable loop invariant increment from the last link in the Chain.
3156 unsigned ChainIdx = 0, NChains = IVChainVec.size();
3157 const SCEV *LastIncExpr = nullptr;
3158 for (; ChainIdx < NChains; ++ChainIdx) {
3159 IVChain &Chain = IVChainVec[ChainIdx];
3160
3161 // Prune the solution space aggressively by checking that both IV operands
3162 // are expressions that operate on the same unscaled SCEVUnknown. This
3163 // "base" will be canceled by the subsequent getMinusSCEV call. Checking
3164 // first avoids creating extra SCEV expressions.
3165 if (!StressIVChain && Chain.ExprBase != OperExprBase)
3166 continue;
3167
3168 Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
3169 if (PrevIV->getType() != NextIV->getType())
3170 continue;
3171
3172 // A phi node terminates a chain.
3173 if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
3174 continue;
3175
3176 // The increment must be loop-invariant so it can be kept in a register.
3177 const SCEV *PrevExpr = SE.getSCEV(PrevIV);
3178 const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
3179 if (isa<SCEVCouldNotCompute>(IncExpr) || !SE.isLoopInvariant(IncExpr, L))
3180 continue;
3181
3182 if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
3183 LastIncExpr = IncExpr;
3184 break;
3185 }
3186 }
3187 // If we haven't found a chain, create a new one, unless we hit the max. Don't
3188 // bother for phi nodes, because they must be last in the chain.
3189 if (ChainIdx == NChains) {
3190 if (isa<PHINode>(UserInst))
3191 return;
3192 if (NChains >= MaxChains && !StressIVChain) {
3193 LLVM_DEBUG(dbgs() << "IV Chain Limit\n");
3194 return;
3195 }
3196 LastIncExpr = OperExpr;
3197 // IVUsers may have skipped over sign/zero extensions. We don't currently
3198 // attempt to form chains involving extensions unless they can be hoisted
3199 // into this loop's AddRec.
3200 if (!isa<SCEVAddRecExpr>(LastIncExpr))
3201 return;
3202 ++NChains;
3203 IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
3204 OperExprBase));
3205 ChainUsersVec.resize(NChains);
3206 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
3207 << ") IV=" << *LastIncExpr << "\n");
3208 } else {
3209 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst
3210 << ") IV+" << *LastIncExpr << "\n");
3211 // Add this IV user to the end of the chain.
3212 IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
3213 }
3214 IVChain &Chain = IVChainVec[ChainIdx];
3215
3216 SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
3217 // This chain's NearUsers become FarUsers.
3218 if (!LastIncExpr->isZero()) {
3219 ChainUsersVec[ChainIdx].FarUsers.insert_range(NearUsers);
3220 NearUsers.clear();
3221 }
3222
3223 // All other uses of IVOperand become near uses of the chain.
3224 // We currently ignore intermediate values within SCEV expressions, assuming
3225 // they will eventually be used be the current chain, or can be computed
3226 // from one of the chain increments. To be more precise we could
3227 // transitively follow its user and only add leaf IV users to the set.
3228 for (User *U : IVOper->users()) {
3229 Instruction *OtherUse = dyn_cast<Instruction>(U);
3230 if (!OtherUse)
3231 continue;
3232 // Uses in the chain will no longer be uses if the chain is formed.
3233 // Include the head of the chain in this iteration (not Chain.begin()).
3234 IVChain::const_iterator IncIter = Chain.Incs.begin();
3235 IVChain::const_iterator IncEnd = Chain.Incs.end();
3236 for( ; IncIter != IncEnd; ++IncIter) {
3237 if (IncIter->UserInst == OtherUse)
3238 break;
3239 }
3240 if (IncIter != IncEnd)
3241 continue;
3242
3243 if (SE.isSCEVable(OtherUse->getType())
3244 && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
3245 && IU.isIVUserOrOperand(OtherUse)) {
3246 continue;
3247 }
3248 NearUsers.insert(OtherUse);
3249 }
3250
3251 // Since this user is part of the chain, it's no longer considered a use
3252 // of the chain.
3253 ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
3254}
3255
3256/// Populate the vector of Chains.
3257///
3258/// This decreases ILP at the architecture level. Targets with ample registers,
3259/// multiple memory ports, and no register renaming probably don't want
3260/// this. However, such targets should probably disable LSR altogether.
3261///
3262/// The job of LSR is to make a reasonable choice of induction variables across
3263/// the loop. Subsequent passes can easily "unchain" computation exposing more
3264/// ILP *within the loop* if the target wants it.
3265///
3266/// Finding the best IV chain is potentially a scheduling problem. Since LSR
3267/// will not reorder memory operations, it will recognize this as a chain, but
3268/// will generate redundant IV increments. Ideally this would be corrected later
3269/// by a smart scheduler:
3270/// = A[i]
3271/// = A[i+x]
3272/// A[i] =
3273/// A[i+x] =
3274///
3275/// TODO: Walk the entire domtree within this loop, not just the path to the
3276/// loop latch. This will discover chains on side paths, but requires
3277/// maintaining multiple copies of the Chains state.
3278void LSRInstance::CollectChains() {
3279 LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n");
3280 SmallVector<ChainUsers, 8> ChainUsersVec;
3281
3282 SmallVector<BasicBlock *,8> LatchPath;
3283 BasicBlock *LoopHeader = L->getHeader();
3284 for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
3285 Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
3286 LatchPath.push_back(Rung->getBlock());
3287 }
3288 LatchPath.push_back(LoopHeader);
3289
3290 // Walk the instruction stream from the loop header to the loop latch.
3291 for (BasicBlock *BB : reverse(LatchPath)) {
3292 for (Instruction &I : *BB) {
3293 // Skip instructions that weren't seen by IVUsers analysis.
3294 if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&I))
3295 continue;
3296
3297 // Ignore users that are part of a SCEV expression. This way we only
3298 // consider leaf IV Users. This effectively rediscovers a portion of
3299 // IVUsers analysis but in program order this time.
3300 if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I)))
3301 continue;
3302
3303 // Remove this instruction from any NearUsers set it may be in.
3304 for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
3305 ChainIdx < NChains; ++ChainIdx) {
3306 ChainUsersVec[ChainIdx].NearUsers.erase(&I);
3307 }
3308 // Search for operands that can be chained.
3309 SmallPtrSet<Instruction*, 4> UniqueOperands;
3310 User::op_iterator IVOpEnd = I.op_end();
3311 User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE);
3312 while (IVOpIter != IVOpEnd) {
3313 Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
3314 if (UniqueOperands.insert(IVOpInst).second)
3315 ChainInstruction(&I, IVOpInst, ChainUsersVec);
3316 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3317 }
3318 } // Continue walking down the instructions.
3319 } // Continue walking down the domtree.
3320 // Visit phi backedges to determine if the chain can generate the IV postinc.
3321 for (PHINode &PN : L->getHeader()->phis()) {
3322 if (!SE.isSCEVable(PN.getType()))
3323 continue;
3324
3325 Instruction *IncV =
3326 dyn_cast<Instruction>(PN.getIncomingValueForBlock(L->getLoopLatch()));
3327 if (IncV)
3328 ChainInstruction(&PN, IncV, ChainUsersVec);
3329 }
3330 // Remove any unprofitable chains.
3331 unsigned ChainIdx = 0;
3332 for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
3333 UsersIdx < NChains; ++UsersIdx) {
3334 if (!isProfitableChain(IVChainVec[UsersIdx],
3335 ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
3336 continue;
3337 // Preserve the chain at UsesIdx.
3338 if (ChainIdx != UsersIdx)
3339 IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
3340 FinalizeChain(IVChainVec[ChainIdx]);
3341 ++ChainIdx;
3342 }
3343 IVChainVec.resize(ChainIdx);
3344}
3345
3346void LSRInstance::FinalizeChain(IVChain &Chain) {
3347 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3348 LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
3349
3350 for (const IVInc &Inc : Chain) {
3351 LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");
3352 auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
3353 assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
3354 IVIncSet.insert(UseI);
3355 }
3356}
3357
3358/// Return true if the IVInc can be folded into an addressing mode.
3359static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
3360 Value *Operand, const TargetTransformInfo &TTI) {
3361 const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
3362 Immediate IncOffset = Immediate::getZero();
3363 if (IncConst) {
3364 if (IncConst && IncConst->getAPInt().getSignificantBits() > 64)
3365 return false;
3366 IncOffset = Immediate::getFixed(IncConst->getValue()->getSExtValue());
3367 } else {
3368 // Look for mul(vscale, constant), to detect a scalable offset.
3369 const APInt *C;
3370 if (!match(IncExpr, m_scev_Mul(m_scev_APInt(C), m_SCEVVScale())) ||
3371 C->getSignificantBits() > 64)
3372 return false;
3373 IncOffset = Immediate::getScalable(C->getSExtValue());
3374 }
3375
3376 if (!isAddressUse(TTI, UserInst, Operand))
3377 return false;
3378
3379 MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
3380 if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
3381 IncOffset, /*HasBaseReg=*/false))
3382 return false;
3383
3384 return true;
3385}
3386
3387/// Generate an add or subtract for each IVInc in a chain to materialize the IV
3388/// user's operand from the previous IV user's operand.
3389void LSRInstance::GenerateIVChain(const IVChain &Chain,
3390 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
3391 // Find the new IVOperand for the head of the chain. It may have been replaced
3392 // by LSR.
3393 const IVInc &Head = Chain.Incs[0];
3394 User::op_iterator IVOpEnd = Head.UserInst->op_end();
3395 // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
3396 User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
3397 IVOpEnd, L, SE);
3398 Value *IVSrc = nullptr;
3399 while (IVOpIter != IVOpEnd) {
3400 IVSrc = getWideOperand(*IVOpIter);
3401
3402 // If this operand computes the expression that the chain needs, we may use
3403 // it. (Check this after setting IVSrc which is used below.)
3404 //
3405 // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
3406 // narrow for the chain, so we can no longer use it. We do allow using a
3407 // wider phi, assuming the LSR checked for free truncation. In that case we
3408 // should already have a truncate on this operand such that
3409 // getSCEV(IVSrc) == IncExpr.
3410 if (SE.getSCEV(*IVOpIter) == Head.IncExpr
3411 || SE.getSCEV(IVSrc) == Head.IncExpr) {
3412 break;
3413 }
3414 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3415 }
3416 if (IVOpIter == IVOpEnd) {
3417 // Gracefully give up on this chain.
3418 LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
3419 return;
3420 }
3421 assert(IVSrc && "Failed to find IV chain source");
3422
3423 LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
3424 Type *IVTy = IVSrc->getType();
3425 Type *IntTy = SE.getEffectiveSCEVType(IVTy);
3426 const SCEV *LeftOverExpr = nullptr;
3427 const SCEV *Accum = SE.getZero(IntTy);
3429 Bases.emplace_back(Accum, IVSrc);
3430
3431 for (const IVInc &Inc : Chain) {
3432 Instruction *InsertPt = Inc.UserInst;
3433 if (isa<PHINode>(InsertPt))
3434 InsertPt = L->getLoopLatch()->getTerminator();
3435
3436 // IVOper will replace the current IV User's operand. IVSrc is the IV
3437 // value currently held in a register.
3438 Value *IVOper = IVSrc;
3439 if (!Inc.IncExpr->isZero()) {
3440 // IncExpr was the result of subtraction of two narrow values, so must
3441 // be signed.
3442 const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
3443 Accum = SE.getAddExpr(Accum, IncExpr);
3444 LeftOverExpr = LeftOverExpr ?
3445 SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
3446 }
3447
3448 // Look through each base to see if any can produce a nice addressing mode.
3449 bool FoundBase = false;
3450 for (auto [MapScev, MapIVOper] : reverse(Bases)) {
3451 const SCEV *Remainder = SE.getMinusSCEV(Accum, MapScev);
3452 if (canFoldIVIncExpr(Remainder, Inc.UserInst, Inc.IVOperand, TTI)) {
3453 if (!Remainder->isZero()) {
3454 Rewriter.clearPostInc();
3455 Value *IncV = Rewriter.expandCodeFor(Remainder, IntTy, InsertPt);
3456 const SCEV *IVOperExpr =
3457 SE.getAddExpr(SE.getUnknown(MapIVOper), SE.getUnknown(IncV));
3458 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3459 } else {
3460 IVOper = MapIVOper;
3461 }
3462
3463 FoundBase = true;
3464 break;
3465 }
3466 }
3467 if (!FoundBase && LeftOverExpr && !LeftOverExpr->isZero()) {
3468 // Expand the IV increment.
3469 Rewriter.clearPostInc();
3470 Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
3471 const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
3472 SE.getUnknown(IncV));
3473 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3474
3475 // If an IV increment can't be folded, use it as the next IV value.
3476 if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
3477 assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
3478 Bases.emplace_back(Accum, IVOper);
3479 IVSrc = IVOper;
3480 LeftOverExpr = nullptr;
3481 }
3482 }
3483 Type *OperTy = Inc.IVOperand->getType();
3484 if (IVTy != OperTy) {
3485 assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
3486 "cannot extend a chained IV");
3487 IRBuilder<> Builder(InsertPt);
3488 IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
3489 }
3490 Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
3491 if (auto *OperandIsInstr = dyn_cast<Instruction>(Inc.IVOperand))
3492 DeadInsts.emplace_back(OperandIsInstr);
3493 }
3494 // If LSR created a new, wider phi, we may also replace its postinc. We only
3495 // do this if we also found a wide value for the head of the chain.
3496 if (isa<PHINode>(Chain.tailUserInst())) {
3497 for (PHINode &Phi : L->getHeader()->phis()) {
3498 if (Phi.getType() != IVSrc->getType())
3499 continue;
3501 Phi.getIncomingValueForBlock(L->getLoopLatch()));
3502 if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
3503 continue;
3504 Value *IVOper = IVSrc;
3505 Type *PostIncTy = PostIncV->getType();
3506 if (IVTy != PostIncTy) {
3507 assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
3508 IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
3509 Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
3510 IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
3511 }
3512 Phi.replaceUsesOfWith(PostIncV, IVOper);
3513 DeadInsts.emplace_back(PostIncV);
3514 }
3515 }
3516}
3517
3518void LSRInstance::CollectFixupsAndInitialFormulae() {
3519 BranchInst *ExitBranch = nullptr;
3520 bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &TLI);
3521
3522 // For calculating baseline cost
3523 SmallPtrSet<const SCEV *, 16> Regs;
3524 DenseSet<const SCEV *> VisitedRegs;
3525 DenseSet<size_t> VisitedLSRUse;
3526
3527 for (const IVStrideUse &U : IU) {
3528 Instruction *UserInst = U.getUser();
3529 // Skip IV users that are part of profitable IV Chains.
3530 User::op_iterator UseI =
3531 find(UserInst->operands(), U.getOperandValToReplace());
3532 assert(UseI != UserInst->op_end() && "cannot find IV operand");
3533 if (IVIncSet.count(UseI)) {
3534 LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
3535 continue;
3536 }
3537
3538 LSRUse::KindType Kind = LSRUse::Basic;
3539 MemAccessTy AccessTy;
3540 if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
3541 Kind = LSRUse::Address;
3542 AccessTy = getAccessType(TTI, UserInst, U.getOperandValToReplace());
3543 }
3544
3545 const SCEV *S = IU.getExpr(U);
3546 if (!S)
3547 continue;
3548 PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
3549
3550 // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
3551 // (N - i == 0), and this allows (N - i) to be the expression that we work
3552 // with rather than just N or i, so we can consider the register
3553 // requirements for both N and i at the same time. Limiting this code to
3554 // equality icmps is not a problem because all interesting loops use
3555 // equality icmps, thanks to IndVarSimplify.
3556 if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) {
3557 // If CI can be saved in some target, like replaced inside hardware loop
3558 // in PowerPC, no need to generate initial formulae for it.
3559 if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition()))
3560 continue;
3561 if (CI->isEquality()) {
3562 // Swap the operands if needed to put the OperandValToReplace on the
3563 // left, for consistency.
3564 Value *NV = CI->getOperand(1);
3565 if (NV == U.getOperandValToReplace()) {
3566 CI->setOperand(1, CI->getOperand(0));
3567 CI->setOperand(0, NV);
3568 NV = CI->getOperand(1);
3569 Changed = true;
3570 }
3571
3572 // x == y --> x - y == 0
3573 const SCEV *N = SE.getSCEV(NV);
3574 if (SE.isLoopInvariant(N, L) && Rewriter.isSafeToExpand(N) &&
3575 (!NV->getType()->isPointerTy() ||
3576 SE.getPointerBase(N) == SE.getPointerBase(S))) {
3577 // S is normalized, so normalize N before folding it into S
3578 // to keep the result normalized.
3579 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3580 if (!N)
3581 continue;
3582 Kind = LSRUse::ICmpZero;
3583 S = SE.getMinusSCEV(N, S);
3584 } else if (L->isLoopInvariant(NV) &&
3585 (!isa<Instruction>(NV) ||
3586 DT.dominates(cast<Instruction>(NV), L->getHeader())) &&
3587 !NV->getType()->isPointerTy()) {
3588 // If we can't generally expand the expression (e.g. it contains
3589 // a divide), but it is already at a loop invariant point before the
3590 // loop, wrap it in an unknown (to prevent the expander from trying
3591 // to re-expand in a potentially unsafe way.) The restriction to
3592 // integer types is required because the unknown hides the base, and
3593 // SCEV can't compute the difference of two unknown pointers.
3594 N = SE.getUnknown(NV);
3595 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3596 if (!N)
3597 continue;
3598 Kind = LSRUse::ICmpZero;
3599 S = SE.getMinusSCEV(N, S);
3601 }
3602
3603 // -1 and the negations of all interesting strides (except the negation
3604 // of -1) are now also interesting.
3605 for (size_t i = 0, e = Factors.size(); i != e; ++i)
3606 if (Factors[i] != -1)
3607 Factors.insert(-(uint64_t)Factors[i]);
3608 Factors.insert(-1);
3609 }
3610 }
3611
3612 // Get or create an LSRUse.
3613 std::pair<size_t, Immediate> P = getUse(S, Kind, AccessTy);
3614 size_t LUIdx = P.first;
3615 Immediate Offset = P.second;
3616 LSRUse &LU = Uses[LUIdx];
3617
3618 // Record the fixup.
3619 LSRFixup &LF = LU.getNewFixup();
3620 LF.UserInst = UserInst;
3621 LF.OperandValToReplace = U.getOperandValToReplace();
3622 LF.PostIncLoops = TmpPostIncLoops;
3623 LF.Offset = Offset;
3624 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3625 LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF);
3626
3627 // Create SCEV as Formula for calculating baseline cost
3628 if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) {
3629 Formula F;
3630 F.initialMatch(S, L, SE);
3631 BaselineCost.RateFormula(F, Regs, VisitedRegs, LU,
3632 HardwareLoopProfitable);
3633 VisitedLSRUse.insert(LUIdx);
3634 }
3635
3636 if (!LU.WidestFixupType ||
3637 SE.getTypeSizeInBits(LU.WidestFixupType) <
3638 SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3639 LU.WidestFixupType = LF.OperandValToReplace->getType();
3640
3641 // If this is the first use of this LSRUse, give it a formula.
3642 if (LU.Formulae.empty()) {
3643 InsertInitialFormula(S, LU, LUIdx);
3644 CountRegisters(LU.Formulae.back(), LUIdx);
3645 }
3646 }
3647
3648 LLVM_DEBUG(print_fixups(dbgs()));
3649}
3650
3651/// Insert a formula for the given expression into the given use, separating out
3652/// loop-variant portions from loop-invariant and loop-computable portions.
3653void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU,
3654 size_t LUIdx) {
3655 // Mark uses whose expressions cannot be expanded.
3656 if (!Rewriter.isSafeToExpand(S))
3657 LU.RigidFormula = true;
3658
3659 Formula F;
3660 F.initialMatch(S, L, SE);
3661 bool Inserted = InsertFormula(LU, LUIdx, F);
3662 assert(Inserted && "Initial formula already exists!"); (void)Inserted;
3663}
3664
3665/// Insert a simple single-register formula for the given expression into the
3666/// given use.
3667void
3668LSRInstance::InsertSupplementalFormula(const SCEV *S,
3669 LSRUse &LU, size_t LUIdx) {
3670 Formula F;
3671 F.BaseRegs.push_back(S);
3672 F.HasBaseReg = true;
3673 bool Inserted = InsertFormula(LU, LUIdx, F);
3674 assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
3675}
3676
3677/// Note which registers are used by the given formula, updating RegUses.
3678void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
3679 if (F.ScaledReg)
3680 RegUses.countRegister(F.ScaledReg, LUIdx);
3681 for (const SCEV *BaseReg : F.BaseRegs)
3682 RegUses.countRegister(BaseReg, LUIdx);
3683}
3684
3685/// If the given formula has not yet been inserted, add it to the list, and
3686/// return true. Return false otherwise.
3687bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
3688 // Do not insert formula that we will not be able to expand.
3689 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
3690 "Formula is illegal");
3691
3692 if (!LU.InsertFormula(F, *L))
3693 return false;
3694
3695 CountRegisters(F, LUIdx);
3696 return true;
3697}
3698
3699/// Test whether this fixup will be executed each time the corresponding IV
3700/// increment instruction is executed.
3701bool LSRInstance::IsFixupExecutedEachIncrement(const LSRFixup &LF) const {
3702 // If the fixup block dominates the IV increment block then there is no path
3703 // through the loop to the increment that doesn't pass through the fixup.
3704 return DT.dominates(LF.UserInst->getParent(), IVIncInsertPos->getParent());
3705}
3706
3707/// Check for other uses of loop-invariant values which we're tracking. These
3708/// other uses will pin these values in registers, making them less profitable
3709/// for elimination.
3710/// TODO: This currently misses non-constant addrec step registers.
3711/// TODO: Should this give more weight to users inside the loop?
3712void
3713LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
3714 SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
3715 SmallPtrSet<const SCEV *, 32> Visited;
3716
3717 // Don't collect outside uses if we are favoring postinc - the instructions in
3718 // the loop are more important than the ones outside of it.
3719 if (AMK == TTI::AMK_PostIndexed)
3720 return;
3721
3722 while (!Worklist.empty()) {
3723 const SCEV *S = Worklist.pop_back_val();
3724
3725 // Don't process the same SCEV twice
3726 if (!Visited.insert(S).second)
3727 continue;
3728
3729 if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
3730 append_range(Worklist, N->operands());
3731 else if (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(S))
3732 Worklist.push_back(C->getOperand());
3733 else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
3734 Worklist.push_back(D->getLHS());
3735 Worklist.push_back(D->getRHS());
3736 } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
3737 const Value *V = US->getValue();
3738 if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
3739 // Look for instructions defined outside the loop.
3740 if (L->contains(Inst)) continue;
3741 } else if (isa<Constant>(V))
3742 // Constants can be re-materialized.
3743 continue;
3744 for (const Use &U : V->uses()) {
3745 const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
3746 // Ignore non-instructions.
3747 if (!UserInst)
3748 continue;
3749 // Don't bother if the instruction is an EHPad.
3750 if (UserInst->isEHPad())
3751 continue;
3752 // Ignore instructions in other functions (as can happen with
3753 // Constants).
3754 if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
3755 continue;
3756 // Ignore instructions not dominated by the loop.
3757 const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
3758 UserInst->getParent() :
3759 cast<PHINode>(UserInst)->getIncomingBlock(
3761 if (!DT.dominates(L->getHeader(), UseBB))
3762 continue;
3763 // Don't bother if the instruction is in a BB which ends in an EHPad.
3764 if (UseBB->getTerminator()->isEHPad())
3765 continue;
3766
3767 // Ignore cases in which the currently-examined value could come from
3768 // a basic block terminated with an EHPad. This checks all incoming
3769 // blocks of the phi node since it is possible that the same incoming
3770 // value comes from multiple basic blocks, only some of which may end
3771 // in an EHPad. If any of them do, a subsequent rewrite attempt by this
3772 // pass would try to insert instructions into an EHPad, hitting an
3773 // assertion.
3774 if (isa<PHINode>(UserInst)) {
3775 const auto *PhiNode = cast<PHINode>(UserInst);
3776 bool HasIncompatibleEHPTerminatedBlock = false;
3777 llvm::Value *ExpectedValue = U;
3778 for (unsigned int I = 0; I < PhiNode->getNumIncomingValues(); I++) {
3779 if (PhiNode->getIncomingValue(I) == ExpectedValue) {
3780 if (PhiNode->getIncomingBlock(I)->getTerminator()->isEHPad()) {
3781 HasIncompatibleEHPTerminatedBlock = true;
3782 break;
3783 }
3784 }
3785 }
3786 if (HasIncompatibleEHPTerminatedBlock) {
3787 continue;
3788 }
3789 }
3790
3791 // Don't bother rewriting PHIs in catchswitch blocks.
3792 if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
3793 continue;
3794 // Ignore uses which are part of other SCEV expressions, to avoid
3795 // analyzing them multiple times.
3796 if (SE.isSCEVable(UserInst->getType())) {
3797 const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
3798 // If the user is a no-op, look through to its uses.
3799 if (!isa<SCEVUnknown>(UserS))
3800 continue;
3801 if (UserS == US) {
3802 Worklist.push_back(
3803 SE.getUnknown(const_cast<Instruction *>(UserInst)));
3804 continue;
3805 }
3806 }
3807 // Ignore icmp instructions which are already being analyzed.
3808 if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
3809 unsigned OtherIdx = !U.getOperandNo();
3810 Value *OtherOp = ICI->getOperand(OtherIdx);
3811 if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
3812 continue;
3813 }
3814
3815 // Do not consider uses inside lifetime intrinsics. These are not
3816 // actually materialized.
3817 if (UserInst->isLifetimeStartOrEnd())
3818 continue;
3819
3820 std::pair<size_t, Immediate> P =
3821 getUse(S, LSRUse::Basic, MemAccessTy());
3822 size_t LUIdx = P.first;
3823 Immediate Offset = P.second;
3824 LSRUse &LU = Uses[LUIdx];
3825 LSRFixup &LF = LU.getNewFixup();
3826 LF.UserInst = const_cast<Instruction *>(UserInst);
3827 LF.OperandValToReplace = U;
3828 LF.Offset = Offset;
3829 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3830 LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF);
3831 if (!LU.WidestFixupType ||
3832 SE.getTypeSizeInBits(LU.WidestFixupType) <
3833 SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3834 LU.WidestFixupType = LF.OperandValToReplace->getType();
3835 InsertSupplementalFormula(US, LU, LUIdx);
3836 CountRegisters(LU.Formulae.back(), Uses.size() - 1);
3837 break;
3838 }
3839 }
3840 }
3841}
3842
3843/// Split S into subexpressions which can be pulled out into separate
3844/// registers. If C is non-null, multiply each subexpression by C.
3845///
3846/// Return remainder expression after factoring the subexpressions captured by
3847/// Ops. If Ops is complete, return NULL.
3848static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
3850 const Loop *L,
3851 ScalarEvolution &SE,
3852 unsigned Depth = 0) {
3853 // Arbitrarily cap recursion to protect compile time.
3854 if (Depth >= 3)
3855 return S;
3856
3857 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
3858 // Break out add operands.
3859 for (const SCEV *S : Add->operands()) {
3860 const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
3861 if (Remainder)
3862 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3863 }
3864 return nullptr;
3865 }
3866 const SCEV *Start, *Step;
3867 const SCEVConstant *Op0;
3868 const SCEV *Op1;
3869 if (match(S, m_scev_AffineAddRec(m_SCEV(Start), m_SCEV(Step)))) {
3870 // Split a non-zero base out of an addrec.
3871 if (Start->isZero())
3872 return S;
3873
3874 const SCEV *Remainder = CollectSubexprs(Start, C, Ops, L, SE, Depth + 1);
3875 // Split the non-zero AddRec unless it is part of a nested recurrence that
3876 // does not pertain to this loop.
3877 if (Remainder && (cast<SCEVAddRecExpr>(S)->getLoop() == L ||
3878 !isa<SCEVAddRecExpr>(Remainder))) {
3879 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3880 Remainder = nullptr;
3881 }
3882 if (Remainder != Start) {
3883 if (!Remainder)
3884 Remainder = SE.getConstant(S->getType(), 0);
3885 return SE.getAddRecExpr(Remainder, Step,
3886 cast<SCEVAddRecExpr>(S)->getLoop(),
3887 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
3889 }
3890 } else if (match(S, m_scev_Mul(m_SCEVConstant(Op0), m_SCEV(Op1)))) {
3891 // Break (C * (a + b + c)) into C*a + C*b + C*c.
3892 C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
3893 const SCEV *Remainder = CollectSubexprs(Op1, C, Ops, L, SE, Depth + 1);
3894 if (Remainder)
3895 Ops.push_back(SE.getMulExpr(C, Remainder));
3896 return nullptr;
3897 }
3898 return S;
3899}
3900
3901/// Return true if the SCEV represents a value that may end up as a
3902/// post-increment operation.
3904 LSRUse &LU, const SCEV *S, const Loop *L,
3905 ScalarEvolution &SE) {
3906 if (LU.Kind != LSRUse::Address ||
3907 !LU.AccessTy.getType()->isIntOrIntVectorTy())
3908 return false;
3909 const SCEV *Start;
3910 if (!match(S, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant())))
3911 return false;
3912 // Check if a post-indexed load/store can be used.
3913 if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, S->getType()) ||
3914 TTI.isIndexedStoreLegal(TTI.MIM_PostInc, S->getType())) {
3915 if (!isa<SCEVConstant>(Start) && SE.isLoopInvariant(Start, L))
3916 return true;
3917 }
3918 return false;
3919}
3920
3921/// Helper function for LSRInstance::GenerateReassociations.
3922void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
3923 const Formula &Base,
3924 unsigned Depth, size_t Idx,
3925 bool IsScaledReg) {
3926 const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3927 // Don't generate reassociations for the base register of a value that
3928 // may generate a post-increment operator. The reason is that the
3929 // reassociations cause extra base+register formula to be created,
3930 // and possibly chosen, but the post-increment is more efficient.
3931 if (AMK == TTI::AMK_PostIndexed && mayUsePostIncMode(TTI, LU, BaseReg, L, SE))
3932 return;
3934 const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
3935 if (Remainder)
3936 AddOps.push_back(Remainder);
3937
3938 if (AddOps.size() == 1)
3939 return;
3940
3942 JE = AddOps.end();
3943 J != JE; ++J) {
3944 // Loop-variant "unknown" values are uninteresting; we won't be able to
3945 // do anything meaningful with them.
3946 if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
3947 continue;
3948
3949 // Don't pull a constant into a register if the constant could be folded
3950 // into an immediate field.
3951 if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3952 LU.AccessTy, *J, Base.getNumRegs() > 1))
3953 continue;
3954
3955 // Collect all operands except *J.
3956 SmallVector<const SCEV *, 8> InnerAddOps(std::as_const(AddOps).begin(), J);
3957 InnerAddOps.append(std::next(J), std::as_const(AddOps).end());
3958
3959 // Don't leave just a constant behind in a register if the constant could
3960 // be folded into an immediate field.
3961 if (InnerAddOps.size() == 1 &&
3962 isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3963 LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
3964 continue;
3965
3966 const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
3967 if (InnerSum->isZero())
3968 continue;
3969 Formula F = Base;
3970
3971 if (F.UnfoldedOffset.isNonZero() && F.UnfoldedOffset.isScalable())
3972 continue;
3973
3974 // Add the remaining pieces of the add back into the new formula.
3975 const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
3976 if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
3977 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
3978 InnerSumSC->getValue()->getZExtValue())) {
3979 F.UnfoldedOffset =
3980 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
3981 InnerSumSC->getValue()->getZExtValue());
3982 if (IsScaledReg) {
3983 F.ScaledReg = nullptr;
3984 F.Scale = 0;
3985 } else
3986 F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
3987 } else if (IsScaledReg)
3988 F.ScaledReg = InnerSum;
3989 else
3990 F.BaseRegs[Idx] = InnerSum;
3991
3992 // Add J as its own register, or an unfolded immediate.
3993 const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
3994 if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
3995 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
3996 SC->getValue()->getZExtValue()))
3997 F.UnfoldedOffset =
3998 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
3999 SC->getValue()->getZExtValue());
4000 else
4001 F.BaseRegs.push_back(*J);
4002 // We may have changed the number of register in base regs, adjust the
4003 // formula accordingly.
4004 F.canonicalize(*L);
4005
4006 if (InsertFormula(LU, LUIdx, F))
4007 // If that formula hadn't been seen before, recurse to find more like
4008 // it.
4009 // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2)
4010 // Because just Depth is not enough to bound compile time.
4011 // This means that every time AddOps.size() is greater 16^x we will add
4012 // x to Depth.
4013 GenerateReassociations(LU, LUIdx, LU.Formulae.back(),
4014 Depth + 1 + (Log2_32(AddOps.size()) >> 2));
4015 }
4016}
4017
4018/// Split out subexpressions from adds and the bases of addrecs.
4019void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
4020 Formula Base, unsigned Depth) {
4021 assert(Base.isCanonical(*L) && "Input must be in the canonical form");
4022 // Arbitrarily cap recursion to protect compile time.
4023 if (Depth >= 3)
4024 return;
4025
4026 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4027 GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
4028
4029 if (Base.Scale == 1)
4030 GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
4031 /* Idx */ -1, /* IsScaledReg */ true);
4032}
4033
4034/// Generate a formula consisting of all of the loop-dominating registers added
4035/// into a single register.
4036void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
4037 Formula Base) {
4038 // This method is only interesting on a plurality of registers.
4039 if (Base.BaseRegs.size() + (Base.Scale == 1) +
4040 (Base.UnfoldedOffset.isNonZero()) <=
4041 1)
4042 return;
4043
4044 // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
4045 // processing the formula.
4046 Base.unscale();
4048 Formula NewBase = Base;
4049 NewBase.BaseRegs.clear();
4050 Type *CombinedIntegerType = nullptr;
4051 for (const SCEV *BaseReg : Base.BaseRegs) {
4052 if (SE.properlyDominates(BaseReg, L->getHeader()) &&
4053 !SE.hasComputableLoopEvolution(BaseReg, L)) {
4054 if (!CombinedIntegerType)
4055 CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType());
4056 Ops.push_back(BaseReg);
4057 }
4058 else
4059 NewBase.BaseRegs.push_back(BaseReg);
4060 }
4061
4062 // If no register is relevant, we're done.
4063 if (Ops.size() == 0)
4064 return;
4065
4066 // Utility function for generating the required variants of the combined
4067 // registers.
4068 auto GenerateFormula = [&](const SCEV *Sum) {
4069 Formula F = NewBase;
4070
4071 // TODO: If Sum is zero, it probably means ScalarEvolution missed an
4072 // opportunity to fold something. For now, just ignore such cases
4073 // rather than proceed with zero in a register.
4074 if (Sum->isZero())
4075 return;
4076
4077 F.BaseRegs.push_back(Sum);
4078 F.canonicalize(*L);
4079 (void)InsertFormula(LU, LUIdx, F);
4080 };
4081
4082 // If we collected at least two registers, generate a formula combining them.
4083 if (Ops.size() > 1) {
4084 SmallVector<const SCEV *, 4> OpsCopy(Ops); // Don't let SE modify Ops.
4085 GenerateFormula(SE.getAddExpr(OpsCopy));
4086 }
4087
4088 // If we have an unfolded offset, generate a formula combining it with the
4089 // registers collected.
4090 if (NewBase.UnfoldedOffset.isNonZero() && NewBase.UnfoldedOffset.isFixed()) {
4091 assert(CombinedIntegerType && "Missing a type for the unfolded offset");
4092 Ops.push_back(SE.getConstant(CombinedIntegerType,
4093 NewBase.UnfoldedOffset.getFixedValue(), true));
4094 NewBase.UnfoldedOffset = Immediate::getFixed(0);
4095 GenerateFormula(SE.getAddExpr(Ops));
4096 }
4097}
4098
4099/// Helper function for LSRInstance::GenerateSymbolicOffsets.
4100void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
4101 const Formula &Base, size_t Idx,
4102 bool IsScaledReg) {
4103 const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4104 GlobalValue *GV = ExtractSymbol(G, SE);
4105 if (G->isZero() || !GV)
4106 return;
4107 Formula F = Base;
4108 F.BaseGV = GV;
4109 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4110 return;
4111 if (IsScaledReg)
4112 F.ScaledReg = G;
4113 else
4114 F.BaseRegs[Idx] = G;
4115 (void)InsertFormula(LU, LUIdx, F);
4116}
4117
4118/// Generate reuse formulae using symbolic offsets.
4119void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
4120 Formula Base) {
4121 // We can't add a symbolic offset if the address already contains one.
4122 if (Base.BaseGV) return;
4123
4124 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4125 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
4126 if (Base.Scale == 1)
4127 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
4128 /* IsScaledReg */ true);
4129}
4130
4131/// Helper function for LSRInstance::GenerateConstantOffsets.
4132void LSRInstance::GenerateConstantOffsetsImpl(
4133 LSRUse &LU, unsigned LUIdx, const Formula &Base,
4134 const SmallVectorImpl<Immediate> &Worklist, size_t Idx, bool IsScaledReg) {
4135
4136 auto GenerateOffset = [&](const SCEV *G, Immediate Offset) {
4137 Formula F = Base;
4138 if (!Base.BaseOffset.isCompatibleImmediate(Offset))
4139 return;
4140 F.BaseOffset = Base.BaseOffset.subUnsigned(Offset);
4141
4142 if (isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) {
4143 // Add the offset to the base register.
4144 const SCEV *NewOffset = Offset.getSCEV(SE, G->getType());
4145 const SCEV *NewG = SE.getAddExpr(NewOffset, G);
4146 // If it cancelled out, drop the base register, otherwise update it.
4147 if (NewG->isZero()) {
4148 if (IsScaledReg) {
4149 F.Scale = 0;
4150 F.ScaledReg = nullptr;
4151 } else
4152 F.deleteBaseReg(F.BaseRegs[Idx]);
4153 F.canonicalize(*L);
4154 } else if (IsScaledReg)
4155 F.ScaledReg = NewG;
4156 else
4157 F.BaseRegs[Idx] = NewG;
4158
4159 (void)InsertFormula(LU, LUIdx, F);
4160 }
4161 };
4162
4163 const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4164
4165 // With constant offsets and constant steps, we can generate pre-inc
4166 // accesses by having the offset equal the step. So, for access #0 with a
4167 // step of 8, we generate a G - 8 base which would require the first access
4168 // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer
4169 // for itself and hopefully becomes the base for other accesses. This means
4170 // means that a single pre-indexed access can be generated to become the new
4171 // base pointer for each iteration of the loop, resulting in no extra add/sub
4172 // instructions for pointer updating.
4173 if ((AMK & TTI::AMK_PreIndexed) && LU.Kind == LSRUse::Address) {
4174 const APInt *StepInt;
4175 if (match(G, m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StepInt)))) {
4176 int64_t Step = StepInt->isNegative() ? StepInt->getSExtValue()
4177 : StepInt->getZExtValue();
4178
4179 for (Immediate Offset : Worklist) {
4180 if (Offset.isFixed()) {
4181 Offset = Immediate::getFixed(Offset.getFixedValue() - Step);
4182 GenerateOffset(G, Offset);
4183 }
4184 }
4185 }
4186 }
4187 for (Immediate Offset : Worklist)
4188 GenerateOffset(G, Offset);
4189
4190 Immediate Imm = ExtractImmediate(G, SE);
4191 if (G->isZero() || Imm.isZero() ||
4192 !Base.BaseOffset.isCompatibleImmediate(Imm))
4193 return;
4194 Formula F = Base;
4195 F.BaseOffset = F.BaseOffset.addUnsigned(Imm);
4196 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4197 return;
4198 if (IsScaledReg) {
4199 F.ScaledReg = G;
4200 } else {
4201 F.BaseRegs[Idx] = G;
4202 // We may generate non canonical Formula if G is a recurrent expr reg
4203 // related with current loop while F.ScaledReg is not.
4204 F.canonicalize(*L);
4205 }
4206 (void)InsertFormula(LU, LUIdx, F);
4207}
4208
4209/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
4210void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
4211 Formula Base) {
4212 // TODO: For now, just add the min and max offset, because it usually isn't
4213 // worthwhile looking at everything inbetween.
4215 Worklist.push_back(LU.MinOffset);
4216 if (LU.MaxOffset != LU.MinOffset)
4217 Worklist.push_back(LU.MaxOffset);
4218
4219 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4220 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
4221 if (Base.Scale == 1)
4222 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
4223 /* IsScaledReg */ true);
4224}
4225
4226/// For ICmpZero, check to see if we can scale up the comparison. For example, x
4227/// == y -> x*c == y*c.
4228void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
4229 Formula Base) {
4230 if (LU.Kind != LSRUse::ICmpZero) return;
4231
4232 // Determine the integer type for the base formula.
4233 Type *IntTy = Base.getType();
4234 if (!IntTy) return;
4235 if (SE.getTypeSizeInBits(IntTy) > 64) return;
4236
4237 // Don't do this if there is more than one offset.
4238 if (LU.MinOffset != LU.MaxOffset) return;
4239
4240 // Check if transformation is valid. It is illegal to multiply pointer.
4241 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4242 return;
4243 for (const SCEV *BaseReg : Base.BaseRegs)
4244 if (BaseReg->getType()->isPointerTy())
4245 return;
4246 assert(!Base.BaseGV && "ICmpZero use is not legal!");
4247
4248 // Check each interesting stride.
4249 for (int64_t Factor : Factors) {
4250 // Check that Factor can be represented by IntTy
4251 if (!ConstantInt::isValueValidForType(IntTy, Factor))
4252 continue;
4253 // Check that the multiplication doesn't overflow.
4254 if (Base.BaseOffset.isMin() && Factor == -1)
4255 continue;
4256 // Not supporting scalable immediates.
4257 if (Base.BaseOffset.isNonZero() && Base.BaseOffset.isScalable())
4258 continue;
4259 Immediate NewBaseOffset = Base.BaseOffset.mulUnsigned(Factor);
4260 assert(Factor != 0 && "Zero factor not expected!");
4261 if (NewBaseOffset.getFixedValue() / Factor !=
4262 Base.BaseOffset.getFixedValue())
4263 continue;
4264 // If the offset will be truncated at this use, check that it is in bounds.
4265 if (!IntTy->isPointerTy() &&
4266 !ConstantInt::isValueValidForType(IntTy, NewBaseOffset.getFixedValue()))
4267 continue;
4268
4269 // Check that multiplying with the use offset doesn't overflow.
4270 Immediate Offset = LU.MinOffset;
4271 if (Offset.isMin() && Factor == -1)
4272 continue;
4273 Offset = Offset.mulUnsigned(Factor);
4274 if (Offset.getFixedValue() / Factor != LU.MinOffset.getFixedValue())
4275 continue;
4276 // If the offset will be truncated at this use, check that it is in bounds.
4277 if (!IntTy->isPointerTy() &&
4278 !ConstantInt::isValueValidForType(IntTy, Offset.getFixedValue()))
4279 continue;
4280
4281 Formula F = Base;
4282 F.BaseOffset = NewBaseOffset;
4283
4284 // Check that this scale is legal.
4285 if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
4286 continue;
4287
4288 // Compensate for the use having MinOffset built into it.
4289 F.BaseOffset = F.BaseOffset.addUnsigned(Offset).subUnsigned(LU.MinOffset);
4290
4291 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4292
4293 // Check that multiplying with each base register doesn't overflow.
4294 for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
4295 F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
4296 if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
4297 goto next;
4298 }
4299
4300 // Check that multiplying with the scaled register doesn't overflow.
4301 if (F.ScaledReg) {
4302 F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
4303 if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
4304 continue;
4305 }
4306
4307 // Check that multiplying with the unfolded offset doesn't overflow.
4308 if (F.UnfoldedOffset.isNonZero()) {
4309 if (F.UnfoldedOffset.isMin() && Factor == -1)
4310 continue;
4311 F.UnfoldedOffset = F.UnfoldedOffset.mulUnsigned(Factor);
4312 if (F.UnfoldedOffset.getFixedValue() / Factor !=
4313 Base.UnfoldedOffset.getFixedValue())
4314 continue;
4315 // If the offset will be truncated, check that it is in bounds.
4317 IntTy, F.UnfoldedOffset.getFixedValue()))
4318 continue;
4319 }
4320
4321 // If we make it here and it's legal, add it.
4322 (void)InsertFormula(LU, LUIdx, F);
4323 next:;
4324 }
4325}
4326
4327/// Generate stride factor reuse formulae by making use of scaled-offset address
4328/// modes, for example.
4329void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
4330 // Determine the integer type for the base formula.
4331 Type *IntTy = Base.getType();
4332 if (!IntTy) return;
4333
4334 // If this Formula already has a scaled register, we can't add another one.
4335 // Try to unscale the formula to generate a better scale.
4336 if (Base.Scale != 0 && !Base.unscale())
4337 return;
4338
4339 assert(Base.Scale == 0 && "unscale did not did its job!");
4340
4341 // Check each interesting stride.
4342 for (int64_t Factor : Factors) {
4343 Base.Scale = Factor;
4344 Base.HasBaseReg = Base.BaseRegs.size() > 1;
4345 // Check whether this scale is going to be legal.
4346 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4347 Base)) {
4348 // As a special-case, handle special out-of-loop Basic users specially.
4349 // TODO: Reconsider this special case.
4350 if (LU.Kind == LSRUse::Basic &&
4351 isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
4352 LU.AccessTy, Base) &&
4353 LU.AllFixupsOutsideLoop)
4354 LU.Kind = LSRUse::Special;
4355 else
4356 continue;
4357 }
4358 // For an ICmpZero, negating a solitary base register won't lead to
4359 // new solutions.
4360 if (LU.Kind == LSRUse::ICmpZero && !Base.HasBaseReg &&
4361 Base.BaseOffset.isZero() && !Base.BaseGV)
4362 continue;
4363 // For each addrec base reg, if its loop is current loop, apply the scale.
4364 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
4365 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
4366 if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
4367 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4368 if (FactorS->isZero())
4369 continue;
4370 // Divide out the factor, ignoring high bits, since we'll be
4371 // scaling the value back up in the end.
4372 if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true))
4373 if (!Quotient->isZero()) {
4374 // TODO: This could be optimized to avoid all the copying.
4375 Formula F = Base;
4376 F.ScaledReg = Quotient;
4377 F.deleteBaseReg(F.BaseRegs[i]);
4378 // The canonical representation of 1*reg is reg, which is already in
4379 // Base. In that case, do not try to insert the formula, it will be
4380 // rejected anyway.
4381 if (F.Scale == 1 && (F.BaseRegs.empty() ||
4382 (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
4383 continue;
4384 // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
4385 // non canonical Formula with ScaledReg's loop not being L.
4386 if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
4387 F.canonicalize(*L);
4388 (void)InsertFormula(LU, LUIdx, F);
4389 }
4390 }
4391 }
4392 }
4393}
4394
4395/// Extend/Truncate \p Expr to \p ToTy considering post-inc uses in \p Loops.
4396/// For all PostIncLoopSets in \p Loops, first de-normalize \p Expr, then
4397/// perform the extension/truncate and normalize again, as the normalized form
4398/// can result in folds that are not valid in the post-inc use contexts. The
4399/// expressions for all PostIncLoopSets must match, otherwise return nullptr.
4400static const SCEV *
4402 const SCEV *Expr, Type *ToTy,
4403 ScalarEvolution &SE) {
4404 const SCEV *Result = nullptr;
4405 for (auto &L : Loops) {
4406 auto *DenormExpr = denormalizeForPostIncUse(Expr, L, SE);
4407 const SCEV *NewDenormExpr = SE.getAnyExtendExpr(DenormExpr, ToTy);
4408 const SCEV *New = normalizeForPostIncUse(NewDenormExpr, L, SE);
4409 if (!New || (Result && New != Result))
4410 return nullptr;
4411 Result = New;
4412 }
4413
4414 assert(Result && "failed to create expression");
4415 return Result;
4416}
4417
4418/// Generate reuse formulae from different IV types.
4419void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
4420 // Don't bother truncating symbolic values.
4421 if (Base.BaseGV) return;
4422
4423 // Determine the integer type for the base formula.
4424 Type *DstTy = Base.getType();
4425 if (!DstTy) return;
4426 if (DstTy->isPointerTy())
4427 return;
4428
4429 // It is invalid to extend a pointer type so exit early if ScaledReg or
4430 // any of the BaseRegs are pointers.
4431 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4432 return;
4433 if (any_of(Base.BaseRegs,
4434 [](const SCEV *S) { return S->getType()->isPointerTy(); }))
4435 return;
4436
4438 for (auto &LF : LU.Fixups)
4439 Loops.push_back(LF.PostIncLoops);
4440
4441 for (Type *SrcTy : Types) {
4442 if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
4443 Formula F = Base;
4444
4445 // Sometimes SCEV is able to prove zero during ext transform. It may
4446 // happen if SCEV did not do all possible transforms while creating the
4447 // initial node (maybe due to depth limitations), but it can do them while
4448 // taking ext.
4449 if (F.ScaledReg) {
4450 const SCEV *NewScaledReg =
4451 getAnyExtendConsideringPostIncUses(Loops, F.ScaledReg, SrcTy, SE);
4452 if (!NewScaledReg || NewScaledReg->isZero())
4453 continue;
4454 F.ScaledReg = NewScaledReg;
4455 }
4456 bool HasZeroBaseReg = false;
4457 for (const SCEV *&BaseReg : F.BaseRegs) {
4458 const SCEV *NewBaseReg =
4459 getAnyExtendConsideringPostIncUses(Loops, BaseReg, SrcTy, SE);
4460 if (!NewBaseReg || NewBaseReg->isZero()) {
4461 HasZeroBaseReg = true;
4462 break;
4463 }
4464 BaseReg = NewBaseReg;
4465 }
4466 if (HasZeroBaseReg)
4467 continue;
4468
4469 // TODO: This assumes we've done basic processing on all uses and
4470 // have an idea what the register usage is.
4471 if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
4472 continue;
4473
4474 F.canonicalize(*L);
4475 (void)InsertFormula(LU, LUIdx, F);
4476 }
4477 }
4478}
4479
4480namespace {
4481
4482/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
4483/// modifications so that the search phase doesn't have to worry about the data
4484/// structures moving underneath it.
4485struct WorkItem {
4486 size_t LUIdx;
4487 Immediate Imm;
4488 const SCEV *OrigReg;
4489
4490 WorkItem(size_t LI, Immediate I, const SCEV *R)
4491 : LUIdx(LI), Imm(I), OrigReg(R) {}
4492
4493 void print(raw_ostream &OS) const;
4494 void dump() const;
4495};
4496
4497} // end anonymous namespace
4498
4499#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4500void WorkItem::print(raw_ostream &OS) const {
4501 OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
4502 << " , add offset " << Imm;
4503}
4504
4505LLVM_DUMP_METHOD void WorkItem::dump() const {
4506 print(errs()); errs() << '\n';
4507}
4508#endif
4509
4510/// Look for registers which are a constant distance apart and try to form reuse
4511/// opportunities between them.
4512void LSRInstance::GenerateCrossUseConstantOffsets() {
4513 // Group the registers by their value without any added constant offset.
4514 using ImmMapTy = std::map<Immediate, const SCEV *, KeyOrderTargetImmediate>;
4515
4516 DenseMap<const SCEV *, ImmMapTy> Map;
4517 DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
4519 for (const SCEV *Use : RegUses) {
4520 const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify.
4521 Immediate Imm = ExtractImmediate(Reg, SE);
4522 auto Pair = Map.try_emplace(Reg);
4523 if (Pair.second)
4524 Sequence.push_back(Reg);
4525 Pair.first->second.insert(std::make_pair(Imm, Use));
4526 UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
4527 }
4528
4529 // Now examine each set of registers with the same base value. Build up
4530 // a list of work to do and do the work in a separate step so that we're
4531 // not adding formulae and register counts while we're searching.
4532 SmallVector<WorkItem, 32> WorkItems;
4533 SmallSet<std::pair<size_t, Immediate>, 32, KeyOrderSizeTAndImmediate>
4534 UniqueItems;
4535 for (const SCEV *Reg : Sequence) {
4536 const ImmMapTy &Imms = Map.find(Reg)->second;
4537
4538 // It's not worthwhile looking for reuse if there's only one offset.
4539 if (Imms.size() == 1)
4540 continue;
4541
4542 LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
4543 for (const auto &Entry
4544 : Imms) dbgs()
4545 << ' ' << Entry.first;
4546 dbgs() << '\n');
4547
4548 // Examine each offset.
4549 for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
4550 J != JE; ++J) {
4551 const SCEV *OrigReg = J->second;
4552
4553 Immediate JImm = J->first;
4554 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
4555
4556 if (!isa<SCEVConstant>(OrigReg) &&
4557 UsedByIndicesMap[Reg].count() == 1) {
4558 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4559 << '\n');
4560 continue;
4561 }
4562
4563 // Conservatively examine offsets between this orig reg a few selected
4564 // other orig regs.
4565 Immediate First = Imms.begin()->first;
4566 Immediate Last = std::prev(Imms.end())->first;
4567 if (!First.isCompatibleImmediate(Last)) {
4568 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4569 << "\n");
4570 continue;
4571 }
4572 // Only scalable if both terms are scalable, or if one is scalable and
4573 // the other is 0.
4574 bool Scalable = First.isScalable() || Last.isScalable();
4575 int64_t FI = First.getKnownMinValue();
4576 int64_t LI = Last.getKnownMinValue();
4577 // Compute (First + Last) / 2 without overflow using the fact that
4578 // First + Last = 2 * (First + Last) + (First ^ Last).
4579 int64_t Avg = (FI & LI) + ((FI ^ LI) >> 1);
4580 // If the result is negative and FI is odd and LI even (or vice versa),
4581 // we rounded towards -inf. Add 1 in that case, to round towards 0.
4582 Avg = Avg + ((FI ^ LI) & ((uint64_t)Avg >> 63));
4583 ImmMapTy::const_iterator OtherImms[] = {
4584 Imms.begin(), std::prev(Imms.end()),
4585 Imms.lower_bound(Immediate::get(Avg, Scalable))};
4586 for (const auto &M : OtherImms) {
4587 if (M == J || M == JE) continue;
4588 if (!JImm.isCompatibleImmediate(M->first))
4589 continue;
4590
4591 // Compute the difference between the two.
4592 Immediate Imm = JImm.subUnsigned(M->first);
4593 for (unsigned LUIdx : UsedByIndices.set_bits())
4594 // Make a memo of this use, offset, and register tuple.
4595 if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
4596 WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
4597 }
4598 }
4599 }
4600
4601 Map.clear();
4602 Sequence.clear();
4603 UsedByIndicesMap.clear();
4604 UniqueItems.clear();
4605
4606 // Now iterate through the worklist and add new formulae.
4607 for (const WorkItem &WI : WorkItems) {
4608 size_t LUIdx = WI.LUIdx;
4609 LSRUse &LU = Uses[LUIdx];
4610 Immediate Imm = WI.Imm;
4611 const SCEV *OrigReg = WI.OrigReg;
4612
4613 Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
4614 const SCEV *NegImmS = Imm.getNegativeSCEV(SE, IntTy);
4615 unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
4616
4617 // TODO: Use a more targeted data structure.
4618 for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
4619 Formula F = LU.Formulae[L];
4620 // FIXME: The code for the scaled and unscaled registers looks
4621 // very similar but slightly different. Investigate if they
4622 // could be merged. That way, we would not have to unscale the
4623 // Formula.
4624 F.unscale();
4625 // Use the immediate in the scaled register.
4626 if (F.ScaledReg == OrigReg) {
4627 if (!F.BaseOffset.isCompatibleImmediate(Imm))
4628 continue;
4629 Immediate Offset = F.BaseOffset.addUnsigned(Imm.mulUnsigned(F.Scale));
4630 // Don't create 50 + reg(-50).
4631 const SCEV *S = Offset.getNegativeSCEV(SE, IntTy);
4632 if (F.referencesReg(S))
4633 continue;
4634 Formula NewF = F;
4635 NewF.BaseOffset = Offset;
4636 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4637 NewF))
4638 continue;
4639 NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
4640
4641 // If the new scale is a constant in a register, and adding the constant
4642 // value to the immediate would produce a value closer to zero than the
4643 // immediate itself, then the formula isn't worthwhile.
4644 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) {
4645 // FIXME: Do we need to do something for scalable immediates here?
4646 // A scalable SCEV won't be constant, but we might still have
4647 // something in the offset? Bail out for now to be safe.
4648 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4649 continue;
4650 if (C->getValue()->isNegative() !=
4651 (NewF.BaseOffset.isLessThanZero()) &&
4652 (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
4653 .ule(std::abs(NewF.BaseOffset.getFixedValue())))
4654 continue;
4655 }
4656
4657 // OK, looks good.
4658 NewF.canonicalize(*this->L);
4659 (void)InsertFormula(LU, LUIdx, NewF);
4660 } else {
4661 // Use the immediate in a base register.
4662 for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
4663 const SCEV *BaseReg = F.BaseRegs[N];
4664 if (BaseReg != OrigReg)
4665 continue;
4666 Formula NewF = F;
4667 if (!NewF.BaseOffset.isCompatibleImmediate(Imm) ||
4668 !NewF.UnfoldedOffset.isCompatibleImmediate(Imm) ||
4669 !NewF.BaseOffset.isCompatibleImmediate(NewF.UnfoldedOffset))
4670 continue;
4671 NewF.BaseOffset = NewF.BaseOffset.addUnsigned(Imm);
4672 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
4673 LU.Kind, LU.AccessTy, NewF)) {
4674 if (AMK == TTI::AMK_PostIndexed &&
4675 mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
4676 continue;
4677 Immediate NewUnfoldedOffset = NewF.UnfoldedOffset.addUnsigned(Imm);
4678 if (!isLegalAddImmediate(TTI, NewUnfoldedOffset))
4679 continue;
4680 NewF = F;
4681 NewF.UnfoldedOffset = NewUnfoldedOffset;
4682 }
4683 NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
4684
4685 // If the new formula has a constant in a register, and adding the
4686 // constant value to the immediate would produce a value closer to
4687 // zero than the immediate itself, then the formula isn't worthwhile.
4688 for (const SCEV *NewReg : NewF.BaseRegs)
4689 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg)) {
4690 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4691 goto skip_formula;
4692 if ((C->getAPInt() + NewF.BaseOffset.getFixedValue())
4693 .abs()
4694 .slt(std::abs(NewF.BaseOffset.getFixedValue())) &&
4695 (C->getAPInt() + NewF.BaseOffset.getFixedValue())
4696 .countr_zero() >=
4698 NewF.BaseOffset.getFixedValue()))
4699 goto skip_formula;
4700 }
4701
4702 // Ok, looks good.
4703 NewF.canonicalize(*this->L);
4704 (void)InsertFormula(LU, LUIdx, NewF);
4705 break;
4706 skip_formula:;
4707 }
4708 }
4709 }
4710 }
4711}
4712
4713/// Generate formulae for each use.
4714void
4715LSRInstance::GenerateAllReuseFormulae() {
4716 // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
4717 // queries are more precise.
4718 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4719 LSRUse &LU = Uses[LUIdx];
4720 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4721 GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
4722 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4723 GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
4724 }
4725 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4726 LSRUse &LU = Uses[LUIdx];
4727 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4728 GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
4729 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4730 GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
4731 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4732 GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
4733 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4734 GenerateScales(LU, LUIdx, LU.Formulae[i]);
4735 }
4736 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4737 LSRUse &LU = Uses[LUIdx];
4738 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4739 GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
4740 }
4741
4742 GenerateCrossUseConstantOffsets();
4743
4744 LLVM_DEBUG(dbgs() << "\n"
4745 "After generating reuse formulae:\n";
4746 print_uses(dbgs()));
4747}
4748
4749/// If there are multiple formulae with the same set of registers used
4750/// by other uses, pick the best one and delete the others.
4751void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
4752 DenseSet<const SCEV *> VisitedRegs;
4753 SmallPtrSet<const SCEV *, 16> Regs;
4754 SmallPtrSet<const SCEV *, 16> LoserRegs;
4755#ifndef NDEBUG
4756 bool ChangedFormulae = false;
4757#endif
4758
4759 // Collect the best formula for each unique set of shared registers. This
4760 // is reset for each use.
4761 using BestFormulaeTy = DenseMap<SmallVector<const SCEV *, 4>, size_t>;
4762
4763 BestFormulaeTy BestFormulae;
4764
4765 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4766 LSRUse &LU = Uses[LUIdx];
4767 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
4768 dbgs() << '\n');
4769
4770 bool Any = false;
4771 for (size_t FIdx = 0, NumForms = LU.Formulae.size();
4772 FIdx != NumForms; ++FIdx) {
4773 Formula &F = LU.Formulae[FIdx];
4774
4775 // Some formulas are instant losers. For example, they may depend on
4776 // nonexistent AddRecs from other loops. These need to be filtered
4777 // immediately, otherwise heuristics could choose them over others leading
4778 // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
4779 // avoids the need to recompute this information across formulae using the
4780 // same bad AddRec. Passing LoserRegs is also essential unless we remove
4781 // the corresponding bad register from the Regs set.
4782 Cost CostF(L, SE, TTI, AMK);
4783 Regs.clear();
4784 CostF.RateFormula(F, Regs, VisitedRegs, LU, HardwareLoopProfitable,
4785 &LoserRegs);
4786 if (CostF.isLoser()) {
4787 // During initial formula generation, undesirable formulae are generated
4788 // by uses within other loops that have some non-trivial address mode or
4789 // use the postinc form of the IV. LSR needs to provide these formulae
4790 // as the basis of rediscovering the desired formula that uses an AddRec
4791 // corresponding to the existing phi. Once all formulae have been
4792 // generated, these initial losers may be pruned.
4793 LLVM_DEBUG(dbgs() << " Filtering loser "; F.print(dbgs());
4794 dbgs() << "\n");
4795 }
4796 else {
4798 for (const SCEV *Reg : F.BaseRegs) {
4799 if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
4800 Key.push_back(Reg);
4801 }
4802 if (F.ScaledReg &&
4803 RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
4804 Key.push_back(F.ScaledReg);
4805 // Unstable sort by host order ok, because this is only used for
4806 // uniquifying.
4807 llvm::sort(Key);
4808
4809 std::pair<BestFormulaeTy::const_iterator, bool> P =
4810 BestFormulae.insert(std::make_pair(Key, FIdx));
4811 if (P.second)
4812 continue;
4813
4814 Formula &Best = LU.Formulae[P.first->second];
4815
4816 Cost CostBest(L, SE, TTI, AMK);
4817 Regs.clear();
4818 CostBest.RateFormula(Best, Regs, VisitedRegs, LU,
4819 HardwareLoopProfitable);
4820 if (CostF.isLess(CostBest))
4821 std::swap(F, Best);
4822 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
4823 dbgs() << "\n"
4824 " in favor of formula ";
4825 Best.print(dbgs()); dbgs() << '\n');
4826 }
4827#ifndef NDEBUG
4828 ChangedFormulae = true;
4829#endif
4830 LU.DeleteFormula(F);
4831 --FIdx;
4832 --NumForms;
4833 Any = true;
4834 }
4835
4836 // Now that we've filtered out some formulae, recompute the Regs set.
4837 if (Any)
4838 LU.RecomputeRegs(LUIdx, RegUses);
4839
4840 // Reset this to prepare for the next use.
4841 BestFormulae.clear();
4842 }
4843
4844 LLVM_DEBUG(if (ChangedFormulae) {
4845 dbgs() << "\n"
4846 "After filtering out undesirable candidates:\n";
4847 print_uses(dbgs());
4848 });
4849}
4850
4851/// Estimate the worst-case number of solutions the solver might have to
4852/// consider. It almost never considers this many solutions because it prune the
4853/// search space, but the pruning isn't always sufficient.
4854size_t LSRInstance::EstimateSearchSpaceComplexity() const {
4855 size_t Power = 1;
4856 for (const LSRUse &LU : Uses) {
4857 size_t FSize = LU.Formulae.size();
4858 if (FSize >= ComplexityLimit) {
4859 Power = ComplexityLimit;
4860 break;
4861 }
4862 Power *= FSize;
4863 if (Power >= ComplexityLimit)
4864 break;
4865 }
4866 return Power;
4867}
4868
4869/// When one formula uses a superset of the registers of another formula, it
4870/// won't help reduce register pressure (though it may not necessarily hurt
4871/// register pressure); remove it to simplify the system.
4872void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
4873 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4874 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4875
4876 LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
4877 "which use a superset of registers used by other "
4878 "formulae.\n");
4879
4880 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4881 LSRUse &LU = Uses[LUIdx];
4882 bool Any = false;
4883 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
4884 Formula &F = LU.Formulae[i];
4885 if (F.BaseOffset.isNonZero() && F.BaseOffset.isScalable())
4886 continue;
4887 // Look for a formula with a constant or GV in a register. If the use
4888 // also has a formula with that same value in an immediate field,
4889 // delete the one that uses a register.
4891 I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
4892 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
4893 Formula NewF = F;
4894 //FIXME: Formulas should store bitwidth to do wrapping properly.
4895 // See PR41034.
4896 NewF.BaseOffset =
4897 Immediate::getFixed(NewF.BaseOffset.getFixedValue() +
4898 (uint64_t)C->getValue()->getSExtValue());
4899 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4900 (I - F.BaseRegs.begin()));
4901 if (LU.HasFormulaWithSameRegs(NewF)) {
4902 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4903 dbgs() << '\n');
4904 LU.DeleteFormula(F);
4905 --i;
4906 --e;
4907 Any = true;
4908 break;
4909 }
4910 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
4911 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
4912 if (!F.BaseGV) {
4913 Formula NewF = F;
4914 NewF.BaseGV = GV;
4915 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4916 (I - F.BaseRegs.begin()));
4917 if (LU.HasFormulaWithSameRegs(NewF)) {
4918 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4919 dbgs() << '\n');
4920 LU.DeleteFormula(F);
4921 --i;
4922 --e;
4923 Any = true;
4924 break;
4925 }
4926 }
4927 }
4928 }
4929 }
4930 if (Any)
4931 LU.RecomputeRegs(LUIdx, RegUses);
4932 }
4933
4934 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4935 }
4936}
4937
4938/// When there are many registers for expressions like A, A+1, A+2, etc.,
4939/// allocate a single register for them.
4940void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
4941 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4942 return;
4943
4944 LLVM_DEBUG(
4945 dbgs() << "The search space is too complex.\n"
4946 "Narrowing the search space by assuming that uses separated "
4947 "by a constant offset will use the same registers.\n");
4948
4949 // This is especially useful for unrolled loops.
4950
4951 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4952 LSRUse &LU = Uses[LUIdx];
4953 for (const Formula &F : LU.Formulae) {
4954 if (F.BaseOffset.isZero() || (F.Scale != 0 && F.Scale != 1))
4955 continue;
4956
4957 LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
4958 if (!LUThatHas)
4959 continue;
4960
4961 if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
4962 LU.Kind, LU.AccessTy))
4963 continue;
4964
4965 LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');
4966
4967 LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
4968 LUThatHas->AllFixupsUnconditional &= LU.AllFixupsUnconditional;
4969
4970 // Transfer the fixups of LU to LUThatHas.
4971 for (LSRFixup &Fixup : LU.Fixups) {
4972 Fixup.Offset += F.BaseOffset;
4973 LUThatHas->pushFixup(Fixup);
4974 LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
4975 }
4976
4977 // Delete formulae from the new use which are no longer legal.
4978 bool Any = false;
4979 for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
4980 Formula &F = LUThatHas->Formulae[i];
4981 if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
4982 LUThatHas->Kind, LUThatHas->AccessTy, F)) {
4983 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
4984 LUThatHas->DeleteFormula(F);
4985 --i;
4986 --e;
4987 Any = true;
4988 }
4989 }
4990
4991 if (Any)
4992 LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
4993
4994 // Delete the old use.
4995 DeleteUse(LU, LUIdx);
4996 --LUIdx;
4997 --NumUses;
4998 break;
4999 }
5000 }
5001
5002 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5003}
5004
5005/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
5006/// we've done more filtering, as it may be able to find more formulae to
5007/// eliminate.
5008void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
5009 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5010 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5011
5012 LLVM_DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
5013 "undesirable dedicated registers.\n");
5014
5015 FilterOutUndesirableDedicatedRegisters();
5016
5017 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5018 }
5019}
5020
5021/// If a LSRUse has multiple formulae with the same ScaledReg and Scale.
5022/// Pick the best one and delete the others.
5023/// This narrowing heuristic is to keep as many formulae with different
5024/// Scale and ScaledReg pair as possible while narrowing the search space.
5025/// The benefit is that it is more likely to find out a better solution
5026/// from a formulae set with more Scale and ScaledReg variations than
5027/// a formulae set with the same Scale and ScaledReg. The picking winner
5028/// reg heuristic will often keep the formulae with the same Scale and
5029/// ScaledReg and filter others, and we want to avoid that if possible.
5030void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
5031 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5032 return;
5033
5034 LLVM_DEBUG(
5035 dbgs() << "The search space is too complex.\n"
5036 "Narrowing the search space by choosing the best Formula "
5037 "from the Formulae with the same Scale and ScaledReg.\n");
5038
5039 // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
5040 using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>;
5041
5042 BestFormulaeTy BestFormulae;
5043#ifndef NDEBUG
5044 bool ChangedFormulae = false;
5045#endif
5046 DenseSet<const SCEV *> VisitedRegs;
5047 SmallPtrSet<const SCEV *, 16> Regs;
5048
5049 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5050 LSRUse &LU = Uses[LUIdx];
5051 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
5052 dbgs() << '\n');
5053
5054 // Return true if Formula FA is better than Formula FB.
5055 auto IsBetterThan = [&](Formula &FA, Formula &FB) {
5056 // First we will try to choose the Formula with fewer new registers.
5057 // For a register used by current Formula, the more the register is
5058 // shared among LSRUses, the less we increase the register number
5059 // counter of the formula.
5060 size_t FARegNum = 0;
5061 for (const SCEV *Reg : FA.BaseRegs) {
5062 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5063 FARegNum += (NumUses - UsedByIndices.count() + 1);
5064 }
5065 size_t FBRegNum = 0;
5066 for (const SCEV *Reg : FB.BaseRegs) {
5067 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5068 FBRegNum += (NumUses - UsedByIndices.count() + 1);
5069 }
5070 if (FARegNum != FBRegNum)
5071 return FARegNum < FBRegNum;
5072
5073 // If the new register numbers are the same, choose the Formula with
5074 // less Cost.
5075 Cost CostFA(L, SE, TTI, AMK);
5076 Cost CostFB(L, SE, TTI, AMK);
5077 Regs.clear();
5078 CostFA.RateFormula(FA, Regs, VisitedRegs, LU, HardwareLoopProfitable);
5079 Regs.clear();
5080 CostFB.RateFormula(FB, Regs, VisitedRegs, LU, HardwareLoopProfitable);
5081 return CostFA.isLess(CostFB);
5082 };
5083
5084 bool Any = false;
5085 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5086 ++FIdx) {
5087 Formula &F = LU.Formulae[FIdx];
5088 if (!F.ScaledReg)
5089 continue;
5090 auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx});
5091 if (P.second)
5092 continue;
5093
5094 Formula &Best = LU.Formulae[P.first->second];
5095 if (IsBetterThan(F, Best))
5096 std::swap(F, Best);
5097 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5098 dbgs() << "\n"
5099 " in favor of formula ";
5100 Best.print(dbgs()); dbgs() << '\n');
5101#ifndef NDEBUG
5102 ChangedFormulae = true;
5103#endif
5104 LU.DeleteFormula(F);
5105 --FIdx;
5106 --NumForms;
5107 Any = true;
5108 }
5109 if (Any)
5110 LU.RecomputeRegs(LUIdx, RegUses);
5111
5112 // Reset this to prepare for the next use.
5113 BestFormulae.clear();
5114 }
5115
5116 LLVM_DEBUG(if (ChangedFormulae) {
5117 dbgs() << "\n"
5118 "After filtering out undesirable candidates:\n";
5119 print_uses(dbgs());
5120 });
5121}
5122
5123/// If we are over the complexity limit, filter out any post-inc prefering
5124/// variables to only post-inc values.
5125void LSRInstance::NarrowSearchSpaceByFilterPostInc() {
5126 if (AMK != TTI::AMK_PostIndexed)
5127 return;
5128 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5129 return;
5130
5131 LLVM_DEBUG(dbgs() << "The search space is too complex.\n"
5132 "Narrowing the search space by choosing the lowest "
5133 "register Formula for PostInc Uses.\n");
5134
5135 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5136 LSRUse &LU = Uses[LUIdx];
5137
5138 if (LU.Kind != LSRUse::Address)
5139 continue;
5140 if (!TTI.isIndexedLoadLegal(TTI.MIM_PostInc, LU.AccessTy.getType()) &&
5141 !TTI.isIndexedStoreLegal(TTI.MIM_PostInc, LU.AccessTy.getType()))
5142 continue;
5143
5144 size_t MinRegs = std::numeric_limits<size_t>::max();
5145 for (const Formula &F : LU.Formulae)
5146 MinRegs = std::min(F.getNumRegs(), MinRegs);
5147
5148 bool Any = false;
5149 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5150 ++FIdx) {
5151 Formula &F = LU.Formulae[FIdx];
5152 if (F.getNumRegs() > MinRegs) {
5153 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5154 dbgs() << "\n");
5155 LU.DeleteFormula(F);
5156 --FIdx;
5157 --NumForms;
5158 Any = true;
5159 }
5160 }
5161 if (Any)
5162 LU.RecomputeRegs(LUIdx, RegUses);
5163
5164 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5165 break;
5166 }
5167
5168 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5169}
5170
5171/// The function delete formulas with high registers number expectation.
5172/// Assuming we don't know the value of each formula (already delete
5173/// all inefficient), generate probability of not selecting for each
5174/// register.
5175/// For example,
5176/// Use1:
5177/// reg(a) + reg({0,+,1})
5178/// reg(a) + reg({-1,+,1}) + 1
5179/// reg({a,+,1})
5180/// Use2:
5181/// reg(b) + reg({0,+,1})
5182/// reg(b) + reg({-1,+,1}) + 1
5183/// reg({b,+,1})
5184/// Use3:
5185/// reg(c) + reg(b) + reg({0,+,1})
5186/// reg(c) + reg({b,+,1})
5187///
5188/// Probability of not selecting
5189/// Use1 Use2 Use3
5190/// reg(a) (1/3) * 1 * 1
5191/// reg(b) 1 * (1/3) * (1/2)
5192/// reg({0,+,1}) (2/3) * (2/3) * (1/2)
5193/// reg({-1,+,1}) (2/3) * (2/3) * 1
5194/// reg({a,+,1}) (2/3) * 1 * 1
5195/// reg({b,+,1}) 1 * (2/3) * (2/3)
5196/// reg(c) 1 * 1 * 0
5197///
5198/// Now count registers number mathematical expectation for each formula:
5199/// Note that for each use we exclude probability if not selecting for the use.
5200/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
5201/// probabilty 1/3 of not selecting for Use1).
5202/// Use1:
5203/// reg(a) + reg({0,+,1}) 1 + 1/3 -- to be deleted
5204/// reg(a) + reg({-1,+,1}) + 1 1 + 4/9 -- to be deleted
5205/// reg({a,+,1}) 1
5206/// Use2:
5207/// reg(b) + reg({0,+,1}) 1/2 + 1/3 -- to be deleted
5208/// reg(b) + reg({-1,+,1}) + 1 1/2 + 2/3 -- to be deleted
5209/// reg({b,+,1}) 2/3
5210/// Use3:
5211/// reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
5212/// reg(c) + reg({b,+,1}) 1 + 2/3
5213void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
5214 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5215 return;
5216 // Ok, we have too many of formulae on our hands to conveniently handle.
5217 // Use a rough heuristic to thin out the list.
5218
5219 // Set of Regs wich will be 100% used in final solution.
5220 // Used in each formula of a solution (in example above this is reg(c)).
5221 // We can skip them in calculations.
5222 SmallPtrSet<const SCEV *, 4> UniqRegs;
5223 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5224
5225 // Map each register to probability of not selecting
5226 DenseMap <const SCEV *, float> RegNumMap;
5227 for (const SCEV *Reg : RegUses) {
5228 if (UniqRegs.count(Reg))
5229 continue;
5230 float PNotSel = 1;
5231 for (const LSRUse &LU : Uses) {
5232 if (!LU.Regs.count(Reg))
5233 continue;
5234 float P = LU.getNotSelectedProbability(Reg);
5235 if (P != 0.0)
5236 PNotSel *= P;
5237 else
5238 UniqRegs.insert(Reg);
5239 }
5240 RegNumMap.insert(std::make_pair(Reg, PNotSel));
5241 }
5242
5243 LLVM_DEBUG(
5244 dbgs() << "Narrowing the search space by deleting costly formulas\n");
5245
5246 // Delete formulas where registers number expectation is high.
5247 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5248 LSRUse &LU = Uses[LUIdx];
5249 // If nothing to delete - continue.
5250 if (LU.Formulae.size() < 2)
5251 continue;
5252 // This is temporary solution to test performance. Float should be
5253 // replaced with round independent type (based on integers) to avoid
5254 // different results for different target builds.
5255 float FMinRegNum = LU.Formulae[0].getNumRegs();
5256 float FMinARegNum = LU.Formulae[0].getNumRegs();
5257 size_t MinIdx = 0;
5258 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5259 Formula &F = LU.Formulae[i];
5260 float FRegNum = 0;
5261 float FARegNum = 0;
5262 for (const SCEV *BaseReg : F.BaseRegs) {
5263 if (UniqRegs.count(BaseReg))
5264 continue;
5265 FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5266 if (isa<SCEVAddRecExpr>(BaseReg))
5267 FARegNum +=
5268 RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5269 }
5270 if (const SCEV *ScaledReg = F.ScaledReg) {
5271 if (!UniqRegs.count(ScaledReg)) {
5272 FRegNum +=
5273 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5274 if (isa<SCEVAddRecExpr>(ScaledReg))
5275 FARegNum +=
5276 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5277 }
5278 }
5279 if (FMinRegNum > FRegNum ||
5280 (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
5281 FMinRegNum = FRegNum;
5282 FMinARegNum = FARegNum;
5283 MinIdx = i;
5284 }
5285 }
5286 LLVM_DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs());
5287 dbgs() << " with min reg num " << FMinRegNum << '\n');
5288 if (MinIdx != 0)
5289 std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
5290 while (LU.Formulae.size() != 1) {
5291 LLVM_DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs());
5292 dbgs() << '\n');
5293 LU.Formulae.pop_back();
5294 }
5295 LU.RecomputeRegs(LUIdx, RegUses);
5296 assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
5297 Formula &F = LU.Formulae[0];
5298 LLVM_DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n');
5299 // When we choose the formula, the regs become unique.
5300 UniqRegs.insert_range(F.BaseRegs);
5301 if (F.ScaledReg)
5302 UniqRegs.insert(F.ScaledReg);
5303 }
5304 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5305}
5306
5307// Check if Best and Reg are SCEVs separated by a constant amount C, and if so
5308// would the addressing offset +C would be legal where the negative offset -C is
5309// not.
5311 ScalarEvolution &SE, const SCEV *Best,
5312 const SCEV *Reg,
5313 MemAccessTy AccessType) {
5314 if (Best->getType() != Reg->getType() ||
5316 cast<SCEVAddRecExpr>(Best)->getLoop() !=
5317 cast<SCEVAddRecExpr>(Reg)->getLoop()))
5318 return false;
5319 std::optional<APInt> Diff = SE.computeConstantDifference(Best, Reg);
5320 if (!Diff)
5321 return false;
5322
5323 return TTI.isLegalAddressingMode(
5324 AccessType.MemTy, /*BaseGV=*/nullptr,
5325 /*BaseOffset=*/Diff->getSExtValue(),
5326 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace) &&
5327 !TTI.isLegalAddressingMode(
5328 AccessType.MemTy, /*BaseGV=*/nullptr,
5329 /*BaseOffset=*/-Diff->getSExtValue(),
5330 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace);
5331}
5332
5333/// Pick a register which seems likely to be profitable, and then in any use
5334/// which has any reference to that register, delete all formulae which do not
5335/// reference that register.
5336void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
5337 // With all other options exhausted, loop until the system is simple
5338 // enough to handle.
5339 SmallPtrSet<const SCEV *, 4> Taken;
5340 while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5341 // Ok, we have too many of formulae on our hands to conveniently handle.
5342 // Use a rough heuristic to thin out the list.
5343 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5344
5345 // Pick the register which is used by the most LSRUses, which is likely
5346 // to be a good reuse register candidate.
5347 const SCEV *Best = nullptr;
5348 unsigned BestNum = 0;
5349 for (const SCEV *Reg : RegUses) {
5350 if (Taken.count(Reg))
5351 continue;
5352 if (!Best) {
5353 Best = Reg;
5354 BestNum = RegUses.getUsedByIndices(Reg).count();
5355 } else {
5356 unsigned Count = RegUses.getUsedByIndices(Reg).count();
5357 if (Count > BestNum) {
5358 Best = Reg;
5359 BestNum = Count;
5360 }
5361
5362 // If the scores are the same, but the Reg is simpler for the target
5363 // (for example {x,+,1} as opposed to {x+C,+,1}, where the target can
5364 // handle +C but not -C), opt for the simpler formula.
5365 if (Count == BestNum) {
5366 int LUIdx = RegUses.getUsedByIndices(Reg).find_first();
5367 if (LUIdx >= 0 && Uses[LUIdx].Kind == LSRUse::Address &&
5369 Uses[LUIdx].AccessTy)) {
5370 Best = Reg;
5371 BestNum = Count;
5372 }
5373 }
5374 }
5375 }
5376 assert(Best && "Failed to find best LSRUse candidate");
5377
5378 LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
5379 << " will yield profitable reuse.\n");
5380 Taken.insert(Best);
5381
5382 // In any use with formulae which references this register, delete formulae
5383 // which don't reference it.
5384 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5385 LSRUse &LU = Uses[LUIdx];
5386 if (!LU.Regs.count(Best)) continue;
5387
5388 bool Any = false;
5389 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5390 Formula &F = LU.Formulae[i];
5391 if (!F.referencesReg(Best)) {
5392 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
5393 LU.DeleteFormula(F);
5394 --e;
5395 --i;
5396 Any = true;
5397 assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
5398 continue;
5399 }
5400 }
5401
5402 if (Any)
5403 LU.RecomputeRegs(LUIdx, RegUses);
5404 }
5405
5406 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5407 }
5408}
5409
5410/// If there are an extraordinary number of formulae to choose from, use some
5411/// rough heuristics to prune down the number of formulae. This keeps the main
5412/// solver from taking an extraordinary amount of time in some worst-case
5413/// scenarios.
5414void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
5415 NarrowSearchSpaceByDetectingSupersets();
5416 NarrowSearchSpaceByCollapsingUnrolledCode();
5417 NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
5419 NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
5420 NarrowSearchSpaceByFilterPostInc();
5421 if (LSRExpNarrow)
5422 NarrowSearchSpaceByDeletingCostlyFormulas();
5423 else
5424 NarrowSearchSpaceByPickingWinnerRegs();
5425}
5426
5427/// This is the recursive solver.
5428void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
5429 Cost &SolutionCost,
5430 SmallVectorImpl<const Formula *> &Workspace,
5431 const Cost &CurCost,
5432 const SmallPtrSet<const SCEV *, 16> &CurRegs,
5433 DenseSet<const SCEV *> &VisitedRegs) const {
5434 // Some ideas:
5435 // - prune more:
5436 // - use more aggressive filtering
5437 // - sort the formula so that the most profitable solutions are found first
5438 // - sort the uses too
5439 // - search faster:
5440 // - don't compute a cost, and then compare. compare while computing a cost
5441 // and bail early.
5442 // - track register sets with SmallBitVector
5443
5444 const LSRUse &LU = Uses[Workspace.size()];
5445
5446 // If this use references any register that's already a part of the
5447 // in-progress solution, consider it a requirement that a formula must
5448 // reference that register in order to be considered. This prunes out
5449 // unprofitable searching.
5450 SmallSetVector<const SCEV *, 4> ReqRegs;
5451 for (const SCEV *S : CurRegs)
5452 if (LU.Regs.count(S))
5453 ReqRegs.insert(S);
5454
5455 SmallPtrSet<const SCEV *, 16> NewRegs;
5456 Cost NewCost(L, SE, TTI, AMK);
5457 for (const Formula &F : LU.Formulae) {
5458 // Ignore formulae which may not be ideal in terms of register reuse of
5459 // ReqRegs. The formula should use all required registers before
5460 // introducing new ones.
5461 // This can sometimes (notably when trying to favour postinc) lead to
5462 // sub-optimial decisions. There it is best left to the cost modelling to
5463 // get correct.
5464 if (!(AMK & TTI::AMK_PostIndexed) || LU.Kind != LSRUse::Address) {
5465 int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
5466 for (const SCEV *Reg : ReqRegs) {
5467 if ((F.ScaledReg && F.ScaledReg == Reg) ||
5468 is_contained(F.BaseRegs, Reg)) {
5469 --NumReqRegsToFind;
5470 if (NumReqRegsToFind == 0)
5471 break;
5472 }
5473 }
5474 if (NumReqRegsToFind != 0) {
5475 // If none of the formulae satisfied the required registers, then we could
5476 // clear ReqRegs and try again. Currently, we simply give up in this case.
5477 continue;
5478 }
5479 }
5480
5481 // Evaluate the cost of the current formula. If it's already worse than
5482 // the current best, prune the search at that point.
5483 NewCost = CurCost;
5484 NewRegs = CurRegs;
5485 NewCost.RateFormula(F, NewRegs, VisitedRegs, LU, HardwareLoopProfitable);
5486 if (NewCost.isLess(SolutionCost)) {
5487 Workspace.push_back(&F);
5488 if (Workspace.size() != Uses.size()) {
5489 SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
5490 NewRegs, VisitedRegs);
5491 if (F.getNumRegs() == 1 && Workspace.size() == 1)
5492 VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
5493 } else {
5494 LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
5495 dbgs() << ".\nRegs:\n";
5496 for (const SCEV *S : NewRegs) dbgs()
5497 << "- " << *S << "\n";
5498 dbgs() << '\n');
5499
5500 SolutionCost = NewCost;
5501 Solution = Workspace;
5502 }
5503 Workspace.pop_back();
5504 }
5505 }
5506}
5507
5508/// Choose one formula from each use. Return the results in the given Solution
5509/// vector.
5510void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
5512 Cost SolutionCost(L, SE, TTI, AMK);
5513 SolutionCost.Lose();
5514 Cost CurCost(L, SE, TTI, AMK);
5515 SmallPtrSet<const SCEV *, 16> CurRegs;
5516 DenseSet<const SCEV *> VisitedRegs;
5517 Workspace.reserve(Uses.size());
5518
5519 // SolveRecurse does all the work.
5520 SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
5521 CurRegs, VisitedRegs);
5522 if (Solution.empty()) {
5523 LLVM_DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
5524 return;
5525 }
5526
5527 // Ok, we've now made all our decisions.
5528 LLVM_DEBUG(dbgs() << "\n"
5529 "The chosen solution requires ";
5530 SolutionCost.print(dbgs()); dbgs() << ":\n";
5531 for (size_t i = 0, e = Uses.size(); i != e; ++i) {
5532 dbgs() << " ";
5533 Uses[i].print(dbgs());
5534 dbgs() << "\n"
5535 " ";
5536 Solution[i]->print(dbgs());
5537 dbgs() << '\n';
5538 });
5539
5540 assert(Solution.size() == Uses.size() && "Malformed solution!");
5541
5542 const bool EnableDropUnprofitableSolution = [&] {
5544 case cl::BOU_TRUE:
5545 return true;
5546 case cl::BOU_FALSE:
5547 return false;
5548 case cl::BOU_UNSET:
5550 }
5551 llvm_unreachable("Unhandled cl::boolOrDefault enum");
5552 }();
5553
5554 if (BaselineCost.isLess(SolutionCost)) {
5555 if (!EnableDropUnprofitableSolution)
5556 LLVM_DEBUG(
5557 dbgs() << "Baseline is more profitable than chosen solution, "
5558 "add option 'lsr-drop-solution' to drop LSR solution.\n");
5559 else {
5560 LLVM_DEBUG(dbgs() << "Baseline is more profitable than chosen "
5561 "solution, dropping LSR solution.\n";);
5562 Solution.clear();
5563 }
5564 }
5565}
5566
5567/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
5568/// we can go while still being dominated by the input positions. This helps
5569/// canonicalize the insert position, which encourages sharing.
5571LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
5572 const SmallVectorImpl<Instruction *> &Inputs)
5573 const {
5574 Instruction *Tentative = &*IP;
5575 while (true) {
5576 bool AllDominate = true;
5577 Instruction *BetterPos = nullptr;
5578 // Don't bother attempting to insert before a catchswitch, their basic block
5579 // cannot have other non-PHI instructions.
5580 if (isa<CatchSwitchInst>(Tentative))
5581 return IP;
5582
5583 for (Instruction *Inst : Inputs) {
5584 if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
5585 AllDominate = false;
5586 break;
5587 }
5588 // Attempt to find an insert position in the middle of the block,
5589 // instead of at the end, so that it can be used for other expansions.
5590 if (Tentative->getParent() == Inst->getParent() &&
5591 (!BetterPos || !DT.dominates(Inst, BetterPos)))
5592 BetterPos = &*std::next(BasicBlock::iterator(Inst));
5593 }
5594 if (!AllDominate)
5595 break;
5596 if (BetterPos)
5597 IP = BetterPos->getIterator();
5598 else
5599 IP = Tentative->getIterator();
5600
5601 const Loop *IPLoop = LI.getLoopFor(IP->getParent());
5602 unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
5603
5604 BasicBlock *IDom;
5605 for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
5606 if (!Rung) return IP;
5607 Rung = Rung->getIDom();
5608 if (!Rung) return IP;
5609 IDom = Rung->getBlock();
5610
5611 // Don't climb into a loop though.
5612 const Loop *IDomLoop = LI.getLoopFor(IDom);
5613 unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
5614 if (IDomDepth <= IPLoopDepth &&
5615 (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
5616 break;
5617 }
5618
5619 Tentative = IDom->getTerminator();
5620 }
5621
5622 return IP;
5623}
5624
5625/// Determine an input position which will be dominated by the operands and
5626/// which will dominate the result.
5627BasicBlock::iterator LSRInstance::AdjustInsertPositionForExpand(
5628 BasicBlock::iterator LowestIP, const LSRFixup &LF, const LSRUse &LU) const {
5629 // Collect some instructions which must be dominated by the
5630 // expanding replacement. These must be dominated by any operands that
5631 // will be required in the expansion.
5632 SmallVector<Instruction *, 4> Inputs;
5633 if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
5634 Inputs.push_back(I);
5635 if (LU.Kind == LSRUse::ICmpZero)
5636 if (Instruction *I =
5637 dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
5638 Inputs.push_back(I);
5639 if (LF.PostIncLoops.count(L)) {
5640 if (LF.isUseFullyOutsideLoop(L))
5641 Inputs.push_back(L->getLoopLatch()->getTerminator());
5642 else
5643 Inputs.push_back(IVIncInsertPos);
5644 }
5645 // The expansion must also be dominated by the increment positions of any
5646 // loops it for which it is using post-inc mode.
5647 for (const Loop *PIL : LF.PostIncLoops) {
5648 if (PIL == L) continue;
5649
5650 // Be dominated by the loop exit.
5651 SmallVector<BasicBlock *, 4> ExitingBlocks;
5652 PIL->getExitingBlocks(ExitingBlocks);
5653 if (!ExitingBlocks.empty()) {
5654 BasicBlock *BB = ExitingBlocks[0];
5655 for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
5656 BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
5657 Inputs.push_back(BB->getTerminator());
5658 }
5659 }
5660
5661 assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad() &&
5662 "Insertion point must be a normal instruction");
5663
5664 // Then, climb up the immediate dominator tree as far as we can go while
5665 // still being dominated by the input positions.
5666 BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
5667
5668 // Don't insert instructions before PHI nodes.
5669 while (isa<PHINode>(IP)) ++IP;
5670
5671 // Ignore landingpad instructions.
5672 while (IP->isEHPad()) ++IP;
5673
5674 // Set IP below instructions recently inserted by SCEVExpander. This keeps the
5675 // IP consistent across expansions and allows the previously inserted
5676 // instructions to be reused by subsequent expansion.
5677 while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP)
5678 ++IP;
5679
5680 return IP;
5681}
5682
5683/// Emit instructions for the leading candidate expression for this LSRUse (this
5684/// is called "expanding").
5685Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
5686 const Formula &F, BasicBlock::iterator IP,
5687 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5688 if (LU.RigidFormula)
5689 return LF.OperandValToReplace;
5690
5691 // Determine an input position which will be dominated by the operands and
5692 // which will dominate the result.
5693 IP = AdjustInsertPositionForExpand(IP, LF, LU);
5694 Rewriter.setInsertPoint(&*IP);
5695
5696 // Inform the Rewriter if we have a post-increment use, so that it can
5697 // perform an advantageous expansion.
5698 Rewriter.setPostInc(LF.PostIncLoops);
5699
5700 // This is the type that the user actually needs.
5701 Type *OpTy = LF.OperandValToReplace->getType();
5702 // This will be the type that we'll initially expand to.
5703 Type *Ty = F.getType();
5704 if (!Ty)
5705 // No type known; just expand directly to the ultimate type.
5706 Ty = OpTy;
5707 else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
5708 // Expand directly to the ultimate type if it's the right size.
5709 Ty = OpTy;
5710 // This is the type to do integer arithmetic in.
5711 Type *IntTy = SE.getEffectiveSCEVType(Ty);
5712
5713 // Build up a list of operands to add together to form the full base.
5715
5716 // Expand the BaseRegs portion.
5717 for (const SCEV *Reg : F.BaseRegs) {
5718 assert(!Reg->isZero() && "Zero allocated in a base register!");
5719
5720 // If we're expanding for a post-inc user, make the post-inc adjustment.
5721 Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE);
5722 Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
5723 }
5724
5725 // Expand the ScaledReg portion.
5726 Value *ICmpScaledV = nullptr;
5727 if (F.Scale != 0) {
5728 const SCEV *ScaledS = F.ScaledReg;
5729
5730 // If we're expanding for a post-inc user, make the post-inc adjustment.
5731 PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
5732 ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE);
5733
5734 if (LU.Kind == LSRUse::ICmpZero) {
5735 // Expand ScaleReg as if it was part of the base regs.
5736 if (F.Scale == 1)
5737 Ops.push_back(
5738 SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)));
5739 else {
5740 // An interesting way of "folding" with an icmp is to use a negated
5741 // scale, which we'll implement by inserting it into the other operand
5742 // of the icmp.
5743 assert(F.Scale == -1 &&
5744 "The only scale supported by ICmpZero uses is -1!");
5745 ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr);
5746 }
5747 } else {
5748 // Otherwise just expand the scaled register and an explicit scale,
5749 // which is expected to be matched as part of the address.
5750
5751 // Flush the operand list to suppress SCEVExpander hoisting address modes.
5752 // Unless the addressing mode will not be folded.
5753 if (!Ops.empty() && LU.Kind == LSRUse::Address &&
5754 isAMCompletelyFolded(TTI, LU, F)) {
5755 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), nullptr);
5756 Ops.clear();
5757 Ops.push_back(SE.getUnknown(FullV));
5758 }
5759 ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr));
5760 if (F.Scale != 1)
5761 ScaledS =
5762 SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
5763 Ops.push_back(ScaledS);
5764 }
5765 }
5766
5767 // Expand the GV portion.
5768 if (F.BaseGV) {
5769 // Flush the operand list to suppress SCEVExpander hoisting.
5770 if (!Ops.empty()) {
5771 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), IntTy);
5772 Ops.clear();
5773 Ops.push_back(SE.getUnknown(FullV));
5774 }
5775 Ops.push_back(SE.getUnknown(F.BaseGV));
5776 }
5777
5778 // Flush the operand list to suppress SCEVExpander hoisting of both folded and
5779 // unfolded offsets. LSR assumes they both live next to their uses.
5780 if (!Ops.empty()) {
5781 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
5782 Ops.clear();
5783 Ops.push_back(SE.getUnknown(FullV));
5784 }
5785
5786 // FIXME: Are we sure we won't get a mismatch here? Is there a way to bail
5787 // out at this point, or should we generate a SCEV adding together mixed
5788 // offsets?
5789 assert(F.BaseOffset.isCompatibleImmediate(LF.Offset) &&
5790 "Expanding mismatched offsets\n");
5791 // Expand the immediate portion.
5792 Immediate Offset = F.BaseOffset.addUnsigned(LF.Offset);
5793 if (Offset.isNonZero()) {
5794 if (LU.Kind == LSRUse::ICmpZero) {
5795 // The other interesting way of "folding" with an ICmpZero is to use a
5796 // negated immediate.
5797 if (!ICmpScaledV)
5798 ICmpScaledV =
5799 ConstantInt::get(IntTy, -(uint64_t)Offset.getFixedValue());
5800 else {
5801 Ops.push_back(SE.getUnknown(ICmpScaledV));
5802 ICmpScaledV = ConstantInt::get(IntTy, Offset.getFixedValue());
5803 }
5804 } else {
5805 // Just add the immediate values. These again are expected to be matched
5806 // as part of the address.
5807 Ops.push_back(Offset.getUnknownSCEV(SE, IntTy));
5808 }
5809 }
5810
5811 // Expand the unfolded offset portion.
5812 Immediate UnfoldedOffset = F.UnfoldedOffset;
5813 if (UnfoldedOffset.isNonZero()) {
5814 // Just add the immediate values.
5815 Ops.push_back(UnfoldedOffset.getUnknownSCEV(SE, IntTy));
5816 }
5817
5818 // Emit instructions summing all the operands.
5819 const SCEV *FullS = Ops.empty() ?
5820 SE.getConstant(IntTy, 0) :
5821 SE.getAddExpr(Ops);
5822 Value *FullV = Rewriter.expandCodeFor(FullS, Ty);
5823
5824 // We're done expanding now, so reset the rewriter.
5825 Rewriter.clearPostInc();
5826
5827 // An ICmpZero Formula represents an ICmp which we're handling as a
5828 // comparison against zero. Now that we've expanded an expression for that
5829 // form, update the ICmp's other operand.
5830 if (LU.Kind == LSRUse::ICmpZero) {
5831 ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
5832 if (auto *OperandIsInstr = dyn_cast<Instruction>(CI->getOperand(1)))
5833 DeadInsts.emplace_back(OperandIsInstr);
5834 assert(!F.BaseGV && "ICmp does not support folding a global value and "
5835 "a scale at the same time!");
5836 if (F.Scale == -1) {
5837 if (ICmpScaledV->getType() != OpTy) {
5839 CastInst::getCastOpcode(ICmpScaledV, false, OpTy, false),
5840 ICmpScaledV, OpTy, "tmp", CI->getIterator());
5841 ICmpScaledV = Cast;
5842 }
5843 CI->setOperand(1, ICmpScaledV);
5844 } else {
5845 // A scale of 1 means that the scale has been expanded as part of the
5846 // base regs.
5847 assert((F.Scale == 0 || F.Scale == 1) &&
5848 "ICmp does not support folding a global value and "
5849 "a scale at the same time!");
5851 -(uint64_t)Offset.getFixedValue());
5852 if (C->getType() != OpTy) {
5854 CastInst::getCastOpcode(C, false, OpTy, false), C, OpTy,
5855 CI->getDataLayout());
5856 assert(C && "Cast of ConstantInt should have folded");
5857 }
5858
5859 CI->setOperand(1, C);
5860 }
5861 }
5862
5863 return FullV;
5864}
5865
5866/// Helper for Rewrite. PHI nodes are special because the use of their operands
5867/// effectively happens in their predecessor blocks, so the expression may need
5868/// to be expanded in multiple places.
5869void LSRInstance::RewriteForPHI(PHINode *PN, const LSRUse &LU,
5870 const LSRFixup &LF, const Formula &F,
5871 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
5872 DenseMap<BasicBlock *, Value *> Inserted;
5873
5874 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
5875 if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
5876 bool needUpdateFixups = false;
5877 BasicBlock *BB = PN->getIncomingBlock(i);
5878
5879 // If this is a critical edge, split the edge so that we do not insert
5880 // the code on all predecessor/successor paths. We do this unless this
5881 // is the canonical backedge for this loop, which complicates post-inc
5882 // users.
5883 if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
5886 BasicBlock *Parent = PN->getParent();
5887 Loop *PNLoop = LI.getLoopFor(Parent);
5888 if (!PNLoop || Parent != PNLoop->getHeader()) {
5889 // Split the critical edge.
5890 BasicBlock *NewBB = nullptr;
5891 if (!Parent->isLandingPad()) {
5892 NewBB =
5893 SplitCriticalEdge(BB, Parent,
5894 CriticalEdgeSplittingOptions(&DT, &LI, MSSAU)
5895 .setMergeIdenticalEdges()
5896 .setKeepOneInputPHIs());
5897 } else {
5899 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
5900 SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DTU, &LI);
5901 NewBB = NewBBs[0];
5902 }
5903 // If NewBB==NULL, then SplitCriticalEdge refused to split because all
5904 // phi predecessors are identical. The simple thing to do is skip
5905 // splitting in this case rather than complicate the API.
5906 if (NewBB) {
5907 // If PN is outside of the loop and BB is in the loop, we want to
5908 // move the block to be immediately before the PHI block, not
5909 // immediately after BB.
5910 if (L->contains(BB) && !L->contains(PN))
5911 NewBB->moveBefore(PN->getParent());
5912
5913 // Splitting the edge can reduce the number of PHI entries we have.
5914 e = PN->getNumIncomingValues();
5915 BB = NewBB;
5916 i = PN->getBasicBlockIndex(BB);
5917
5918 needUpdateFixups = true;
5919 }
5920 }
5921 }
5922
5923 std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
5924 Inserted.try_emplace(BB);
5925 if (!Pair.second)
5926 PN->setIncomingValue(i, Pair.first->second);
5927 else {
5928 Value *FullV =
5929 Expand(LU, LF, F, BB->getTerminator()->getIterator(), DeadInsts);
5930
5931 // If this is reuse-by-noop-cast, insert the noop cast.
5932 Type *OpTy = LF.OperandValToReplace->getType();
5933 if (FullV->getType() != OpTy)
5934 FullV = CastInst::Create(
5935 CastInst::getCastOpcode(FullV, false, OpTy, false), FullV,
5936 LF.OperandValToReplace->getType(), "tmp",
5937 BB->getTerminator()->getIterator());
5938
5939 // If the incoming block for this value is not in the loop, it means the
5940 // current PHI is not in a loop exit, so we must create a LCSSA PHI for
5941 // the inserted value.
5942 if (auto *I = dyn_cast<Instruction>(FullV))
5943 if (L->contains(I) && !L->contains(BB))
5944 InsertedNonLCSSAInsts.insert(I);
5945
5946 PN->setIncomingValue(i, FullV);
5947 Pair.first->second = FullV;
5948 }
5949
5950 // If LSR splits critical edge and phi node has other pending
5951 // fixup operands, we need to update those pending fixups. Otherwise
5952 // formulae will not be implemented completely and some instructions
5953 // will not be eliminated.
5954 if (needUpdateFixups) {
5955 for (LSRUse &LU : Uses)
5956 for (LSRFixup &Fixup : LU.Fixups)
5957 // If fixup is supposed to rewrite some operand in the phi
5958 // that was just updated, it may be already moved to
5959 // another phi node. Such fixup requires update.
5960 if (Fixup.UserInst == PN) {
5961 // Check if the operand we try to replace still exists in the
5962 // original phi.
5963 bool foundInOriginalPHI = false;
5964 for (const auto &val : PN->incoming_values())
5965 if (val == Fixup.OperandValToReplace) {
5966 foundInOriginalPHI = true;
5967 break;
5968 }
5969
5970 // If fixup operand found in original PHI - nothing to do.
5971 if (foundInOriginalPHI)
5972 continue;
5973
5974 // Otherwise it might be moved to another PHI and requires update.
5975 // If fixup operand not found in any of the incoming blocks that
5976 // means we have already rewritten it - nothing to do.
5977 for (const auto &Block : PN->blocks())
5978 for (BasicBlock::iterator I = Block->begin(); isa<PHINode>(I);
5979 ++I) {
5980 PHINode *NewPN = cast<PHINode>(I);
5981 for (const auto &val : NewPN->incoming_values())
5982 if (val == Fixup.OperandValToReplace)
5983 Fixup.UserInst = NewPN;
5984 }
5985 }
5986 }
5987 }
5988}
5989
5990/// Emit instructions for the leading candidate expression for this LSRUse (this
5991/// is called "expanding"), and update the UserInst to reference the newly
5992/// expanded value.
5993void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
5994 const Formula &F,
5995 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
5996 // First, find an insertion point that dominates UserInst. For PHI nodes,
5997 // find the nearest block which dominates all the relevant uses.
5998 if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
5999 RewriteForPHI(PN, LU, LF, F, DeadInsts);
6000 } else {
6001 Value *FullV = Expand(LU, LF, F, LF.UserInst->getIterator(), DeadInsts);
6002
6003 // If this is reuse-by-noop-cast, insert the noop cast.
6004 Type *OpTy = LF.OperandValToReplace->getType();
6005 if (FullV->getType() != OpTy) {
6006 Instruction *Cast =
6007 CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
6008 FullV, OpTy, "tmp", LF.UserInst->getIterator());
6009 FullV = Cast;
6010 }
6011
6012 // Update the user. ICmpZero is handled specially here (for now) because
6013 // Expand may have updated one of the operands of the icmp already, and
6014 // its new value may happen to be equal to LF.OperandValToReplace, in
6015 // which case doing replaceUsesOfWith leads to replacing both operands
6016 // with the same value. TODO: Reorganize this.
6017 if (LU.Kind == LSRUse::ICmpZero)
6018 LF.UserInst->setOperand(0, FullV);
6019 else
6020 LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
6021 }
6022
6023 if (auto *OperandIsInstr = dyn_cast<Instruction>(LF.OperandValToReplace))
6024 DeadInsts.emplace_back(OperandIsInstr);
6025}
6026
6027// Trying to hoist the IVInc to loop header if all IVInc users are in
6028// the loop header. It will help backend to generate post index load/store
6029// when the latch block is different from loop header block.
6030static bool canHoistIVInc(const TargetTransformInfo &TTI, const LSRFixup &Fixup,
6031 const LSRUse &LU, Instruction *IVIncInsertPos,
6032 Loop *L) {
6033 if (LU.Kind != LSRUse::Address)
6034 return false;
6035
6036 // For now this code do the conservative optimization, only work for
6037 // the header block. Later we can hoist the IVInc to the block post
6038 // dominate all users.
6039 BasicBlock *LHeader = L->getHeader();
6040 if (IVIncInsertPos->getParent() == LHeader)
6041 return false;
6042
6043 if (!Fixup.OperandValToReplace ||
6044 any_of(Fixup.OperandValToReplace->users(), [&LHeader](User *U) {
6045 Instruction *UI = cast<Instruction>(U);
6046 return UI->getParent() != LHeader;
6047 }))
6048 return false;
6049
6050 Instruction *I = Fixup.UserInst;
6051 Type *Ty = I->getType();
6052 return (isa<LoadInst>(I) && TTI.isIndexedLoadLegal(TTI.MIM_PostInc, Ty)) ||
6053 (isa<StoreInst>(I) && TTI.isIndexedStoreLegal(TTI.MIM_PostInc, Ty));
6054}
6055
6056/// Rewrite all the fixup locations with new values, following the chosen
6057/// solution.
6058void LSRInstance::ImplementSolution(
6059 const SmallVectorImpl<const Formula *> &Solution) {
6060 // Keep track of instructions we may have made dead, so that
6061 // we can remove them after we are done working.
6063
6064 // Mark phi nodes that terminate chains so the expander tries to reuse them.
6065 for (const IVChain &Chain : IVChainVec) {
6066 if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst()))
6067 Rewriter.setChainedPhi(PN);
6068 }
6069
6070 // Expand the new value definitions and update the users.
6071 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
6072 for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) {
6073 Instruction *InsertPos =
6074 canHoistIVInc(TTI, Fixup, Uses[LUIdx], IVIncInsertPos, L)
6075 ? L->getHeader()->getTerminator()
6076 : IVIncInsertPos;
6077 Rewriter.setIVIncInsertPos(L, InsertPos);
6078 Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], DeadInsts);
6079 Changed = true;
6080 }
6081
6082 auto InsertedInsts = InsertedNonLCSSAInsts.takeVector();
6083 formLCSSAForInstructions(InsertedInsts, DT, LI, &SE);
6084
6085 for (const IVChain &Chain : IVChainVec) {
6086 GenerateIVChain(Chain, DeadInsts);
6087 Changed = true;
6088 }
6089
6090 for (const WeakVH &IV : Rewriter.getInsertedIVs())
6091 if (IV && dyn_cast<Instruction>(&*IV)->getParent())
6092 ScalarEvolutionIVs.push_back(IV);
6093
6094 // Clean up after ourselves. This must be done before deleting any
6095 // instructions.
6096 Rewriter.clear();
6097
6099 &TLI, MSSAU);
6100
6101 // In our cost analysis above, we assume that each addrec consumes exactly
6102 // one register, and arrange to have increments inserted just before the
6103 // latch to maximimize the chance this is true. However, if we reused
6104 // existing IVs, we now need to move the increments to match our
6105 // expectations. Otherwise, our cost modeling results in us having a
6106 // chosen a non-optimal result for the actual schedule. (And yes, this
6107 // scheduling decision does impact later codegen.)
6108 for (PHINode &PN : L->getHeader()->phis()) {
6109 BinaryOperator *BO = nullptr;
6110 Value *Start = nullptr, *Step = nullptr;
6111 if (!matchSimpleRecurrence(&PN, BO, Start, Step))
6112 continue;
6113
6114 switch (BO->getOpcode()) {
6115 case Instruction::Sub:
6116 if (BO->getOperand(0) != &PN)
6117 // sub is non-commutative - match handling elsewhere in LSR
6118 continue;
6119 break;
6120 case Instruction::Add:
6121 break;
6122 default:
6123 continue;
6124 };
6125
6126 if (!isa<Constant>(Step))
6127 // If not a constant step, might increase register pressure
6128 // (We assume constants have been canonicalized to RHS)
6129 continue;
6130
6131 if (BO->getParent() == IVIncInsertPos->getParent())
6132 // Only bother moving across blocks. Isel can handle block local case.
6133 continue;
6134
6135 // Can we legally schedule inc at the desired point?
6136 if (!llvm::all_of(BO->uses(),
6137 [&](Use &U) {return DT.dominates(IVIncInsertPos, U);}))
6138 continue;
6139 BO->moveBefore(IVIncInsertPos->getIterator());
6140 Changed = true;
6141 }
6142
6143
6144}
6145
6146LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
6147 DominatorTree &DT, LoopInfo &LI,
6148 const TargetTransformInfo &TTI, AssumptionCache &AC,
6149 TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU)
6150 : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), TLI(TLI), TTI(TTI), L(L),
6151 MSSAU(MSSAU), AMK(PreferredAddresingMode.getNumOccurrences() > 0
6153 : TTI.getPreferredAddressingMode(L, &SE)),
6154 Rewriter(SE, L->getHeader()->getDataLayout(), "lsr", false),
6155 BaselineCost(L, SE, TTI, AMK) {
6156 // If LoopSimplify form is not available, stay out of trouble.
6157 if (!L->isLoopSimplifyForm())
6158 return;
6159
6160 // If there's no interesting work to be done, bail early.
6161 if (IU.empty()) return;
6162
6163 // If there's too much analysis to be done, bail early. We won't be able to
6164 // model the problem anyway.
6165 unsigned NumUsers = 0;
6166 for (const IVStrideUse &U : IU) {
6167 if (++NumUsers > MaxIVUsers) {
6168 (void)U;
6169 LLVM_DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U
6170 << "\n");
6171 return;
6172 }
6173 // Bail out if we have a PHI on an EHPad that gets a value from a
6174 // CatchSwitchInst. Because the CatchSwitchInst cannot be split, there is
6175 // no good place to stick any instructions.
6176 if (auto *PN = dyn_cast<PHINode>(U.getUser())) {
6177 auto FirstNonPHI = PN->getParent()->getFirstNonPHIIt();
6178 if (isa<FuncletPadInst>(FirstNonPHI) ||
6179 isa<CatchSwitchInst>(FirstNonPHI))
6180 for (BasicBlock *PredBB : PN->blocks())
6181 if (isa<CatchSwitchInst>(PredBB->getFirstNonPHIIt()))
6182 return;
6183 }
6184 }
6185
6186 LLVM_DEBUG(dbgs() << "\nLSR on loop ";
6187 L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
6188 dbgs() << ":\n");
6189
6190 // Check if we expect this loop to use a hardware loop instruction, which will
6191 // be used when calculating the costs of formulas.
6192 HardwareLoopInfo HWLoopInfo(L);
6193 HardwareLoopProfitable =
6194 TTI.isHardwareLoopProfitable(L, SE, AC, &TLI, HWLoopInfo);
6195
6196 // Configure SCEVExpander already now, so the correct mode is used for
6197 // isSafeToExpand() checks.
6198#if LLVM_ENABLE_ABI_BREAKING_CHECKS
6199 Rewriter.setDebugType(DEBUG_TYPE);
6200#endif
6201 Rewriter.disableCanonicalMode();
6202 Rewriter.enableLSRMode();
6203
6204 // First, perform some low-level loop optimizations.
6205 OptimizeShadowIV();
6206 OptimizeLoopTermCond();
6207
6208 // If loop preparation eliminates all interesting IV users, bail.
6209 if (IU.empty()) return;
6210
6211 // Skip nested loops until we can model them better with formulae.
6212 if (!L->isInnermost()) {
6213 LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
6214 return;
6215 }
6216
6217 // Start collecting data and preparing for the solver.
6218 // If number of registers is not the major cost, we cannot benefit from the
6219 // current profitable chain optimization which is based on number of
6220 // registers.
6221 // FIXME: add profitable chain optimization for other kinds major cost, for
6222 // example number of instructions.
6223 if (TTI.isNumRegsMajorCostOfLSR() || StressIVChain)
6224 CollectChains();
6225 CollectInterestingTypesAndFactors();
6226 CollectFixupsAndInitialFormulae();
6227 CollectLoopInvariantFixupsAndFormulae();
6228
6229 if (Uses.empty())
6230 return;
6231
6232 LLVM_DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
6233 print_uses(dbgs()));
6234 LLVM_DEBUG(dbgs() << "The baseline solution requires ";
6235 BaselineCost.print(dbgs()); dbgs() << "\n");
6236
6237 // Now use the reuse data to generate a bunch of interesting ways
6238 // to formulate the values needed for the uses.
6239 GenerateAllReuseFormulae();
6240
6241 FilterOutUndesirableDedicatedRegisters();
6242 NarrowSearchSpaceUsingHeuristics();
6243
6245 Solve(Solution);
6246
6247 // Release memory that is no longer needed.
6248 Factors.clear();
6249 Types.clear();
6250 RegUses.clear();
6251
6252 if (Solution.empty())
6253 return;
6254
6255#ifndef NDEBUG
6256 // Formulae should be legal.
6257 for (const LSRUse &LU : Uses) {
6258 for (const Formula &F : LU.Formulae)
6259 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
6260 F) && "Illegal formula generated!");
6261 };
6262#endif
6263
6264 // Now that we've decided what we want, make it so.
6265 ImplementSolution(Solution);
6266}
6267
6268#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
6269void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
6270 if (Factors.empty() && Types.empty()) return;
6271
6272 OS << "LSR has identified the following interesting factors and types: ";
6273 bool First = true;
6274
6275 for (int64_t Factor : Factors) {
6276 if (!First) OS << ", ";
6277 First = false;
6278 OS << '*' << Factor;
6279 }
6280
6281 for (Type *Ty : Types) {
6282 if (!First) OS << ", ";
6283 First = false;
6284 OS << '(' << *Ty << ')';
6285 }
6286 OS << '\n';
6287}
6288
6289void LSRInstance::print_fixups(raw_ostream &OS) const {
6290 OS << "LSR is examining the following fixup sites:\n";
6291 for (const LSRUse &LU : Uses)
6292 for (const LSRFixup &LF : LU.Fixups) {
6293 dbgs() << " ";
6294 LF.print(OS);
6295 OS << '\n';
6296 }
6297}
6298
6299void LSRInstance::print_uses(raw_ostream &OS) const {
6300 OS << "LSR is examining the following uses:\n";
6301 for (const LSRUse &LU : Uses) {
6302 dbgs() << " ";
6303 LU.print(OS);
6304 OS << '\n';
6305 for (const Formula &F : LU.Formulae) {
6306 OS << " ";
6307 F.print(OS);
6308 OS << '\n';
6309 }
6310 }
6311}
6312
6313void LSRInstance::print(raw_ostream &OS) const {
6314 print_factors_and_types(OS);
6315 print_fixups(OS);
6316 print_uses(OS);
6317}
6318
6319LLVM_DUMP_METHOD void LSRInstance::dump() const {
6320 print(errs()); errs() << '\n';
6321}
6322#endif
6323
6324namespace {
6325
6326class LoopStrengthReduce : public LoopPass {
6327public:
6328 static char ID; // Pass ID, replacement for typeid
6329
6330 LoopStrengthReduce();
6331
6332private:
6333 bool runOnLoop(Loop *L, LPPassManager &LPM) override;
6334 void getAnalysisUsage(AnalysisUsage &AU) const override;
6335};
6336
6337} // end anonymous namespace
6338
6339LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
6341}
6342
6343void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
6344 // We split critical edges, so we change the CFG. However, we do update
6345 // many analyses if they are around.
6347
6348 AU.addRequired<LoopInfoWrapperPass>();
6349 AU.addPreserved<LoopInfoWrapperPass>();
6351 AU.addRequired<DominatorTreeWrapperPass>();
6352 AU.addPreserved<DominatorTreeWrapperPass>();
6353 AU.addRequired<ScalarEvolutionWrapperPass>();
6354 AU.addPreserved<ScalarEvolutionWrapperPass>();
6355 AU.addRequired<AssumptionCacheTracker>();
6356 AU.addRequired<TargetLibraryInfoWrapperPass>();
6357 // Requiring LoopSimplify a second time here prevents IVUsers from running
6358 // twice, since LoopSimplify was invalidated by running ScalarEvolution.
6360 AU.addRequired<IVUsersWrapperPass>();
6361 AU.addPreserved<IVUsersWrapperPass>();
6362 AU.addRequired<TargetTransformInfoWrapperPass>();
6363 AU.addPreserved<MemorySSAWrapperPass>();
6364}
6365
6366namespace {
6367
6368/// Enables more convenient iteration over a DWARF expression vector.
6370ToDwarfOpIter(SmallVectorImpl<uint64_t> &Expr) {
6371 llvm::DIExpression::expr_op_iterator Begin =
6372 llvm::DIExpression::expr_op_iterator(Expr.begin());
6373 llvm::DIExpression::expr_op_iterator End =
6374 llvm::DIExpression::expr_op_iterator(Expr.end());
6375 return {Begin, End};
6376}
6377
6378struct SCEVDbgValueBuilder {
6379 SCEVDbgValueBuilder() = default;
6380 SCEVDbgValueBuilder(const SCEVDbgValueBuilder &Base) { clone(Base); }
6381
6382 void clone(const SCEVDbgValueBuilder &Base) {
6383 LocationOps = Base.LocationOps;
6384 Expr = Base.Expr;
6385 }
6386
6387 void clear() {
6388 LocationOps.clear();
6389 Expr.clear();
6390 }
6391
6392 /// The DIExpression as we translate the SCEV.
6394 /// The location ops of the DIExpression.
6395 SmallVector<Value *, 2> LocationOps;
6396
6397 void pushOperator(uint64_t Op) { Expr.push_back(Op); }
6398 void pushUInt(uint64_t Operand) { Expr.push_back(Operand); }
6399
6400 /// Add a DW_OP_LLVM_arg to the expression, followed by the index of the value
6401 /// in the set of values referenced by the expression.
6402 void pushLocation(llvm::Value *V) {
6404 auto *It = llvm::find(LocationOps, V);
6405 unsigned ArgIndex = 0;
6406 if (It != LocationOps.end()) {
6407 ArgIndex = std::distance(LocationOps.begin(), It);
6408 } else {
6409 ArgIndex = LocationOps.size();
6410 LocationOps.push_back(V);
6411 }
6412 Expr.push_back(ArgIndex);
6413 }
6414
6415 void pushValue(const SCEVUnknown *U) {
6416 llvm::Value *V = cast<SCEVUnknown>(U)->getValue();
6417 pushLocation(V);
6418 }
6419
6420 bool pushConst(const SCEVConstant *C) {
6421 if (C->getAPInt().getSignificantBits() > 64)
6422 return false;
6423 Expr.push_back(llvm::dwarf::DW_OP_consts);
6424 Expr.push_back(C->getAPInt().getSExtValue());
6425 return true;
6426 }
6427
6428 // Iterating the expression as DWARF ops is convenient when updating
6429 // DWARF_OP_LLVM_args.
6431 return ToDwarfOpIter(Expr);
6432 }
6433
6434 /// Several SCEV types are sequences of the same arithmetic operator applied
6435 /// to constants and values that may be extended or truncated.
6436 bool pushArithmeticExpr(const llvm::SCEVCommutativeExpr *CommExpr,
6437 uint64_t DwarfOp) {
6438 assert((isa<llvm::SCEVAddExpr>(CommExpr) || isa<SCEVMulExpr>(CommExpr)) &&
6439 "Expected arithmetic SCEV type");
6440 bool Success = true;
6441 unsigned EmitOperator = 0;
6442 for (const auto &Op : CommExpr->operands()) {
6443 Success &= pushSCEV(Op);
6444
6445 if (EmitOperator >= 1)
6446 pushOperator(DwarfOp);
6447 ++EmitOperator;
6448 }
6449 return Success;
6450 }
6451
6452 // TODO: Identify and omit noop casts.
6453 bool pushCast(const llvm::SCEVCastExpr *C, bool IsSigned) {
6454 const llvm::SCEV *Inner = C->getOperand(0);
6455 const llvm::Type *Type = C->getType();
6456 uint64_t ToWidth = Type->getIntegerBitWidth();
6457 bool Success = pushSCEV(Inner);
6458 uint64_t CastOps[] = {dwarf::DW_OP_LLVM_convert, ToWidth,
6459 IsSigned ? llvm::dwarf::DW_ATE_signed
6460 : llvm::dwarf::DW_ATE_unsigned};
6461 for (const auto &Op : CastOps)
6462 pushOperator(Op);
6463 return Success;
6464 }
6465
6466 // TODO: MinMax - although these haven't been encountered in the test suite.
6467 bool pushSCEV(const llvm::SCEV *S) {
6468 bool Success = true;
6469 if (const SCEVConstant *StartInt = dyn_cast<SCEVConstant>(S)) {
6470 Success &= pushConst(StartInt);
6471
6472 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
6473 if (!U->getValue())
6474 return false;
6475 pushLocation(U->getValue());
6476
6477 } else if (const SCEVMulExpr *MulRec = dyn_cast<SCEVMulExpr>(S)) {
6478 Success &= pushArithmeticExpr(MulRec, llvm::dwarf::DW_OP_mul);
6479
6480 } else if (const SCEVUDivExpr *UDiv = dyn_cast<SCEVUDivExpr>(S)) {
6481 Success &= pushSCEV(UDiv->getLHS());
6482 Success &= pushSCEV(UDiv->getRHS());
6483 pushOperator(llvm::dwarf::DW_OP_div);
6484
6485 } else if (const SCEVCastExpr *Cast = dyn_cast<SCEVCastExpr>(S)) {
6486 // Assert if a new and unknown SCEVCastEXpr type is encountered.
6489 "Unexpected cast type in SCEV.");
6490 Success &= pushCast(Cast, (isa<SCEVSignExtendExpr>(Cast)));
6491
6492 } else if (const SCEVAddExpr *AddExpr = dyn_cast<SCEVAddExpr>(S)) {
6493 Success &= pushArithmeticExpr(AddExpr, llvm::dwarf::DW_OP_plus);
6494
6495 } else if (isa<SCEVAddRecExpr>(S)) {
6496 // Nested SCEVAddRecExpr are generated by nested loops and are currently
6497 // unsupported.
6498 return false;
6499
6500 } else {
6501 return false;
6502 }
6503 return Success;
6504 }
6505
6506 /// Return true if the combination of arithmetic operator and underlying
6507 /// SCEV constant value is an identity function.
6508 bool isIdentityFunction(uint64_t Op, const SCEV *S) {
6509 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
6510 if (C->getAPInt().getSignificantBits() > 64)
6511 return false;
6512 int64_t I = C->getAPInt().getSExtValue();
6513 switch (Op) {
6514 case llvm::dwarf::DW_OP_plus:
6515 case llvm::dwarf::DW_OP_minus:
6516 return I == 0;
6517 case llvm::dwarf::DW_OP_mul:
6518 case llvm::dwarf::DW_OP_div:
6519 return I == 1;
6520 }
6521 }
6522 return false;
6523 }
6524
6525 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6526 /// builder's expression stack. The stack should already contain an
6527 /// expression for the iteration count, so that it can be multiplied by
6528 /// the stride and added to the start.
6529 /// Components of the expression are omitted if they are an identity function.
6530 /// Chain (non-affine) SCEVs are not supported.
6531 bool SCEVToValueExpr(const llvm::SCEVAddRecExpr &SAR, ScalarEvolution &SE) {
6532 assert(SAR.isAffine() && "Expected affine SCEV");
6533 const SCEV *Start = SAR.getStart();
6534 const SCEV *Stride = SAR.getStepRecurrence(SE);
6535
6536 // Skip pushing arithmetic noops.
6537 if (!isIdentityFunction(llvm::dwarf::DW_OP_mul, Stride)) {
6538 if (!pushSCEV(Stride))
6539 return false;
6540 pushOperator(llvm::dwarf::DW_OP_mul);
6541 }
6542 if (!isIdentityFunction(llvm::dwarf::DW_OP_plus, Start)) {
6543 if (!pushSCEV(Start))
6544 return false;
6545 pushOperator(llvm::dwarf::DW_OP_plus);
6546 }
6547 return true;
6548 }
6549
6550 /// Create an expression that is an offset from a value (usually the IV).
6551 void createOffsetExpr(int64_t Offset, Value *OffsetValue) {
6552 pushLocation(OffsetValue);
6554 LLVM_DEBUG(
6555 dbgs() << "scev-salvage: Generated IV offset expression. Offset: "
6556 << std::to_string(Offset) << "\n");
6557 }
6558
6559 /// Combine a translation of the SCEV and the IV to create an expression that
6560 /// recovers a location's value.
6561 /// returns true if an expression was created.
6562 bool createIterCountExpr(const SCEV *S,
6563 const SCEVDbgValueBuilder &IterationCount,
6564 ScalarEvolution &SE) {
6565 // SCEVs for SSA values are most frquently of the form
6566 // {start,+,stride}, but sometimes they are ({start,+,stride} + %a + ..).
6567 // This is because %a is a PHI node that is not the IV. However, these
6568 // SCEVs have not been observed to result in debuginfo-lossy optimisations,
6569 // so its not expected this point will be reached.
6570 if (!isa<SCEVAddRecExpr>(S))
6571 return false;
6572
6573 LLVM_DEBUG(dbgs() << "scev-salvage: Location to salvage SCEV: " << *S
6574 << '\n');
6575
6576 const auto *Rec = cast<SCEVAddRecExpr>(S);
6577 if (!Rec->isAffine())
6578 return false;
6579
6581 return false;
6582
6583 // Initialise a new builder with the iteration count expression. In
6584 // combination with the value's SCEV this enables recovery.
6585 clone(IterationCount);
6586 if (!SCEVToValueExpr(*Rec, SE))
6587 return false;
6588
6589 return true;
6590 }
6591
6592 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6593 /// builder's expression stack. The stack should already contain an
6594 /// expression for the iteration count, so that it can be multiplied by
6595 /// the stride and added to the start.
6596 /// Components of the expression are omitted if they are an identity function.
6597 bool SCEVToIterCountExpr(const llvm::SCEVAddRecExpr &SAR,
6598 ScalarEvolution &SE) {
6599 assert(SAR.isAffine() && "Expected affine SCEV");
6600 const SCEV *Start = SAR.getStart();
6601 const SCEV *Stride = SAR.getStepRecurrence(SE);
6602
6603 // Skip pushing arithmetic noops.
6604 if (!isIdentityFunction(llvm::dwarf::DW_OP_minus, Start)) {
6605 if (!pushSCEV(Start))
6606 return false;
6607 pushOperator(llvm::dwarf::DW_OP_minus);
6608 }
6609 if (!isIdentityFunction(llvm::dwarf::DW_OP_div, Stride)) {
6610 if (!pushSCEV(Stride))
6611 return false;
6612 pushOperator(llvm::dwarf::DW_OP_div);
6613 }
6614 return true;
6615 }
6616
6617 // Append the current expression and locations to a location list and an
6618 // expression list. Modify the DW_OP_LLVM_arg indexes to account for
6619 // the locations already present in the destination list.
6620 void appendToVectors(SmallVectorImpl<uint64_t> &DestExpr,
6621 SmallVectorImpl<Value *> &DestLocations) {
6622 assert(!DestLocations.empty() &&
6623 "Expected the locations vector to contain the IV");
6624 // The DWARF_OP_LLVM_arg arguments of the expression being appended must be
6625 // modified to account for the locations already in the destination vector.
6626 // All builders contain the IV as the first location op.
6627 assert(!LocationOps.empty() &&
6628 "Expected the location ops to contain the IV.");
6629 // DestIndexMap[n] contains the index in DestLocations for the nth
6630 // location in this SCEVDbgValueBuilder.
6631 SmallVector<uint64_t, 2> DestIndexMap;
6632 for (const auto &Op : LocationOps) {
6633 auto It = find(DestLocations, Op);
6634 if (It != DestLocations.end()) {
6635 // Location already exists in DestLocations, reuse existing ArgIndex.
6636 DestIndexMap.push_back(std::distance(DestLocations.begin(), It));
6637 continue;
6638 }
6639 // Location is not in DestLocations, add it.
6640 DestIndexMap.push_back(DestLocations.size());
6641 DestLocations.push_back(Op);
6642 }
6643
6644 for (const auto &Op : expr_ops()) {
6645 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6646 Op.appendToVector(DestExpr);
6647 continue;
6648 }
6649
6651 // `DW_OP_LLVM_arg n` represents the nth LocationOp in this SCEV,
6652 // DestIndexMap[n] contains its new index in DestLocations.
6653 uint64_t NewIndex = DestIndexMap[Op.getArg(0)];
6654 DestExpr.push_back(NewIndex);
6655 }
6656 }
6657};
6658
6659/// Holds all the required data to salvage a dbg.value using the pre-LSR SCEVs
6660/// and DIExpression.
6661struct DVIRecoveryRec {
6662 DVIRecoveryRec(DbgVariableRecord *DVR)
6663 : DbgRef(DVR), Expr(DVR->getExpression()), HadLocationArgList(false) {}
6664
6665 DbgVariableRecord *DbgRef;
6666 DIExpression *Expr;
6667 bool HadLocationArgList;
6668 SmallVector<WeakVH, 2> LocationOps;
6671
6672 void clear() {
6673 for (auto &RE : RecoveryExprs)
6674 RE.reset();
6675 RecoveryExprs.clear();
6676 }
6677
6678 ~DVIRecoveryRec() { clear(); }
6679};
6680} // namespace
6681
6682/// Returns the total number of DW_OP_llvm_arg operands in the expression.
6683/// This helps in determining if a DIArglist is necessary or can be omitted from
6684/// the dbg.value.
6686 auto expr_ops = ToDwarfOpIter(Expr);
6687 unsigned Count = 0;
6688 for (auto Op : expr_ops)
6689 if (Op.getOp() == dwarf::DW_OP_LLVM_arg)
6690 Count++;
6691 return Count;
6692}
6693
6694/// Overwrites DVI with the location and Ops as the DIExpression. This will
6695/// create an invalid expression if Ops has any dwarf::DW_OP_llvm_arg operands,
6696/// because a DIArglist is not created for the first argument of the dbg.value.
6697template <typename T>
6698static void updateDVIWithLocation(T &DbgVal, Value *Location,
6700 assert(numLLVMArgOps(Ops) == 0 && "Expected expression that does not "
6701 "contain any DW_OP_llvm_arg operands.");
6702 DbgVal.setRawLocation(ValueAsMetadata::get(Location));
6703 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6704 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6705}
6706
6707/// Overwrite DVI with locations placed into a DIArglist.
6708template <typename T>
6709static void updateDVIWithLocations(T &DbgVal,
6710 SmallVectorImpl<Value *> &Locations,
6712 assert(numLLVMArgOps(Ops) != 0 &&
6713 "Expected expression that references DIArglist locations using "
6714 "DW_OP_llvm_arg operands.");
6716 for (Value *V : Locations)
6717 MetadataLocs.push_back(ValueAsMetadata::get(V));
6718 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6719 DbgVal.setRawLocation(llvm::DIArgList::get(DbgVal.getContext(), ValArrayRef));
6720 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6721}
6722
6723/// Write the new expression and new location ops for the dbg.value. If possible
6724/// reduce the szie of the dbg.value by omitting DIArglist. This
6725/// can be omitted if:
6726/// 1. There is only a single location, refenced by a single DW_OP_llvm_arg.
6727/// 2. The DW_OP_LLVM_arg is the first operand in the expression.
6728static void UpdateDbgValue(DVIRecoveryRec &DVIRec,
6729 SmallVectorImpl<Value *> &NewLocationOps,
6731 DbgVariableRecord *DbgVal = DVIRec.DbgRef;
6732 unsigned NumLLVMArgs = numLLVMArgOps(NewExpr);
6733 if (NumLLVMArgs == 0) {
6734 // Location assumed to be on the stack.
6735 updateDVIWithLocation(*DbgVal, NewLocationOps[0], NewExpr);
6736 } else if (NumLLVMArgs == 1 && NewExpr[0] == dwarf::DW_OP_LLVM_arg) {
6737 // There is only a single DW_OP_llvm_arg at the start of the expression,
6738 // so it can be omitted along with DIArglist.
6739 assert(NewExpr[1] == 0 &&
6740 "Lone LLVM_arg in a DIExpression should refer to location-op 0.");
6742 updateDVIWithLocation(*DbgVal, NewLocationOps[0], ShortenedOps);
6743 } else {
6744 // Multiple DW_OP_llvm_arg, so DIArgList is strictly necessary.
6745 updateDVIWithLocations(*DbgVal, NewLocationOps, NewExpr);
6746 }
6747
6748 // If the DIExpression was previously empty then add the stack terminator.
6749 // Non-empty expressions have only had elements inserted into them and so
6750 // the terminator should already be present e.g. stack_value or fragment.
6751 DIExpression *SalvageExpr = DbgVal->getExpression();
6752 if (!DVIRec.Expr->isComplex() && SalvageExpr->isComplex()) {
6753 SalvageExpr = DIExpression::append(SalvageExpr, {dwarf::DW_OP_stack_value});
6754 DbgVal->setExpression(SalvageExpr);
6755 }
6756}
6757
6758/// Cached location ops may be erased during LSR, in which case a poison is
6759/// required when restoring from the cache. The type of that location is no
6760/// longer available, so just use int8. The poison will be replaced by one or
6761/// more locations later when a SCEVDbgValueBuilder selects alternative
6762/// locations to use for the salvage.
6764 return (VH) ? VH : PoisonValue::get(llvm::Type::getInt8Ty(C));
6765}
6766
6767/// Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
6768static void restorePreTransformState(DVIRecoveryRec &DVIRec) {
6769 DbgVariableRecord *DbgVal = DVIRec.DbgRef;
6770 LLVM_DEBUG(dbgs() << "scev-salvage: restore dbg.value to pre-LSR state\n"
6771 << "scev-salvage: post-LSR: " << *DbgVal << '\n');
6772 assert(DVIRec.Expr && "Expected an expression");
6773 DbgVal->setExpression(DVIRec.Expr);
6774
6775 // Even a single location-op may be inside a DIArgList and referenced with
6776 // DW_OP_LLVM_arg, which is valid only with a DIArgList.
6777 if (!DVIRec.HadLocationArgList) {
6778 assert(DVIRec.LocationOps.size() == 1 &&
6779 "Unexpected number of location ops.");
6780 // LSR's unsuccessful salvage attempt may have added DIArgList, which in
6781 // this case was not present before, so force the location back to a
6782 // single uncontained Value.
6783 Value *CachedValue =
6784 getValueOrPoison(DVIRec.LocationOps[0], DbgVal->getContext());
6785 DbgVal->setRawLocation(ValueAsMetadata::get(CachedValue));
6786 } else {
6788 for (WeakVH VH : DVIRec.LocationOps) {
6789 Value *CachedValue = getValueOrPoison(VH, DbgVal->getContext());
6790 MetadataLocs.push_back(ValueAsMetadata::get(CachedValue));
6791 }
6792 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6793 DbgVal->setRawLocation(
6794 llvm::DIArgList::get(DbgVal->getContext(), ValArrayRef));
6795 }
6796 LLVM_DEBUG(dbgs() << "scev-salvage: pre-LSR: " << *DbgVal << '\n');
6797}
6798
6800 llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec,
6801 const SCEV *SCEVInductionVar,
6802 SCEVDbgValueBuilder IterCountExpr) {
6803
6804 if (!DVIRec.DbgRef->isKillLocation())
6805 return false;
6806
6807 // LSR may have caused several changes to the dbg.value in the failed salvage
6808 // attempt. So restore the DIExpression, the location ops and also the
6809 // location ops format, which is always DIArglist for multiple ops, but only
6810 // sometimes for a single op.
6812
6813 // LocationOpIndexMap[i] will store the post-LSR location index of
6814 // the non-optimised out location at pre-LSR index i.
6815 SmallVector<int64_t, 2> LocationOpIndexMap;
6816 LocationOpIndexMap.assign(DVIRec.LocationOps.size(), -1);
6817 SmallVector<Value *, 2> NewLocationOps;
6818 NewLocationOps.push_back(LSRInductionVar);
6819
6820 for (unsigned i = 0; i < DVIRec.LocationOps.size(); i++) {
6821 WeakVH VH = DVIRec.LocationOps[i];
6822 // Place the locations not optimised out in the list first, avoiding
6823 // inserts later. The map is used to update the DIExpression's
6824 // DW_OP_LLVM_arg arguments as the expression is updated.
6825 if (VH && !isa<UndefValue>(VH)) {
6826 NewLocationOps.push_back(VH);
6827 LocationOpIndexMap[i] = NewLocationOps.size() - 1;
6828 LLVM_DEBUG(dbgs() << "scev-salvage: Location index " << i
6829 << " now at index " << LocationOpIndexMap[i] << "\n");
6830 continue;
6831 }
6832
6833 // It's possible that a value referred to in the SCEV may have been
6834 // optimised out by LSR.
6835 if (SE.containsErasedValue(DVIRec.SCEVs[i]) ||
6836 SE.containsUndefs(DVIRec.SCEVs[i])) {
6837 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV for location at index: " << i
6838 << " refers to a location that is now undef or erased. "
6839 "Salvage abandoned.\n");
6840 return false;
6841 }
6842
6843 LLVM_DEBUG(dbgs() << "scev-salvage: salvaging location at index " << i
6844 << " with SCEV: " << *DVIRec.SCEVs[i] << "\n");
6845
6846 DVIRec.RecoveryExprs[i] = std::make_unique<SCEVDbgValueBuilder>();
6847 SCEVDbgValueBuilder *SalvageExpr = DVIRec.RecoveryExprs[i].get();
6848
6849 // Create an offset-based salvage expression if possible, as it requires
6850 // less DWARF ops than an iteration count-based expression.
6851 if (std::optional<APInt> Offset =
6852 SE.computeConstantDifference(DVIRec.SCEVs[i], SCEVInductionVar)) {
6853 if (Offset->getSignificantBits() <= 64)
6854 SalvageExpr->createOffsetExpr(Offset->getSExtValue(), LSRInductionVar);
6855 else
6856 return false;
6857 } else if (!SalvageExpr->createIterCountExpr(DVIRec.SCEVs[i], IterCountExpr,
6858 SE))
6859 return false;
6860 }
6861
6862 // Merge the DbgValueBuilder generated expressions and the original
6863 // DIExpression, place the result into an new vector.
6865 if (DVIRec.Expr->getNumElements() == 0) {
6866 assert(DVIRec.RecoveryExprs.size() == 1 &&
6867 "Expected only a single recovery expression for an empty "
6868 "DIExpression.");
6869 assert(DVIRec.RecoveryExprs[0] &&
6870 "Expected a SCEVDbgSalvageBuilder for location 0");
6871 SCEVDbgValueBuilder *B = DVIRec.RecoveryExprs[0].get();
6872 B->appendToVectors(NewExpr, NewLocationOps);
6873 }
6874 for (const auto &Op : DVIRec.Expr->expr_ops()) {
6875 // Most Ops needn't be updated.
6876 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6877 Op.appendToVector(NewExpr);
6878 continue;
6879 }
6880
6881 uint64_t LocationArgIndex = Op.getArg(0);
6882 SCEVDbgValueBuilder *DbgBuilder =
6883 DVIRec.RecoveryExprs[LocationArgIndex].get();
6884 // The location doesn't have s SCEVDbgValueBuilder, so LSR did not
6885 // optimise it away. So just translate the argument to the updated
6886 // location index.
6887 if (!DbgBuilder) {
6888 NewExpr.push_back(dwarf::DW_OP_LLVM_arg);
6889 assert(LocationOpIndexMap[Op.getArg(0)] != -1 &&
6890 "Expected a positive index for the location-op position.");
6891 NewExpr.push_back(LocationOpIndexMap[Op.getArg(0)]);
6892 continue;
6893 }
6894 // The location has a recovery expression.
6895 DbgBuilder->appendToVectors(NewExpr, NewLocationOps);
6896 }
6897
6898 UpdateDbgValue(DVIRec, NewLocationOps, NewExpr);
6899 LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: " << *DVIRec.DbgRef << "\n");
6900 return true;
6901}
6902
6903/// Obtain an expression for the iteration count, then attempt to salvage the
6904/// dbg.value intrinsics.
6906 llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar,
6907 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &DVIToUpdate) {
6908 if (DVIToUpdate.empty())
6909 return;
6910
6911 const llvm::SCEV *SCEVInductionVar = SE.getSCEV(LSRInductionVar);
6912 assert(SCEVInductionVar &&
6913 "Anticipated a SCEV for the post-LSR induction variable");
6914
6915 if (const SCEVAddRecExpr *IVAddRec =
6916 dyn_cast<SCEVAddRecExpr>(SCEVInductionVar)) {
6917 if (!IVAddRec->isAffine())
6918 return;
6919
6920 // Prevent translation using excessive resources.
6921 if (IVAddRec->getExpressionSize() > MaxSCEVSalvageExpressionSize)
6922 return;
6923
6924 // The iteration count is required to recover location values.
6925 SCEVDbgValueBuilder IterCountExpr;
6926 IterCountExpr.pushLocation(LSRInductionVar);
6927 if (!IterCountExpr.SCEVToIterCountExpr(*IVAddRec, SE))
6928 return;
6929
6930 LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV: " << *SCEVInductionVar
6931 << '\n');
6932
6933 for (auto &DVIRec : DVIToUpdate) {
6934 SalvageDVI(L, SE, LSRInductionVar, *DVIRec, SCEVInductionVar,
6935 IterCountExpr);
6936 }
6937 }
6938}
6939
6940/// Identify and cache salvageable DVI locations and expressions along with the
6941/// corresponding SCEV(s). Also ensure that the DVI is not deleted between
6942/// cacheing and salvaging.
6944 Loop *L, ScalarEvolution &SE,
6945 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &SalvageableDVISCEVs) {
6946 for (const auto &B : L->getBlocks()) {
6947 for (auto &I : *B) {
6948 for (DbgVariableRecord &DbgVal : filterDbgVars(I.getDbgRecordRange())) {
6949 if (!DbgVal.isDbgValue() && !DbgVal.isDbgAssign())
6950 continue;
6951
6952 // Ensure that if any location op is undef that the dbg.vlue is not
6953 // cached.
6954 if (DbgVal.isKillLocation())
6955 continue;
6956
6957 // Check that the location op SCEVs are suitable for translation to
6958 // DIExpression.
6959 const auto &HasTranslatableLocationOps =
6960 [&](const DbgVariableRecord &DbgValToTranslate) -> bool {
6961 for (const auto LocOp : DbgValToTranslate.location_ops()) {
6962 if (!LocOp)
6963 return false;
6964
6965 if (!SE.isSCEVable(LocOp->getType()))
6966 return false;
6967
6968 const SCEV *S = SE.getSCEV(LocOp);
6969 if (SE.containsUndefs(S))
6970 return false;
6971 }
6972 return true;
6973 };
6974
6975 if (!HasTranslatableLocationOps(DbgVal))
6976 continue;
6977
6978 std::unique_ptr<DVIRecoveryRec> NewRec =
6979 std::make_unique<DVIRecoveryRec>(&DbgVal);
6980 // Each location Op may need a SCEVDbgValueBuilder in order to recover
6981 // it. Pre-allocating a vector will enable quick lookups of the builder
6982 // later during the salvage.
6983 NewRec->RecoveryExprs.resize(DbgVal.getNumVariableLocationOps());
6984 for (const auto LocOp : DbgVal.location_ops()) {
6985 NewRec->SCEVs.push_back(SE.getSCEV(LocOp));
6986 NewRec->LocationOps.push_back(LocOp);
6987 NewRec->HadLocationArgList = DbgVal.hasArgList();
6988 }
6989 SalvageableDVISCEVs.push_back(std::move(NewRec));
6990 }
6991 }
6992 }
6993}
6994
6995/// Ideally pick the PHI IV inserted by ScalarEvolutionExpander. As a fallback
6996/// any PHi from the loop header is usable, but may have less chance of
6997/// surviving subsequent transforms.
6999 const LSRInstance &LSR) {
7000
7001 auto IsSuitableIV = [&](PHINode *P) {
7002 if (!SE.isSCEVable(P->getType()))
7003 return false;
7004 if (const SCEVAddRecExpr *Rec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(P)))
7005 return Rec->isAffine() && !SE.containsUndefs(SE.getSCEV(P));
7006 return false;
7007 };
7008
7009 // For now, just pick the first IV that was generated and inserted by
7010 // ScalarEvolution. Ideally pick an IV that is unlikely to be optimised away
7011 // by subsequent transforms.
7012 for (const WeakVH &IV : LSR.getScalarEvolutionIVs()) {
7013 if (!IV)
7014 continue;
7015
7016 // There should only be PHI node IVs.
7017 PHINode *P = cast<PHINode>(&*IV);
7018
7019 if (IsSuitableIV(P))
7020 return P;
7021 }
7022
7023 for (PHINode &P : L.getHeader()->phis()) {
7024 if (IsSuitableIV(&P))
7025 return &P;
7026 }
7027 return nullptr;
7028}
7029
7031 DominatorTree &DT, LoopInfo &LI,
7032 const TargetTransformInfo &TTI,
7034 MemorySSA *MSSA) {
7035
7036 // Debug preservation - before we start removing anything identify which DVI
7037 // meet the salvageable criteria and store their DIExpression and SCEVs.
7038 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> SalvageableDVIRecords;
7039 DbgGatherSalvagableDVI(L, SE, SalvageableDVIRecords);
7040
7041 bool Changed = false;
7042 std::unique_ptr<MemorySSAUpdater> MSSAU;
7043 if (MSSA)
7044 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
7045
7046 // Run the main LSR transformation.
7047 const LSRInstance &Reducer =
7048 LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get());
7049 Changed |= Reducer.getChanged();
7050
7051 // Remove any extra phis created by processing inner loops.
7052 Changed |= DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7053 if (EnablePhiElim && L->isLoopSimplifyForm()) {
7055 const DataLayout &DL = L->getHeader()->getDataLayout();
7056 SCEVExpander Rewriter(SE, DL, "lsr", false);
7057#if LLVM_ENABLE_ABI_BREAKING_CHECKS
7058 Rewriter.setDebugType(DEBUG_TYPE);
7059#endif
7060 unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI);
7061 Rewriter.clear();
7062 if (numFolded) {
7063 Changed = true;
7065 MSSAU.get());
7066 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7067 }
7068 }
7069 // LSR may at times remove all uses of an induction variable from a loop.
7070 // The only remaining use is the PHI in the exit block.
7071 // When this is the case, if the exit value of the IV can be calculated using
7072 // SCEV, we can replace the exit block PHI with the final value of the IV and
7073 // skip the updates in each loop iteration.
7074 if (L->isRecursivelyLCSSAForm(DT, LI) && L->getExitBlock()) {
7076 const DataLayout &DL = L->getHeader()->getDataLayout();
7077 SCEVExpander Rewriter(SE, DL, "lsr", true);
7078 int Rewrites = rewriteLoopExitValues(L, &LI, &TLI, &SE, &TTI, Rewriter, &DT,
7079 UnusedIndVarInLoop, DeadInsts);
7080 Rewriter.clear();
7081 if (Rewrites) {
7082 Changed = true;
7084 MSSAU.get());
7085 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7086 }
7087 }
7088
7089 if (SalvageableDVIRecords.empty())
7090 return Changed;
7091
7092 // Obtain relevant IVs and attempt to rewrite the salvageable DVIs with
7093 // expressions composed using the derived iteration count.
7094 // TODO: Allow for multiple IV references for nested AddRecSCEVs
7095 for (const auto &L : LI) {
7096 if (llvm::PHINode *IV = GetInductionVariable(*L, SE, Reducer))
7097 DbgRewriteSalvageableDVIs(L, SE, IV, SalvageableDVIRecords);
7098 else {
7099 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV salvaging not possible. An IV "
7100 "could not be identified.\n");
7101 }
7102 }
7103
7104 for (auto &Rec : SalvageableDVIRecords)
7105 Rec->clear();
7106 SalvageableDVIRecords.clear();
7107 return Changed;
7108}
7109
7110bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
7111 if (skipLoop(L))
7112 return false;
7113
7114 auto &IU = getAnalysis<IVUsersWrapperPass>().getIU();
7115 auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
7116 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
7117 auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
7118 const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
7119 *L->getHeader()->getParent());
7120 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
7121 *L->getHeader()->getParent());
7122 auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
7123 *L->getHeader()->getParent());
7124 auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
7125 MemorySSA *MSSA = nullptr;
7126 if (MSSAAnalysis)
7127 MSSA = &MSSAAnalysis->getMSSA();
7128 return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, TLI, MSSA);
7129}
7130
7133 LPMUpdater &) {
7134 if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,
7135 AR.DT, AR.LI, AR.TTI, AR.AC, AR.TLI, AR.MSSA))
7136 return PreservedAnalyses::all();
7137
7138 auto PA = getLoopPassPreservedAnalyses();
7139 if (AR.MSSA)
7140 PA.preserve<MemorySSAAnalysis>();
7141 return PA;
7142}
7143
7144char LoopStrengthReduce::ID = 0;
7145
7146INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
7147 "Loop Strength Reduction", false, false)
7153INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
7154INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
7155 "Loop Strength Reduction", false, false)
7156
7157Pass *llvm::createLoopStrengthReducePass() { return new LoopStrengthReduce(); }
#define Success
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis false
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:638
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isCanonical(const MDString *S)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
This file contains constants used for implementing Dwarf debug support.
early cse Early CSE w MemorySSA
#define DEBUG_TYPE
Hexagon Hardware Loops
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
This header provides classes for managing per-loop analyses.
static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec, const SCEV *SCEVInductionVar, SCEVDbgValueBuilder IterCountExpr)
static cl::opt< bool > DropScaledForVScale("lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true), cl::desc("Avoid using scaled registers with vscale-relative addressing"))
static Value * getWideOperand(Value *Oper)
IVChain logic must consistently peek base TruncInst operands, so wrap it in a convenient helper.
static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE)
Return true if the given add can be sign-extended without changing its value.
static bool mayUsePostIncMode(const TargetTransformInfo &TTI, LSRUse &LU, const SCEV *S, const Loop *L, ScalarEvolution &SE)
Return true if the SCEV represents a value that may end up as a post-increment operation.
static void restorePreTransformState(DVIRecoveryRec &DVIRec)
Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE)
If S involves the addition of a constant integer value, return that integer value,...
static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L)
static User::op_iterator findIVOperand(User::op_iterator OI, User::op_iterator OE, Loop *L, ScalarEvolution &SE)
Helper for CollectChains that finds an IV operand (computed by an AddRec in this loop) within [OI,...
static cl::opt< TTI::AddressingModeKind > PreferredAddresingMode("lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None), cl::desc("A flag that overrides the target's preferred addressing mode."), cl::values(clEnumValN(TTI::AMK_None, "none", "Don't prefer any addressing mode"), clEnumValN(TTI::AMK_PreIndexed, "preindexed", "Prefer pre-indexed addressing mode"), clEnumValN(TTI::AMK_PostIndexed, "postindexed", "Prefer post-indexed addressing mode"), clEnumValN(TTI::AMK_All, "all", "Consider all addressing modes")))
static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset, Immediate MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg, int64_t Scale)
Test whether we know how to expand the current formula.
static void DbgGatherSalvagableDVI(Loop *L, ScalarEvolution &SE, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &SalvageableDVISCEVs)
Identify and cache salvageable DVI locations and expressions along with the corresponding SCEV(s).
static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE)
Return true if the given mul can be sign-extended without changing its value.
static const unsigned MaxSCEVSalvageExpressionSize
Limit the size of expression that SCEV-based salvaging will attempt to translate into a DIExpression.
static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if this AddRec is already a phi in its loop.
static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F, const Loop &L)
static cl::opt< bool > InsnsCost("lsr-insns-cost", cl::Hidden, cl::init(true), cl::desc("Add instruction count to a LSR cost model"))
static cl::opt< bool > StressIVChain("stress-ivchain", cl::Hidden, cl::init(false), cl::desc("Stress test LSR IV chains"))
static bool isAddressUse(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Returns true if the specified instruction is using the specified value as an address.
static GlobalValue * ExtractSymbol(const SCEV *&S, ScalarEvolution &SE)
If S involves the addition of a GlobalValue address, return that symbol, and mutate S to point to a n...
static void updateDVIWithLocation(T &DbgVal, Value *Location, SmallVectorImpl< uint64_t > &Ops)
Overwrites DVI with the location and Ops as the DIExpression.
static bool isLegalAddImmediate(const TargetTransformInfo &TTI, Immediate Offset)
static cl::opt< cl::boolOrDefault > AllowDropSolutionIfLessProfitable("lsr-drop-solution", cl::Hidden, cl::desc("Attempt to drop solution if it is less profitable"))
static cl::opt< bool > EnableVScaleImmediates("lsr-enable-vscale-immediates", cl::Hidden, cl::init(true), cl::desc("Enable analysis of vscale-relative immediates in LSR"))
static const SCEV * getExprBase(const SCEV *S)
Return an approximation of this SCEV expression's "base", or NULL for any constant.
static bool isAlwaysFoldable(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg)
static llvm::PHINode * GetInductionVariable(const Loop &L, ScalarEvolution &SE, const LSRInstance &LSR)
Ideally pick the PHI IV inserted by ScalarEvolutionExpander.
static bool IsSimplerBaseSCEVForTarget(const TargetTransformInfo &TTI, ScalarEvolution &SE, const SCEV *Best, const SCEV *Reg, MemAccessTy AccessType)
static const unsigned MaxIVUsers
MaxIVUsers is an arbitrary threshold that provides an early opportunity for bail out.
static bool isHighCostExpansion(const SCEV *S, SmallPtrSetImpl< const SCEV * > &Processed, ScalarEvolution &SE)
Check if expanding this expression is likely to incur significant cost.
static Value * getValueOrPoison(WeakVH &VH, LLVMContext &C)
Cached location ops may be erased during LSR, in which case a poison is required when restoring from ...
static MemAccessTy getAccessType(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Return the type of the memory being accessed.
static unsigned numLLVMArgOps(SmallVectorImpl< uint64_t > &Expr)
Returns the total number of DW_OP_llvm_arg operands in the expression.
static void DbgRewriteSalvageableDVIs(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &DVIToUpdate)
Obtain an expression for the iteration count, then attempt to salvage the dbg.value intrinsics.
static cl::opt< bool > EnablePhiElim("enable-lsr-phielim", cl::Hidden, cl::init(true), cl::desc("Enable LSR phi elimination"))
static void UpdateDbgValue(DVIRecoveryRec &DVIRec, SmallVectorImpl< Value * > &NewLocationOps, SmallVectorImpl< uint64_t > &NewExpr)
Write the new expression and new location ops for the dbg.value.
static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if the given addrec can be sign-extended without changing its value.
static bool canHoistIVInc(const TargetTransformInfo &TTI, const LSRFixup &Fixup, const LSRUse &LU, Instruction *IVIncInsertPos, Loop *L)
static void DoInitialMatch(const SCEV *S, Loop *L, SmallVectorImpl< const SCEV * > &Good, SmallVectorImpl< const SCEV * > &Bad, ScalarEvolution &SE)
Recursion helper for initialMatch.
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F)
Check if the addressing mode defined by F is completely folded in LU at isel time.
static cl::opt< bool > LSRExpNarrow("lsr-exp-narrow", cl::Hidden, cl::init(false), cl::desc("Narrow LSR complex solution using" " expectation of registers number"))
static cl::opt< bool > FilterSameScaledReg("lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true), cl::desc("Narrow LSR search space by filtering non-optimal formulae" " with the same ScaledReg and Scale"))
static void updateDVIWithLocations(T &DbgVal, SmallVectorImpl< Value * > &Locations, SmallVectorImpl< uint64_t > &Ops)
Overwrite DVI with locations placed into a DIArglist.
static cl::opt< unsigned > ComplexityLimit("lsr-complexity-limit", cl::Hidden, cl::init(std::numeric_limits< uint16_t >::max()), cl::desc("LSR search space complexity limit"))
static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC, TargetLibraryInfo &TLI, MemorySSA *MSSA)
static bool isProfitableChain(IVChain &Chain, SmallPtrSetImpl< Instruction * > &Users, ScalarEvolution &SE, const TargetTransformInfo &TTI)
Return true if the number of registers needed for the chain is estimated to be less than the number r...
static const SCEV * CollectSubexprs(const SCEV *S, const SCEVConstant *C, SmallVectorImpl< const SCEV * > &Ops, const Loop *L, ScalarEvolution &SE, unsigned Depth=0)
Split S into subexpressions which can be pulled out into separate registers.
static const SCEV * getExactSDiv(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool IgnoreSignificantBits=false)
Return an expression for LHS /s RHS, if it can be determined and if the remainder is known to be zero...
static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, Value *Operand, const TargetTransformInfo &TTI)
Return true if the IVInc can be folded into an addressing mode.
static const SCEV * getAnyExtendConsideringPostIncUses(ArrayRef< PostIncLoopSet > Loops, const SCEV *Expr, Type *ToTy, ScalarEvolution &SE)
Extend/Truncate Expr to ToTy considering post-inc uses in Loops.
static unsigned getSetupCost(const SCEV *Reg, unsigned Depth)
static cl::opt< unsigned > SetupCostDepthLimit("lsr-setupcost-depth-limit", cl::Hidden, cl::init(7), cl::desc("The limit on recursion depth for LSRs setup cost"))
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
Register Reg
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
#define T
uint64_t IntrinsicInst * II
#define P(N)
PowerPC TLS Dynamic Call Fixup
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the PointerIntPair class.
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
SI optimize exec mask operations pre RA
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
static const unsigned UnknownAddressSpace
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
Virtual Register Rewriter
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1540
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:329
LLVM_ABI APInt sdiv(const APInt &RHS) const
Signed division function for APInt.
Definition APInt.cpp:1644
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition APInt.h:1531
LLVM_ABI APInt srem(const APInt &RHS) const
Function for signed remainder operation.
Definition APInt.cpp:1736
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
LLVM_ABI AnalysisUsage & addRequiredID(const void *ID)
Definition Pass.cpp:284
AnalysisUsage & addPreservedID(const void *ID)
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:528
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:386
LLVM_ABI bool isLandingPad() const
Return true if this basic block is a landing pad.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
BinaryOps getOpcode() const
Definition InstrTypes.h:374
static LLVM_ABI BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
bool isUnconditional() const
Value * getCondition() const
static LLVM_ABI Instruction::CastOps getCastOpcode(const Value *Val, bool SrcIsSigned, Type *Ty, bool DstIsSigned)
Returns the opcode necessary to cast Val into Ty using usual casting rules.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_NE
not equal
Definition InstrTypes.h:698
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
static LLVM_ABI bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:131
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:169
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:163
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI DIArgList * get(LLVMContext &Context, ArrayRef< ValueAsMetadata * > Args)
DWARF expression.
iterator_range< expr_op_iterator > expr_ops() const
static LLVM_ABI DIExpression * append(const DIExpression *Expr, ArrayRef< uint64_t > Ops)
Append the opcodes Ops to DIExpr.
unsigned getNumElements() const
static LLVM_ABI void appendOffset(SmallVectorImpl< uint64_t > &Ops, int64_t Offset)
Append Ops with operations to apply the Offset.
LLVM_ABI bool isComplex() const
Return whether the location is computed on the expression stack, meaning it cannot be a simple regist...
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
LLVM_ABI LLVMContext & getContext()
Record of a variable value-assignment, aka a non instruction representation of the dbg....
LLVM_ABI bool isKillLocation() const
void setRawLocation(Metadata *NewLocation)
Use of this should generally be avoided; instead, replaceVariableLocationOp and addVariableLocationOp...
void setExpression(DIExpression *NewExpr)
DIExpression * getExpression() const
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:248
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:233
NodeT * getBlock() const
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:322
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
LLVM_ABI Instruction * findNearestCommonDominator(Instruction *I1, Instruction *I2) const
Find the nearest instruction I that dominates both I1 and I2, in the sense that a result produced bef...
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
PointerType * getType() const
Global values are always pointers.
IVStrideUse - Keep track of one use of a strided induction variable.
Definition IVUsers.h:35
void transformToPostInc(const Loop *L)
transformToPostInc - Transform the expression to post-inc form for the given loop.
Definition IVUsers.cpp:365
Value * getOperandValToReplace() const
getOperandValToReplace - Return the Value of the operand in the user instruction that this IVStrideUs...
Definition IVUsers.h:54
void setUser(Instruction *NewUser)
setUser - Assign a new user instruction for this use.
Definition IVUsers.h:48
Analysis pass that exposes the IVUsers for a loop.
Definition IVUsers.h:184
ilist< IVStrideUse >::const_iterator const_iterator
Definition IVUsers.h:142
iterator end()
Definition IVUsers.h:144
iterator begin()
Definition IVUsers.h:143
bool empty() const
Definition IVUsers.h:147
LLVM_ABI void print(raw_ostream &OS) const
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isLifetimeStartOrEnd() const LLVM_READONLY
Return true if the instruction is a llvm.lifetime.start or llvm.lifetime.end marker.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isEHPad() const
Return true if the instruction is a variety of EH-block.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI Type * getAccessType() const LLVM_READONLY
Return the type this instruction accesses in memory, if any.
const char * getOpcodeName() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
An instruction for reading from memory.
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
The legacy pass manager's analysis pass to compute loop information.
Definition LoopInfo.h:596
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1569
An analysis that produces MemorySSA for a function.
Definition MemorySSA.h:936
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition MemorySSA.h:702
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
iterator_range< const_block_iterator > blocks() const
op_range incoming_values()
void setIncomingValue(unsigned i, Value *V)
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
static unsigned getIncomingValueNumForOperand(unsigned i)
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
This node represents an addition of some number of SCEVs.
This node represents a polynomial recurrence on the trip count of the specified loop.
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents a constant integer value.
ConstantInt * getValue() const
const APInt & getAPInt() const
This class uses information about analyze scalars to rewrite expressions in canonical form.
This node represents multiplication of some number of SCEVs.
ArrayRef< const SCEV * > operands() const
This means that we are dealing with an entirely unknown SCEV value, and only represent it as its LLVM...
This class represents an analyzed expression in the program.
LLVM_ABI ArrayRef< const SCEV * > operands() const
Return operands of this SCEV expression.
unsigned short getExpressionSize() const
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
SCEVTypes getSCEVType() const
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
LLVM_ABI uint64_t getTypeSizeInBits(Type *Ty) const
Return the size in bits of the specified type, for which isSCEVable must return true.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getNoopOrSignExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI const SCEV * getAddRecExpr(const SCEV *Start, const SCEV *Step, const Loop *L, SCEV::NoWrapFlags Flags)
Get an add recurrence expression for the specified loop.
LLVM_ABI bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
LLVM_ABI Type * getEffectiveSCEVType(Type *Ty) const
Return a type with the same bitwidth as the given type and which represents how SCEV will treat the g...
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getAnyExtendExpr(const SCEV *Op, Type *Ty)
getAnyExtendExpr - Return a SCEV for the given operand extended with unspecified bits out to the give...
LLVM_ABI bool containsUndefs(const SCEV *S) const
Return true if the SCEV expression contains an undef value.
LLVM_ABI const SCEV * getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
LLVM_ABI const SCEV * getVScale(Type *Ty)
LLVM_ABI bool hasComputableLoopEvolution(const SCEV *S, const Loop *L)
Return true if the given SCEV changes value in a known way in the specified loop.
LLVM_ABI const SCEV * getPointerBase(const SCEV *V)
Transitively follow the chain of pointer-type operands until reaching a SCEV that does not have a sin...
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getUnknown(Value *V)
LLVM_ABI std::optional< APInt > computeConstantDifference(const SCEV *LHS, const SCEV *RHS)
Compute LHS - RHS and returns the result as an APInt if it is a constant, and std::nullopt if it isn'...
LLVM_ABI bool properlyDominates(const SCEV *S, const BasicBlock *BB)
Return true if elements that makes up the given SCEV properly dominate the specified basic block.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVM_ABI bool containsErasedValue(const SCEV *S) const
Return true if the SCEV expression contains a Value that has been optimised out and is now a nullptr.
LLVMContext & getContext() const
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:102
iterator end()
Get an iterator to the end of the SetVector.
Definition SetVector.h:111
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition SetVector.h:105
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:150
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
iterator_range< const_set_bits_iterator > set_bits() const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
size_type size() const
Returns the number of bits in this bitvector.
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
iterator erase(const_iterator CI)
typename SuperClass::const_iterator const_iterator
typename SuperClass::iterator iterator
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
static StackOffset get(int64_t Fixed, int64_t Scalable)
Definition TypeSize.h:42
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI bool shouldDropLSRSolutionIfLessProfitable() const
Return true if LSR should drop a found solution if it's calculated to be less profitable than the bas...
LLVM_ABI bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const
Return true if LSR cost of C1 is lower than C2.
LLVM_ABI bool isIndexedStoreLegal(enum MemIndexedMode Mode, Type *Ty) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace=0, Instruction *I=nullptr, int64_t ScalableOffset=0) const
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
LLVM_ABI bool isIndexedLoadLegal(enum MemIndexedMode Mode, Type *Ty) const
LLVM_ABI bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
LLVM_ABI bool isLegalAddImmediate(int64_t Imm) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
LLVM_ABI bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *LibInfo) const
Return true if the target can save a compare for loop count, for example hardware loop saves a compar...
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI bool canMacroFuseCmp() const
Return true if the target can fuse a compare and branch.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
@ AMK_All
Consider all addressing modes.
@ AMK_PreIndexed
Prefer pre-indexed addressing mode.
@ AMK_None
Don't prefer any addressing mode.
LLVM_ABI bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
This class represents a truncation of integer types.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:62
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:295
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI int getFPMantissaWidth() const
Return the width of the mantissa of this type.
Definition Type.cpp:236
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
Use * op_iterator
Definition User.h:279
op_range operands()
Definition User.h:292
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:21
op_iterator op_begin()
Definition User.h:284
void setOperand(unsigned i, Value *Val)
Definition User.h:237
Value * getOperand(unsigned i) const
Definition User.h:232
op_iterator op_end()
Definition User.h:286
static LLVM_ABI ValueAsMetadata * get(Value *V)
Definition Metadata.cpp:503
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI void printAsOperand(raw_ostream &O, bool PrintType=true, const Module *M=nullptr) const
Print the name of this Value out to the specified raw_ostream.
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
iterator_range< use_iterator > uses()
Definition Value.h:380
A nullable Value handle that is nullable.
int getNumOccurrences() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:180
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
class_match< const SCEVVScale > m_SCEVVScale()
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
class_match< const SCEVConstant > m_SCEVConstant()
SCEVAffineAddRec_match< Op0_t, Op1_t, class_match< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
bind_ty< const SCEVMulExpr > m_scev_Mul(const SCEVMulExpr *&V)
bool match(const SCEV *S, const Pattern &P)
class_match< const Loop > m_Loop()
cst_pred_ty< is_specific_cst > m_scev_SpecificInt(uint64_t V)
Match an SCEV constant with a plain unsigned integer.
class_match< const SCEV > m_SCEV()
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
@ DW_OP_LLVM_arg
Only used in LLVM metadata.
Definition Dwarf.h:149
@ DW_OP_LLVM_convert
Only used in LLVM metadata.
Definition Dwarf.h:145
constexpr double e
Sequence
A sequence of states that a pointer may go through in which an objc_retain and objc_release are actua...
Definition PtrState.h:41
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
iterator end() const
Definition BasicBlock.h:89
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
Definition SFrame.h:77
unsigned KindType
For isa, dyn_cast, etc operations on TelemetryInfo.
Definition Telemetry.h:85
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
InstructionCost Cost
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition Utils.cpp:1724
bool operator!=(uint64_t V1, const APInt &V2)
Definition APInt.h:2113
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136
LLVM_ABI char & LoopSimplifyID
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
DomTreeNodeBase< BasicBlock > DomTreeNode
Definition Dominators.h:95
AnalysisManager< Loop, LoopStandardAnalysisResults & > LoopAnalysisManager
The loop analysis manager.
LLVM_ABI bool matchSimpleRecurrence(const PHINode *P, BinaryOperator *&BO, Value *&Start, Value *&Step)
Attempt to match a simple first order recurrence cycle of the form: iv = phi Ty [Start,...
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
LLVM_ABI bool DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr)
Examine each PHI in the given block and delete it if it is dead.
LLVM_ABI void initializeLoopStrengthReducePass(PassRegistry &)
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
LLVM_ABI const SCEV * denormalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE)
Denormalize S to be post-increment for all loops present in Loops.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1622
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
LLVM_ABI Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void SplitLandingPadPredecessors(BasicBlock *OrigBB, ArrayRef< BasicBlock * > Preds, const char *Suffix, const char *Suffix2, SmallVectorImpl< BasicBlock * > &NewBBs, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, bool PreserveLCSSA=false)
This method transforms the landing pad, OrigBB, by introducing two new basic blocks into the function...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
LLVM_ABI const SCEV * normalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE, bool CheckInvertible=true)
Normalize S to be post-increment for all loops present in Loops.
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Other
Any other memory.
Definition ModRef.h:68
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
@ Add
Sum of integers.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:1954
DWARFExpression::Operation Op
LLVM_ABI Pass * createLoopStrengthReducePass()
LLVM_ABI BasicBlock * SplitCriticalEdge(Instruction *TI, unsigned SuccNum, const CriticalEdgeSplittingOptions &Options=CriticalEdgeSplittingOptions(), const Twine &BBName="")
If this edge is a critical edge, insert a new node to split the critical edge.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructionsPermissive(SmallVectorImpl< WeakTrackingVH > &DeadInsts, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
Same functionality as RecursivelyDeleteTriviallyDeadInstructions, but allow instructions that are not...
Definition Local.cpp:548
constexpr unsigned BitWidth
LLVM_ABI bool formLCSSAForInstructions(SmallVectorImpl< Instruction * > &Worklist, const DominatorTree &DT, const LoopInfo &LI, ScalarEvolution *SE, SmallVectorImpl< PHINode * > *PHIsToRemove=nullptr, SmallVectorImpl< PHINode * > *InsertedPHIs=nullptr)
Ensures LCSSA form for every instruction from the Worklist in the scope of innermost containing loop.
Definition LCSSA.cpp:308
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
SmallPtrSet< const Loop *, 2 > PostIncLoopSet
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1758
LLVM_ABI int rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI, ScalarEvolution *SE, const TargetTransformInfo *TTI, SCEVExpander &Rewriter, DominatorTree *DT, ReplaceExitVal ReplaceExitValue, SmallVector< WeakTrackingVH, 16 > &DeadInsts)
If the final value of any expressions that are recurrent in the loop can be computed,...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
@ UnusedIndVarInLoop
Definition LoopUtils.h:520
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
bool SCEVExprContains(const SCEV *Root, PredTy Pred)
Return true if any node in Root satisfies the predicate Pred.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
Attributes of a target dependent hardware loop.
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Information about a load/store intrinsic defined by the target.
Value * PtrVal
This is the pointer that the intrinsic is loading from or storing to.