LLVM 23.0.0git
LoopStrengthReduce.cpp
Go to the documentation of this file.
1//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This transformation analyzes and transforms the induction variables (and
10// computations derived from them) into forms suitable for efficient execution
11// on the target.
12//
13// This pass performs a strength reduction on array references inside loops that
14// have as one or more of their components the loop induction variable, it
15// rewrites expressions to take advantage of scaled-index addressing modes
16// available on the target, and it performs a variety of other optimizations
17// related to loop induction variables.
18//
19// Terminology note: this code has a lot of handling for "post-increment" or
20// "post-inc" users. This is not talking about post-increment addressing modes;
21// it is instead talking about code like this:
22//
23// %i = phi [ 0, %entry ], [ %i.next, %latch ]
24// ...
25// %i.next = add %i, 1
26// %c = icmp eq %i.next, %n
27//
28// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
29// it's useful to think about these as the same register, with some uses using
30// the value of the register before the add and some using it after. In this
31// example, the icmp is a post-increment user, since it uses %i.next, which is
32// the value of the induction variable after the increment. The other common
33// case of post-increment users is users outside the loop.
34//
35// TODO: More sophistication in the way Formulae are generated and filtered.
36//
37// TODO: Handle multiple loops at a time.
38//
39// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
40// of a GlobalValue?
41//
42// TODO: When truncation is free, truncate ICmp users' operands to make it a
43// smaller encoding (on x86 at least).
44//
45// TODO: When a negated register is used by an add (such as in a list of
46// multiple base registers, or as the increment expression in an addrec),
47// we may not actually need both reg and (-1 * reg) in registers; the
48// negation can be implemented by using a sub instead of an add. The
49// lack of support for taking this into consideration when making
50// register pressure decisions is partly worked around by the "Special"
51// use kind.
52//
53//===----------------------------------------------------------------------===//
54
56#include "llvm/ADT/APInt.h"
57#include "llvm/ADT/DenseMap.h"
58#include "llvm/ADT/DenseSet.h"
60#include "llvm/ADT/STLExtras.h"
61#include "llvm/ADT/SetVector.h"
64#include "llvm/ADT/SmallSet.h"
66#include "llvm/ADT/Statistic.h"
84#include "llvm/IR/BasicBlock.h"
85#include "llvm/IR/Constant.h"
86#include "llvm/IR/Constants.h"
89#include "llvm/IR/Dominators.h"
90#include "llvm/IR/GlobalValue.h"
91#include "llvm/IR/IRBuilder.h"
92#include "llvm/IR/InstrTypes.h"
93#include "llvm/IR/Instruction.h"
96#include "llvm/IR/Module.h"
97#include "llvm/IR/Operator.h"
98#include "llvm/IR/Type.h"
99#include "llvm/IR/Use.h"
100#include "llvm/IR/User.h"
101#include "llvm/IR/Value.h"
102#include "llvm/IR/ValueHandle.h"
104#include "llvm/Pass.h"
105#include "llvm/Support/Casting.h"
108#include "llvm/Support/Debug.h"
118#include <algorithm>
119#include <cassert>
120#include <cstddef>
121#include <cstdint>
122#include <iterator>
123#include <limits>
124#include <map>
125#include <numeric>
126#include <optional>
127#include <utility>
128
129using namespace llvm;
130using namespace SCEVPatternMatch;
131
132#define DEBUG_TYPE "loop-reduce"
133
134/// MaxIVUsers is an arbitrary threshold that provides an early opportunity for
135/// bail out. This threshold is far beyond the number of users that LSR can
136/// conceivably solve, so it should not affect generated code, but catches the
137/// worst cases before LSR burns too much compile time and stack space.
138static const unsigned MaxIVUsers = 200;
139
140/// Limit the size of expression that SCEV-based salvaging will attempt to
141/// translate into a DIExpression.
142/// Choose a maximum size such that debuginfo is not excessively increased and
143/// the salvaging is not too expensive for the compiler.
144static const unsigned MaxSCEVSalvageExpressionSize = 64;
145
146// Cleanup congruent phis after LSR phi expansion.
148 "enable-lsr-phielim", cl::Hidden, cl::init(true),
149 cl::desc("Enable LSR phi elimination"));
150
151// The flag adds instruction count to solutions cost comparison.
153 "lsr-insns-cost", cl::Hidden, cl::init(true),
154 cl::desc("Add instruction count to a LSR cost model"));
155
156// Flag to choose how to narrow complex lsr solution
158 "lsr-exp-narrow", cl::Hidden, cl::init(false),
159 cl::desc("Narrow LSR complex solution using"
160 " expectation of registers number"));
161
162// Flag to narrow search space by filtering non-optimal formulae with
163// the same ScaledReg and Scale.
165 "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true),
166 cl::desc("Narrow LSR search space by filtering non-optimal formulae"
167 " with the same ScaledReg and Scale"));
168
170 "lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None),
171 cl::desc("A flag that overrides the target's preferred addressing mode."),
173 clEnumValN(TTI::AMK_None, "none", "Don't prefer any addressing mode"),
174 clEnumValN(TTI::AMK_PreIndexed, "preindexed",
175 "Prefer pre-indexed addressing mode"),
176 clEnumValN(TTI::AMK_PostIndexed, "postindexed",
177 "Prefer post-indexed addressing mode"),
178 clEnumValN(TTI::AMK_All, "all", "Consider all addressing modes")));
179
181 "lsr-complexity-limit", cl::Hidden,
182 cl::init(std::numeric_limits<uint16_t>::max()),
183 cl::desc("LSR search space complexity limit"));
184
186 "lsr-setupcost-depth-limit", cl::Hidden, cl::init(7),
187 cl::desc("The limit on recursion depth for LSRs setup cost"));
188
190 "lsr-drop-solution", cl::Hidden,
191 cl::desc("Attempt to drop solution if it is less profitable"));
192
194 "lsr-enable-vscale-immediates", cl::Hidden, cl::init(true),
195 cl::desc("Enable analysis of vscale-relative immediates in LSR"));
196
198 "lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true),
199 cl::desc("Avoid using scaled registers with vscale-relative addressing"));
200
201#ifndef NDEBUG
202// Stress test IV chain generation.
204 "stress-ivchain", cl::Hidden, cl::init(false),
205 cl::desc("Stress test LSR IV chains"));
206#else
207static bool StressIVChain = false;
208#endif
209
210namespace {
211
212struct MemAccessTy {
213 /// Used in situations where the accessed memory type is unknown.
214 static const unsigned UnknownAddressSpace =
215 std::numeric_limits<unsigned>::max();
216
217 Type *MemTy = nullptr;
218 unsigned AddrSpace = UnknownAddressSpace;
219
220 MemAccessTy() = default;
221 MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {}
222
223 bool operator==(MemAccessTy Other) const {
224 return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
225 }
226
227 bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
228
229 static MemAccessTy getUnknown(LLVMContext &Ctx,
230 unsigned AS = UnknownAddressSpace) {
231 return MemAccessTy(Type::getVoidTy(Ctx), AS);
232 }
233
234 Type *getType() { return MemTy; }
235};
236
237/// This class holds data which is used to order reuse candidates.
238class RegSortData {
239public:
240 /// This represents the set of LSRUse indices which reference
241 /// a particular register.
242 SmallBitVector UsedByIndices;
243
244 void print(raw_ostream &OS) const;
245 void dump() const;
246};
247
248// An offset from an address that is either scalable or fixed. Used for
249// per-target optimizations of addressing modes.
250class Immediate : public details::FixedOrScalableQuantity<Immediate, int64_t> {
251 constexpr Immediate(ScalarTy MinVal, bool Scalable)
252 : FixedOrScalableQuantity(MinVal, Scalable) {}
253
254 constexpr Immediate(const FixedOrScalableQuantity<Immediate, int64_t> &V)
255 : FixedOrScalableQuantity(V) {}
256
257public:
258 constexpr Immediate() = delete;
259
260 static constexpr Immediate getFixed(ScalarTy MinVal) {
261 return {MinVal, false};
262 }
263 static constexpr Immediate getScalable(ScalarTy MinVal) {
264 return {MinVal, true};
265 }
266 static constexpr Immediate get(ScalarTy MinVal, bool Scalable) {
267 return {MinVal, Scalable};
268 }
269 static constexpr Immediate getZero() { return {0, false}; }
270 static constexpr Immediate getFixedMin() {
271 return {std::numeric_limits<int64_t>::min(), false};
272 }
273 static constexpr Immediate getFixedMax() {
274 return {std::numeric_limits<int64_t>::max(), false};
275 }
276 static constexpr Immediate getScalableMin() {
277 return {std::numeric_limits<int64_t>::min(), true};
278 }
279 static constexpr Immediate getScalableMax() {
280 return {std::numeric_limits<int64_t>::max(), true};
281 }
282
283 constexpr bool isLessThanZero() const { return Quantity < 0; }
284
285 constexpr bool isGreaterThanZero() const { return Quantity > 0; }
286
287 constexpr bool isCompatibleImmediate(const Immediate &Imm) const {
288 return isZero() || Imm.isZero() || Imm.Scalable == Scalable;
289 }
290
291 constexpr bool isMin() const {
292 return Quantity == std::numeric_limits<ScalarTy>::min();
293 }
294
295 constexpr bool isMax() const {
296 return Quantity == std::numeric_limits<ScalarTy>::max();
297 }
298
299 // Arithmetic 'operators' that cast to unsigned types first.
300 constexpr Immediate addUnsigned(const Immediate &RHS) const {
301 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
302 ScalarTy Value = (uint64_t)Quantity + RHS.getKnownMinValue();
303 return {Value, Scalable || RHS.isScalable()};
304 }
305
306 constexpr Immediate subUnsigned(const Immediate &RHS) const {
307 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
308 ScalarTy Value = (uint64_t)Quantity - RHS.getKnownMinValue();
309 return {Value, Scalable || RHS.isScalable()};
310 }
311
312 // Scale the quantity by a constant without caring about runtime scalability.
313 constexpr Immediate mulUnsigned(const ScalarTy RHS) const {
314 ScalarTy Value = (uint64_t)Quantity * RHS;
315 return {Value, Scalable};
316 }
317
318 // Helpers for generating SCEVs with vscale terms where needed.
319 const SCEV *getSCEV(ScalarEvolution &SE, Type *Ty) const {
320 const SCEV *S = SE.getConstant(Ty, Quantity);
321 if (Scalable)
322 S = SE.getMulExpr(S, SE.getVScale(S->getType()));
323 return S;
324 }
325
326 const SCEV *getNegativeSCEV(ScalarEvolution &SE, Type *Ty) const {
327 const SCEV *NegS = SE.getConstant(Ty, -(uint64_t)Quantity);
328 if (Scalable)
329 NegS = SE.getMulExpr(NegS, SE.getVScale(NegS->getType()));
330 return NegS;
331 }
332
333 const SCEV *getUnknownSCEV(ScalarEvolution &SE, Type *Ty) const {
334 // TODO: Avoid implicit trunc?
335 // See https://github.com/llvm/llvm-project/issues/112510.
336 const SCEV *SU = SE.getUnknown(
337 ConstantInt::getSigned(Ty, Quantity, /*ImplicitTrunc=*/true));
338 if (Scalable)
339 SU = SE.getMulExpr(SU, SE.getVScale(SU->getType()));
340 return SU;
341 }
342};
343
344// This is needed for the Compare type of std::map when Immediate is used
345// as a key. We don't need it to be fully correct against any value of vscale,
346// just to make sure that vscale-related terms in the map are considered against
347// each other rather than being mixed up and potentially missing opportunities.
348struct KeyOrderTargetImmediate {
349 bool operator()(const Immediate &LHS, const Immediate &RHS) const {
350 if (LHS.isScalable() && !RHS.isScalable())
351 return false;
352 if (!LHS.isScalable() && RHS.isScalable())
353 return true;
354 return LHS.getKnownMinValue() < RHS.getKnownMinValue();
355 }
356};
357
358// This would be nicer if we could be generic instead of directly using size_t,
359// but there doesn't seem to be a type trait for is_orderable or
360// is_lessthan_comparable or similar.
361struct KeyOrderSizeTAndImmediate {
362 bool operator()(const std::pair<size_t, Immediate> &LHS,
363 const std::pair<size_t, Immediate> &RHS) const {
364 size_t LSize = LHS.first;
365 size_t RSize = RHS.first;
366 if (LSize != RSize)
367 return LSize < RSize;
368 return KeyOrderTargetImmediate()(LHS.second, RHS.second);
369 }
370};
371} // end anonymous namespace
372
373#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
374void RegSortData::print(raw_ostream &OS) const {
375 OS << "[NumUses=" << UsedByIndices.count() << ']';
376}
377
378LLVM_DUMP_METHOD void RegSortData::dump() const {
379 print(errs()); errs() << '\n';
380}
381#endif
382
383namespace {
384
385/// Map register candidates to information about how they are used.
386class RegUseTracker {
387 using RegUsesTy = DenseMap<const SCEV *, RegSortData>;
388
389 RegUsesTy RegUsesMap;
391
392public:
393 void countRegister(const SCEV *Reg, size_t LUIdx);
394 void dropRegister(const SCEV *Reg, size_t LUIdx);
395 void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
396
397 bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
398
399 const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
400
401 void clear();
402
405
406 iterator begin() { return RegSequence.begin(); }
407 iterator end() { return RegSequence.end(); }
408 const_iterator begin() const { return RegSequence.begin(); }
409 const_iterator end() const { return RegSequence.end(); }
410};
411
412} // end anonymous namespace
413
414void
415RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
416 std::pair<RegUsesTy::iterator, bool> Pair = RegUsesMap.try_emplace(Reg);
417 RegSortData &RSD = Pair.first->second;
418 if (Pair.second)
419 RegSequence.push_back(Reg);
420 RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
421 RSD.UsedByIndices.set(LUIdx);
422}
423
424void
425RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
426 RegUsesTy::iterator It = RegUsesMap.find(Reg);
427 assert(It != RegUsesMap.end());
428 RegSortData &RSD = It->second;
429 assert(RSD.UsedByIndices.size() > LUIdx);
430 RSD.UsedByIndices.reset(LUIdx);
431}
432
433void
434RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
435 assert(LUIdx <= LastLUIdx);
436
437 // Update RegUses. The data structure is not optimized for this purpose;
438 // we must iterate through it and update each of the bit vectors.
439 for (auto &Pair : RegUsesMap) {
440 SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
441 if (LUIdx < UsedByIndices.size())
442 UsedByIndices[LUIdx] =
443 LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : false;
444 UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
445 }
446}
447
448bool
449RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
450 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
451 if (I == RegUsesMap.end())
452 return false;
453 const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
454 int i = UsedByIndices.find_first();
455 if (i == -1) return false;
456 if ((size_t)i != LUIdx) return true;
457 return UsedByIndices.find_next(i) != -1;
458}
459
460const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
461 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
462 assert(I != RegUsesMap.end() && "Unknown register!");
463 return I->second.UsedByIndices;
464}
465
466void RegUseTracker::clear() {
467 RegUsesMap.clear();
468 RegSequence.clear();
469}
470
471namespace {
472
473/// This class holds information that describes a formula for computing
474/// satisfying a use. It may include broken-out immediates and scaled registers.
475struct Formula {
476 /// Global base address used for complex addressing.
477 GlobalValue *BaseGV = nullptr;
478
479 /// Base offset for complex addressing.
480 Immediate BaseOffset = Immediate::getZero();
481
482 /// Whether any complex addressing has a base register.
483 bool HasBaseReg = false;
484
485 /// The scale of any complex addressing.
486 int64_t Scale = 0;
487
488 /// The list of "base" registers for this use. When this is non-empty. The
489 /// canonical representation of a formula is
490 /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
491 /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
492 /// 3. The reg containing recurrent expr related with currect loop in the
493 /// formula should be put in the ScaledReg.
494 /// #1 enforces that the scaled register is always used when at least two
495 /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
496 /// #2 enforces that 1 * reg is reg.
497 /// #3 ensures invariant regs with respect to current loop can be combined
498 /// together in LSR codegen.
499 /// This invariant can be temporarily broken while building a formula.
500 /// However, every formula inserted into the LSRInstance must be in canonical
501 /// form.
503
504 /// The 'scaled' register for this use. This should be non-null when Scale is
505 /// not zero.
506 const SCEV *ScaledReg = nullptr;
507
508 /// An additional constant offset which added near the use. This requires a
509 /// temporary register, but the offset itself can live in an add immediate
510 /// field rather than a register.
511 Immediate UnfoldedOffset = Immediate::getZero();
512
513 Formula() = default;
514
515 void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
516
517 bool isCanonical(const Loop &L) const;
518
519 void canonicalize(const Loop &L);
520
521 bool unscale();
522
523 bool hasZeroEnd() const;
524
525 bool countsDownToZero() const;
526
527 size_t getNumRegs() const;
528 Type *getType() const;
529
530 void deleteBaseReg(const SCEV *&S);
531
532 bool referencesReg(const SCEV *S) const;
533 bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
534 const RegUseTracker &RegUses) const;
535
536 void print(raw_ostream &OS) const;
537 void dump() const;
538};
539
540} // end anonymous namespace
541
542/// Recursion helper for initialMatch.
543static void DoInitialMatch(const SCEV *S, Loop *L,
546 // Collect expressions which properly dominate the loop header.
547 if (SE.properlyDominates(S, L->getHeader())) {
548 Good.push_back(S);
549 return;
550 }
551
552 // Look at add operands.
553 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
554 for (const SCEV *S : Add->operands())
555 DoInitialMatch(S, L, Good, Bad, SE);
556 return;
557 }
558
559 // Look at addrec operands.
560 const SCEV *Start, *Step;
561 const Loop *ARLoop;
562 if (match(S,
563 m_scev_AffineAddRec(m_SCEV(Start), m_SCEV(Step), m_Loop(ARLoop))) &&
564 !Start->isZero()) {
565 DoInitialMatch(Start, L, Good, Bad, SE);
566 DoInitialMatch(SE.getAddRecExpr(SE.getConstant(S->getType(), 0), Step,
567 // FIXME: AR->getNoWrapFlags()
568 ARLoop, SCEV::FlagAnyWrap),
569 L, Good, Bad, SE);
570 return;
571 }
572
573 // Handle a multiplication by -1 (negation) if it didn't fold.
574 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
575 if (Mul->getOperand(0)->isAllOnesValue()) {
577 const SCEV *NewMul = SE.getMulExpr(Ops);
578
581 DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
582 const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
583 SE.getEffectiveSCEVType(NewMul->getType())));
584 for (const SCEV *S : MyGood)
585 Good.push_back(SE.getMulExpr(NegOne, S));
586 for (const SCEV *S : MyBad)
587 Bad.push_back(SE.getMulExpr(NegOne, S));
588 return;
589 }
590
591 // Ok, we can't do anything interesting. Just stuff the whole thing into a
592 // register and hope for the best.
593 Bad.push_back(S);
594}
595
596/// Incorporate loop-variant parts of S into this Formula, attempting to keep
597/// all loop-invariant and loop-computable values in a single base register.
598void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
601 DoInitialMatch(S, L, Good, Bad, SE);
602 if (!Good.empty()) {
603 const SCEV *Sum = SE.getAddExpr(Good);
604 if (!Sum->isZero())
605 BaseRegs.push_back(Sum);
606 HasBaseReg = true;
607 }
608 if (!Bad.empty()) {
609 const SCEV *Sum = SE.getAddExpr(Bad);
610 if (!Sum->isZero())
611 BaseRegs.push_back(Sum);
612 HasBaseReg = true;
613 }
614 canonicalize(*L);
615}
616
617static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L) {
618 return SCEVExprContains(S, [&L](const SCEV *S) {
619 return isa<SCEVAddRecExpr>(S) && (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
620 });
621}
622
623/// Check whether or not this formula satisfies the canonical
624/// representation.
625/// \see Formula::BaseRegs.
626bool Formula::isCanonical(const Loop &L) const {
627 assert((Scale == 0 || ScaledReg) &&
628 "ScaledReg must be non-null if Scale is non-zero");
629
630 if (!ScaledReg)
631 return BaseRegs.size() <= 1;
632
633 if (Scale != 1)
634 return true;
635
636 if (Scale == 1 && BaseRegs.empty())
637 return false;
638
639 if (containsAddRecDependentOnLoop(ScaledReg, L))
640 return true;
641
642 // If ScaledReg is not a recurrent expr, or it is but its loop is not current
643 // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
644 // loop, we want to swap the reg in BaseRegs with ScaledReg.
645 return none_of(BaseRegs, [&L](const SCEV *S) {
647 });
648}
649
650/// Helper method to morph a formula into its canonical representation.
651/// \see Formula::BaseRegs.
652/// Every formula having more than one base register, must use the ScaledReg
653/// field. Otherwise, we would have to do special cases everywhere in LSR
654/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
655/// On the other hand, 1*reg should be canonicalized into reg.
656void Formula::canonicalize(const Loop &L) {
657 if (isCanonical(L))
658 return;
659
660 if (BaseRegs.empty()) {
661 // No base reg? Use scale reg with scale = 1 as such.
662 assert(ScaledReg && "Expected 1*reg => reg");
663 assert(Scale == 1 && "Expected 1*reg => reg");
664 BaseRegs.push_back(ScaledReg);
665 Scale = 0;
666 ScaledReg = nullptr;
667 return;
668 }
669
670 // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
671 if (!ScaledReg) {
672 ScaledReg = BaseRegs.pop_back_val();
673 Scale = 1;
674 }
675
676 // If ScaledReg is an invariant with respect to L, find the reg from
677 // BaseRegs containing the recurrent expr related with Loop L. Swap the
678 // reg with ScaledReg.
679 if (!containsAddRecDependentOnLoop(ScaledReg, L)) {
680 auto I = find_if(BaseRegs, [&L](const SCEV *S) {
682 });
683 if (I != BaseRegs.end())
684 std::swap(ScaledReg, *I);
685 }
686 assert(isCanonical(L) && "Failed to canonicalize?");
687}
688
689/// Get rid of the scale in the formula.
690/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
691/// \return true if it was possible to get rid of the scale, false otherwise.
692/// \note After this operation the formula may not be in the canonical form.
693bool Formula::unscale() {
694 if (Scale != 1)
695 return false;
696 Scale = 0;
697 BaseRegs.push_back(ScaledReg);
698 ScaledReg = nullptr;
699 return true;
700}
701
702bool Formula::hasZeroEnd() const {
703 if (UnfoldedOffset || BaseOffset)
704 return false;
705 if (BaseRegs.size() != 1 || ScaledReg)
706 return false;
707 return true;
708}
709
710bool Formula::countsDownToZero() const {
711 if (!hasZeroEnd())
712 return false;
713 assert(BaseRegs.size() == 1 && "hasZeroEnd should mean one BaseReg");
714 const APInt *StepInt;
715 if (!match(BaseRegs[0], m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StepInt))))
716 return false;
717 return StepInt->isNegative();
718}
719
720/// Return the total number of register operands used by this formula. This does
721/// not include register uses implied by non-constant addrec strides.
722size_t Formula::getNumRegs() const {
723 return !!ScaledReg + BaseRegs.size();
724}
725
726/// Return the type of this formula, if it has one, or null otherwise. This type
727/// is meaningless except for the bit size.
728Type *Formula::getType() const {
729 return !BaseRegs.empty() ? BaseRegs.front()->getType() :
730 ScaledReg ? ScaledReg->getType() :
731 BaseGV ? BaseGV->getType() :
732 nullptr;
733}
734
735/// Delete the given base reg from the BaseRegs list.
736void Formula::deleteBaseReg(const SCEV *&S) {
737 if (&S != &BaseRegs.back())
738 std::swap(S, BaseRegs.back());
739 BaseRegs.pop_back();
740}
741
742/// Test if this formula references the given register.
743bool Formula::referencesReg(const SCEV *S) const {
744 return S == ScaledReg || is_contained(BaseRegs, S);
745}
746
747/// Test whether this formula uses registers which are used by uses other than
748/// the use with the given index.
749bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
750 const RegUseTracker &RegUses) const {
751 if (ScaledReg)
752 if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
753 return true;
754 for (const SCEV *BaseReg : BaseRegs)
755 if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
756 return true;
757 return false;
758}
759
760#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
761void Formula::print(raw_ostream &OS) const {
762 ListSeparator Plus(" + ");
763 if (BaseGV) {
764 OS << Plus;
765 BaseGV->printAsOperand(OS, /*PrintType=*/false);
766 }
767 if (BaseOffset.isNonZero())
768 OS << Plus << BaseOffset;
769
770 for (const SCEV *BaseReg : BaseRegs)
771 OS << Plus << "reg(" << *BaseReg << ')';
772
773 if (HasBaseReg && BaseRegs.empty())
774 OS << Plus << "**error: HasBaseReg**";
775 else if (!HasBaseReg && !BaseRegs.empty())
776 OS << Plus << "**error: !HasBaseReg**";
777
778 if (Scale != 0) {
779 OS << Plus << Scale << "*reg(";
780 if (ScaledReg)
781 OS << *ScaledReg;
782 else
783 OS << "<unknown>";
784 OS << ')';
785 }
786 if (UnfoldedOffset.isNonZero())
787 OS << Plus << "imm(" << UnfoldedOffset << ')';
788}
789
790LLVM_DUMP_METHOD void Formula::dump() const {
791 print(errs()); errs() << '\n';
792}
793#endif
794
795/// Return true if the given addrec can be sign-extended without changing its
796/// value.
798 Type *WideTy =
800 return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
801}
802
803/// Return true if the given add can be sign-extended without changing its
804/// value.
805static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
806 Type *WideTy =
807 IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
808 return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
809}
810
811/// Return true if the given mul can be sign-extended without changing its
812/// value.
813static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
814 Type *WideTy =
816 SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
817 return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
818}
819
820/// Return an expression for LHS /s RHS, if it can be determined and if the
821/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
822/// is true, expressions like (X * Y) /s Y are simplified to X, ignoring that
823/// the multiplication may overflow, which is useful when the result will be
824/// used in a context where the most significant bits are ignored.
825static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
826 ScalarEvolution &SE,
827 bool IgnoreSignificantBits = false) {
828 // Handle the trivial case, which works for any SCEV type.
829 if (LHS == RHS)
830 return SE.getConstant(LHS->getType(), 1);
831
832 // Handle a few RHS special cases.
834 if (RC) {
835 const APInt &RA = RC->getAPInt();
836 // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
837 // some folding.
838 if (RA.isAllOnes()) {
839 if (LHS->getType()->isPointerTy())
840 return nullptr;
841 return SE.getMulExpr(LHS, RC);
842 }
843 // Handle x /s 1 as x.
844 if (RA == 1)
845 return LHS;
846 }
847
848 // Check for a division of a constant by a constant.
850 if (!RC)
851 return nullptr;
852 const APInt &LA = C->getAPInt();
853 const APInt &RA = RC->getAPInt();
854 if (LA.srem(RA) != 0)
855 return nullptr;
856 return SE.getConstant(LA.sdiv(RA));
857 }
858
859 // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
861 if ((IgnoreSignificantBits || isAddRecSExtable(AR, SE)) && AR->isAffine()) {
862 const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
863 IgnoreSignificantBits);
864 if (!Step) return nullptr;
865 const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
866 IgnoreSignificantBits);
867 if (!Start) return nullptr;
868 // FlagNW is independent of the start value, step direction, and is
869 // preserved with smaller magnitude steps.
870 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
871 return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
872 }
873 return nullptr;
874 }
875
876 // Distribute the sdiv over add operands, if the add doesn't overflow.
878 if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
880 for (const SCEV *S : Add->operands()) {
881 const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
882 if (!Op) return nullptr;
883 Ops.push_back(Op);
884 }
885 return SE.getAddExpr(Ops);
886 }
887 return nullptr;
888 }
889
890 // Check for a multiply operand that we can pull RHS out of.
892 if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
893 // Handle special case C1*X*Y /s C2*X*Y.
894 if (const SCEVMulExpr *MulRHS = dyn_cast<SCEVMulExpr>(RHS)) {
895 if (IgnoreSignificantBits || isMulSExtable(MulRHS, SE)) {
896 const SCEVConstant *LC = dyn_cast<SCEVConstant>(Mul->getOperand(0));
897 const SCEVConstant *RC =
898 dyn_cast<SCEVConstant>(MulRHS->getOperand(0));
899 if (LC && RC) {
901 SmallVector<const SCEV *, 4> ROps(drop_begin(MulRHS->operands()));
902 if (LOps == ROps)
903 return getExactSDiv(LC, RC, SE, IgnoreSignificantBits);
904 }
905 }
906 }
907
909 bool Found = false;
910 for (const SCEV *S : Mul->operands()) {
911 if (!Found)
912 if (const SCEV *Q = getExactSDiv(S, RHS, SE,
913 IgnoreSignificantBits)) {
914 S = Q;
915 Found = true;
916 }
917 Ops.push_back(S);
918 }
919 return Found ? SE.getMulExpr(Ops) : nullptr;
920 }
921 return nullptr;
922 }
923
924 // Otherwise we don't know.
925 return nullptr;
926}
927
928/// Extracts an immediate operand from \p Ops and replaces the operand with
929/// zero. If \p PreferScalable is true and \p Ops contains both a scalable and
930/// non-scalable offsets, the scalable offset will be extracted.
932 ScalarEvolution &SE,
933 bool PreferScalable) {
934 const APInt *C;
935 SCEVUse *Op = nullptr;
936 Immediate Result = Immediate::getZero();
937
938 // Ops are sorted by their SCEVType (the order of SCEVTypes enum). So, for an
939 // AddExpr the possible order of operands is:
940 // Constant < VScale < Truncate < ZeroExtend < SignExtend < MulExpr < ...
941
942 // This means fixed-size immediates will always appear on the LHS:
943 SCEVUse &S = Ops.front();
944 if (match(S, m_scev_APInt(C)) && !C->isZero() &&
945 C->getSignificantBits() <= 64) {
946 Op = &S;
947 Result = Immediate::getFixed(C->getSExtValue());
948 }
949
950 // But scalable immediates, which are MulExpr(Vscale, Constant), can appear
951 // later in the operand list:
952 if (EnableVScaleImmediates && (Result.isZero() || PreferScalable)) {
953 for (SCEVUse &S : Ops) {
954 // We know anything past scMulExpr will not be a vscale immediate.
955 if (S->getSCEVType() > scMulExpr)
956 break;
958 Op = &S;
959 Result = Immediate::getScalable(C->getSExtValue());
960 break;
961 }
962 }
963 }
964
965 if (Result.isNonZero()) {
966 SCEVUse &S = *Op;
967 S = SE.getConstant(S->getType(), 0);
968 }
969
970 return Result;
971}
972
973/// If S involves the addition of a constant integer value, return that integer
974/// value, and mutate S to point to a new SCEV with that value excluded.
975static Immediate ExtractImmediate(SCEVUse &S, ScalarEvolution &SE,
976 bool PreferScalable = false) {
977 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
978 SmallVector<SCEVUse, 8> NewOps(Add->operands());
979 Immediate Result = ExtractImmediateOperand(NewOps, SE, PreferScalable);
980 if (Result.isZero())
981 Result = ExtractImmediate(NewOps.front(), SE, PreferScalable);
982 if (Result.isNonZero())
983 S = SE.getAddExpr(NewOps);
984 return Result;
985 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
986 SmallVector<SCEVUse, 8> NewOps(AR->operands());
987 Immediate Result = ExtractImmediate(NewOps.front(), SE, PreferScalable);
988 if (Result.isNonZero())
989 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
990 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
992 return Result;
993 }
994 return ExtractImmediateOperand({S}, SE, PreferScalable);
995}
996
997/// If S involves the addition of a GlobalValue address, return that symbol, and
998/// mutate S to point to a new SCEV with that value excluded.
1000 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
1001 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
1002 S = SE.getConstant(GV->getType(), 0);
1003 return GV;
1004 }
1005 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
1006 SmallVector<SCEVUse, 8> NewOps(Add->operands());
1007 GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
1008 if (Result)
1009 S = SE.getAddExpr(NewOps);
1010 return Result;
1011 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
1012 SmallVector<SCEVUse, 8> NewOps(AR->operands());
1013 GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
1014 if (Result)
1015 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
1016 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
1018 return Result;
1019 }
1020 return nullptr;
1021}
1022
1023/// Returns true if the specified instruction is using the specified value as an
1024/// address.
1026 Instruction *Inst, Value *OperandVal) {
1027 bool isAddress = isa<LoadInst>(Inst);
1028 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
1029 if (SI->getPointerOperand() == OperandVal)
1030 isAddress = true;
1031 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
1032 // Addressing modes can also be folded into prefetches and a variety
1033 // of intrinsics.
1034 switch (II->getIntrinsicID()) {
1035 case Intrinsic::memset:
1036 case Intrinsic::prefetch:
1037 case Intrinsic::masked_load:
1038 if (II->getArgOperand(0) == OperandVal)
1039 isAddress = true;
1040 break;
1041 case Intrinsic::masked_store:
1042 if (II->getArgOperand(1) == OperandVal)
1043 isAddress = true;
1044 break;
1045 case Intrinsic::memmove:
1046 case Intrinsic::memcpy:
1047 if (II->getArgOperand(0) == OperandVal ||
1048 II->getArgOperand(1) == OperandVal)
1049 isAddress = true;
1050 break;
1051 default: {
1052 MemIntrinsicInfo IntrInfo;
1053 if (TTI.getTgtMemIntrinsic(II, IntrInfo)) {
1054 if (IntrInfo.PtrVal == OperandVal)
1055 isAddress = true;
1056 }
1057 }
1058 }
1059 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1060 if (RMW->getPointerOperand() == OperandVal)
1061 isAddress = true;
1062 } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1063 if (CmpX->getPointerOperand() == OperandVal)
1064 isAddress = true;
1065 }
1066 return isAddress;
1067}
1068
1069/// Return the type of the memory being accessed.
1070static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
1071 Instruction *Inst, Value *OperandVal) {
1072 MemAccessTy AccessTy = MemAccessTy::getUnknown(Inst->getContext());
1073
1074 // First get the type of memory being accessed.
1075 if (Type *Ty = Inst->getAccessType())
1076 AccessTy.MemTy = Ty;
1077
1078 // Then get the pointer address space.
1079 if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
1080 AccessTy.AddrSpace = SI->getPointerAddressSpace();
1081 } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
1082 AccessTy.AddrSpace = LI->getPointerAddressSpace();
1083 } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1084 AccessTy.AddrSpace = RMW->getPointerAddressSpace();
1085 } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1086 AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
1087 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
1088 switch (II->getIntrinsicID()) {
1089 case Intrinsic::prefetch:
1090 case Intrinsic::memset:
1091 AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
1092 AccessTy.MemTy = OperandVal->getType();
1093 break;
1094 case Intrinsic::memmove:
1095 case Intrinsic::memcpy:
1096 AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace();
1097 AccessTy.MemTy = OperandVal->getType();
1098 break;
1099 case Intrinsic::masked_load:
1100 AccessTy.AddrSpace =
1101 II->getArgOperand(0)->getType()->getPointerAddressSpace();
1102 break;
1103 case Intrinsic::masked_store:
1104 AccessTy.AddrSpace =
1105 II->getArgOperand(1)->getType()->getPointerAddressSpace();
1106 break;
1107 default: {
1108 MemIntrinsicInfo IntrInfo;
1109 if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) {
1110 AccessTy.AddrSpace
1111 = IntrInfo.PtrVal->getType()->getPointerAddressSpace();
1112 }
1113
1114 break;
1115 }
1116 }
1117 }
1118
1119 return AccessTy;
1120}
1121
1122/// Return true if this AddRec is already a phi in its loop.
1123static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
1124 for (PHINode &PN : AR->getLoop()->getHeader()->phis()) {
1125 if (SE.isSCEVable(PN.getType()) &&
1126 (SE.getEffectiveSCEVType(PN.getType()) ==
1127 SE.getEffectiveSCEVType(AR->getType())) &&
1128 SE.getSCEV(&PN) == AR)
1129 return true;
1130 }
1131 return false;
1132}
1133
1134/// Check if expanding this expression is likely to incur significant cost. This
1135/// is tricky because SCEV doesn't track which expressions are actually computed
1136/// by the current IR.
1137///
1138/// We currently allow expansion of IV increments that involve adds,
1139/// multiplication by constants, and AddRecs from existing phis.
1140///
1141/// TODO: Allow UDivExpr if we can find an existing IV increment that is an
1142/// obvious multiple of the UDivExpr.
1143static bool isHighCostExpansion(const SCEV *S,
1145 ScalarEvolution &SE) {
1146 // Zero/One operand expressions
1147 switch (S->getSCEVType()) {
1148 case scUnknown:
1149 case scConstant:
1150 case scVScale:
1151 return false;
1152 case scTruncate:
1153 return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
1154 Processed, SE);
1155 case scZeroExtend:
1156 return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
1157 Processed, SE);
1158 case scSignExtend:
1159 return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
1160 Processed, SE);
1161 default:
1162 break;
1163 }
1164
1165 if (!Processed.insert(S).second)
1166 return false;
1167
1168 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
1169 for (const SCEV *S : Add->operands()) {
1170 if (isHighCostExpansion(S, Processed, SE))
1171 return true;
1172 }
1173 return false;
1174 }
1175
1176 const SCEV *Op0, *Op1;
1177 if (match(S, m_scev_Mul(m_SCEV(Op0), m_SCEV(Op1)))) {
1178 // Multiplication by a constant is ok
1179 if (isa<SCEVConstant>(Op0))
1180 return isHighCostExpansion(Op1, Processed, SE);
1181
1182 // If we have the value of one operand, check if an existing
1183 // multiplication already generates this expression.
1184 if (const auto *U = dyn_cast<SCEVUnknown>(Op1)) {
1185 Value *UVal = U->getValue();
1186 for (User *UR : UVal->users()) {
1187 // If U is a constant, it may be used by a ConstantExpr.
1189 if (UI && UI->getOpcode() == Instruction::Mul &&
1190 SE.isSCEVable(UI->getType())) {
1191 return SE.getSCEV(UI) == S;
1192 }
1193 }
1194 }
1195 }
1196
1197 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
1198 if (isExistingPhi(AR, SE))
1199 return false;
1200 }
1201
1202 // Fow now, consider any other type of expression (div/mul/min/max) high cost.
1203 return true;
1204}
1205
1206namespace {
1207
1208class LSRUse;
1209
1210} // end anonymous namespace
1211
1212/// Check if the addressing mode defined by \p F is completely
1213/// folded in \p LU at isel time.
1214/// This includes address-mode folding and special icmp tricks.
1215/// This function returns true if \p LU can accommodate what \p F
1216/// defines and up to 1 base + 1 scaled + offset.
1217/// In other words, if \p F has several base registers, this function may
1218/// still return true. Therefore, users still need to account for
1219/// additional base registers and/or unfolded offsets to derive an
1220/// accurate cost model.
1221static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1222 const LSRUse &LU, const Formula &F);
1223
1224// Get the cost of the scaling factor used in F for LU.
1225static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI,
1226 const LSRUse &LU, const Formula &F,
1227 const Loop &L);
1228
1229namespace {
1230
1231/// This class is used to measure and compare candidate formulae.
1232class Cost {
1233 const Loop *L = nullptr;
1234 ScalarEvolution *SE = nullptr;
1235 const TargetTransformInfo *TTI = nullptr;
1236 TargetTransformInfo::LSRCost C;
1238
1239public:
1240 Cost() = delete;
1241 Cost(const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
1243 L(L), SE(&SE), TTI(&TTI), AMK(AMK) {
1244 C.Insns = 0;
1245 C.NumRegs = 0;
1246 C.AddRecCost = 0;
1247 C.NumIVMuls = 0;
1248 C.NumBaseAdds = 0;
1249 C.ImmCost = 0;
1250 C.SetupCost = 0;
1251 C.ScaleCost = 0;
1252 }
1253
1254 bool isLess(const Cost &Other) const;
1255
1256 void Lose();
1257
1258#ifndef NDEBUG
1259 // Once any of the metrics loses, they must all remain losers.
1260 bool isValid() {
1261 return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
1262 | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
1263 || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
1264 & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
1265 }
1266#endif
1267
1268 bool isLoser() {
1269 assert(isValid() && "invalid cost");
1270 return C.NumRegs == ~0u;
1271 }
1272
1273 void RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
1274 const DenseSet<const SCEV *> &VisitedRegs, const LSRUse &LU,
1275 bool HardwareLoopProfitable,
1276 SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
1277
1278 void print(raw_ostream &OS) const;
1279 void dump() const;
1280
1281private:
1282 void RateRegister(const Formula &F, const SCEV *Reg,
1283 SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
1284 bool HardwareLoopProfitable);
1285 void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1286 SmallPtrSetImpl<const SCEV *> &Regs,
1287 const LSRUse &LU, bool HardwareLoopProfitable,
1288 SmallPtrSetImpl<const SCEV *> *LoserRegs);
1289};
1290
1291/// An operand value in an instruction which is to be replaced with some
1292/// equivalent, possibly strength-reduced, replacement.
1293struct LSRFixup {
1294 /// The instruction which will be updated.
1295 Instruction *UserInst = nullptr;
1296
1297 /// The operand of the instruction which will be replaced. The operand may be
1298 /// used more than once; every instance will be replaced.
1299 Value *OperandValToReplace = nullptr;
1300
1301 /// If this user is to use the post-incremented value of an induction
1302 /// variable, this set is non-empty and holds the loops associated with the
1303 /// induction variable.
1304 PostIncLoopSet PostIncLoops;
1305
1306 /// A constant offset to be added to the LSRUse expression. This allows
1307 /// multiple fixups to share the same LSRUse with different offsets, for
1308 /// example in an unrolled loop.
1309 Immediate Offset = Immediate::getZero();
1310
1311 LSRFixup() = default;
1312
1313 bool isUseFullyOutsideLoop(const Loop *L) const;
1314
1315 void print(raw_ostream &OS) const;
1316 void dump() const;
1317};
1318
1319/// This class holds the state that LSR keeps for each use in IVUsers, as well
1320/// as uses invented by LSR itself. It includes information about what kinds of
1321/// things can be folded into the user, information about the user itself, and
1322/// information about how the use may be satisfied. TODO: Represent multiple
1323/// users of the same expression in common?
1324class LSRUse {
1325 DenseSet<SmallVector<const SCEV *, 4>> Uniquifier;
1326
1327public:
1328 /// An enum for a kind of use, indicating what types of scaled and immediate
1329 /// operands it might support.
1330 enum KindType {
1331 Basic, ///< A normal use, with no folding.
1332 Special, ///< A special case of basic, allowing -1 scales.
1333 Address, ///< An address use; folding according to TargetLowering
1334 ICmpZero ///< An equality icmp with both operands folded into one.
1335 // TODO: Add a generic icmp too?
1336 };
1337
1338 using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>;
1339
1340 KindType Kind;
1341 MemAccessTy AccessTy;
1342
1343 /// The list of operands which are to be replaced.
1345
1346 /// Keep track of the min and max offsets of the fixups.
1347 Immediate MinOffset = Immediate::getFixedMax();
1348 Immediate MaxOffset = Immediate::getFixedMin();
1349
1350 /// This records whether all of the fixups using this LSRUse are outside of
1351 /// the loop, in which case some special-case heuristics may be used.
1352 bool AllFixupsOutsideLoop = true;
1353
1354 /// This records whether all of the fixups using this LSRUse are unconditional
1355 /// within the loop, meaning they will be executed on every path to the loop
1356 /// latch. This includes fixups before early exits.
1357 bool AllFixupsUnconditional = true;
1358
1359 /// RigidFormula is set to true to guarantee that this use will be associated
1360 /// with a single formula--the one that initially matched. Some SCEV
1361 /// expressions cannot be expanded. This allows LSR to consider the registers
1362 /// used by those expressions without the need to expand them later after
1363 /// changing the formula.
1364 bool RigidFormula = false;
1365
1366 /// A list of ways to build a value that can satisfy this user. After the
1367 /// list is populated, one of these is selected heuristically and used to
1368 /// formulate a replacement for OperandValToReplace in UserInst.
1369 SmallVector<Formula, 12> Formulae;
1370
1371 /// The set of register candidates used by all formulae in this LSRUse.
1372 SmallPtrSet<const SCEV *, 4> Regs;
1373
1374 LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {}
1375
1376 LSRFixup &getNewFixup() {
1377 Fixups.push_back(LSRFixup());
1378 return Fixups.back();
1379 }
1380
1381 void pushFixup(LSRFixup &f) {
1382 Fixups.push_back(f);
1383 if (Immediate::isKnownGT(f.Offset, MaxOffset))
1384 MaxOffset = f.Offset;
1385 if (Immediate::isKnownLT(f.Offset, MinOffset))
1386 MinOffset = f.Offset;
1387 }
1388
1389 bool HasFormulaWithSameRegs(const Formula &F) const;
1390 float getNotSelectedProbability(const SCEV *Reg) const;
1391 bool InsertFormula(const Formula &F, const Loop &L);
1392 void DeleteFormula(Formula &F);
1393 void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
1394
1395 void print(raw_ostream &OS) const;
1396 void dump() const;
1397};
1398
1399} // end anonymous namespace
1400
1401static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1402 LSRUse::KindType Kind, MemAccessTy AccessTy,
1403 GlobalValue *BaseGV, Immediate BaseOffset,
1404 bool HasBaseReg, int64_t Scale,
1405 Instruction *Fixup = nullptr);
1406
1407static unsigned getSetupCost(const SCEV *Reg, unsigned Depth,
1408 const TargetTransformInfo &TTI) {
1409 if (isa<SCEVUnknown>(Reg))
1410 return 1;
1411 if (const auto *C = dyn_cast<SCEVConstant>(Reg)) {
1412 if (TTI.getIntImmCost(C->getAPInt(), C->getType(),
1415 return 0;
1416 return 1;
1417 }
1418 if (Depth == 0)
1419 return 0;
1420 if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))
1421 return getSetupCost(S->getStart(), Depth - 1, TTI);
1422 if (auto S = dyn_cast<SCEVIntegralCastExpr>(Reg))
1423 return getSetupCost(S->getOperand(), Depth - 1, TTI);
1424 if (auto S = dyn_cast<SCEVNAryExpr>(Reg))
1425 return std::accumulate(S->operands().begin(), S->operands().end(), 0,
1426 [&](unsigned i, const SCEV *Reg) {
1427 return i + getSetupCost(Reg, Depth - 1, TTI);
1428 });
1429 if (auto S = dyn_cast<SCEVUDivExpr>(Reg))
1430 return getSetupCost(S->getLHS(), Depth - 1, TTI) +
1431 getSetupCost(S->getRHS(), Depth - 1, TTI);
1432 return 0;
1433}
1434
1435/// Tally up interesting quantities from the given register.
1436void Cost::RateRegister(const Formula &F, const SCEV *Reg,
1437 SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
1438 bool HardwareLoopProfitable) {
1439 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
1440 // If this is an addrec for another loop, it should be an invariant
1441 // with respect to L since L is the innermost loop (at least
1442 // for now LSR only handles innermost loops).
1443 if (AR->getLoop() != L) {
1444 // If the AddRec exists, consider it's register free and leave it alone.
1445 if (isExistingPhi(AR, *SE) && !(AMK & TTI::AMK_PostIndexed))
1446 return;
1447
1448 // It is bad to allow LSR for current loop to add induction variables
1449 // for its sibling loops.
1450 if (!AR->getLoop()->contains(L)) {
1451 Lose();
1452 return;
1453 }
1454
1455 // Otherwise, it will be an invariant with respect to Loop L.
1456 ++C.NumRegs;
1457 return;
1458 }
1459
1460 unsigned LoopCost = 1;
1461 if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||
1462 TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {
1463 const SCEV *Start;
1464 const APInt *Step;
1465 if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_scev_APInt(Step)))) {
1466 // If the step size matches the base offset, we could use pre-indexed
1467 // addressing.
1468 bool CanPreIndex = (AMK & TTI::AMK_PreIndexed) &&
1469 F.BaseOffset.isFixed() &&
1470 *Step == F.BaseOffset.getFixedValue();
1471 bool CanPostIndex = (AMK & TTI::AMK_PostIndexed) &&
1472 !isa<SCEVConstant>(Start) &&
1473 SE->isLoopInvariant(Start, L);
1474 // We can only pre or post index when the load/store is unconditional.
1475 if ((CanPreIndex || CanPostIndex) && LU.AllFixupsUnconditional)
1476 LoopCost = 0;
1477 }
1478 }
1479
1480 // If the loop counts down to zero and we'll be using a hardware loop then
1481 // the addrec will be combined into the hardware loop instruction.
1482 if (LU.Kind == LSRUse::ICmpZero && F.countsDownToZero() &&
1483 HardwareLoopProfitable)
1484 LoopCost = 0;
1485 C.AddRecCost += LoopCost;
1486
1487 // Add the step value register, if it needs one.
1488 // TODO: The non-affine case isn't precisely modeled here.
1489 if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
1490 if (!Regs.count(AR->getOperand(1))) {
1491 RateRegister(F, AR->getOperand(1), Regs, LU, HardwareLoopProfitable);
1492 if (isLoser())
1493 return;
1494 }
1495 }
1496 }
1497 ++C.NumRegs;
1498
1499 // Rough heuristic; favor registers which don't require extra setup
1500 // instructions in the preheader.
1501 C.SetupCost += getSetupCost(Reg, SetupCostDepthLimit, *TTI);
1502 // Ensure we don't, even with the recusion limit, produce invalid costs.
1503 C.SetupCost = std::min<unsigned>(C.SetupCost, 1 << 16);
1504
1505 C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
1507}
1508
1509/// Record this register in the set. If we haven't seen it before, rate
1510/// it. Optional LoserRegs provides a way to declare any formula that refers to
1511/// one of those regs an instant loser.
1512void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1513 SmallPtrSetImpl<const SCEV *> &Regs,
1514 const LSRUse &LU, bool HardwareLoopProfitable,
1515 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1516 if (LoserRegs && LoserRegs->count(Reg)) {
1517 Lose();
1518 return;
1519 }
1520 if (Regs.insert(Reg).second) {
1521 RateRegister(F, Reg, Regs, LU, HardwareLoopProfitable);
1522 if (LoserRegs && isLoser())
1523 LoserRegs->insert(Reg);
1524 }
1525}
1526
1527void Cost::RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
1528 const DenseSet<const SCEV *> &VisitedRegs,
1529 const LSRUse &LU, bool HardwareLoopProfitable,
1530 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1531 if (isLoser())
1532 return;
1533 assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
1534 // Tally up the registers.
1535 unsigned PrevAddRecCost = C.AddRecCost;
1536 unsigned PrevNumRegs = C.NumRegs;
1537 unsigned PrevNumBaseAdds = C.NumBaseAdds;
1538 if (const SCEV *ScaledReg = F.ScaledReg) {
1539 if (VisitedRegs.count(ScaledReg)) {
1540 Lose();
1541 return;
1542 }
1543 RatePrimaryRegister(F, ScaledReg, Regs, LU, HardwareLoopProfitable,
1544 LoserRegs);
1545 if (isLoser())
1546 return;
1547 }
1548 for (const SCEV *BaseReg : F.BaseRegs) {
1549 if (VisitedRegs.count(BaseReg)) {
1550 Lose();
1551 return;
1552 }
1553 RatePrimaryRegister(F, BaseReg, Regs, LU, HardwareLoopProfitable,
1554 LoserRegs);
1555 if (isLoser())
1556 return;
1557 }
1558
1559 // Determine how many (unfolded) adds we'll need inside the loop.
1560 size_t NumBaseParts = F.getNumRegs();
1561 if (NumBaseParts > 1)
1562 // Do not count the base and a possible second register if the target
1563 // allows to fold 2 registers.
1564 C.NumBaseAdds +=
1565 NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F)));
1566 C.NumBaseAdds += (F.UnfoldedOffset.isNonZero());
1567
1568 // Accumulate non-free scaling amounts.
1569 C.ScaleCost += getScalingFactorCost(*TTI, LU, F, *L).getValue();
1570
1571 // Tally up the non-zero immediates.
1572 for (const LSRFixup &Fixup : LU.Fixups) {
1573 if (Fixup.Offset.isCompatibleImmediate(F.BaseOffset)) {
1574 Immediate Offset = Fixup.Offset.addUnsigned(F.BaseOffset);
1575 if (F.BaseGV)
1576 C.ImmCost += 64; // Handle symbolic values conservatively.
1577 // TODO: This should probably be the pointer size.
1578 else if (Offset.isNonZero())
1579 C.ImmCost +=
1580 APInt(64, Offset.getKnownMinValue(), true).getSignificantBits();
1581
1582 // Check with target if this offset with this instruction is
1583 // specifically not supported.
1584 if (LU.Kind == LSRUse::Address && Offset.isNonZero() &&
1585 !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1586 Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
1587 C.NumBaseAdds++;
1588 } else {
1589 // Incompatible immediate type, increase cost to avoid using
1590 C.ImmCost += 2048;
1591 }
1592 }
1593
1594 // If we don't count instruction cost exit here.
1595 if (!InsnsCost) {
1596 assert(isValid() && "invalid cost");
1597 return;
1598 }
1599
1600 // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
1601 // additional instruction (at least fill).
1602 // TODO: Need distinguish register class?
1603 unsigned TTIRegNum = TTI->getNumberOfRegisters(
1604 TTI->getRegisterClassForType(false, F.getType())) - 1;
1605 if (C.NumRegs > TTIRegNum) {
1606 // Cost already exceeded TTIRegNum, then only newly added register can add
1607 // new instructions.
1608 if (PrevNumRegs > TTIRegNum)
1609 C.Insns += (C.NumRegs - PrevNumRegs);
1610 else
1611 C.Insns += (C.NumRegs - TTIRegNum);
1612 }
1613
1614 // If ICmpZero formula ends with not 0, it could not be replaced by
1615 // just add or sub. We'll need to compare final result of AddRec.
1616 // That means we'll need an additional instruction. But if the target can
1617 // macro-fuse a compare with a branch, don't count this extra instruction.
1618 // For -10 + {0, +, 1}:
1619 // i = i + 1;
1620 // cmp i, 10
1621 //
1622 // For {-10, +, 1}:
1623 // i = i + 1;
1624 if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() &&
1625 !TTI->canMacroFuseCmp())
1626 C.Insns++;
1627 // Each new AddRec adds 1 instruction to calculation.
1628 C.Insns += (C.AddRecCost - PrevAddRecCost);
1629
1630 // BaseAdds adds instructions for unfolded registers.
1631 if (LU.Kind != LSRUse::ICmpZero)
1632 C.Insns += C.NumBaseAdds - PrevNumBaseAdds;
1633 assert(isValid() && "invalid cost");
1634}
1635
1636/// Set this cost to a losing value.
1637void Cost::Lose() {
1638 C.Insns = std::numeric_limits<unsigned>::max();
1639 C.NumRegs = std::numeric_limits<unsigned>::max();
1640 C.AddRecCost = std::numeric_limits<unsigned>::max();
1641 C.NumIVMuls = std::numeric_limits<unsigned>::max();
1642 C.NumBaseAdds = std::numeric_limits<unsigned>::max();
1643 C.ImmCost = std::numeric_limits<unsigned>::max();
1644 C.SetupCost = std::numeric_limits<unsigned>::max();
1645 C.ScaleCost = std::numeric_limits<unsigned>::max();
1646}
1647
1648/// Choose the lower cost.
1649bool Cost::isLess(const Cost &Other) const {
1650 if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
1651 C.Insns != Other.C.Insns)
1652 return C.Insns < Other.C.Insns;
1653 return TTI->isLSRCostLess(C, Other.C);
1654}
1655
1656#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1657void Cost::print(raw_ostream &OS) const {
1658 if (InsnsCost)
1659 OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
1660 OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s");
1661 if (C.AddRecCost != 0)
1662 OS << ", with addrec cost " << C.AddRecCost;
1663 if (C.NumIVMuls != 0)
1664 OS << ", plus " << C.NumIVMuls << " IV mul"
1665 << (C.NumIVMuls == 1 ? "" : "s");
1666 if (C.NumBaseAdds != 0)
1667 OS << ", plus " << C.NumBaseAdds << " base add"
1668 << (C.NumBaseAdds == 1 ? "" : "s");
1669 if (C.ScaleCost != 0)
1670 OS << ", plus " << C.ScaleCost << " scale cost";
1671 if (C.ImmCost != 0)
1672 OS << ", plus " << C.ImmCost << " imm cost";
1673 if (C.SetupCost != 0)
1674 OS << ", plus " << C.SetupCost << " setup cost";
1675}
1676
1677LLVM_DUMP_METHOD void Cost::dump() const {
1678 print(errs()); errs() << '\n';
1679}
1680#endif
1681
1682/// Test whether this fixup always uses its value outside of the given loop.
1683bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
1684 // PHI nodes use their value in their incoming blocks.
1685 if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
1686 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
1687 if (PN->getIncomingValue(i) == OperandValToReplace &&
1688 L->contains(PN->getIncomingBlock(i)))
1689 return false;
1690 return true;
1691 }
1692
1693 return !L->contains(UserInst);
1694}
1695
1696#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1697void LSRFixup::print(raw_ostream &OS) const {
1698 OS << "UserInst=";
1699 // Store is common and interesting enough to be worth special-casing.
1700 if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
1701 OS << "store ";
1702 Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
1703 } else if (UserInst->getType()->isVoidTy())
1704 OS << UserInst->getOpcodeName();
1705 else
1706 UserInst->printAsOperand(OS, /*PrintType=*/false);
1707
1708 OS << ", OperandValToReplace=";
1709 OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
1710
1711 for (const Loop *PIL : PostIncLoops) {
1712 OS << ", PostIncLoop=";
1713 PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
1714 }
1715
1716 if (Offset.isNonZero())
1717 OS << ", Offset=" << Offset;
1718}
1719
1720LLVM_DUMP_METHOD void LSRFixup::dump() const {
1721 print(errs()); errs() << '\n';
1722}
1723#endif
1724
1725/// Test whether this use as a formula which has the same registers as the given
1726/// formula.
1727bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
1729 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1730 // Unstable sort by host order ok, because this is only used for uniquifying.
1731 llvm::sort(Key);
1732 return Uniquifier.count(Key);
1733}
1734
1735/// The function returns a probability of selecting formula without Reg.
1736float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
1737 unsigned FNum = 0;
1738 for (const Formula &F : Formulae)
1739 if (F.referencesReg(Reg))
1740 FNum++;
1741 return ((float)(Formulae.size() - FNum)) / Formulae.size();
1742}
1743
1744/// If the given formula has not yet been inserted, add it to the list, and
1745/// return true. Return false otherwise. The formula must be in canonical form.
1746bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
1747 assert(F.isCanonical(L) && "Invalid canonical representation");
1748
1749 if (!Formulae.empty() && RigidFormula)
1750 return false;
1751
1753 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1754 // Unstable sort by host order ok, because this is only used for uniquifying.
1755 llvm::sort(Key);
1756
1757 if (!Uniquifier.insert(Key).second)
1758 return false;
1759
1760 // Using a register to hold the value of 0 is not profitable.
1761 assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
1762 "Zero allocated in a scaled register!");
1763#ifndef NDEBUG
1764 for (const SCEV *BaseReg : F.BaseRegs)
1765 assert(!BaseReg->isZero() && "Zero allocated in a base register!");
1766#endif
1767
1768 // Add the formula to the list.
1769 Formulae.push_back(F);
1770
1771 // Record registers now being used by this use.
1772 Regs.insert_range(F.BaseRegs);
1773 if (F.ScaledReg)
1774 Regs.insert(F.ScaledReg);
1775
1776 return true;
1777}
1778
1779/// Remove the given formula from this use's list.
1780void LSRUse::DeleteFormula(Formula &F) {
1781 if (&F != &Formulae.back())
1782 std::swap(F, Formulae.back());
1783 Formulae.pop_back();
1784}
1785
1786/// Recompute the Regs field, and update RegUses.
1787void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
1788 // Now that we've filtered out some formulae, recompute the Regs set.
1789 SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
1790 Regs.clear();
1791 for (const Formula &F : Formulae) {
1792 if (F.ScaledReg) Regs.insert(F.ScaledReg);
1793 Regs.insert_range(F.BaseRegs);
1794 }
1795
1796 // Update the RegTracker.
1797 for (const SCEV *S : OldRegs)
1798 if (!Regs.count(S))
1799 RegUses.dropRegister(S, LUIdx);
1800}
1801
1802#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1803void LSRUse::print(raw_ostream &OS) const {
1804 OS << "LSR Use: Kind=";
1805 switch (Kind) {
1806 case Basic: OS << "Basic"; break;
1807 case Special: OS << "Special"; break;
1808 case ICmpZero: OS << "ICmpZero"; break;
1809 case Address:
1810 OS << "Address of ";
1811 if (AccessTy.MemTy->isPointerTy())
1812 OS << "pointer"; // the full pointer type could be really verbose
1813 else {
1814 OS << *AccessTy.MemTy;
1815 }
1816
1817 OS << " in addrspace(" << AccessTy.AddrSpace << ')';
1818 }
1819
1820 OS << ", Offsets={";
1821 bool NeedComma = false;
1822 for (const LSRFixup &Fixup : Fixups) {
1823 if (NeedComma) OS << ',';
1824 OS << Fixup.Offset;
1825 NeedComma = true;
1826 }
1827 OS << '}';
1828
1829 if (AllFixupsOutsideLoop)
1830 OS << ", all-fixups-outside-loop";
1831
1832 if (AllFixupsUnconditional)
1833 OS << ", all-fixups-unconditional";
1834}
1835
1836LLVM_DUMP_METHOD void LSRUse::dump() const {
1837 print(errs()); errs() << '\n';
1838}
1839#endif
1840
1842 LSRUse::KindType Kind, MemAccessTy AccessTy,
1843 GlobalValue *BaseGV, Immediate BaseOffset,
1844 bool HasBaseReg, int64_t Scale,
1845 Instruction *Fixup /* = nullptr */) {
1846 switch (Kind) {
1847 case LSRUse::Address: {
1848 int64_t FixedOffset =
1849 BaseOffset.isScalable() ? 0 : BaseOffset.getFixedValue();
1850 int64_t ScalableOffset =
1851 BaseOffset.isScalable() ? BaseOffset.getKnownMinValue() : 0;
1852 return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, FixedOffset,
1853 HasBaseReg, Scale, AccessTy.AddrSpace,
1854 Fixup, ScalableOffset);
1855 }
1856 case LSRUse::ICmpZero:
1857 // There's not even a target hook for querying whether it would be legal to
1858 // fold a GV into an ICmp.
1859 if (BaseGV)
1860 return false;
1861
1862 // ICmp only has two operands; don't allow more than two non-trivial parts.
1863 if (Scale != 0 && HasBaseReg && BaseOffset.isNonZero())
1864 return false;
1865
1866 // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
1867 // putting the scaled register in the other operand of the icmp.
1868 if (Scale != 0 && Scale != -1)
1869 return false;
1870
1871 // If we have low-level target information, ask the target if it can fold an
1872 // integer immediate on an icmp.
1873 if (BaseOffset.isNonZero()) {
1874 // We don't have an interface to query whether the target supports
1875 // icmpzero against scalable quantities yet.
1876 if (BaseOffset.isScalable())
1877 return false;
1878
1879 // We have one of:
1880 // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
1881 // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
1882 // Offs is the ICmp immediate.
1883 if (Scale == 0)
1884 // The cast does the right thing with
1885 // std::numeric_limits<int64_t>::min().
1886 BaseOffset = BaseOffset.getFixed(-(uint64_t)BaseOffset.getFixedValue());
1887 return TTI.isLegalICmpImmediate(BaseOffset.getFixedValue());
1888 }
1889
1890 // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
1891 return true;
1892
1893 case LSRUse::Basic:
1894 // Only handle single-register values.
1895 return !BaseGV && Scale == 0 && BaseOffset.isZero();
1896
1897 case LSRUse::Special:
1898 // Special case Basic to handle -1 scales.
1899 return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset.isZero();
1900 }
1901
1902 llvm_unreachable("Invalid LSRUse Kind!");
1903}
1904
1906 Immediate MinOffset, Immediate MaxOffset,
1907 LSRUse::KindType Kind, MemAccessTy AccessTy,
1908 GlobalValue *BaseGV, Immediate BaseOffset,
1909 bool HasBaseReg, int64_t Scale) {
1910 if (BaseOffset.isNonZero() &&
1911 (BaseOffset.isScalable() != MinOffset.isScalable() ||
1912 BaseOffset.isScalable() != MaxOffset.isScalable()))
1913 return false;
1914 // Check for overflow.
1915 int64_t Base = BaseOffset.getKnownMinValue();
1916 int64_t Min = MinOffset.getKnownMinValue();
1917 int64_t Max = MaxOffset.getKnownMinValue();
1918 if (((int64_t)((uint64_t)Base + Min) > Base) != (Min > 0))
1919 return false;
1920 MinOffset = Immediate::get((uint64_t)Base + Min, MinOffset.isScalable());
1921 if (((int64_t)((uint64_t)Base + Max) > Base) != (Max > 0))
1922 return false;
1923 MaxOffset = Immediate::get((uint64_t)Base + Max, MaxOffset.isScalable());
1924
1925 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
1926 HasBaseReg, Scale) &&
1927 isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
1928 HasBaseReg, Scale);
1929}
1930
1932 Immediate MinOffset, Immediate MaxOffset,
1933 LSRUse::KindType Kind, MemAccessTy AccessTy,
1934 const Formula &F, const Loop &L) {
1935 // For the purpose of isAMCompletelyFolded either having a canonical formula
1936 // or a scale not equal to zero is correct.
1937 // Problems may arise from non canonical formulae having a scale == 0.
1938 // Strictly speaking it would best to just rely on canonical formulae.
1939 // However, when we generate the scaled formulae, we first check that the
1940 // scaling factor is profitable before computing the actual ScaledReg for
1941 // compile time sake.
1942 assert((F.isCanonical(L) || F.Scale != 0));
1943 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1944 F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
1945}
1946
1947/// Test whether we know how to expand the current formula.
1948static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1949 Immediate MaxOffset, LSRUse::KindType Kind,
1950 MemAccessTy AccessTy, GlobalValue *BaseGV,
1951 Immediate BaseOffset, bool HasBaseReg, int64_t Scale) {
1952 // We know how to expand completely foldable formulae.
1953 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1954 BaseOffset, HasBaseReg, Scale) ||
1955 // Or formulae that use a base register produced by a sum of base
1956 // registers.
1957 (Scale == 1 &&
1958 isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1959 BaseGV, BaseOffset, true, 0));
1960}
1961
1962static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1963 Immediate MaxOffset, LSRUse::KindType Kind,
1964 MemAccessTy AccessTy, const Formula &F) {
1965 return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
1966 F.BaseOffset, F.HasBaseReg, F.Scale);
1967}
1968
1970 Immediate Offset) {
1971 if (Offset.isScalable())
1972 return TTI.isLegalAddScalableImmediate(Offset.getKnownMinValue());
1973
1974 return TTI.isLegalAddImmediate(Offset.getFixedValue());
1975}
1976
1978 const LSRUse &LU, const Formula &F) {
1979 // Target may want to look at the user instructions.
1980 if (LU.Kind == LSRUse::Address && TTI.LSRWithInstrQueries()) {
1981 for (const LSRFixup &Fixup : LU.Fixups)
1982 if (!isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1983 (F.BaseOffset + Fixup.Offset), F.HasBaseReg,
1984 F.Scale, Fixup.UserInst))
1985 return false;
1986 return true;
1987 }
1988
1989 return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1990 LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
1991 F.Scale);
1992}
1993
1995 const LSRUse &LU, const Formula &F,
1996 const Loop &L) {
1997 if (!F.Scale)
1998 return 0;
1999
2000 // If the use is not completely folded in that instruction, we will have to
2001 // pay an extra cost only for scale != 1.
2002 if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
2003 LU.AccessTy, F, L))
2004 return F.Scale != 1;
2005
2006 switch (LU.Kind) {
2007 case LSRUse::Address: {
2008 // Check the scaling factor cost with both the min and max offsets.
2009 int64_t ScalableMin = 0, ScalableMax = 0, FixedMin = 0, FixedMax = 0;
2010 if (F.BaseOffset.isScalable()) {
2011 ScalableMin = (F.BaseOffset + LU.MinOffset).getKnownMinValue();
2012 ScalableMax = (F.BaseOffset + LU.MaxOffset).getKnownMinValue();
2013 } else {
2014 FixedMin = (F.BaseOffset + LU.MinOffset).getFixedValue();
2015 FixedMax = (F.BaseOffset + LU.MaxOffset).getFixedValue();
2016 }
2017 InstructionCost ScaleCostMinOffset = TTI.getScalingFactorCost(
2018 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMin, ScalableMin),
2019 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
2020 InstructionCost ScaleCostMaxOffset = TTI.getScalingFactorCost(
2021 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMax, ScalableMax),
2022 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
2023
2024 assert(ScaleCostMinOffset.isValid() && ScaleCostMaxOffset.isValid() &&
2025 "Legal addressing mode has an illegal cost!");
2026 return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
2027 }
2028 case LSRUse::ICmpZero:
2029 case LSRUse::Basic:
2030 case LSRUse::Special:
2031 // The use is completely folded, i.e., everything is folded into the
2032 // instruction.
2033 return 0;
2034 }
2035
2036 llvm_unreachable("Invalid LSRUse Kind!");
2037}
2038
2040 LSRUse::KindType Kind, MemAccessTy AccessTy,
2041 GlobalValue *BaseGV, Immediate BaseOffset,
2042 bool HasBaseReg) {
2043 // Fast-path: zero is always foldable.
2044 if (BaseOffset.isZero() && !BaseGV)
2045 return true;
2046
2047 // Conservatively, create an address with an immediate and a
2048 // base and a scale.
2049 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2050
2051 // Canonicalize a scale of 1 to a base register if the formula doesn't
2052 // already have a base register.
2053 if (!HasBaseReg && Scale == 1) {
2054 Scale = 0;
2055 HasBaseReg = true;
2056 }
2057
2058 // FIXME: Try with + without a scale? Maybe based on TTI?
2059 // I think basereg + scaledreg + immediateoffset isn't a good 'conservative'
2060 // default for many architectures, not just AArch64 SVE. More investigation
2061 // needed later to determine if this should be used more widely than just
2062 // on scalable types.
2063 if (HasBaseReg && BaseOffset.isNonZero() && Kind != LSRUse::ICmpZero &&
2064 AccessTy.MemTy && AccessTy.MemTy->isScalableTy() && DropScaledForVScale)
2065 Scale = 0;
2066
2067 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
2068 HasBaseReg, Scale);
2069}
2070
2072 ScalarEvolution &SE, Immediate MinOffset,
2073 Immediate MaxOffset, LSRUse::KindType Kind,
2074 MemAccessTy AccessTy, const SCEV *S,
2075 bool HasBaseReg) {
2076 // Fast-path: zero is always foldable.
2077 if (S->isZero()) return true;
2078
2079 // Conservatively, create an address with an immediate and a
2080 // base and a scale.
2081 SCEVUse SCopy = S;
2082 Immediate BaseOffset = ExtractImmediate(SCopy, SE);
2083 GlobalValue *BaseGV = ExtractSymbol(SCopy, SE);
2084
2085 // If there's anything else involved, it's not foldable.
2086 if (!SCopy->isZero())
2087 return false;
2088
2089 // Fast-path: zero is always foldable.
2090 if (BaseOffset.isZero() && !BaseGV)
2091 return true;
2092
2093 if (BaseOffset.isScalable())
2094 return false;
2095
2096 // Conservatively, create an address with an immediate and a
2097 // base and a scale.
2098 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2099
2100 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
2101 BaseOffset, HasBaseReg, Scale);
2102}
2103
2104namespace {
2105
2106/// An individual increment in a Chain of IV increments. Relate an IV user to
2107/// an expression that computes the IV it uses from the IV used by the previous
2108/// link in the Chain.
2109///
2110/// For the head of a chain, IncExpr holds the absolute SCEV expression for the
2111/// original IVOperand. The head of the chain's IVOperand is only valid during
2112/// chain collection, before LSR replaces IV users. During chain generation,
2113/// IncExpr can be used to find the new IVOperand that computes the same
2114/// expression.
2115struct IVInc {
2116 Instruction *UserInst;
2117 Value* IVOperand;
2118 const SCEV *IncExpr;
2119
2120 IVInc(Instruction *U, Value *O, const SCEV *E)
2121 : UserInst(U), IVOperand(O), IncExpr(E) {}
2122};
2123
2124// The list of IV increments in program order. We typically add the head of a
2125// chain without finding subsequent links.
2126struct IVChain {
2128 const SCEV *ExprBase = nullptr;
2129
2130 IVChain() = default;
2131 IVChain(const IVInc &Head, const SCEV *Base)
2132 : Incs(1, Head), ExprBase(Base) {}
2133
2134 using const_iterator = SmallVectorImpl<IVInc>::const_iterator;
2135
2136 // Return the first increment in the chain.
2137 const_iterator begin() const {
2138 assert(!Incs.empty());
2139 return std::next(Incs.begin());
2140 }
2141 const_iterator end() const {
2142 return Incs.end();
2143 }
2144
2145 // Returns true if this chain contains any increments.
2146 bool hasIncs() const { return Incs.size() >= 2; }
2147
2148 // Add an IVInc to the end of this chain.
2149 void add(const IVInc &X) { Incs.push_back(X); }
2150
2151 // Returns the last UserInst in the chain.
2152 Instruction *tailUserInst() const { return Incs.back().UserInst; }
2153
2154 // Returns true if IncExpr can be profitably added to this chain.
2155 bool isProfitableIncrement(const SCEV *OperExpr,
2156 const SCEV *IncExpr,
2157 ScalarEvolution&);
2158};
2159
2160/// Helper for CollectChains to track multiple IV increment uses. Distinguish
2161/// between FarUsers that definitely cross IV increments and NearUsers that may
2162/// be used between IV increments.
2163struct ChainUsers {
2164 SmallPtrSet<Instruction*, 4> FarUsers;
2165 SmallPtrSet<Instruction*, 4> NearUsers;
2166};
2167
2168/// This class holds state for the main loop strength reduction logic.
2169class LSRInstance {
2170 IVUsers &IU;
2171 ScalarEvolution &SE;
2172 DominatorTree &DT;
2173 LoopInfo &LI;
2174 AssumptionCache &AC;
2175 TargetLibraryInfo &TLI;
2176 const TargetTransformInfo &TTI;
2177 Loop *const L;
2178 MemorySSAUpdater *MSSAU;
2180 mutable SCEVExpander Rewriter;
2181 bool Changed = false;
2182 bool HardwareLoopProfitable = false;
2183
2184 /// This is the insert position that the current loop's induction variable
2185 /// increment should be placed. In simple loops, this is the latch block's
2186 /// terminator. But in more complicated cases, this is a position which will
2187 /// dominate all the in-loop post-increment users.
2188 Instruction *IVIncInsertPos = nullptr;
2189
2190 /// Interesting factors between use strides.
2191 ///
2192 /// We explicitly use a SetVector which contains a SmallSet, instead of the
2193 /// default, a SmallDenseSet, because we need to use the full range of
2194 /// int64_ts, and there's currently no good way of doing that with
2195 /// SmallDenseSet.
2196 SetVector<int64_t, SmallVector<int64_t, 8>, SmallSet<int64_t, 8>> Factors;
2197
2198 /// The cost of the current SCEV, the best solution by LSR will be dropped if
2199 /// the solution is not profitable.
2200 Cost BaselineCost;
2201
2202 /// Interesting use types, to facilitate truncation reuse.
2203 SmallSetVector<Type *, 4> Types;
2204
2205 /// The list of interesting uses.
2207
2208 /// Track which uses use which register candidates.
2209 RegUseTracker RegUses;
2210
2211 // Limit the number of chains to avoid quadratic behavior. We don't expect to
2212 // have more than a few IV increment chains in a loop. Missing a Chain falls
2213 // back to normal LSR behavior for those uses.
2214 static const unsigned MaxChains = 8;
2215
2216 /// IV users can form a chain of IV increments.
2218
2219 /// IV users that belong to profitable IVChains.
2220 SmallPtrSet<Use*, MaxChains> IVIncSet;
2221
2222 /// Induction variables that were generated and inserted by the SCEV Expander.
2223 SmallVector<llvm::WeakVH, 2> ScalarEvolutionIVs;
2224
2225 // Inserting instructions in the loop and using them as PHI's input could
2226 // break LCSSA in case if PHI's parent block is not a loop exit (i.e. the
2227 // corresponding incoming block is not loop exiting). So collect all such
2228 // instructions to form LCSSA for them later.
2229 SmallSetVector<Instruction *, 4> InsertedNonLCSSAInsts;
2230
2231 void OptimizeShadowIV();
2232 bool FindIVUserForCond(Instruction *Cond, IVStrideUse *&CondUse);
2233 Instruction *OptimizeMax(ICmpInst *Cond, IVStrideUse *&CondUse);
2234 void OptimizeLoopTermCond();
2235
2236 void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
2237 SmallVectorImpl<ChainUsers> &ChainUsersVec);
2238 void FinalizeChain(IVChain &Chain);
2239 void CollectChains();
2240 void GenerateIVChain(const IVChain &Chain,
2241 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2242
2243 void CollectInterestingTypesAndFactors();
2244 void CollectFixupsAndInitialFormulae();
2245
2246 // Support for sharing of LSRUses between LSRFixups.
2247 using UseMapTy = DenseMap<LSRUse::SCEVUseKindPair, size_t>;
2248 UseMapTy UseMap;
2249
2250 bool reconcileNewOffset(LSRUse &LU, Immediate NewOffset, bool HasBaseReg,
2251 LSRUse::KindType Kind, MemAccessTy AccessTy);
2252
2253 std::pair<size_t, Immediate> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
2254 MemAccessTy AccessTy);
2255
2256 void DeleteUse(LSRUse &LU, size_t LUIdx);
2257
2258 LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
2259
2260 void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2261 void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2262 void CountRegisters(const Formula &F, size_t LUIdx);
2263 bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
2264 bool IsFixupExecutedEachIncrement(const LSRFixup &LF) const;
2265
2266 void CollectLoopInvariantFixupsAndFormulae();
2267
2268 void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
2269 unsigned Depth = 0);
2270
2271 void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
2272 const Formula &Base, unsigned Depth,
2273 size_t Idx, bool IsScaledReg = false);
2274 void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
2275 void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2276 const Formula &Base, size_t Idx,
2277 bool IsScaledReg = false);
2278 void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2279 void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2280 const Formula &Base,
2281 const SmallVectorImpl<Immediate> &Worklist,
2282 size_t Idx, bool IsScaledReg = false);
2283 void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2284 void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2285 void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2286 void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
2287 void GenerateCrossUseConstantOffsets();
2288 void GenerateAllReuseFormulae();
2289
2290 void FilterOutUndesirableDedicatedRegisters();
2291
2292 size_t EstimateSearchSpaceComplexity() const;
2293 void NarrowSearchSpaceByDetectingSupersets();
2294 void NarrowSearchSpaceByCollapsingUnrolledCode();
2295 void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
2296 void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
2297 void NarrowSearchSpaceByFilterPostInc();
2298 void NarrowSearchSpaceByMergingUsesOutsideLoop();
2299 void NarrowSearchSpaceByDeletingCostlyFormulas();
2300 void NarrowSearchSpaceByPickingWinnerRegs();
2301 void NarrowSearchSpaceUsingHeuristics();
2302
2303 void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
2304 Cost &SolutionCost,
2305 SmallVectorImpl<const Formula *> &Workspace,
2306 const Cost &CurCost,
2307 const SmallPtrSet<const SCEV *, 16> &CurRegs,
2308 DenseSet<const SCEV *> &VisitedRegs) const;
2309 void Solve(SmallVectorImpl<const Formula *> &Solution) const;
2310
2312 HoistInsertPosition(BasicBlock::iterator IP,
2313 const SmallVectorImpl<Instruction *> &Inputs) const;
2314 BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP,
2315 const LSRFixup &LF,
2316 const LSRUse &LU) const;
2317
2318 Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2320 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2321 void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
2322 const Formula &F,
2323 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2324 void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2325 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2326 void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
2327
2328public:
2329 LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
2330 LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC,
2331 TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU);
2332
2333 bool getChanged() const { return Changed; }
2334 const SmallVectorImpl<WeakVH> &getScalarEvolutionIVs() const {
2335 return ScalarEvolutionIVs;
2336 }
2337
2338 void print_factors_and_types(raw_ostream &OS) const;
2339 void print_fixups(raw_ostream &OS) const;
2340 void print_uses(raw_ostream &OS) const;
2341 void print(raw_ostream &OS) const;
2342 void dump() const;
2343};
2344
2345} // end anonymous namespace
2346
2347/// If IV is used in a int-to-float cast inside the loop then try to eliminate
2348/// the cast operation.
2349void LSRInstance::OptimizeShadowIV() {
2350 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2351 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2352 return;
2353
2354 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
2355 UI != E; /* empty */) {
2356 IVUsers::const_iterator CandidateUI = UI;
2357 ++UI;
2358 Instruction *ShadowUse = CandidateUI->getUser();
2359 Type *DestTy = nullptr;
2360 bool IsSigned = false;
2361
2362 /* If shadow use is a int->float cast then insert a second IV
2363 to eliminate this cast.
2364
2365 for (unsigned i = 0; i < n; ++i)
2366 foo((double)i);
2367
2368 is transformed into
2369
2370 double d = 0.0;
2371 for (unsigned i = 0; i < n; ++i, ++d)
2372 foo(d);
2373 */
2374 if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
2375 IsSigned = false;
2376 DestTy = UCast->getDestTy();
2377 }
2378 else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
2379 IsSigned = true;
2380 DestTy = SCast->getDestTy();
2381 }
2382 if (!DestTy) continue;
2383
2384 // If target does not support DestTy natively then do not apply
2385 // this transformation.
2386 if (!TTI.isTypeLegal(DestTy)) continue;
2387
2388 PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
2389 if (!PH) continue;
2390 if (PH->getNumIncomingValues() != 2) continue;
2391
2392 // If the calculation in integers overflows, the result in FP type will
2393 // differ. So we only can do this transformation if we are guaranteed to not
2394 // deal with overflowing values
2395 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PH));
2396 if (!AR) continue;
2397 if (IsSigned && !AR->hasNoSignedWrap()) continue;
2398 if (!IsSigned && !AR->hasNoUnsignedWrap()) continue;
2399
2400 Type *SrcTy = PH->getType();
2401 int Mantissa = DestTy->getFPMantissaWidth();
2402 if (Mantissa == -1) continue;
2403 if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
2404 continue;
2405
2406 unsigned Entry, Latch;
2407 if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
2408 Entry = 0;
2409 Latch = 1;
2410 } else {
2411 Entry = 1;
2412 Latch = 0;
2413 }
2414
2415 ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
2416 if (!Init) continue;
2417 Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
2418 (double)Init->getSExtValue() :
2419 (double)Init->getZExtValue());
2420
2421 BinaryOperator *Incr =
2423 if (!Incr) continue;
2424 if (Incr->getOpcode() != Instruction::Add
2425 && Incr->getOpcode() != Instruction::Sub)
2426 continue;
2427
2428 /* Initialize new IV, double d = 0.0 in above example. */
2429 ConstantInt *C = nullptr;
2430 if (Incr->getOperand(0) == PH)
2432 else if (Incr->getOperand(1) == PH)
2434 else
2435 continue;
2436
2437 if (!C) continue;
2438
2439 // Ignore negative constants, as the code below doesn't handle them
2440 // correctly. TODO: Remove this restriction.
2441 if (!C->getValue().isStrictlyPositive())
2442 continue;
2443
2444 /* Add new PHINode. */
2445 PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH->getIterator());
2446 NewPH->setDebugLoc(PH->getDebugLoc());
2447
2448 /* create new increment. '++d' in above example. */
2449 Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
2450 BinaryOperator *NewIncr = BinaryOperator::Create(
2451 Incr->getOpcode() == Instruction::Add ? Instruction::FAdd
2452 : Instruction::FSub,
2453 NewPH, CFP, "IV.S.next.", Incr->getIterator());
2454 NewIncr->setDebugLoc(Incr->getDebugLoc());
2455
2456 NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
2457 NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
2458
2459 /* Remove cast operation */
2460 ShadowUse->replaceAllUsesWith(NewPH);
2461 ShadowUse->eraseFromParent();
2462 Changed = true;
2463 break;
2464 }
2465}
2466
2467/// If Cond has an operand that is an expression of an IV, set the IV user and
2468/// stride information and return true, otherwise return false.
2469bool LSRInstance::FindIVUserForCond(Instruction *Cond, IVStrideUse *&CondUse) {
2470 for (IVStrideUse &U : IU)
2471 if (U.getUser() == Cond) {
2472 // NOTE: we could handle setcc instructions with multiple uses here, but
2473 // InstCombine does it as well for simple uses, it's not clear that it
2474 // occurs enough in real life to handle.
2475 CondUse = &U;
2476 return true;
2477 }
2478 return false;
2479}
2480
2481/// Rewrite the loop's terminating condition if it uses a max computation.
2482///
2483/// This is a narrow solution to a specific, but acute, problem. For loops
2484/// like this:
2485///
2486/// i = 0;
2487/// do {
2488/// p[i] = 0.0;
2489/// } while (++i < n);
2490///
2491/// the trip count isn't just 'n', because 'n' might not be positive. And
2492/// unfortunately this can come up even for loops where the user didn't use
2493/// a C do-while loop. For example, seemingly well-behaved top-test loops
2494/// will commonly be lowered like this:
2495///
2496/// if (n > 0) {
2497/// i = 0;
2498/// do {
2499/// p[i] = 0.0;
2500/// } while (++i < n);
2501/// }
2502///
2503/// and then it's possible for subsequent optimization to obscure the if
2504/// test in such a way that indvars can't find it.
2505///
2506/// When indvars can't find the if test in loops like this, it creates a
2507/// max expression, which allows it to give the loop a canonical
2508/// induction variable:
2509///
2510/// i = 0;
2511/// max = n < 1 ? 1 : n;
2512/// do {
2513/// p[i] = 0.0;
2514/// } while (++i != max);
2515///
2516/// Canonical induction variables are necessary because the loop passes
2517/// are designed around them. The most obvious example of this is the
2518/// LoopInfo analysis, which doesn't remember trip count values. It
2519/// expects to be able to rediscover the trip count each time it is
2520/// needed, and it does this using a simple analysis that only succeeds if
2521/// the loop has a canonical induction variable.
2522///
2523/// However, when it comes time to generate code, the maximum operation
2524/// can be quite costly, especially if it's inside of an outer loop.
2525///
2526/// This function solves this problem by detecting this type of loop and
2527/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
2528/// the instructions for the maximum computation.
2529Instruction *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse *&CondUse) {
2530 // Check that the loop matches the pattern we're looking for.
2531 if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
2532 Cond->getPredicate() != CmpInst::ICMP_NE)
2533 return Cond;
2534
2535 SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
2536 if (!Sel || !Sel->hasOneUse()) return Cond;
2537
2538 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2539 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2540 return Cond;
2541 const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
2542
2543 // Add one to the backedge-taken count to get the trip count.
2544 const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
2545 if (IterationCount != SE.getSCEV(Sel)) return Cond;
2546
2547 // Check for a max calculation that matches the pattern. There's no check
2548 // for ICMP_ULE here because the comparison would be with zero, which
2549 // isn't interesting.
2550 CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
2551 const SCEVNAryExpr *Max = nullptr;
2552 if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
2553 Pred = ICmpInst::ICMP_SLE;
2554 Max = S;
2555 } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
2556 Pred = ICmpInst::ICMP_SLT;
2557 Max = S;
2558 } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
2559 Pred = ICmpInst::ICMP_ULT;
2560 Max = U;
2561 } else {
2562 // No match; bail.
2563 return Cond;
2564 }
2565
2566 // To handle a max with more than two operands, this optimization would
2567 // require additional checking and setup.
2568 if (Max->getNumOperands() != 2)
2569 return Cond;
2570
2571 const SCEV *MaxLHS = Max->getOperand(0);
2572 const SCEV *MaxRHS = Max->getOperand(1);
2573
2574 // ScalarEvolution canonicalizes constants to the left. For < and >, look
2575 // for a comparison with 1. For <= and >=, a comparison with zero.
2576 if (!MaxLHS ||
2577 (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
2578 return Cond;
2579
2580 // Check the relevant induction variable for conformance to
2581 // the pattern.
2582 const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
2583 if (!match(IV,
2585 return Cond;
2586
2587 assert(cast<SCEVAddRecExpr>(IV)->getLoop() == L &&
2588 "Loop condition operand is an addrec in a different loop!");
2589
2590 // Check the right operand of the select, and remember it, as it will
2591 // be used in the new comparison instruction.
2592 Value *NewRHS = nullptr;
2593 if (ICmpInst::isTrueWhenEqual(Pred)) {
2594 // Look for n+1, and grab n.
2595 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
2596 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2597 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2598 NewRHS = BO->getOperand(0);
2599 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
2600 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2601 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2602 NewRHS = BO->getOperand(0);
2603 if (!NewRHS)
2604 return Cond;
2605 } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
2606 NewRHS = Sel->getOperand(1);
2607 else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
2608 NewRHS = Sel->getOperand(2);
2609 else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
2610 NewRHS = SU->getValue();
2611 else
2612 // Max doesn't match expected pattern.
2613 return Cond;
2614
2615 // Determine the new comparison opcode. It may be signed or unsigned,
2616 // and the original comparison may be either equality or inequality.
2617 if (Cond->getPredicate() == CmpInst::ICMP_EQ)
2618 Pred = CmpInst::getInversePredicate(Pred);
2619
2620 // Ok, everything looks ok to change the condition into an SLT or SGE and
2621 // delete the max calculation.
2622 ICmpInst *NewCond = new ICmpInst(Cond->getIterator(), Pred,
2623 Cond->getOperand(0), NewRHS, "scmp");
2624
2625 // Delete the max calculation instructions.
2626 NewCond->setDebugLoc(Cond->getDebugLoc());
2627 Cond->replaceAllUsesWith(NewCond);
2628 CondUse->setUser(NewCond);
2630 Cond->eraseFromParent();
2631 Sel->eraseFromParent();
2632 if (Cmp->use_empty()) {
2633 salvageDebugInfo(*Cmp);
2634 Cmp->eraseFromParent();
2635 }
2636 return NewCond;
2637}
2638
2639/// Change loop terminating condition to use the postinc iv when possible.
2640void
2641LSRInstance::OptimizeLoopTermCond() {
2642 SmallPtrSet<Instruction *, 4> PostIncs;
2643
2644 // We need a different set of heuristics for rotated and non-rotated loops.
2645 // If a loop is rotated then the latch is also the backedge, so inserting
2646 // post-inc expressions just before the latch is ideal. To reduce live ranges
2647 // it also makes sense to rewrite terminating conditions to use post-inc
2648 // expressions.
2649 //
2650 // If the loop is not rotated then the latch is not a backedge; the latch
2651 // check is done in the loop head. Adding post-inc expressions before the
2652 // latch will cause overlapping live-ranges of pre-inc and post-inc expressions
2653 // in the loop body. In this case we do *not* want to use post-inc expressions
2654 // in the latch check, and we want to insert post-inc expressions before
2655 // the backedge.
2656 BasicBlock *LatchBlock = L->getLoopLatch();
2657 SmallVector<BasicBlock*, 8> ExitingBlocks;
2658 L->getExitingBlocks(ExitingBlocks);
2659 if (!llvm::is_contained(ExitingBlocks, LatchBlock)) {
2660 // The backedge doesn't exit the loop; treat this as a head-tested loop.
2661 IVIncInsertPos = LatchBlock->getTerminator();
2662 return;
2663 }
2664
2665 // Otherwise treat this as a rotated loop.
2666 for (BasicBlock *ExitingBlock : ExitingBlocks) {
2667 // Get the terminating condition for the loop if possible. If we
2668 // can, we want to change it to use a post-incremented version of its
2669 // induction variable, to allow coalescing the live ranges for the IV into
2670 // one register value.
2671
2672 CondBrInst *TermBr = dyn_cast<CondBrInst>(ExitingBlock->getTerminator());
2673 if (!TermBr)
2674 continue;
2675
2677 // If the argument to TermBr is an extractelement, then the source of that
2678 // instruction is what's generated the condition.
2680 if (Extract)
2681 Cond = dyn_cast<Instruction>(Extract->getVectorOperand());
2682 // FIXME: We could do more here, like handling logical operations where one
2683 // side is a cmp that uses an induction variable.
2684 if (!Cond)
2685 continue;
2686
2687 // Search IVUsesByStride to find Cond's IVUse if there is one.
2688 IVStrideUse *CondUse = nullptr;
2689 if (!FindIVUserForCond(Cond, CondUse))
2690 continue;
2691
2692 // If the trip count is computed in terms of a max (due to ScalarEvolution
2693 // being unable to find a sufficient guard, for example), change the loop
2694 // comparison to use SLT or ULT instead of NE.
2695 // One consequence of doing this now is that it disrupts the count-down
2696 // optimization. That's not always a bad thing though, because in such
2697 // cases it may still be worthwhile to avoid a max.
2698 if (auto *Cmp = dyn_cast<ICmpInst>(Cond))
2699 Cond = OptimizeMax(Cmp, CondUse);
2700
2701 // If this exiting block dominates the latch block, it may also use
2702 // the post-inc value if it won't be shared with other uses.
2703 // Check for dominance.
2704 if (!DT.dominates(ExitingBlock, LatchBlock))
2705 continue;
2706
2707 // Conservatively avoid trying to use the post-inc value in non-latch
2708 // exits if there may be pre-inc users in intervening blocks.
2709 if (LatchBlock != ExitingBlock)
2710 for (const IVStrideUse &UI : IU)
2711 // Test if the use is reachable from the exiting block. This dominator
2712 // query is a conservative approximation of reachability.
2713 if (&UI != CondUse &&
2714 !DT.properlyDominates(UI.getUser()->getParent(), ExitingBlock)) {
2715 // Conservatively assume there may be reuse if the quotient of their
2716 // strides could be a legal scale.
2717 const SCEV *A = IU.getStride(*CondUse, L);
2718 const SCEV *B = IU.getStride(UI, L);
2719 if (!A || !B) continue;
2720 if (SE.getTypeSizeInBits(A->getType()) !=
2721 SE.getTypeSizeInBits(B->getType())) {
2722 if (SE.getTypeSizeInBits(A->getType()) >
2723 SE.getTypeSizeInBits(B->getType()))
2724 B = SE.getSignExtendExpr(B, A->getType());
2725 else
2726 A = SE.getSignExtendExpr(A, B->getType());
2727 }
2728 if (const SCEVConstant *D =
2730 const ConstantInt *C = D->getValue();
2731 // Stride of one or negative one can have reuse with non-addresses.
2732 if (C->isOne() || C->isMinusOne())
2733 goto decline_post_inc;
2734 // Avoid weird situations.
2735 if (C->getValue().getSignificantBits() >= 64 ||
2736 C->getValue().isMinSignedValue())
2737 goto decline_post_inc;
2738 // Check for possible scaled-address reuse.
2739 if (isAddressUse(TTI, UI.getUser(), UI.getOperandValToReplace())) {
2740 MemAccessTy AccessTy =
2741 getAccessType(TTI, UI.getUser(), UI.getOperandValToReplace());
2742 int64_t Scale = C->getSExtValue();
2743 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2744 /*BaseOffset=*/0,
2745 /*HasBaseReg=*/true, Scale,
2746 AccessTy.AddrSpace))
2747 goto decline_post_inc;
2748 Scale = -Scale;
2749 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2750 /*BaseOffset=*/0,
2751 /*HasBaseReg=*/true, Scale,
2752 AccessTy.AddrSpace))
2753 goto decline_post_inc;
2754 }
2755 }
2756 }
2757
2758 LLVM_DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: "
2759 << *Cond << '\n');
2760
2761 // It's possible for the setcc instruction to be anywhere in the loop, and
2762 // possible for it to have multiple users. If it is not immediately before
2763 // the exiting block branch, move it.
2764 if (isa_and_nonnull<CmpInst>(Cond) && Cond->getNextNode() != TermBr &&
2765 !Extract) {
2766 if (Cond->hasOneUse()) {
2767 Cond->moveBefore(TermBr->getIterator());
2768 } else {
2769 // Clone the terminating condition and insert into the loopend.
2770 Instruction *OldCond = Cond;
2771 Cond = Cond->clone();
2772 Cond->setName(L->getHeader()->getName() + ".termcond");
2773 Cond->insertInto(ExitingBlock, TermBr->getIterator());
2774
2775 // Clone the IVUse, as the old use still exists!
2776 CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
2777 TermBr->replaceUsesOfWith(OldCond, Cond);
2778 }
2779 }
2780
2781 // If we get to here, we know that we can transform the setcc instruction to
2782 // use the post-incremented version of the IV, allowing us to coalesce the
2783 // live ranges for the IV correctly.
2784 CondUse->transformToPostInc(L);
2785 Changed = true;
2786
2787 PostIncs.insert(Cond);
2788 decline_post_inc:;
2789 }
2790
2791 // Determine an insertion point for the loop induction variable increment. It
2792 // must dominate all the post-inc comparisons we just set up, and it must
2793 // dominate the loop latch edge.
2794 IVIncInsertPos = L->getLoopLatch()->getTerminator();
2795 for (Instruction *Inst : PostIncs)
2796 IVIncInsertPos = DT.findNearestCommonDominator(IVIncInsertPos, Inst);
2797}
2798
2799/// Determine if the given use can accommodate a fixup at the given offset and
2800/// other details. If so, update the use and return true.
2801bool LSRInstance::reconcileNewOffset(LSRUse &LU, Immediate NewOffset,
2802 bool HasBaseReg, LSRUse::KindType Kind,
2803 MemAccessTy AccessTy) {
2804 Immediate NewMinOffset = LU.MinOffset;
2805 Immediate NewMaxOffset = LU.MaxOffset;
2806 MemAccessTy NewAccessTy = AccessTy;
2807
2808 // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
2809 // something conservative, however this can pessimize in the case that one of
2810 // the uses will have all its uses outside the loop, for example.
2811 if (LU.Kind != Kind)
2812 return false;
2813
2814 // Check for a mismatched access type, and fall back conservatively as needed.
2815 // TODO: Be less conservative when the type is similar and can use the same
2816 // addressing modes.
2817 if (Kind == LSRUse::Address) {
2818 if (AccessTy.MemTy != LU.AccessTy.MemTy) {
2819 NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext(),
2820 AccessTy.AddrSpace);
2821 }
2822 }
2823
2824 // Conservatively assume HasBaseReg is true for now.
2825 if (Immediate::isKnownLT(NewOffset, LU.MinOffset)) {
2826 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2827 LU.MaxOffset - NewOffset, HasBaseReg))
2828 return false;
2829 NewMinOffset = NewOffset;
2830 } else if (Immediate::isKnownGT(NewOffset, LU.MaxOffset)) {
2831 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2832 NewOffset - LU.MinOffset, HasBaseReg))
2833 return false;
2834 NewMaxOffset = NewOffset;
2835 }
2836
2837 // FIXME: We should be able to handle some level of scalable offset support
2838 // for 'void', but in order to get basic support up and running this is
2839 // being left out.
2840 if (NewAccessTy.MemTy && NewAccessTy.MemTy->isVoidTy() &&
2841 (NewMinOffset.isScalable() || NewMaxOffset.isScalable()))
2842 return false;
2843
2844 // Update the use.
2845 LU.MinOffset = NewMinOffset;
2846 LU.MaxOffset = NewMaxOffset;
2847 LU.AccessTy = NewAccessTy;
2848 return true;
2849}
2850
2851/// Return an LSRUse index and an offset value for a fixup which needs the given
2852/// expression, with the given kind and optional access type. Either reuse an
2853/// existing use or create a new one, as needed.
2854std::pair<size_t, Immediate> LSRInstance::getUse(const SCEV *&Expr,
2855 LSRUse::KindType Kind,
2856 MemAccessTy AccessTy) {
2857 const SCEV *Copy = Expr;
2858 SCEVUse ExprUse = Expr;
2859 Immediate Offset = ExtractImmediate(
2860 ExprUse, SE, AccessTy.MemTy && AccessTy.MemTy->isScalableTy());
2861 Expr = ExprUse;
2862
2863 // Basic uses can't accept any offset, for example.
2864 if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
2865 Offset, /*HasBaseReg=*/ true)) {
2866 Expr = Copy;
2867 Offset = Immediate::getFixed(0);
2868 }
2869
2870 std::pair<UseMapTy::iterator, bool> P =
2871 UseMap.try_emplace(LSRUse::SCEVUseKindPair(Expr, Kind));
2872 if (!P.second) {
2873 // A use already existed with this base.
2874 size_t LUIdx = P.first->second;
2875 LSRUse &LU = Uses[LUIdx];
2876 if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
2877 // Reuse this use.
2878 return std::make_pair(LUIdx, Offset);
2879 }
2880
2881 // Create a new use.
2882 size_t LUIdx = Uses.size();
2883 P.first->second = LUIdx;
2884 Uses.push_back(LSRUse(Kind, AccessTy));
2885 LSRUse &LU = Uses[LUIdx];
2886
2887 LU.MinOffset = Offset;
2888 LU.MaxOffset = Offset;
2889 return std::make_pair(LUIdx, Offset);
2890}
2891
2892/// Delete the given use from the Uses list.
2893void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
2894 if (&LU != &Uses.back())
2895 std::swap(LU, Uses.back());
2896 Uses.pop_back();
2897
2898 // Update RegUses.
2899 RegUses.swapAndDropUse(LUIdx, Uses.size());
2900}
2901
2902/// Look for a use distinct from OrigLU which is has a formula that has the same
2903/// registers as the given formula.
2904LSRUse *
2905LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
2906 const LSRUse &OrigLU) {
2907 // Search all uses for the formula. This could be more clever.
2908 for (LSRUse &LU : Uses) {
2909 // Check whether this use is close enough to OrigLU, to see whether it's
2910 // worthwhile looking through its formulae.
2911 // Ignore ICmpZero uses because they may contain formulae generated by
2912 // GenerateICmpZeroScales, in which case adding fixup offsets may
2913 // be invalid.
2914 if (&LU != &OrigLU && LU.Kind != LSRUse::ICmpZero &&
2915 LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
2916 LU.HasFormulaWithSameRegs(OrigF)) {
2917 // Scan through this use's formulae.
2918 for (const Formula &F : LU.Formulae) {
2919 // Check to see if this formula has the same registers and symbols
2920 // as OrigF.
2921 if (F.BaseRegs == OrigF.BaseRegs &&
2922 F.ScaledReg == OrigF.ScaledReg &&
2923 F.BaseGV == OrigF.BaseGV &&
2924 F.Scale == OrigF.Scale &&
2925 F.UnfoldedOffset == OrigF.UnfoldedOffset) {
2926 if (F.BaseOffset.isZero())
2927 return &LU;
2928 // This is the formula where all the registers and symbols matched;
2929 // there aren't going to be any others. Since we declined it, we
2930 // can skip the rest of the formulae and proceed to the next LSRUse.
2931 break;
2932 }
2933 }
2934 }
2935 }
2936
2937 // Nothing looked good.
2938 return nullptr;
2939}
2940
2941void LSRInstance::CollectInterestingTypesAndFactors() {
2942 SmallSetVector<const SCEV *, 4> Strides;
2943
2944 // Collect interesting types and strides.
2946 for (const IVStrideUse &U : IU) {
2947 const SCEV *Expr = IU.getExpr(U);
2948 if (!Expr)
2949 continue;
2950
2951 // Collect interesting types.
2952 Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
2953
2954 // Add strides for mentioned loops.
2955 Worklist.push_back(Expr);
2956 do {
2957 const SCEV *S = Worklist.pop_back_val();
2958 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
2959 if (AR->getLoop() == L)
2960 Strides.insert(AR->getStepRecurrence(SE));
2961 Worklist.push_back(AR->getStart());
2962 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
2963 append_range(Worklist, Add->operands());
2964 }
2965 } while (!Worklist.empty());
2966 }
2967
2968 // Compute interesting factors from the set of interesting strides.
2969 for (SmallSetVector<const SCEV *, 4>::const_iterator
2970 I = Strides.begin(), E = Strides.end(); I != E; ++I)
2971 for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
2972 std::next(I); NewStrideIter != E; ++NewStrideIter) {
2973 const SCEV *OldStride = *I;
2974 const SCEV *NewStride = *NewStrideIter;
2975
2976 if (SE.getTypeSizeInBits(OldStride->getType()) !=
2977 SE.getTypeSizeInBits(NewStride->getType())) {
2978 if (SE.getTypeSizeInBits(OldStride->getType()) >
2979 SE.getTypeSizeInBits(NewStride->getType()))
2980 NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
2981 else
2982 OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
2983 }
2984 if (const SCEVConstant *Factor =
2985 dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
2986 SE, true))) {
2987 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2988 Factors.insert(Factor->getAPInt().getSExtValue());
2989 } else if (const SCEVConstant *Factor =
2991 NewStride,
2992 SE, true))) {
2993 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2994 Factors.insert(Factor->getAPInt().getSExtValue());
2995 }
2996 }
2997
2998 // If all uses use the same type, don't bother looking for truncation-based
2999 // reuse.
3000 if (Types.size() == 1)
3001 Types.clear();
3002
3003 LLVM_DEBUG(print_factors_and_types(dbgs()));
3004}
3005
3006/// Helper for CollectChains that finds an IV operand (computed by an AddRec in
3007/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
3008/// IVStrideUses, we could partially skip this.
3009static User::op_iterator
3011 Loop *L, ScalarEvolution &SE) {
3012 for(; OI != OE; ++OI) {
3013 if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
3014 if (!SE.isSCEVable(Oper->getType()))
3015 continue;
3016
3017 if (const SCEVAddRecExpr *AR =
3019 if (AR->getLoop() == L)
3020 break;
3021 }
3022 }
3023 }
3024 return OI;
3025}
3026
3027/// IVChain logic must consistently peek base TruncInst operands, so wrap it in
3028/// a convenient helper.
3030 if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
3031 return Trunc->getOperand(0);
3032 return Oper;
3033}
3034
3035/// Return an approximation of this SCEV expression's "base", or NULL for any
3036/// constant. Returning the expression itself is conservative. Returning a
3037/// deeper subexpression is more precise and valid as long as it isn't less
3038/// complex than another subexpression. For expressions involving multiple
3039/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
3040/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
3041/// IVInc==b-a.
3042///
3043/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
3044/// SCEVUnknown, we simply return the rightmost SCEV operand.
3045static const SCEV *getExprBase(const SCEV *S) {
3046 switch (S->getSCEVType()) {
3047 default: // including scUnknown.
3048 return S;
3049 case scConstant:
3050 case scVScale:
3051 return nullptr;
3052 case scTruncate:
3053 return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
3054 case scZeroExtend:
3055 return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
3056 case scSignExtend:
3057 return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
3058 case scAddExpr: {
3059 // Skip over scaled operands (scMulExpr) to follow add operands as long as
3060 // there's nothing more complex.
3061 // FIXME: not sure if we want to recognize negation.
3062 const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
3063 for (const SCEV *SubExpr : reverse(Add->operands())) {
3064 if (SubExpr->getSCEVType() == scAddExpr)
3065 return getExprBase(SubExpr);
3066
3067 if (SubExpr->getSCEVType() != scMulExpr)
3068 return SubExpr;
3069 }
3070 return S; // all operands are scaled, be conservative.
3071 }
3072 case scAddRecExpr:
3073 return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
3074 }
3075 llvm_unreachable("Unknown SCEV kind!");
3076}
3077
3078/// Return true if the chain increment is profitable to expand into a loop
3079/// invariant value, which may require its own register. A profitable chain
3080/// increment will be an offset relative to the same base. We allow such offsets
3081/// to potentially be used as chain increment as long as it's not obviously
3082/// expensive to expand using real instructions.
3083bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
3084 const SCEV *IncExpr,
3085 ScalarEvolution &SE) {
3086 // Aggressively form chains when -stress-ivchain.
3087 if (StressIVChain)
3088 return true;
3089
3090 // Do not replace a constant offset from IV head with a nonconstant IV
3091 // increment.
3092 if (!isa<SCEVConstant>(IncExpr)) {
3093 const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
3094 if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
3095 return false;
3096 }
3097
3098 SmallPtrSet<const SCEV*, 8> Processed;
3099 return !isHighCostExpansion(IncExpr, Processed, SE);
3100}
3101
3102/// Return true if the number of registers needed for the chain is estimated to
3103/// be less than the number required for the individual IV users. First prohibit
3104/// any IV users that keep the IV live across increments (the Users set should
3105/// be empty). Next count the number and type of increments in the chain.
3106///
3107/// Chaining IVs can lead to considerable code bloat if ISEL doesn't
3108/// effectively use postinc addressing modes. Only consider it profitable it the
3109/// increments can be computed in fewer registers when chained.
3110///
3111/// TODO: Consider IVInc free if it's already used in another chains.
3112static bool isProfitableChain(IVChain &Chain,
3114 ScalarEvolution &SE,
3115 const TargetTransformInfo &TTI) {
3116 if (StressIVChain)
3117 return true;
3118
3119 if (!Chain.hasIncs())
3120 return false;
3121
3122 if (!Users.empty()) {
3123 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
3124 for (Instruction *Inst
3125 : Users) { dbgs() << " " << *Inst << "\n"; });
3126 return false;
3127 }
3128 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3129
3130 // The chain itself may require a register, so initialize cost to 1.
3131 int cost = 1;
3132
3133 // A complete chain likely eliminates the need for keeping the original IV in
3134 // a register. LSR does not currently know how to form a complete chain unless
3135 // the header phi already exists.
3136 if (isa<PHINode>(Chain.tailUserInst())
3137 && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
3138 --cost;
3139 }
3140 const SCEV *LastIncExpr = nullptr;
3141 unsigned NumConstIncrements = 0;
3142 unsigned NumVarIncrements = 0;
3143 unsigned NumReusedIncrements = 0;
3144
3145 if (TTI.isProfitableLSRChainElement(Chain.Incs[0].UserInst))
3146 return true;
3147
3148 for (const IVInc &Inc : Chain) {
3149 if (TTI.isProfitableLSRChainElement(Inc.UserInst))
3150 return true;
3151 if (Inc.IncExpr->isZero())
3152 continue;
3153
3154 // Incrementing by zero or some constant is neutral. We assume constants can
3155 // be folded into an addressing mode or an add's immediate operand.
3156 if (isa<SCEVConstant>(Inc.IncExpr)) {
3157 ++NumConstIncrements;
3158 continue;
3159 }
3160
3161 if (Inc.IncExpr == LastIncExpr)
3162 ++NumReusedIncrements;
3163 else
3164 ++NumVarIncrements;
3165
3166 LastIncExpr = Inc.IncExpr;
3167 }
3168 // An IV chain with a single increment is handled by LSR's postinc
3169 // uses. However, a chain with multiple increments requires keeping the IV's
3170 // value live longer than it needs to be if chained.
3171 if (NumConstIncrements > 1)
3172 --cost;
3173
3174 // Materializing increment expressions in the preheader that didn't exist in
3175 // the original code may cost a register. For example, sign-extended array
3176 // indices can produce ridiculous increments like this:
3177 // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
3178 cost += NumVarIncrements;
3179
3180 // Reusing variable increments likely saves a register to hold the multiple of
3181 // the stride.
3182 cost -= NumReusedIncrements;
3183
3184 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
3185 << "\n");
3186
3187 return cost < 0;
3188}
3189
3190/// Add this IV user to an existing chain or make it the head of a new chain.
3191void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
3192 SmallVectorImpl<ChainUsers> &ChainUsersVec) {
3193 // When IVs are used as types of varying widths, they are generally converted
3194 // to a wider type with some uses remaining narrow under a (free) trunc.
3195 Value *const NextIV = getWideOperand(IVOper);
3196 const SCEV *const OperExpr = SE.getSCEV(NextIV);
3197 const SCEV *const OperExprBase = getExprBase(OperExpr);
3198
3199 // Visit all existing chains. Check if its IVOper can be computed as a
3200 // profitable loop invariant increment from the last link in the Chain.
3201 unsigned ChainIdx = 0, NChains = IVChainVec.size();
3202 const SCEV *LastIncExpr = nullptr;
3203 for (; ChainIdx < NChains; ++ChainIdx) {
3204 IVChain &Chain = IVChainVec[ChainIdx];
3205
3206 // Prune the solution space aggressively by checking that both IV operands
3207 // are expressions that operate on the same unscaled SCEVUnknown. This
3208 // "base" will be canceled by the subsequent getMinusSCEV call. Checking
3209 // first avoids creating extra SCEV expressions.
3210 if (!StressIVChain && Chain.ExprBase != OperExprBase)
3211 continue;
3212
3213 Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
3214 if (PrevIV->getType() != NextIV->getType())
3215 continue;
3216
3217 // A phi node terminates a chain.
3218 if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
3219 continue;
3220
3221 // The increment must be loop-invariant so it can be kept in a register.
3222 const SCEV *PrevExpr = SE.getSCEV(PrevIV);
3223 const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
3224 if (isa<SCEVCouldNotCompute>(IncExpr) || !SE.isLoopInvariant(IncExpr, L))
3225 continue;
3226
3227 if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
3228 LastIncExpr = IncExpr;
3229 break;
3230 }
3231 }
3232 // If we haven't found a chain, create a new one, unless we hit the max. Don't
3233 // bother for phi nodes, because they must be last in the chain.
3234 if (ChainIdx == NChains) {
3235 if (isa<PHINode>(UserInst))
3236 return;
3237 if (NChains >= MaxChains && !StressIVChain) {
3238 LLVM_DEBUG(dbgs() << "IV Chain Limit\n");
3239 return;
3240 }
3241 LastIncExpr = OperExpr;
3242 // IVUsers may have skipped over sign/zero extensions. We don't currently
3243 // attempt to form chains involving extensions unless they can be hoisted
3244 // into this loop's AddRec.
3245 if (!isa<SCEVAddRecExpr>(LastIncExpr))
3246 return;
3247 ++NChains;
3248 IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
3249 OperExprBase));
3250 ChainUsersVec.resize(NChains);
3251 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
3252 << ") IV=" << *LastIncExpr << "\n");
3253 } else {
3254 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst
3255 << ") IV+" << *LastIncExpr << "\n");
3256 // Add this IV user to the end of the chain.
3257 IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
3258 }
3259 IVChain &Chain = IVChainVec[ChainIdx];
3260
3261 SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
3262 // This chain's NearUsers become FarUsers.
3263 if (!LastIncExpr->isZero()) {
3264 ChainUsersVec[ChainIdx].FarUsers.insert_range(NearUsers);
3265 NearUsers.clear();
3266 }
3267
3268 // All other uses of IVOperand become near uses of the chain.
3269 // We currently ignore intermediate values within SCEV expressions, assuming
3270 // they will eventually be used be the current chain, or can be computed
3271 // from one of the chain increments. To be more precise we could
3272 // transitively follow its user and only add leaf IV users to the set.
3273 for (User *U : IVOper->users()) {
3274 Instruction *OtherUse = dyn_cast<Instruction>(U);
3275 if (!OtherUse)
3276 continue;
3277 // Uses in the chain will no longer be uses if the chain is formed.
3278 // Include the head of the chain in this iteration (not Chain.begin()).
3279 IVChain::const_iterator IncIter = Chain.Incs.begin();
3280 IVChain::const_iterator IncEnd = Chain.Incs.end();
3281 for( ; IncIter != IncEnd; ++IncIter) {
3282 if (IncIter->UserInst == OtherUse)
3283 break;
3284 }
3285 if (IncIter != IncEnd)
3286 continue;
3287
3288 if (SE.isSCEVable(OtherUse->getType())
3289 && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
3290 && IU.isIVUserOrOperand(OtherUse)) {
3291 continue;
3292 }
3293 NearUsers.insert(OtherUse);
3294 }
3295
3296 // Since this user is part of the chain, it's no longer considered a use
3297 // of the chain.
3298 ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
3299}
3300
3301/// Populate the vector of Chains.
3302///
3303/// This decreases ILP at the architecture level. Targets with ample registers,
3304/// multiple memory ports, and no register renaming probably don't want
3305/// this. However, such targets should probably disable LSR altogether.
3306///
3307/// The job of LSR is to make a reasonable choice of induction variables across
3308/// the loop. Subsequent passes can easily "unchain" computation exposing more
3309/// ILP *within the loop* if the target wants it.
3310///
3311/// Finding the best IV chain is potentially a scheduling problem. Since LSR
3312/// will not reorder memory operations, it will recognize this as a chain, but
3313/// will generate redundant IV increments. Ideally this would be corrected later
3314/// by a smart scheduler:
3315/// = A[i]
3316/// = A[i+x]
3317/// A[i] =
3318/// A[i+x] =
3319///
3320/// TODO: Walk the entire domtree within this loop, not just the path to the
3321/// loop latch. This will discover chains on side paths, but requires
3322/// maintaining multiple copies of the Chains state.
3323void LSRInstance::CollectChains() {
3324 LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n");
3325 SmallVector<ChainUsers, 8> ChainUsersVec;
3326
3327 SmallVector<BasicBlock *,8> LatchPath;
3328 BasicBlock *LoopHeader = L->getHeader();
3329 for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
3330 Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
3331 LatchPath.push_back(Rung->getBlock());
3332 }
3333 LatchPath.push_back(LoopHeader);
3334
3335 // Walk the instruction stream from the loop header to the loop latch.
3336 for (BasicBlock *BB : reverse(LatchPath)) {
3337 for (Instruction &I : *BB) {
3338 // Skip instructions that weren't seen by IVUsers analysis.
3339 if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&I))
3340 continue;
3341
3342 // Skip ephemeral values, as they don't produce real code.
3343 if (IU.isEphemeral(&I))
3344 continue;
3345
3346 // Ignore users that are part of a SCEV expression. This way we only
3347 // consider leaf IV Users. This effectively rediscovers a portion of
3348 // IVUsers analysis but in program order this time.
3349 if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I)))
3350 continue;
3351
3352 // Remove this instruction from any NearUsers set it may be in.
3353 for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
3354 ChainIdx < NChains; ++ChainIdx) {
3355 ChainUsersVec[ChainIdx].NearUsers.erase(&I);
3356 }
3357 // Search for operands that can be chained.
3358 SmallPtrSet<Instruction*, 4> UniqueOperands;
3359 User::op_iterator IVOpEnd = I.op_end();
3360 User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE);
3361 while (IVOpIter != IVOpEnd) {
3362 Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
3363 if (UniqueOperands.insert(IVOpInst).second)
3364 ChainInstruction(&I, IVOpInst, ChainUsersVec);
3365 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3366 }
3367 } // Continue walking down the instructions.
3368 } // Continue walking down the domtree.
3369 // Visit phi backedges to determine if the chain can generate the IV postinc.
3370 for (PHINode &PN : L->getHeader()->phis()) {
3371 if (!SE.isSCEVable(PN.getType()))
3372 continue;
3373
3374 Instruction *IncV =
3375 dyn_cast<Instruction>(PN.getIncomingValueForBlock(L->getLoopLatch()));
3376 if (IncV)
3377 ChainInstruction(&PN, IncV, ChainUsersVec);
3378 }
3379 // Remove any unprofitable chains.
3380 unsigned ChainIdx = 0;
3381 for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
3382 UsersIdx < NChains; ++UsersIdx) {
3383 if (!isProfitableChain(IVChainVec[UsersIdx],
3384 ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
3385 continue;
3386 // Preserve the chain at UsesIdx.
3387 if (ChainIdx != UsersIdx)
3388 IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
3389 FinalizeChain(IVChainVec[ChainIdx]);
3390 ++ChainIdx;
3391 }
3392 IVChainVec.resize(ChainIdx);
3393}
3394
3395void LSRInstance::FinalizeChain(IVChain &Chain) {
3396 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3397 LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
3398
3399 for (const IVInc &Inc : Chain) {
3400 LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");
3401 auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
3402 assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
3403 IVIncSet.insert(UseI);
3404 }
3405}
3406
3407/// Return true if the IVInc can be folded into an addressing mode.
3408static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
3409 Value *Operand, const TargetTransformInfo &TTI) {
3410 const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
3411 Immediate IncOffset = Immediate::getZero();
3412 if (IncConst) {
3413 if (IncConst && IncConst->getAPInt().getSignificantBits() > 64)
3414 return false;
3415 IncOffset = Immediate::getFixed(IncConst->getValue()->getSExtValue());
3416 } else {
3417 // Look for mul(vscale, constant), to detect a scalable offset.
3418 const APInt *C;
3419 if (!match(IncExpr, m_scev_Mul(m_scev_APInt(C), m_SCEVVScale())) ||
3420 C->getSignificantBits() > 64)
3421 return false;
3422 IncOffset = Immediate::getScalable(C->getSExtValue());
3423 }
3424
3425 if (!isAddressUse(TTI, UserInst, Operand))
3426 return false;
3427
3428 MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
3429 if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
3430 IncOffset, /*HasBaseReg=*/false))
3431 return false;
3432
3433 return true;
3434}
3435
3436/// Generate an add or subtract for each IVInc in a chain to materialize the IV
3437/// user's operand from the previous IV user's operand.
3438void LSRInstance::GenerateIVChain(const IVChain &Chain,
3439 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
3440 // Find the new IVOperand for the head of the chain. It may have been replaced
3441 // by LSR.
3442 const IVInc &Head = Chain.Incs[0];
3443 User::op_iterator IVOpEnd = Head.UserInst->op_end();
3444 // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
3445 User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
3446 IVOpEnd, L, SE);
3447 Value *IVSrc = nullptr;
3448 while (IVOpIter != IVOpEnd) {
3449 IVSrc = getWideOperand(*IVOpIter);
3450
3451 // If this operand computes the expression that the chain needs, we may use
3452 // it. (Check this after setting IVSrc which is used below.)
3453 //
3454 // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
3455 // narrow for the chain, so we can no longer use it. We do allow using a
3456 // wider phi, assuming the LSR checked for free truncation. In that case we
3457 // should already have a truncate on this operand such that
3458 // getSCEV(IVSrc) == IncExpr.
3459 if (SE.getSCEV(*IVOpIter) == Head.IncExpr
3460 || SE.getSCEV(IVSrc) == Head.IncExpr) {
3461 break;
3462 }
3463 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3464 }
3465 if (IVOpIter == IVOpEnd) {
3466 // Gracefully give up on this chain.
3467 LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
3468 return;
3469 }
3470 assert(IVSrc && "Failed to find IV chain source");
3471
3472 LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
3473 Type *IVTy = IVSrc->getType();
3474 Type *IntTy = SE.getEffectiveSCEVType(IVTy);
3475 const SCEV *LeftOverExpr = nullptr;
3476 const SCEV *Accum = SE.getZero(IntTy);
3478 Bases.emplace_back(Accum, IVSrc);
3479
3480 for (const IVInc &Inc : Chain) {
3481 Instruction *InsertPt = Inc.UserInst;
3482 if (isa<PHINode>(InsertPt))
3483 InsertPt = L->getLoopLatch()->getTerminator();
3484
3485 // IVOper will replace the current IV User's operand. IVSrc is the IV
3486 // value currently held in a register.
3487 Value *IVOper = IVSrc;
3488 if (!Inc.IncExpr->isZero()) {
3489 // IncExpr was the result of subtraction of two narrow values, so must
3490 // be signed.
3491 const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
3492 Accum = SE.getAddExpr(Accum, IncExpr);
3493 LeftOverExpr = LeftOverExpr ?
3494 SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
3495 }
3496
3497 // Look through each base to see if any can produce a nice addressing mode.
3498 bool FoundBase = false;
3499 for (auto [MapScev, MapIVOper] : reverse(Bases)) {
3500 const SCEV *Remainder = SE.getMinusSCEV(Accum, MapScev);
3501 if (canFoldIVIncExpr(Remainder, Inc.UserInst, Inc.IVOperand, TTI)) {
3502 if (!Remainder->isZero()) {
3503 Rewriter.clearPostInc();
3504 Value *IncV = Rewriter.expandCodeFor(Remainder, IntTy, InsertPt);
3505 const SCEV *IVOperExpr =
3506 SE.getAddExpr(SE.getUnknown(MapIVOper), SE.getUnknown(IncV));
3507 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3508 } else {
3509 IVOper = MapIVOper;
3510 }
3511
3512 FoundBase = true;
3513 break;
3514 }
3515 }
3516 if (!FoundBase && LeftOverExpr && !LeftOverExpr->isZero()) {
3517 // Expand the IV increment.
3518 Rewriter.clearPostInc();
3519 Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
3520 const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
3521 SE.getUnknown(IncV));
3522 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3523
3524 // If an IV increment can't be folded, use it as the next IV value.
3525 if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
3526 assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
3527 Bases.emplace_back(Accum, IVOper);
3528 IVSrc = IVOper;
3529 LeftOverExpr = nullptr;
3530 }
3531 }
3532 Type *OperTy = Inc.IVOperand->getType();
3533 if (IVTy != OperTy) {
3534 assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
3535 "cannot extend a chained IV");
3536 IRBuilder<> Builder(InsertPt);
3537 IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
3538 }
3539 Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
3540 if (auto *OperandIsInstr = dyn_cast<Instruction>(Inc.IVOperand))
3541 DeadInsts.emplace_back(OperandIsInstr);
3542 }
3543 // If LSR created a new, wider phi, we may also replace its postinc. We only
3544 // do this if we also found a wide value for the head of the chain.
3545 if (isa<PHINode>(Chain.tailUserInst())) {
3546 for (PHINode &Phi : L->getHeader()->phis()) {
3547 if (Phi.getType() != IVSrc->getType())
3548 continue;
3550 Phi.getIncomingValueForBlock(L->getLoopLatch()));
3551 if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
3552 continue;
3553 Value *IVOper = IVSrc;
3554 Type *PostIncTy = PostIncV->getType();
3555 if (IVTy != PostIncTy) {
3556 assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
3557 IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
3558 Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
3559 IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
3560 }
3561 Phi.replaceUsesOfWith(PostIncV, IVOper);
3562 DeadInsts.emplace_back(PostIncV);
3563 }
3564 }
3565}
3566
3567void LSRInstance::CollectFixupsAndInitialFormulae() {
3568 CondBrInst *ExitBranch = nullptr;
3569 bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &TLI);
3570
3571 // For calculating baseline cost
3572 SmallPtrSet<const SCEV *, 16> Regs;
3573 DenseSet<const SCEV *> VisitedRegs;
3574 DenseSet<size_t> VisitedLSRUse;
3575
3576 for (const IVStrideUse &U : IU) {
3577 Instruction *UserInst = U.getUser();
3578 // Skip IV users that are part of profitable IV Chains.
3579 User::op_iterator UseI =
3580 find(UserInst->operands(), U.getOperandValToReplace());
3581 assert(UseI != UserInst->op_end() && "cannot find IV operand");
3582 if (IVIncSet.count(UseI)) {
3583 LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
3584 continue;
3585 }
3586
3587 LSRUse::KindType Kind = LSRUse::Basic;
3588 MemAccessTy AccessTy;
3589 if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
3590 Kind = LSRUse::Address;
3591 AccessTy = getAccessType(TTI, UserInst, U.getOperandValToReplace());
3592 }
3593
3594 const SCEV *S = IU.getExpr(U);
3595 if (!S)
3596 continue;
3597 PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
3598
3599 // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
3600 // (N - i == 0), and this allows (N - i) to be the expression that we work
3601 // with rather than just N or i, so we can consider the register
3602 // requirements for both N and i at the same time. Limiting this code to
3603 // equality icmps is not a problem because all interesting loops use
3604 // equality icmps, thanks to IndVarSimplify.
3605 if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) {
3606 // If CI can be saved in some target, like replaced inside hardware loop
3607 // in PowerPC, no need to generate initial formulae for it.
3608 if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition()))
3609 continue;
3610 if (CI->isEquality()) {
3611 // Swap the operands if needed to put the OperandValToReplace on the
3612 // left, for consistency.
3613 Value *NV = CI->getOperand(1);
3614 if (NV == U.getOperandValToReplace()) {
3615 CI->setOperand(1, CI->getOperand(0));
3616 CI->setOperand(0, NV);
3617 NV = CI->getOperand(1);
3618 Changed = true;
3619 }
3620
3621 // x == y --> x - y == 0
3622 const SCEV *N = SE.getSCEV(NV);
3623 if (SE.isLoopInvariant(N, L) && Rewriter.isSafeToExpand(N) &&
3624 (!NV->getType()->isPointerTy() ||
3625 SE.getPointerBase(N) == SE.getPointerBase(S))) {
3626 // S is normalized, so normalize N before folding it into S
3627 // to keep the result normalized.
3628 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3629 if (!N)
3630 continue;
3631 Kind = LSRUse::ICmpZero;
3632 S = SE.getMinusSCEV(N, S);
3633 } else if (L->isLoopInvariant(NV) &&
3634 (!isa<Instruction>(NV) ||
3635 DT.dominates(cast<Instruction>(NV), L->getHeader())) &&
3636 !NV->getType()->isPointerTy()) {
3637 // If we can't generally expand the expression (e.g. it contains
3638 // a divide), but it is already at a loop invariant point before the
3639 // loop, wrap it in an unknown (to prevent the expander from trying
3640 // to re-expand in a potentially unsafe way.) The restriction to
3641 // integer types is required because the unknown hides the base, and
3642 // SCEV can't compute the difference of two unknown pointers.
3643 N = SE.getUnknown(NV);
3644 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3645 if (!N)
3646 continue;
3647 Kind = LSRUse::ICmpZero;
3648 S = SE.getMinusSCEV(N, S);
3650 }
3651
3652 // -1 and the negations of all interesting strides (except the negation
3653 // of -1) are now also interesting.
3654 for (size_t i = 0, e = Factors.size(); i != e; ++i)
3655 if (Factors[i] != -1)
3656 Factors.insert(-(uint64_t)Factors[i]);
3657 Factors.insert(-1);
3658 }
3659 }
3660
3661 // Get or create an LSRUse.
3662 std::pair<size_t, Immediate> P = getUse(S, Kind, AccessTy);
3663 size_t LUIdx = P.first;
3664 Immediate Offset = P.second;
3665 LSRUse &LU = Uses[LUIdx];
3666
3667 // Record the fixup.
3668 LSRFixup &LF = LU.getNewFixup();
3669 LF.UserInst = UserInst;
3670 LF.OperandValToReplace = U.getOperandValToReplace();
3671 LF.PostIncLoops = TmpPostIncLoops;
3672 LF.Offset = Offset;
3673 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3674 LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF);
3675
3676 // Create SCEV as Formula for calculating baseline cost
3677 if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) {
3678 Formula F;
3679 F.initialMatch(S, L, SE);
3680 BaselineCost.RateFormula(F, Regs, VisitedRegs, LU,
3681 HardwareLoopProfitable);
3682 VisitedLSRUse.insert(LUIdx);
3683 }
3684
3685 // If this is the first use of this LSRUse, give it a formula.
3686 if (LU.Formulae.empty()) {
3687 InsertInitialFormula(S, LU, LUIdx);
3688 CountRegisters(LU.Formulae.back(), LUIdx);
3689 }
3690 }
3691
3692 LLVM_DEBUG(print_fixups(dbgs()));
3693}
3694
3695/// Insert a formula for the given expression into the given use, separating out
3696/// loop-variant portions from loop-invariant and loop-computable portions.
3697void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU,
3698 size_t LUIdx) {
3699 // Mark uses whose expressions cannot be expanded.
3700 if (!Rewriter.isSafeToExpand(S))
3701 LU.RigidFormula = true;
3702
3703 Formula F;
3704 F.initialMatch(S, L, SE);
3705 bool Inserted = InsertFormula(LU, LUIdx, F);
3706 assert(Inserted && "Initial formula already exists!"); (void)Inserted;
3707}
3708
3709/// Insert a simple single-register formula for the given expression into the
3710/// given use.
3711void
3712LSRInstance::InsertSupplementalFormula(const SCEV *S,
3713 LSRUse &LU, size_t LUIdx) {
3714 Formula F;
3715 F.BaseRegs.push_back(S);
3716 F.HasBaseReg = true;
3717 bool Inserted = InsertFormula(LU, LUIdx, F);
3718 assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
3719}
3720
3721/// Note which registers are used by the given formula, updating RegUses.
3722void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
3723 if (F.ScaledReg)
3724 RegUses.countRegister(F.ScaledReg, LUIdx);
3725 for (const SCEV *BaseReg : F.BaseRegs)
3726 RegUses.countRegister(BaseReg, LUIdx);
3727}
3728
3729/// If the given formula has not yet been inserted, add it to the list, and
3730/// return true. Return false otherwise.
3731bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
3732 // Do not insert formula that we will not be able to expand.
3733 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
3734 "Formula is illegal");
3735
3736 if (!LU.InsertFormula(F, *L))
3737 return false;
3738
3739 CountRegisters(F, LUIdx);
3740 return true;
3741}
3742
3743/// Test whether this fixup will be executed each time the corresponding IV
3744/// increment instruction is executed.
3745bool LSRInstance::IsFixupExecutedEachIncrement(const LSRFixup &LF) const {
3746 // If the fixup block dominates the IV increment block then there is no path
3747 // through the loop to the increment that doesn't pass through the fixup.
3748 return DT.dominates(LF.UserInst->getParent(), IVIncInsertPos->getParent());
3749}
3750
3751/// Check for other uses of loop-invariant values which we're tracking. These
3752/// other uses will pin these values in registers, making them less profitable
3753/// for elimination.
3754/// TODO: This currently misses non-constant addrec step registers.
3755/// TODO: Should this give more weight to users inside the loop?
3756void
3757LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
3758 SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
3759 SmallPtrSet<const SCEV *, 32> Visited;
3760
3761 // Don't collect outside uses if we are favoring postinc - the instructions in
3762 // the loop are more important than the ones outside of it.
3763 if (AMK == TTI::AMK_PostIndexed)
3764 return;
3765
3766 while (!Worklist.empty()) {
3767 const SCEV *S = Worklist.pop_back_val();
3768
3769 // Don't process the same SCEV twice
3770 if (!Visited.insert(S).second)
3771 continue;
3772
3773 if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
3774 append_range(Worklist, N->operands());
3775 else if (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(S))
3776 Worklist.push_back(C->getOperand());
3777 else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
3778 Worklist.push_back(D->getLHS());
3779 Worklist.push_back(D->getRHS());
3780 } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
3781 const Value *V = US->getValue();
3782 if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
3783 // Look for instructions defined outside the loop.
3784 if (L->contains(Inst)) continue;
3785 } else if (isa<Constant>(V))
3786 // Constants can be re-materialized.
3787 continue;
3788 for (const Use &U : V->uses()) {
3789 const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
3790 // Ignore non-instructions.
3791 if (!UserInst)
3792 continue;
3793 // Don't bother if the instruction is an EHPad.
3794 if (UserInst->isEHPad())
3795 continue;
3796 // Ignore instructions in other functions (as can happen with
3797 // Constants).
3798 if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
3799 continue;
3800 // Ignore instructions not dominated by the loop.
3801 const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
3802 UserInst->getParent() :
3803 cast<PHINode>(UserInst)->getIncomingBlock(
3805 if (!DT.dominates(L->getHeader(), UseBB))
3806 continue;
3807 // Don't bother if the instruction is in a BB which ends in an EHPad.
3808 if (UseBB->getTerminator()->isEHPad())
3809 continue;
3810
3811 // Ignore cases in which the currently-examined value could come from
3812 // a basic block terminated with an EHPad. This checks all incoming
3813 // blocks of the phi node since it is possible that the same incoming
3814 // value comes from multiple basic blocks, only some of which may end
3815 // in an EHPad. If any of them do, a subsequent rewrite attempt by this
3816 // pass would try to insert instructions into an EHPad, hitting an
3817 // assertion.
3818 if (isa<PHINode>(UserInst)) {
3819 const auto *PhiNode = cast<PHINode>(UserInst);
3820 bool HasIncompatibleEHPTerminatedBlock = false;
3821 llvm::Value *ExpectedValue = U;
3822 for (unsigned int I = 0; I < PhiNode->getNumIncomingValues(); I++) {
3823 if (PhiNode->getIncomingValue(I) == ExpectedValue) {
3824 if (PhiNode->getIncomingBlock(I)->getTerminator()->isEHPad()) {
3825 HasIncompatibleEHPTerminatedBlock = true;
3826 break;
3827 }
3828 }
3829 }
3830 if (HasIncompatibleEHPTerminatedBlock) {
3831 continue;
3832 }
3833 }
3834
3835 // Don't bother rewriting PHIs in catchswitch blocks.
3836 if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
3837 continue;
3838 // Ignore uses which are part of other SCEV expressions, to avoid
3839 // analyzing them multiple times.
3840 if (SE.isSCEVable(UserInst->getType())) {
3841 const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
3842 // If the user is a no-op, look through to its uses.
3843 if (!isa<SCEVUnknown>(UserS))
3844 continue;
3845 if (UserS == US) {
3846 Worklist.push_back(
3847 SE.getUnknown(const_cast<Instruction *>(UserInst)));
3848 continue;
3849 }
3850 }
3851 // Ignore icmp instructions which are already being analyzed.
3852 if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
3853 unsigned OtherIdx = !U.getOperandNo();
3854 Value *OtherOp = ICI->getOperand(OtherIdx);
3855 if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
3856 continue;
3857 }
3858
3859 // Do not consider uses inside lifetime intrinsics. These are not
3860 // actually materialized.
3861 if (UserInst->isLifetimeStartOrEnd())
3862 continue;
3863
3864 std::pair<size_t, Immediate> P =
3865 getUse(S, LSRUse::Basic, MemAccessTy());
3866 size_t LUIdx = P.first;
3867 Immediate Offset = P.second;
3868 LSRUse &LU = Uses[LUIdx];
3869 LSRFixup &LF = LU.getNewFixup();
3870 LF.UserInst = const_cast<Instruction *>(UserInst);
3871 LF.OperandValToReplace = U;
3872 LF.Offset = Offset;
3873 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3874 LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF);
3875 InsertSupplementalFormula(US, LU, LUIdx);
3876 CountRegisters(LU.Formulae.back(), Uses.size() - 1);
3877 break;
3878 }
3879 }
3880 }
3881}
3882
3883/// Split S into subexpressions which can be pulled out into separate
3884/// registers. If C is non-null, multiply each subexpression by C.
3885///
3886/// Return remainder expression after factoring the subexpressions captured by
3887/// Ops. If Ops is complete, return NULL.
3888static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
3890 const Loop *L,
3891 ScalarEvolution &SE,
3892 unsigned Depth = 0) {
3893 // Arbitrarily cap recursion to protect compile time.
3894 if (Depth >= 3)
3895 return S;
3896
3897 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
3898 // Break out add operands.
3899 for (const SCEV *S : Add->operands()) {
3900 const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
3901 if (Remainder)
3902 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3903 }
3904 return nullptr;
3905 }
3906 const SCEV *Start, *Step;
3907 const SCEVConstant *Op0;
3908 const SCEV *Op1;
3909 if (match(S, m_scev_AffineAddRec(m_SCEV(Start), m_SCEV(Step)))) {
3910 // Split a non-zero base out of an addrec.
3911 if (Start->isZero())
3912 return S;
3913
3914 const SCEV *Remainder = CollectSubexprs(Start, C, Ops, L, SE, Depth + 1);
3915 // Split the non-zero AddRec unless it is part of a nested recurrence that
3916 // does not pertain to this loop.
3917 if (Remainder && (cast<SCEVAddRecExpr>(S)->getLoop() == L ||
3918 !isa<SCEVAddRecExpr>(Remainder))) {
3919 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3920 Remainder = nullptr;
3921 }
3922 if (Remainder != Start) {
3923 if (!Remainder)
3924 Remainder = SE.getConstant(S->getType(), 0);
3925 return SE.getAddRecExpr(Remainder, Step,
3926 cast<SCEVAddRecExpr>(S)->getLoop(),
3927 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
3929 }
3930 } else if (match(S, m_scev_Mul(m_SCEVConstant(Op0), m_SCEV(Op1)))) {
3931 // Break (C * (a + b + c)) into C*a + C*b + C*c.
3932 C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
3933 const SCEV *Remainder = CollectSubexprs(Op1, C, Ops, L, SE, Depth + 1);
3934 if (Remainder)
3935 Ops.push_back(SE.getMulExpr(C, Remainder));
3936 return nullptr;
3937 }
3938 return S;
3939}
3940
3941/// Return true if the SCEV represents a value that may end up as a
3942/// post-increment operation.
3944 LSRUse &LU, const SCEV *S, const Loop *L,
3945 ScalarEvolution &SE) {
3946 if (LU.Kind != LSRUse::Address ||
3947 !LU.AccessTy.getType()->isIntOrIntVectorTy())
3948 return false;
3949 const SCEV *Start;
3950 if (!match(S, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant())))
3951 return false;
3952 // Check if a post-indexed load/store can be used.
3953 if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, S->getType()) ||
3954 TTI.isIndexedStoreLegal(TTI.MIM_PostInc, S->getType())) {
3955 if (!isa<SCEVConstant>(Start) && SE.isLoopInvariant(Start, L))
3956 return true;
3957 }
3958 return false;
3959}
3960
3961/// Helper function for LSRInstance::GenerateReassociations.
3962void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
3963 const Formula &Base,
3964 unsigned Depth, size_t Idx,
3965 bool IsScaledReg) {
3966 const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3967 // Don't generate reassociations for the base register of a value that
3968 // may generate a post-increment operator. The reason is that the
3969 // reassociations cause extra base+register formula to be created,
3970 // and possibly chosen, but the post-increment is more efficient.
3971 if (AMK == TTI::AMK_PostIndexed && mayUsePostIncMode(TTI, LU, BaseReg, L, SE))
3972 return;
3974 const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
3975 if (Remainder)
3976 AddOps.push_back(Remainder);
3977
3978 if (AddOps.size() == 1)
3979 return;
3980
3982 JE = AddOps.end();
3983 J != JE; ++J) {
3984 // Loop-variant "unknown" values are uninteresting; we won't be able to
3985 // do anything meaningful with them.
3986 if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
3987 continue;
3988
3989 // Don't pull a constant into a register if the constant could be folded
3990 // into an immediate field.
3991 if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3992 LU.AccessTy, *J, Base.getNumRegs() > 1))
3993 continue;
3994
3995 // Collect all operands except *J.
3996 SmallVector<SCEVUse, 8> InnerAddOps(std::as_const(AddOps).begin(), J);
3997 InnerAddOps.append(std::next(J), std::as_const(AddOps).end());
3998
3999 // Don't leave just a constant behind in a register if the constant could
4000 // be folded into an immediate field.
4001 if (InnerAddOps.size() == 1 &&
4002 isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
4003 LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
4004 continue;
4005
4006 const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
4007 if (InnerSum->isZero())
4008 continue;
4009 Formula F = Base;
4010
4011 if (F.UnfoldedOffset.isNonZero() && F.UnfoldedOffset.isScalable())
4012 continue;
4013
4014 // Add the remaining pieces of the add back into the new formula.
4015 const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
4016 if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
4017 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
4018 InnerSumSC->getValue()->getZExtValue())) {
4019 F.UnfoldedOffset =
4020 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
4021 InnerSumSC->getValue()->getZExtValue());
4022 if (IsScaledReg) {
4023 F.ScaledReg = nullptr;
4024 F.Scale = 0;
4025 } else
4026 F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
4027 } else if (IsScaledReg)
4028 F.ScaledReg = InnerSum;
4029 else
4030 F.BaseRegs[Idx] = InnerSum;
4031
4032 // Add J as its own register, or an unfolded immediate.
4033 const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
4034 if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
4035 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
4036 SC->getValue()->getZExtValue()))
4037 F.UnfoldedOffset =
4038 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
4039 SC->getValue()->getZExtValue());
4040 else
4041 F.BaseRegs.push_back(*J);
4042 // We may have changed the number of register in base regs, adjust the
4043 // formula accordingly.
4044 F.canonicalize(*L);
4045
4046 if (InsertFormula(LU, LUIdx, F))
4047 // If that formula hadn't been seen before, recurse to find more like
4048 // it.
4049 // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2)
4050 // Because just Depth is not enough to bound compile time.
4051 // This means that every time AddOps.size() is greater 16^x we will add
4052 // x to Depth.
4053 GenerateReassociations(LU, LUIdx, LU.Formulae.back(),
4054 Depth + 1 + (Log2_32(AddOps.size()) >> 2));
4055 }
4056}
4057
4058/// Split out subexpressions from adds and the bases of addrecs.
4059void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
4060 Formula Base, unsigned Depth) {
4061 assert(Base.isCanonical(*L) && "Input must be in the canonical form");
4062 // Arbitrarily cap recursion to protect compile time.
4063 if (Depth >= 3)
4064 return;
4065
4066 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4067 GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
4068
4069 if (Base.Scale == 1)
4070 GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
4071 /* Idx */ -1, /* IsScaledReg */ true);
4072}
4073
4074/// Generate a formula consisting of all of the loop-dominating registers added
4075/// into a single register.
4076void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
4077 Formula Base) {
4078 // This method is only interesting on a plurality of registers.
4079 if (Base.BaseRegs.size() + (Base.Scale == 1) +
4080 (Base.UnfoldedOffset.isNonZero()) <=
4081 1)
4082 return;
4083
4084 // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
4085 // processing the formula.
4086 Base.unscale();
4088 Formula NewBase = Base;
4089 NewBase.BaseRegs.clear();
4090 Type *CombinedIntegerType = nullptr;
4091 for (const SCEV *BaseReg : Base.BaseRegs) {
4092 if (SE.properlyDominates(BaseReg, L->getHeader()) &&
4093 !SE.hasComputableLoopEvolution(BaseReg, L)) {
4094 if (!CombinedIntegerType)
4095 CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType());
4096 Ops.push_back(BaseReg);
4097 }
4098 else
4099 NewBase.BaseRegs.push_back(BaseReg);
4100 }
4101
4102 // If no register is relevant, we're done.
4103 if (Ops.size() == 0)
4104 return;
4105
4106 // Utility function for generating the required variants of the combined
4107 // registers.
4108 auto GenerateFormula = [&](const SCEV *Sum) {
4109 Formula F = NewBase;
4110
4111 // TODO: If Sum is zero, it probably means ScalarEvolution missed an
4112 // opportunity to fold something. For now, just ignore such cases
4113 // rather than proceed with zero in a register.
4114 if (Sum->isZero())
4115 return;
4116
4117 F.BaseRegs.push_back(Sum);
4118 F.canonicalize(*L);
4119 (void)InsertFormula(LU, LUIdx, F);
4120 };
4121
4122 // If we collected at least two registers, generate a formula combining them.
4123 if (Ops.size() > 1) {
4124 SmallVector<SCEVUse, 4> OpsCopy(Ops); // Don't let SE modify Ops.
4125 GenerateFormula(SE.getAddExpr(OpsCopy));
4126 }
4127
4128 // If we have an unfolded offset, generate a formula combining it with the
4129 // registers collected.
4130 if (NewBase.UnfoldedOffset.isNonZero() && NewBase.UnfoldedOffset.isFixed()) {
4131 assert(CombinedIntegerType && "Missing a type for the unfolded offset");
4132 Ops.push_back(SE.getConstant(CombinedIntegerType,
4133 NewBase.UnfoldedOffset.getFixedValue(), true));
4134 NewBase.UnfoldedOffset = Immediate::getFixed(0);
4135 GenerateFormula(SE.getAddExpr(Ops));
4136 }
4137}
4138
4139/// Helper function for LSRInstance::GenerateSymbolicOffsets.
4140void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
4141 const Formula &Base, size_t Idx,
4142 bool IsScaledReg) {
4143 SCEVUse G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4144 GlobalValue *GV = ExtractSymbol(G, SE);
4145 if (G->isZero() || !GV)
4146 return;
4147 Formula F = Base;
4148 F.BaseGV = GV;
4149 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4150 return;
4151 if (IsScaledReg)
4152 F.ScaledReg = G;
4153 else
4154 F.BaseRegs[Idx] = G;
4155 (void)InsertFormula(LU, LUIdx, F);
4156}
4157
4158/// Generate reuse formulae using symbolic offsets.
4159void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
4160 Formula Base) {
4161 // We can't add a symbolic offset if the address already contains one.
4162 if (Base.BaseGV) return;
4163
4164 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4165 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
4166 if (Base.Scale == 1)
4167 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
4168 /* IsScaledReg */ true);
4169}
4170
4171/// Helper function for LSRInstance::GenerateConstantOffsets.
4172void LSRInstance::GenerateConstantOffsetsImpl(
4173 LSRUse &LU, unsigned LUIdx, const Formula &Base,
4174 const SmallVectorImpl<Immediate> &Worklist, size_t Idx, bool IsScaledReg) {
4175
4176 auto GenerateOffset = [&](const SCEV *G, Immediate Offset) {
4177 Formula F = Base;
4178 if (!Base.BaseOffset.isCompatibleImmediate(Offset))
4179 return;
4180 F.BaseOffset = Base.BaseOffset.subUnsigned(Offset);
4181
4182 if (isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) {
4183 // Add the offset to the base register.
4184 const SCEV *NewOffset = Offset.getSCEV(SE, G->getType());
4185 const SCEV *NewG = SE.getAddExpr(NewOffset, G);
4186 // If it cancelled out, drop the base register, otherwise update it.
4187 if (NewG->isZero()) {
4188 if (IsScaledReg) {
4189 F.Scale = 0;
4190 F.ScaledReg = nullptr;
4191 } else
4192 F.deleteBaseReg(F.BaseRegs[Idx]);
4193 F.canonicalize(*L);
4194 } else if (IsScaledReg)
4195 F.ScaledReg = NewG;
4196 else
4197 F.BaseRegs[Idx] = NewG;
4198
4199 (void)InsertFormula(LU, LUIdx, F);
4200 }
4201 };
4202
4203 SCEVUse G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4204
4205 // With constant offsets and constant steps, we can generate pre-inc
4206 // accesses by having the offset equal the step. So, for access #0 with a
4207 // step of 8, we generate a G - 8 base which would require the first access
4208 // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer
4209 // for itself and hopefully becomes the base for other accesses. This means
4210 // means that a single pre-indexed access can be generated to become the new
4211 // base pointer for each iteration of the loop, resulting in no extra add/sub
4212 // instructions for pointer updating.
4213 if ((AMK & TTI::AMK_PreIndexed) && LU.Kind == LSRUse::Address) {
4214 const APInt *StepInt;
4215 if (match(G, m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StepInt)))) {
4216 int64_t Step = StepInt->isNegative() ? StepInt->getSExtValue()
4217 : StepInt->getZExtValue();
4218
4219 for (Immediate Offset : Worklist) {
4220 if (Offset.isFixed()) {
4221 Offset = Immediate::getFixed(Offset.getFixedValue() - Step);
4222 GenerateOffset(G, Offset);
4223 }
4224 }
4225 }
4226 }
4227 for (Immediate Offset : Worklist)
4228 GenerateOffset(G, Offset);
4229
4230 // TODO: It likely makes sense to extract the immediate corresponding to the
4231 // access type (i.e., set PreferScalable to AccessTy.MemTy &&
4232 // AccessTy.MemTy->isScalableTy()).
4233 Immediate Imm = ExtractImmediate(G, SE, /*PreferScalable=*/false);
4234 if (G->isZero() || Imm.isZero() ||
4235 !Base.BaseOffset.isCompatibleImmediate(Imm))
4236 return;
4237 Formula F = Base;
4238 F.BaseOffset = F.BaseOffset.addUnsigned(Imm);
4239 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4240 return;
4241 if (IsScaledReg) {
4242 F.ScaledReg = G;
4243 } else {
4244 F.BaseRegs[Idx] = G;
4245 // We may generate non canonical Formula if G is a recurrent expr reg
4246 // related with current loop while F.ScaledReg is not.
4247 F.canonicalize(*L);
4248 }
4249 (void)InsertFormula(LU, LUIdx, F);
4250}
4251
4252/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
4253void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
4254 Formula Base) {
4255 // TODO: For now, just add the min and max offset, because it usually isn't
4256 // worthwhile looking at everything inbetween.
4258 Worklist.push_back(LU.MinOffset);
4259 if (LU.MaxOffset != LU.MinOffset)
4260 Worklist.push_back(LU.MaxOffset);
4261
4262 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4263 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
4264 if (Base.Scale == 1)
4265 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
4266 /* IsScaledReg */ true);
4267}
4268
4269/// For ICmpZero, check to see if we can scale up the comparison. For example, x
4270/// == y -> x*c == y*c.
4271void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
4272 Formula Base) {
4273 if (LU.Kind != LSRUse::ICmpZero) return;
4274
4275 // Determine the integer type for the base formula.
4276 Type *IntTy = Base.getType();
4277 if (!IntTy) return;
4278 if (SE.getTypeSizeInBits(IntTy) > 64) return;
4279
4280 // Don't do this if there is more than one offset.
4281 if (LU.MinOffset != LU.MaxOffset) return;
4282
4283 // Check if transformation is valid. It is illegal to multiply pointer.
4284 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4285 return;
4286 for (const SCEV *BaseReg : Base.BaseRegs)
4287 if (BaseReg->getType()->isPointerTy())
4288 return;
4289 assert(!Base.BaseGV && "ICmpZero use is not legal!");
4290
4291 // Check each interesting stride.
4292 for (int64_t Factor : Factors) {
4293 // Check that Factor can be represented by IntTy
4294 if (!ConstantInt::isValueValidForType(IntTy, Factor))
4295 continue;
4296 // Check that the multiplication doesn't overflow.
4297 if (Base.BaseOffset.isMin() && Factor == -1)
4298 continue;
4299 // Not supporting scalable immediates.
4300 if (Base.BaseOffset.isNonZero() && Base.BaseOffset.isScalable())
4301 continue;
4302 Immediate NewBaseOffset = Base.BaseOffset.mulUnsigned(Factor);
4303 assert(Factor != 0 && "Zero factor not expected!");
4304 if (NewBaseOffset.getFixedValue() / Factor !=
4305 Base.BaseOffset.getFixedValue())
4306 continue;
4307 // If the offset will be truncated at this use, check that it is in bounds.
4308 if (!IntTy->isPointerTy() &&
4309 !ConstantInt::isValueValidForType(IntTy, NewBaseOffset.getFixedValue()))
4310 continue;
4311
4312 // Check that multiplying with the use offset doesn't overflow.
4313 Immediate Offset = LU.MinOffset;
4314 if (Offset.isMin() && Factor == -1)
4315 continue;
4316 Offset = Offset.mulUnsigned(Factor);
4317 if (Offset.getFixedValue() / Factor != LU.MinOffset.getFixedValue())
4318 continue;
4319 // If the offset will be truncated at this use, check that it is in bounds.
4320 if (!IntTy->isPointerTy() &&
4321 !ConstantInt::isValueValidForType(IntTy, Offset.getFixedValue()))
4322 continue;
4323
4324 Formula F = Base;
4325 F.BaseOffset = NewBaseOffset;
4326
4327 // Check that this scale is legal.
4328 if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
4329 continue;
4330
4331 // Compensate for the use having MinOffset built into it.
4332 F.BaseOffset = F.BaseOffset.addUnsigned(Offset).subUnsigned(LU.MinOffset);
4333
4334 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4335
4336 // Check that multiplying with each base register doesn't overflow.
4337 for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
4338 F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
4339 if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
4340 goto next;
4341 }
4342
4343 // Check that multiplying with the scaled register doesn't overflow.
4344 if (F.ScaledReg) {
4345 F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
4346 if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
4347 continue;
4348 }
4349
4350 // Check that multiplying with the unfolded offset doesn't overflow.
4351 if (F.UnfoldedOffset.isNonZero()) {
4352 if (F.UnfoldedOffset.isMin() && Factor == -1)
4353 continue;
4354 F.UnfoldedOffset = F.UnfoldedOffset.mulUnsigned(Factor);
4355 if (F.UnfoldedOffset.getFixedValue() / Factor !=
4356 Base.UnfoldedOffset.getFixedValue())
4357 continue;
4358 // If the offset will be truncated, check that it is in bounds.
4360 IntTy, F.UnfoldedOffset.getFixedValue()))
4361 continue;
4362 }
4363
4364 // If we make it here and it's legal, add it.
4365 (void)InsertFormula(LU, LUIdx, F);
4366 next:;
4367 }
4368}
4369
4370/// Generate stride factor reuse formulae by making use of scaled-offset address
4371/// modes, for example.
4372void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
4373 // Determine the integer type for the base formula.
4374 Type *IntTy = Base.getType();
4375 if (!IntTy) return;
4376
4377 // If this Formula already has a scaled register, we can't add another one.
4378 // Try to unscale the formula to generate a better scale.
4379 if (Base.Scale != 0 && !Base.unscale())
4380 return;
4381
4382 assert(Base.Scale == 0 && "unscale did not did its job!");
4383
4384 // Check each interesting stride.
4385 for (int64_t Factor : Factors) {
4386 Base.Scale = Factor;
4387 Base.HasBaseReg = Base.BaseRegs.size() > 1;
4388 // Check whether this scale is going to be legal.
4389 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4390 Base)) {
4391 // As a special-case, handle special out-of-loop Basic users specially.
4392 // TODO: Reconsider this special case.
4393 if (LU.Kind == LSRUse::Basic &&
4394 isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
4395 LU.AccessTy, Base) &&
4396 LU.AllFixupsOutsideLoop)
4397 LU.Kind = LSRUse::Special;
4398 else
4399 continue;
4400 }
4401 // For an ICmpZero, negating a solitary base register won't lead to
4402 // new solutions.
4403 if (LU.Kind == LSRUse::ICmpZero && !Base.HasBaseReg &&
4404 Base.BaseOffset.isZero() && !Base.BaseGV)
4405 continue;
4406 // For each addrec base reg, if its loop is current loop, apply the scale.
4407 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
4408 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
4409 if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
4410 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4411 if (FactorS->isZero())
4412 continue;
4413 // Divide out the factor, ignoring high bits, since we'll be
4414 // scaling the value back up in the end.
4415 if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true))
4416 if (!Quotient->isZero()) {
4417 // TODO: This could be optimized to avoid all the copying.
4418 Formula F = Base;
4419 F.ScaledReg = Quotient;
4420 F.deleteBaseReg(F.BaseRegs[i]);
4421 // The canonical representation of 1*reg is reg, which is already in
4422 // Base. In that case, do not try to insert the formula, it will be
4423 // rejected anyway.
4424 if (F.Scale == 1 && (F.BaseRegs.empty() ||
4425 (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
4426 continue;
4427 // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
4428 // non canonical Formula with ScaledReg's loop not being L.
4429 if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
4430 F.canonicalize(*L);
4431 (void)InsertFormula(LU, LUIdx, F);
4432 }
4433 }
4434 }
4435 }
4436}
4437
4438/// Extend/Truncate \p Expr to \p ToTy considering post-inc uses in \p Loops.
4439/// For all PostIncLoopSets in \p Loops, first de-normalize \p Expr, then
4440/// perform the extension/truncate and normalize again, as the normalized form
4441/// can result in folds that are not valid in the post-inc use contexts. The
4442/// expressions for all PostIncLoopSets must match, otherwise return nullptr.
4443static const SCEV *
4445 const SCEV *Expr, Type *ToTy,
4446 ScalarEvolution &SE) {
4447 const SCEV *Result = nullptr;
4448 for (auto &L : Loops) {
4449 auto *DenormExpr = denormalizeForPostIncUse(Expr, L, SE);
4450 const SCEV *NewDenormExpr = SE.getAnyExtendExpr(DenormExpr, ToTy);
4451 const SCEV *New = normalizeForPostIncUse(NewDenormExpr, L, SE);
4452 if (!New || (Result && New != Result))
4453 return nullptr;
4454 Result = New;
4455 }
4456
4457 assert(Result && "failed to create expression");
4458 return Result;
4459}
4460
4461/// Generate reuse formulae from different IV types.
4462void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
4463 // Don't bother truncating symbolic values.
4464 if (Base.BaseGV) return;
4465
4466 // Determine the integer type for the base formula.
4467 Type *DstTy = Base.getType();
4468 if (!DstTy) return;
4469 if (DstTy->isPointerTy())
4470 return;
4471
4472 // It is invalid to extend a pointer type so exit early if ScaledReg or
4473 // any of the BaseRegs are pointers.
4474 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4475 return;
4476 if (any_of(Base.BaseRegs,
4477 [](const SCEV *S) { return S->getType()->isPointerTy(); }))
4478 return;
4479
4481 for (auto &LF : LU.Fixups)
4482 Loops.push_back(LF.PostIncLoops);
4483
4484 for (Type *SrcTy : Types) {
4485 if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
4486 Formula F = Base;
4487
4488 // Sometimes SCEV is able to prove zero during ext transform. It may
4489 // happen if SCEV did not do all possible transforms while creating the
4490 // initial node (maybe due to depth limitations), but it can do them while
4491 // taking ext.
4492 if (F.ScaledReg) {
4493 const SCEV *NewScaledReg =
4494 getAnyExtendConsideringPostIncUses(Loops, F.ScaledReg, SrcTy, SE);
4495 if (!NewScaledReg || NewScaledReg->isZero())
4496 continue;
4497 F.ScaledReg = NewScaledReg;
4498 }
4499 bool HasZeroBaseReg = false;
4500 for (const SCEV *&BaseReg : F.BaseRegs) {
4501 const SCEV *NewBaseReg =
4502 getAnyExtendConsideringPostIncUses(Loops, BaseReg, SrcTy, SE);
4503 if (!NewBaseReg || NewBaseReg->isZero()) {
4504 HasZeroBaseReg = true;
4505 break;
4506 }
4507 BaseReg = NewBaseReg;
4508 }
4509 if (HasZeroBaseReg)
4510 continue;
4511
4512 // TODO: This assumes we've done basic processing on all uses and
4513 // have an idea what the register usage is.
4514 if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
4515 continue;
4516
4517 F.canonicalize(*L);
4518 (void)InsertFormula(LU, LUIdx, F);
4519 }
4520 }
4521}
4522
4523namespace {
4524
4525/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
4526/// modifications so that the search phase doesn't have to worry about the data
4527/// structures moving underneath it.
4528struct WorkItem {
4529 size_t LUIdx;
4530 Immediate Imm;
4531 const SCEV *OrigReg;
4532
4533 WorkItem(size_t LI, Immediate I, const SCEV *R)
4534 : LUIdx(LI), Imm(I), OrigReg(R) {}
4535
4536 void print(raw_ostream &OS) const;
4537 void dump() const;
4538};
4539
4540} // end anonymous namespace
4541
4542#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4543void WorkItem::print(raw_ostream &OS) const {
4544 OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
4545 << " , add offset " << Imm;
4546}
4547
4548LLVM_DUMP_METHOD void WorkItem::dump() const {
4549 print(errs()); errs() << '\n';
4550}
4551#endif
4552
4553/// Look for registers which are a constant distance apart and try to form reuse
4554/// opportunities between them.
4555void LSRInstance::GenerateCrossUseConstantOffsets() {
4556 // Group the registers by their value without any added constant offset.
4557 using ImmMapTy = std::map<Immediate, const SCEV *, KeyOrderTargetImmediate>;
4558
4559 DenseMap<const SCEV *, ImmMapTy> Map;
4560 DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
4562 for (const SCEV *Use : RegUses) {
4563 SCEVUse Reg = Use; // Make a copy for ExtractImmediate to modify.
4564 // TODO: Extract both scalable and fixed immediates (if present)?
4565 Immediate Imm = ExtractImmediate(Reg, SE);
4566 auto Pair = Map.try_emplace(Reg);
4567 if (Pair.second)
4568 Sequence.push_back(Reg);
4569 Pair.first->second.insert(std::make_pair(Imm, Use));
4570 UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
4571 }
4572
4573 // Now examine each set of registers with the same base value. Build up
4574 // a list of work to do and do the work in a separate step so that we're
4575 // not adding formulae and register counts while we're searching.
4576 SmallVector<WorkItem, 32> WorkItems;
4577 SmallSet<std::pair<size_t, Immediate>, 32, KeyOrderSizeTAndImmediate>
4578 UniqueItems;
4579 for (const SCEV *Reg : Sequence) {
4580 const ImmMapTy &Imms = Map.find(Reg)->second;
4581
4582 // It's not worthwhile looking for reuse if there's only one offset.
4583 if (Imms.size() == 1)
4584 continue;
4585
4586 LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
4587 for (const auto &Entry
4588 : Imms) dbgs()
4589 << ' ' << Entry.first;
4590 dbgs() << '\n');
4591
4592 // Examine each offset.
4593 for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
4594 J != JE; ++J) {
4595 const SCEV *OrigReg = J->second;
4596
4597 Immediate JImm = J->first;
4598 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
4599
4600 if (!isa<SCEVConstant>(OrigReg) &&
4601 UsedByIndicesMap[Reg].count() == 1) {
4602 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4603 << '\n');
4604 continue;
4605 }
4606
4607 // Conservatively examine offsets between this orig reg a few selected
4608 // other orig regs.
4609 Immediate First = Imms.begin()->first;
4610 Immediate Last = std::prev(Imms.end())->first;
4611 if (!First.isCompatibleImmediate(Last)) {
4612 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4613 << "\n");
4614 continue;
4615 }
4616 // Only scalable if both terms are scalable, or if one is scalable and
4617 // the other is 0.
4618 bool Scalable = First.isScalable() || Last.isScalable();
4619 int64_t FI = First.getKnownMinValue();
4620 int64_t LI = Last.getKnownMinValue();
4621 // Compute (First + Last) / 2 without overflow using the fact that
4622 // First + Last = 2 * (First + Last) + (First ^ Last).
4623 int64_t Avg = (FI & LI) + ((FI ^ LI) >> 1);
4624 // If the result is negative and FI is odd and LI even (or vice versa),
4625 // we rounded towards -inf. Add 1 in that case, to round towards 0.
4626 Avg = Avg + ((FI ^ LI) & ((uint64_t)Avg >> 63));
4627 ImmMapTy::const_iterator OtherImms[] = {
4628 Imms.begin(), std::prev(Imms.end()),
4629 Imms.lower_bound(Immediate::get(Avg, Scalable))};
4630 for (const auto &M : OtherImms) {
4631 if (M == J || M == JE) continue;
4632 if (!JImm.isCompatibleImmediate(M->first))
4633 continue;
4634
4635 // Compute the difference between the two.
4636 Immediate Imm = JImm.subUnsigned(M->first);
4637 for (unsigned LUIdx : UsedByIndices.set_bits())
4638 // Make a memo of this use, offset, and register tuple.
4639 if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
4640 WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
4641 }
4642 }
4643 }
4644
4645 Map.clear();
4646 Sequence.clear();
4647 UsedByIndicesMap.clear();
4648 UniqueItems.clear();
4649
4650 // Now iterate through the worklist and add new formulae.
4651 for (const WorkItem &WI : WorkItems) {
4652 size_t LUIdx = WI.LUIdx;
4653 LSRUse &LU = Uses[LUIdx];
4654 Immediate Imm = WI.Imm;
4655 const SCEV *OrigReg = WI.OrigReg;
4656
4657 Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
4658 const SCEV *NegImmS = Imm.getNegativeSCEV(SE, IntTy);
4659 unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
4660
4661 // TODO: Use a more targeted data structure.
4662 for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
4663 Formula F = LU.Formulae[L];
4664 // FIXME: The code for the scaled and unscaled registers looks
4665 // very similar but slightly different. Investigate if they
4666 // could be merged. That way, we would not have to unscale the
4667 // Formula.
4668 F.unscale();
4669 // Use the immediate in the scaled register.
4670 if (F.ScaledReg == OrigReg) {
4671 if (!F.BaseOffset.isCompatibleImmediate(Imm))
4672 continue;
4673 Immediate Offset = F.BaseOffset.addUnsigned(Imm.mulUnsigned(F.Scale));
4674 // Don't create 50 + reg(-50).
4675 const SCEV *S = Offset.getNegativeSCEV(SE, IntTy);
4676 if (F.referencesReg(S))
4677 continue;
4678 Formula NewF = F;
4679 NewF.BaseOffset = Offset;
4680 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4681 NewF))
4682 continue;
4683 NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
4684
4685 // If the new scale is a constant in a register, and adding the constant
4686 // value to the immediate would produce a value closer to zero than the
4687 // immediate itself, then the formula isn't worthwhile.
4688 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) {
4689 // FIXME: Do we need to do something for scalable immediates here?
4690 // A scalable SCEV won't be constant, but we might still have
4691 // something in the offset? Bail out for now to be safe.
4692 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4693 continue;
4694 if (C->getValue()->isNegative() !=
4695 (NewF.BaseOffset.isLessThanZero()) &&
4696 (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
4697 .ule(std::abs(NewF.BaseOffset.getFixedValue())))
4698 continue;
4699 }
4700
4701 // OK, looks good.
4702 NewF.canonicalize(*this->L);
4703 (void)InsertFormula(LU, LUIdx, NewF);
4704 } else {
4705 // Use the immediate in a base register.
4706 for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
4707 const SCEV *BaseReg = F.BaseRegs[N];
4708 if (BaseReg != OrigReg)
4709 continue;
4710 Formula NewF = F;
4711 if (!NewF.BaseOffset.isCompatibleImmediate(Imm) ||
4712 !NewF.UnfoldedOffset.isCompatibleImmediate(Imm) ||
4713 !NewF.BaseOffset.isCompatibleImmediate(NewF.UnfoldedOffset))
4714 continue;
4715 NewF.BaseOffset = NewF.BaseOffset.addUnsigned(Imm);
4716 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
4717 LU.Kind, LU.AccessTy, NewF)) {
4718 if (AMK == TTI::AMK_PostIndexed &&
4719 mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
4720 continue;
4721 Immediate NewUnfoldedOffset = NewF.UnfoldedOffset.addUnsigned(Imm);
4722 if (!isLegalAddImmediate(TTI, NewUnfoldedOffset))
4723 continue;
4724 NewF = F;
4725 NewF.UnfoldedOffset = NewUnfoldedOffset;
4726 }
4727 NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
4728
4729 // If the new formula has a constant in a register, and adding the
4730 // constant value to the immediate would produce a value closer to
4731 // zero than the immediate itself, then the formula isn't worthwhile.
4732 for (const SCEV *NewReg : NewF.BaseRegs)
4733 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg)) {
4734 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4735 goto skip_formula;
4736 if ((C->getAPInt() + NewF.BaseOffset.getFixedValue())
4737 .abs()
4738 .slt(std::abs(NewF.BaseOffset.getFixedValue())) &&
4739 (C->getAPInt() + NewF.BaseOffset.getFixedValue())
4740 .countr_zero() >=
4742 NewF.BaseOffset.getFixedValue()))
4743 goto skip_formula;
4744 }
4745
4746 // Ok, looks good.
4747 NewF.canonicalize(*this->L);
4748 (void)InsertFormula(LU, LUIdx, NewF);
4749 break;
4750 skip_formula:;
4751 }
4752 }
4753 }
4754 }
4755}
4756
4757/// Generate formulae for each use.
4758void
4759LSRInstance::GenerateAllReuseFormulae() {
4760 // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
4761 // queries are more precise.
4762 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4763 LSRUse &LU = Uses[LUIdx];
4764 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4765 GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
4766 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4767 GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
4768 }
4769 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4770 LSRUse &LU = Uses[LUIdx];
4771 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4772 GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
4773 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4774 GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
4775 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4776 GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
4777 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4778 GenerateScales(LU, LUIdx, LU.Formulae[i]);
4779 }
4780 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4781 LSRUse &LU = Uses[LUIdx];
4782 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4783 GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
4784 }
4785
4786 GenerateCrossUseConstantOffsets();
4787
4788 LLVM_DEBUG(dbgs() << "\n"
4789 "After generating reuse formulae:\n";
4790 print_uses(dbgs()));
4791}
4792
4793/// If there are multiple formulae with the same set of registers used
4794/// by other uses, pick the best one and delete the others.
4795void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
4796 DenseSet<const SCEV *> VisitedRegs;
4797 SmallPtrSet<const SCEV *, 16> Regs;
4798 SmallPtrSet<const SCEV *, 16> LoserRegs;
4799#ifndef NDEBUG
4800 bool ChangedFormulae = false;
4801#endif
4802
4803 // Collect the best formula for each unique set of shared registers. This
4804 // is reset for each use.
4805 using BestFormulaeTy = DenseMap<SmallVector<const SCEV *, 4>, size_t>;
4806
4807 BestFormulaeTy BestFormulae;
4808
4809 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4810 LSRUse &LU = Uses[LUIdx];
4811 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
4812 dbgs() << '\n');
4813
4814 bool Any = false;
4815 for (size_t FIdx = 0, NumForms = LU.Formulae.size();
4816 FIdx != NumForms; ++FIdx) {
4817 Formula &F = LU.Formulae[FIdx];
4818
4819 // Some formulas are instant losers. For example, they may depend on
4820 // nonexistent AddRecs from other loops. These need to be filtered
4821 // immediately, otherwise heuristics could choose them over others leading
4822 // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
4823 // avoids the need to recompute this information across formulae using the
4824 // same bad AddRec. Passing LoserRegs is also essential unless we remove
4825 // the corresponding bad register from the Regs set.
4826 Cost CostF(L, SE, TTI, AMK);
4827 Regs.clear();
4828 CostF.RateFormula(F, Regs, VisitedRegs, LU, HardwareLoopProfitable,
4829 &LoserRegs);
4830 if (CostF.isLoser()) {
4831 // During initial formula generation, undesirable formulae are generated
4832 // by uses within other loops that have some non-trivial address mode or
4833 // use the postinc form of the IV. LSR needs to provide these formulae
4834 // as the basis of rediscovering the desired formula that uses an AddRec
4835 // corresponding to the existing phi. Once all formulae have been
4836 // generated, these initial losers may be pruned.
4837 LLVM_DEBUG(dbgs() << " Filtering loser "; F.print(dbgs());
4838 dbgs() << "\n");
4839 }
4840 else {
4842 for (const SCEV *Reg : F.BaseRegs) {
4843 if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
4844 Key.push_back(Reg);
4845 }
4846 if (F.ScaledReg &&
4847 RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
4848 Key.push_back(F.ScaledReg);
4849 // Unstable sort by host order ok, because this is only used for
4850 // uniquifying.
4851 llvm::sort(Key);
4852
4853 std::pair<BestFormulaeTy::const_iterator, bool> P =
4854 BestFormulae.insert(std::make_pair(Key, FIdx));
4855 if (P.second)
4856 continue;
4857
4858 Formula &Best = LU.Formulae[P.first->second];
4859
4860 Cost CostBest(L, SE, TTI, AMK);
4861 Regs.clear();
4862 CostBest.RateFormula(Best, Regs, VisitedRegs, LU,
4863 HardwareLoopProfitable);
4864 if (CostF.isLess(CostBest))
4865 std::swap(F, Best);
4866 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
4867 dbgs() << "\n"
4868 " in favor of formula ";
4869 Best.print(dbgs()); dbgs() << '\n');
4870 }
4871#ifndef NDEBUG
4872 ChangedFormulae = true;
4873#endif
4874 LU.DeleteFormula(F);
4875 --FIdx;
4876 --NumForms;
4877 Any = true;
4878 }
4879
4880 // Now that we've filtered out some formulae, recompute the Regs set.
4881 if (Any)
4882 LU.RecomputeRegs(LUIdx, RegUses);
4883
4884 // Reset this to prepare for the next use.
4885 BestFormulae.clear();
4886 }
4887
4888 LLVM_DEBUG(if (ChangedFormulae) {
4889 dbgs() << "\n"
4890 "After filtering out undesirable candidates:\n";
4891 print_uses(dbgs());
4892 });
4893}
4894
4895/// Estimate the worst-case number of solutions the solver might have to
4896/// consider. It almost never considers this many solutions because it prune the
4897/// search space, but the pruning isn't always sufficient.
4898size_t LSRInstance::EstimateSearchSpaceComplexity() const {
4899 size_t Power = 1;
4900 for (const LSRUse &LU : Uses) {
4901 size_t FSize = LU.Formulae.size();
4902 if (FSize >= ComplexityLimit) {
4903 Power = ComplexityLimit;
4904 break;
4905 }
4906 Power *= FSize;
4907 if (Power >= ComplexityLimit)
4908 break;
4909 }
4910 return Power;
4911}
4912
4913/// When one formula uses a superset of the registers of another formula, it
4914/// won't help reduce register pressure (though it may not necessarily hurt
4915/// register pressure); remove it to simplify the system.
4916void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
4917 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4918 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4919
4920 LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
4921 "which use a superset of registers used by other "
4922 "formulae.\n");
4923
4924 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4925 LSRUse &LU = Uses[LUIdx];
4926 bool Any = false;
4927 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
4928 Formula &F = LU.Formulae[i];
4929 if (F.BaseOffset.isNonZero() && F.BaseOffset.isScalable())
4930 continue;
4931 // Look for a formula with a constant or GV in a register. If the use
4932 // also has a formula with that same value in an immediate field,
4933 // delete the one that uses a register.
4935 I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
4936 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
4937 Formula NewF = F;
4938 //FIXME: Formulas should store bitwidth to do wrapping properly.
4939 // See PR41034.
4940 NewF.BaseOffset =
4941 Immediate::getFixed(NewF.BaseOffset.getFixedValue() +
4942 (uint64_t)C->getValue()->getSExtValue());
4943 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4944 (I - F.BaseRegs.begin()));
4945 if (LU.HasFormulaWithSameRegs(NewF)) {
4946 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4947 dbgs() << '\n');
4948 LU.DeleteFormula(F);
4949 --i;
4950 --e;
4951 Any = true;
4952 break;
4953 }
4954 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
4955 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
4956 if (!F.BaseGV) {
4957 Formula NewF = F;
4958 NewF.BaseGV = GV;
4959 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4960 (I - F.BaseRegs.begin()));
4961 if (LU.HasFormulaWithSameRegs(NewF)) {
4962 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4963 dbgs() << '\n');
4964 LU.DeleteFormula(F);
4965 --i;
4966 --e;
4967 Any = true;
4968 break;
4969 }
4970 }
4971 }
4972 }
4973 }
4974 if (Any)
4975 LU.RecomputeRegs(LUIdx, RegUses);
4976 }
4977
4978 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4979 }
4980}
4981
4982/// When there are many registers for expressions like A, A+1, A+2, etc.,
4983/// allocate a single register for them.
4984void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
4985 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4986 return;
4987
4988 LLVM_DEBUG(
4989 dbgs() << "The search space is too complex.\n"
4990 "Narrowing the search space by assuming that uses separated "
4991 "by a constant offset will use the same registers.\n");
4992
4993 // This is especially useful for unrolled loops.
4994
4995 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4996 LSRUse &LU = Uses[LUIdx];
4997 for (const Formula &F : LU.Formulae) {
4998 if (F.BaseOffset.isZero() || (F.Scale != 0 && F.Scale != 1))
4999 continue;
5000 assert((LU.Kind == LSRUse::Address || LU.Kind == LSRUse::ICmpZero) &&
5001 "Only address and cmp uses expected to have nonzero BaseOffset");
5002
5003 LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
5004 if (!LUThatHas)
5005 continue;
5006
5007 if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
5008 LU.Kind, LU.AccessTy))
5009 continue;
5010
5011 LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');
5012
5013 LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
5014 LUThatHas->AllFixupsUnconditional &= LU.AllFixupsUnconditional;
5015
5016 // Transfer the fixups of LU to LUThatHas.
5017 for (LSRFixup &Fixup : LU.Fixups) {
5018 Fixup.Offset += F.BaseOffset;
5019 LUThatHas->pushFixup(Fixup);
5020 LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
5021 }
5022
5023#ifndef NDEBUG
5024 Type *FixupType = LUThatHas->Fixups[0].OperandValToReplace->getType();
5025 for (LSRFixup &Fixup : LUThatHas->Fixups)
5026 assert(Fixup.OperandValToReplace->getType() == FixupType &&
5027 "Expected all fixups to have the same type");
5028#endif
5029
5030 // Delete formulae from the new use which are no longer legal.
5031 bool Any = false;
5032 for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
5033 Formula &F = LUThatHas->Formulae[i];
5034 if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
5035 LUThatHas->Kind, LUThatHas->AccessTy, F)) {
5036 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
5037 LUThatHas->DeleteFormula(F);
5038 --i;
5039 --e;
5040 Any = true;
5041 }
5042 }
5043
5044 if (Any)
5045 LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
5046
5047 // Delete the old use.
5048 DeleteUse(LU, LUIdx);
5049 --LUIdx;
5050 --NumUses;
5051 break;
5052 }
5053 }
5054
5055 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5056}
5057
5058/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
5059/// we've done more filtering, as it may be able to find more formulae to
5060/// eliminate.
5061void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
5062 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5063 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5064
5065 LLVM_DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
5066 "undesirable dedicated registers.\n");
5067
5068 FilterOutUndesirableDedicatedRegisters();
5069
5070 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5071 }
5072}
5073
5074/// If a LSRUse has multiple formulae with the same ScaledReg and Scale.
5075/// Pick the best one and delete the others.
5076/// This narrowing heuristic is to keep as many formulae with different
5077/// Scale and ScaledReg pair as possible while narrowing the search space.
5078/// The benefit is that it is more likely to find out a better solution
5079/// from a formulae set with more Scale and ScaledReg variations than
5080/// a formulae set with the same Scale and ScaledReg. The picking winner
5081/// reg heuristic will often keep the formulae with the same Scale and
5082/// ScaledReg and filter others, and we want to avoid that if possible.
5083void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
5084 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5085 return;
5086
5087 LLVM_DEBUG(
5088 dbgs() << "The search space is too complex.\n"
5089 "Narrowing the search space by choosing the best Formula "
5090 "from the Formulae with the same Scale and ScaledReg.\n");
5091
5092 // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
5093 using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>;
5094
5095 BestFormulaeTy BestFormulae;
5096#ifndef NDEBUG
5097 bool ChangedFormulae = false;
5098#endif
5099 DenseSet<const SCEV *> VisitedRegs;
5100 SmallPtrSet<const SCEV *, 16> Regs;
5101
5102 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5103 LSRUse &LU = Uses[LUIdx];
5104 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
5105 dbgs() << '\n');
5106
5107 // Return true if Formula FA is better than Formula FB.
5108 auto IsBetterThan = [&](Formula &FA, Formula &FB) {
5109 // First we will try to choose the Formula with fewer new registers.
5110 // For a register used by current Formula, the more the register is
5111 // shared among LSRUses, the less we increase the register number
5112 // counter of the formula.
5113 size_t FARegNum = 0;
5114 for (const SCEV *Reg : FA.BaseRegs) {
5115 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5116 FARegNum += (NumUses - UsedByIndices.count() + 1);
5117 }
5118 size_t FBRegNum = 0;
5119 for (const SCEV *Reg : FB.BaseRegs) {
5120 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5121 FBRegNum += (NumUses - UsedByIndices.count() + 1);
5122 }
5123 if (FARegNum != FBRegNum)
5124 return FARegNum < FBRegNum;
5125
5126 // If the new register numbers are the same, choose the Formula with
5127 // less Cost.
5128 Cost CostFA(L, SE, TTI, AMK);
5129 Cost CostFB(L, SE, TTI, AMK);
5130 Regs.clear();
5131 CostFA.RateFormula(FA, Regs, VisitedRegs, LU, HardwareLoopProfitable);
5132 Regs.clear();
5133 CostFB.RateFormula(FB, Regs, VisitedRegs, LU, HardwareLoopProfitable);
5134 return CostFA.isLess(CostFB);
5135 };
5136
5137 bool Any = false;
5138 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5139 ++FIdx) {
5140 Formula &F = LU.Formulae[FIdx];
5141 if (!F.ScaledReg)
5142 continue;
5143 auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx});
5144 if (P.second)
5145 continue;
5146
5147 Formula &Best = LU.Formulae[P.first->second];
5148 if (IsBetterThan(F, Best))
5149 std::swap(F, Best);
5150 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5151 dbgs() << "\n"
5152 " in favor of formula ";
5153 Best.print(dbgs()); dbgs() << '\n');
5154#ifndef NDEBUG
5155 ChangedFormulae = true;
5156#endif
5157 LU.DeleteFormula(F);
5158 --FIdx;
5159 --NumForms;
5160 Any = true;
5161 }
5162 if (Any)
5163 LU.RecomputeRegs(LUIdx, RegUses);
5164
5165 // Reset this to prepare for the next use.
5166 BestFormulae.clear();
5167 }
5168
5169 LLVM_DEBUG(if (ChangedFormulae) {
5170 dbgs() << "\n"
5171 "After filtering out undesirable candidates:\n";
5172 print_uses(dbgs());
5173 });
5174}
5175
5176/// If we are over the complexity limit, filter out any post-inc prefering
5177/// variables to only post-inc values.
5178void LSRInstance::NarrowSearchSpaceByFilterPostInc() {
5179 if (AMK != TTI::AMK_PostIndexed)
5180 return;
5181 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5182 return;
5183
5184 LLVM_DEBUG(dbgs() << "The search space is too complex.\n"
5185 "Narrowing the search space by choosing the lowest "
5186 "register Formula for PostInc Uses.\n");
5187
5188 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5189 LSRUse &LU = Uses[LUIdx];
5190
5191 if (LU.Kind != LSRUse::Address)
5192 continue;
5193 if (!TTI.isIndexedLoadLegal(TTI.MIM_PostInc, LU.AccessTy.getType()) &&
5194 !TTI.isIndexedStoreLegal(TTI.MIM_PostInc, LU.AccessTy.getType()))
5195 continue;
5196
5197 size_t MinRegs = std::numeric_limits<size_t>::max();
5198 for (const Formula &F : LU.Formulae)
5199 MinRegs = std::min(F.getNumRegs(), MinRegs);
5200
5201 bool Any = false;
5202 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5203 ++FIdx) {
5204 Formula &F = LU.Formulae[FIdx];
5205 if (F.getNumRegs() > MinRegs) {
5206 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5207 dbgs() << "\n");
5208 LU.DeleteFormula(F);
5209 --FIdx;
5210 --NumForms;
5211 Any = true;
5212 }
5213 }
5214 if (Any)
5215 LU.RecomputeRegs(LUIdx, RegUses);
5216
5217 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5218 break;
5219 }
5220
5221 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5222}
5223
5224void LSRInstance::NarrowSearchSpaceByMergingUsesOutsideLoop() {
5225 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5226 return;
5227
5228 LLVM_DEBUG(
5229 dbgs() << "The search space is too complex.\n"
5230 "Narrowing the search space by merging uses with fixups "
5231 "entirely outside the loop with uses inside the loop.\n");
5232
5233 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5234 LSRUse &LU = Uses[LUIdx];
5235 if (!LU.AllFixupsOutsideLoop || LU.Formulae.empty())
5236 continue;
5237
5238 LLVM_DEBUG(dbgs() << " Trying to eliminate use "; LU.print(dbgs());
5239 dbgs() << '\n');
5240
5241 // Find a compatible LSRUse inside the loop that we could merge LU with
5242 LSRUse *LUToMergeWith = nullptr;
5243 const Formula &ThisF = LU.Formulae[0];
5244 for (LSRUse &OtherLU : Uses) {
5245 // Only merge with uses inside the loop
5246 if (OtherLU.AllFixupsOutsideLoop)
5247 continue;
5248 // Can't merge with ICmpZero uses as they're handled specially when
5249 // expanding
5250 if (OtherLU.Kind == LSRUse::ICmpZero)
5251 continue;
5252 // Can't merge with uses without any formulae
5253 if (OtherLU.Formulae.empty())
5254 continue;
5255 // Can't merge if LU's offsets aren't legal for all of OtherLU's formulae
5256 if (any_of(OtherLU.Formulae, [&](const Formula &F) {
5257 return !isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, OtherLU.Kind,
5258 OtherLU.AccessTy, F);
5259 }))
5260 continue;
5261 // We can merge with uses that have the same initial formula. We allow
5262 // merging of uses with different Kind and AccessTy which means that the
5263 // cost may end up being inaccurate, but it's also what we would have
5264 // gotten if we'd ignored uses outside the loop entirely.
5265 const Formula &OtherF = OtherLU.Formulae[0];
5266 if (ThisF.BaseRegs == OtherF.BaseRegs &&
5267 ThisF.ScaledReg == OtherF.ScaledReg &&
5268 ThisF.BaseGV == OtherF.BaseGV && ThisF.Scale == OtherF.Scale &&
5269 ThisF.UnfoldedOffset == OtherF.UnfoldedOffset &&
5270 ThisF.BaseOffset == OtherF.BaseOffset) {
5271 LUToMergeWith = &OtherLU;
5272 break;
5273 }
5274 }
5275 if (!LUToMergeWith)
5276 continue;
5277
5278 LLVM_DEBUG(dbgs() << " Merging with "; LUToMergeWith->print(dbgs());
5279 dbgs() << '\n');
5280
5281 // Copy fixups
5282 for (LSRFixup &Fixup : LU.Fixups) {
5283 LUToMergeWith->pushFixup(Fixup);
5284 }
5285
5286 // Delete the old use.
5287 DeleteUse(LU, LUIdx);
5288 --LUIdx;
5289 --NumUses;
5290 }
5291
5292 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5293}
5294
5295/// The function delete formulas with high registers number expectation.
5296/// Assuming we don't know the value of each formula (already delete
5297/// all inefficient), generate probability of not selecting for each
5298/// register.
5299/// For example,
5300/// Use1:
5301/// reg(a) + reg({0,+,1})
5302/// reg(a) + reg({-1,+,1}) + 1
5303/// reg({a,+,1})
5304/// Use2:
5305/// reg(b) + reg({0,+,1})
5306/// reg(b) + reg({-1,+,1}) + 1
5307/// reg({b,+,1})
5308/// Use3:
5309/// reg(c) + reg(b) + reg({0,+,1})
5310/// reg(c) + reg({b,+,1})
5311///
5312/// Probability of not selecting
5313/// Use1 Use2 Use3
5314/// reg(a) (1/3) * 1 * 1
5315/// reg(b) 1 * (1/3) * (1/2)
5316/// reg({0,+,1}) (2/3) * (2/3) * (1/2)
5317/// reg({-1,+,1}) (2/3) * (2/3) * 1
5318/// reg({a,+,1}) (2/3) * 1 * 1
5319/// reg({b,+,1}) 1 * (2/3) * (2/3)
5320/// reg(c) 1 * 1 * 0
5321///
5322/// Now count registers number mathematical expectation for each formula:
5323/// Note that for each use we exclude probability if not selecting for the use.
5324/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
5325/// probabilty 1/3 of not selecting for Use1).
5326/// Use1:
5327/// reg(a) + reg({0,+,1}) 1 + 1/3 -- to be deleted
5328/// reg(a) + reg({-1,+,1}) + 1 1 + 4/9 -- to be deleted
5329/// reg({a,+,1}) 1
5330/// Use2:
5331/// reg(b) + reg({0,+,1}) 1/2 + 1/3 -- to be deleted
5332/// reg(b) + reg({-1,+,1}) + 1 1/2 + 2/3 -- to be deleted
5333/// reg({b,+,1}) 2/3
5334/// Use3:
5335/// reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
5336/// reg(c) + reg({b,+,1}) 1 + 2/3
5337void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
5338 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5339 return;
5340 // Ok, we have too many of formulae on our hands to conveniently handle.
5341 // Use a rough heuristic to thin out the list.
5342
5343 // Set of Regs wich will be 100% used in final solution.
5344 // Used in each formula of a solution (in example above this is reg(c)).
5345 // We can skip them in calculations.
5346 SmallPtrSet<const SCEV *, 4> UniqRegs;
5347 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5348
5349 // Map each register to probability of not selecting
5350 DenseMap <const SCEV *, float> RegNumMap;
5351 for (const SCEV *Reg : RegUses) {
5352 if (UniqRegs.count(Reg))
5353 continue;
5354 float PNotSel = 1;
5355 for (const LSRUse &LU : Uses) {
5356 if (!LU.Regs.count(Reg))
5357 continue;
5358 float P = LU.getNotSelectedProbability(Reg);
5359 if (P != 0.0)
5360 PNotSel *= P;
5361 else
5362 UniqRegs.insert(Reg);
5363 }
5364 RegNumMap.insert(std::make_pair(Reg, PNotSel));
5365 }
5366
5367 LLVM_DEBUG(
5368 dbgs() << "Narrowing the search space by deleting costly formulas\n");
5369
5370 // Delete formulas where registers number expectation is high.
5371 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5372 LSRUse &LU = Uses[LUIdx];
5373 // If nothing to delete - continue.
5374 if (LU.Formulae.size() < 2)
5375 continue;
5376 // This is temporary solution to test performance. Float should be
5377 // replaced with round independent type (based on integers) to avoid
5378 // different results for different target builds.
5379 float FMinRegNum = LU.Formulae[0].getNumRegs();
5380 float FMinARegNum = LU.Formulae[0].getNumRegs();
5381 size_t MinIdx = 0;
5382 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5383 Formula &F = LU.Formulae[i];
5384 float FRegNum = 0;
5385 float FARegNum = 0;
5386 for (const SCEV *BaseReg : F.BaseRegs) {
5387 if (UniqRegs.count(BaseReg))
5388 continue;
5389 FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5390 if (isa<SCEVAddRecExpr>(BaseReg))
5391 FARegNum +=
5392 RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5393 }
5394 if (const SCEV *ScaledReg = F.ScaledReg) {
5395 if (!UniqRegs.count(ScaledReg)) {
5396 FRegNum +=
5397 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5398 if (isa<SCEVAddRecExpr>(ScaledReg))
5399 FARegNum +=
5400 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5401 }
5402 }
5403 if (FMinRegNum > FRegNum ||
5404 (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
5405 FMinRegNum = FRegNum;
5406 FMinARegNum = FARegNum;
5407 MinIdx = i;
5408 }
5409 }
5410 LLVM_DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs());
5411 dbgs() << " with min reg num " << FMinRegNum << '\n');
5412 if (MinIdx != 0)
5413 std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
5414 while (LU.Formulae.size() != 1) {
5415 LLVM_DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs());
5416 dbgs() << '\n');
5417 LU.Formulae.pop_back();
5418 }
5419 LU.RecomputeRegs(LUIdx, RegUses);
5420 assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
5421 Formula &F = LU.Formulae[0];
5422 LLVM_DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n');
5423 // When we choose the formula, the regs become unique.
5424 UniqRegs.insert_range(F.BaseRegs);
5425 if (F.ScaledReg)
5426 UniqRegs.insert(F.ScaledReg);
5427 }
5428 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5429}
5430
5431// Check if Best and Reg are SCEVs separated by a constant amount C, and if so
5432// would the addressing offset +C would be legal where the negative offset -C is
5433// not.
5435 ScalarEvolution &SE, const SCEV *Best,
5436 const SCEV *Reg,
5437 MemAccessTy AccessType) {
5438 if (Best->getType() != Reg->getType() ||
5440 cast<SCEVAddRecExpr>(Best)->getLoop() !=
5441 cast<SCEVAddRecExpr>(Reg)->getLoop()))
5442 return false;
5443 std::optional<APInt> Diff = SE.computeConstantDifference(Best, Reg);
5444 if (!Diff)
5445 return false;
5446
5447 return TTI.isLegalAddressingMode(
5448 AccessType.MemTy, /*BaseGV=*/nullptr,
5449 /*BaseOffset=*/Diff->getSExtValue(),
5450 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace) &&
5451 !TTI.isLegalAddressingMode(
5452 AccessType.MemTy, /*BaseGV=*/nullptr,
5453 /*BaseOffset=*/-Diff->getSExtValue(),
5454 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace);
5455}
5456
5457/// Pick a register which seems likely to be profitable, and then in any use
5458/// which has any reference to that register, delete all formulae which do not
5459/// reference that register.
5460void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
5461 // With all other options exhausted, loop until the system is simple
5462 // enough to handle.
5463 SmallPtrSet<const SCEV *, 4> Taken;
5464 while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5465 // Ok, we have too many of formulae on our hands to conveniently handle.
5466 // Use a rough heuristic to thin out the list.
5467 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5468
5469 // Pick the register which is used by the most LSRUses, which is likely
5470 // to be a good reuse register candidate.
5471 const SCEV *Best = nullptr;
5472 unsigned BestNum = 0;
5473 for (const SCEV *Reg : RegUses) {
5474 if (Taken.count(Reg))
5475 continue;
5476 if (!Best) {
5477 Best = Reg;
5478 BestNum = RegUses.getUsedByIndices(Reg).count();
5479 } else {
5480 unsigned Count = RegUses.getUsedByIndices(Reg).count();
5481 if (Count > BestNum) {
5482 Best = Reg;
5483 BestNum = Count;
5484 }
5485
5486 // If the scores are the same, but the Reg is simpler for the target
5487 // (for example {x,+,1} as opposed to {x+C,+,1}, where the target can
5488 // handle +C but not -C), opt for the simpler formula.
5489 if (Count == BestNum) {
5490 int LUIdx = RegUses.getUsedByIndices(Reg).find_first();
5491 if (LUIdx >= 0 && Uses[LUIdx].Kind == LSRUse::Address &&
5493 Uses[LUIdx].AccessTy)) {
5494 Best = Reg;
5495 BestNum = Count;
5496 }
5497 }
5498 }
5499 }
5500 assert(Best && "Failed to find best LSRUse candidate");
5501
5502 LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
5503 << " will yield profitable reuse.\n");
5504 Taken.insert(Best);
5505
5506 // In any use with formulae which references this register, delete formulae
5507 // which don't reference it.
5508 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5509 LSRUse &LU = Uses[LUIdx];
5510 if (!LU.Regs.count(Best)) continue;
5511
5512 bool Any = false;
5513 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5514 Formula &F = LU.Formulae[i];
5515 if (!F.referencesReg(Best)) {
5516 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
5517 LU.DeleteFormula(F);
5518 --e;
5519 --i;
5520 Any = true;
5521 assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
5522 continue;
5523 }
5524 }
5525
5526 if (Any)
5527 LU.RecomputeRegs(LUIdx, RegUses);
5528 }
5529
5530 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5531 }
5532}
5533
5534/// If there are an extraordinary number of formulae to choose from, use some
5535/// rough heuristics to prune down the number of formulae. This keeps the main
5536/// solver from taking an extraordinary amount of time in some worst-case
5537/// scenarios.
5538void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
5539 NarrowSearchSpaceByDetectingSupersets();
5540 NarrowSearchSpaceByCollapsingUnrolledCode();
5541 NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
5543 NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
5544 NarrowSearchSpaceByFilterPostInc();
5545 NarrowSearchSpaceByMergingUsesOutsideLoop();
5546 if (LSRExpNarrow)
5547 NarrowSearchSpaceByDeletingCostlyFormulas();
5548 else
5549 NarrowSearchSpaceByPickingWinnerRegs();
5550}
5551
5552/// This is the recursive solver.
5553void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
5554 Cost &SolutionCost,
5555 SmallVectorImpl<const Formula *> &Workspace,
5556 const Cost &CurCost,
5557 const SmallPtrSet<const SCEV *, 16> &CurRegs,
5558 DenseSet<const SCEV *> &VisitedRegs) const {
5559 // Some ideas:
5560 // - prune more:
5561 // - use more aggressive filtering
5562 // - sort the formula so that the most profitable solutions are found first
5563 // - sort the uses too
5564 // - search faster:
5565 // - don't compute a cost, and then compare. compare while computing a cost
5566 // and bail early.
5567 // - track register sets with SmallBitVector
5568
5569 const LSRUse &LU = Uses[Workspace.size()];
5570
5571 // If this use references any register that's already a part of the
5572 // in-progress solution, consider it a requirement that a formula must
5573 // reference that register in order to be considered. This prunes out
5574 // unprofitable searching.
5575 SmallSetVector<const SCEV *, 4> ReqRegs;
5576 for (const SCEV *S : CurRegs)
5577 if (LU.Regs.count(S))
5578 ReqRegs.insert(S);
5579
5580 SmallPtrSet<const SCEV *, 16> NewRegs;
5581 Cost NewCost(L, SE, TTI, AMK);
5582 for (const Formula &F : LU.Formulae) {
5583 // Ignore formulae which may not be ideal in terms of register reuse of
5584 // ReqRegs. The formula should use all required registers before
5585 // introducing new ones.
5586 // This can sometimes (notably when trying to favour postinc) lead to
5587 // sub-optimial decisions. There it is best left to the cost modelling to
5588 // get correct.
5589 if (!(AMK & TTI::AMK_PostIndexed) || LU.Kind != LSRUse::Address) {
5590 int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
5591 for (const SCEV *Reg : ReqRegs) {
5592 if ((F.ScaledReg && F.ScaledReg == Reg) ||
5593 is_contained(F.BaseRegs, Reg)) {
5594 --NumReqRegsToFind;
5595 if (NumReqRegsToFind == 0)
5596 break;
5597 }
5598 }
5599 if (NumReqRegsToFind != 0) {
5600 // If none of the formulae satisfied the required registers, then we could
5601 // clear ReqRegs and try again. Currently, we simply give up in this case.
5602 continue;
5603 }
5604 }
5605
5606 // Evaluate the cost of the current formula. If it's already worse than
5607 // the current best, prune the search at that point.
5608 NewCost = CurCost;
5609 NewRegs = CurRegs;
5610 NewCost.RateFormula(F, NewRegs, VisitedRegs, LU, HardwareLoopProfitable);
5611 if (NewCost.isLess(SolutionCost)) {
5612 Workspace.push_back(&F);
5613 if (Workspace.size() != Uses.size()) {
5614 SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
5615 NewRegs, VisitedRegs);
5616 if (F.getNumRegs() == 1 && Workspace.size() == 1)
5617 VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
5618 } else {
5619 LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
5620 dbgs() << ".\nRegs:\n";
5621 for (const SCEV *S : NewRegs) dbgs()
5622 << "- " << *S << "\n";
5623 dbgs() << '\n');
5624
5625 SolutionCost = NewCost;
5626 Solution = Workspace;
5627 }
5628 Workspace.pop_back();
5629 }
5630 }
5631}
5632
5633/// Choose one formula from each use. Return the results in the given Solution
5634/// vector.
5635void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
5637 Cost SolutionCost(L, SE, TTI, AMK);
5638 SolutionCost.Lose();
5639 Cost CurCost(L, SE, TTI, AMK);
5640 SmallPtrSet<const SCEV *, 16> CurRegs;
5641 DenseSet<const SCEV *> VisitedRegs;
5642 Workspace.reserve(Uses.size());
5643
5644 // SolveRecurse does all the work.
5645 SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
5646 CurRegs, VisitedRegs);
5647 if (Solution.empty()) {
5648 LLVM_DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
5649 return;
5650 }
5651
5652 // Ok, we've now made all our decisions.
5653 LLVM_DEBUG(dbgs() << "\n"
5654 "The chosen solution requires ";
5655 SolutionCost.print(dbgs()); dbgs() << ":\n";
5656 for (size_t i = 0, e = Uses.size(); i != e; ++i) {
5657 dbgs() << " ";
5658 Uses[i].print(dbgs());
5659 dbgs() << "\n"
5660 " ";
5661 Solution[i]->print(dbgs());
5662 dbgs() << '\n';
5663 });
5664
5665 assert(Solution.size() == Uses.size() && "Malformed solution!");
5666
5667 const bool EnableDropUnprofitableSolution = [&] {
5669 case cl::BOU_TRUE:
5670 return true;
5671 case cl::BOU_FALSE:
5672 return false;
5673 case cl::BOU_UNSET:
5675 }
5676 llvm_unreachable("Unhandled cl::boolOrDefault enum");
5677 }();
5678
5679 if (BaselineCost.isLess(SolutionCost)) {
5680 if (!EnableDropUnprofitableSolution)
5681 LLVM_DEBUG(
5682 dbgs() << "Baseline is more profitable than chosen solution, "
5683 "add option 'lsr-drop-solution' to drop LSR solution.\n");
5684 else {
5685 LLVM_DEBUG(dbgs() << "Baseline is more profitable than chosen "
5686 "solution, dropping LSR solution.\n";);
5687 Solution.clear();
5688 }
5689 }
5690}
5691
5692/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
5693/// we can go while still being dominated by the input positions. This helps
5694/// canonicalize the insert position, which encourages sharing.
5696LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
5697 const SmallVectorImpl<Instruction *> &Inputs)
5698 const {
5699 Instruction *Tentative = &*IP;
5700 while (true) {
5701 bool AllDominate = true;
5702 Instruction *BetterPos = nullptr;
5703 // Don't bother attempting to insert before a catchswitch, their basic block
5704 // cannot have other non-PHI instructions.
5705 if (isa<CatchSwitchInst>(Tentative))
5706 return IP;
5707
5708 for (Instruction *Inst : Inputs) {
5709 if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
5710 AllDominate = false;
5711 break;
5712 }
5713 // Attempt to find an insert position in the middle of the block,
5714 // instead of at the end, so that it can be used for other expansions.
5715 if (Tentative->getParent() == Inst->getParent() &&
5716 (!BetterPos || !DT.dominates(Inst, BetterPos)))
5717 BetterPos = &*std::next(BasicBlock::iterator(Inst));
5718 }
5719 if (!AllDominate)
5720 break;
5721 if (BetterPos)
5722 IP = BetterPos->getIterator();
5723 else
5724 IP = Tentative->getIterator();
5725
5726 const Loop *IPLoop = LI.getLoopFor(IP->getParent());
5727 unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
5728
5729 BasicBlock *IDom;
5730 for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
5731 if (!Rung) return IP;
5732 Rung = Rung->getIDom();
5733 if (!Rung) return IP;
5734 IDom = Rung->getBlock();
5735
5736 // Don't climb into a loop though.
5737 const Loop *IDomLoop = LI.getLoopFor(IDom);
5738 unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
5739 if (IDomDepth <= IPLoopDepth &&
5740 (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
5741 break;
5742 }
5743
5744 Tentative = IDom->getTerminator();
5745 }
5746
5747 return IP;
5748}
5749
5750/// Determine an input position which will be dominated by the operands and
5751/// which will dominate the result.
5752BasicBlock::iterator LSRInstance::AdjustInsertPositionForExpand(
5753 BasicBlock::iterator LowestIP, const LSRFixup &LF, const LSRUse &LU) const {
5754 // Collect some instructions which must be dominated by the
5755 // expanding replacement. These must be dominated by any operands that
5756 // will be required in the expansion.
5757 SmallVector<Instruction *, 4> Inputs;
5758 if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
5759 Inputs.push_back(I);
5760 if (LU.Kind == LSRUse::ICmpZero)
5761 if (Instruction *I =
5762 dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
5763 Inputs.push_back(I);
5764 if (LF.PostIncLoops.count(L)) {
5765 if (LF.isUseFullyOutsideLoop(L))
5766 Inputs.push_back(L->getLoopLatch()->getTerminator());
5767 else
5768 Inputs.push_back(IVIncInsertPos);
5769 }
5770 // The expansion must also be dominated by the increment positions of any
5771 // loops it for which it is using post-inc mode.
5772 for (const Loop *PIL : LF.PostIncLoops) {
5773 if (PIL == L) continue;
5774
5775 // Be dominated by the loop exit.
5776 SmallVector<BasicBlock *, 4> ExitingBlocks;
5777 PIL->getExitingBlocks(ExitingBlocks);
5778 if (!ExitingBlocks.empty()) {
5779 BasicBlock *BB = ExitingBlocks[0];
5780 for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
5781 BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
5782 Inputs.push_back(BB->getTerminator());
5783 }
5784 }
5785
5786 assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad() &&
5787 "Insertion point must be a normal instruction");
5788
5789 // Then, climb up the immediate dominator tree as far as we can go while
5790 // still being dominated by the input positions.
5791 BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
5792
5793 // Don't insert instructions before PHI nodes.
5794 while (isa<PHINode>(IP)) ++IP;
5795
5796 // Ignore landingpad instructions.
5797 while (IP->isEHPad()) ++IP;
5798
5799 // Set IP below instructions recently inserted by SCEVExpander. This keeps the
5800 // IP consistent across expansions and allows the previously inserted
5801 // instructions to be reused by subsequent expansion.
5802 while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP)
5803 ++IP;
5804
5805 return IP;
5806}
5807
5808/// Emit instructions for the leading candidate expression for this LSRUse (this
5809/// is called "expanding").
5810Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
5811 const Formula &F, BasicBlock::iterator IP,
5812 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5813 if (LU.RigidFormula)
5814 return LF.OperandValToReplace;
5815
5816 // Determine an input position which will be dominated by the operands and
5817 // which will dominate the result.
5818 IP = AdjustInsertPositionForExpand(IP, LF, LU);
5819 Rewriter.setInsertPoint(&*IP);
5820
5821 // Inform the Rewriter if we have a post-increment use, so that it can
5822 // perform an advantageous expansion.
5823 Rewriter.setPostInc(LF.PostIncLoops);
5824
5825 // This is the type that the user actually needs.
5826 Type *OpTy = LF.OperandValToReplace->getType();
5827 // For ICmpZero with pointer-typed operands, keep the comparison in the
5828 // integer domain to avoid generating inttoptr casts.
5829 if (LU.Kind == LSRUse::ICmpZero && OpTy->isPointerTy())
5830 OpTy = SE.getEffectiveSCEVType(OpTy);
5831 // This will be the type that we'll initially expand to.
5832 Type *Ty = F.getType();
5833 if (!Ty)
5834 // No type known; just expand directly to the ultimate type.
5835 Ty = OpTy;
5836 else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
5837 // Expand directly to the ultimate type if it's the right size.
5838 Ty = OpTy;
5839 // This is the type to do integer arithmetic in.
5840 Type *IntTy = SE.getEffectiveSCEVType(Ty);
5841
5842 // Build up a list of operands to add together to form the full base.
5844
5845 // Expand the BaseRegs portion.
5846 for (const SCEV *Reg : F.BaseRegs) {
5847 assert(!Reg->isZero() && "Zero allocated in a base register!");
5848
5849 // If we're expanding for a post-inc user, make the post-inc adjustment.
5850 Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE);
5851 Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
5852 }
5853
5854 // Expand the ScaledReg portion.
5855 Value *ICmpScaledV = nullptr;
5856 if (F.Scale != 0) {
5857 const SCEV *ScaledS = F.ScaledReg;
5858
5859 // If we're expanding for a post-inc user, make the post-inc adjustment.
5860 PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
5861 ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE);
5862
5863 if (LU.Kind == LSRUse::ICmpZero) {
5864 // Expand ScaleReg as if it was part of the base regs.
5865 if (F.Scale == 1)
5866 Ops.push_back(
5867 SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)));
5868 else {
5869 // An interesting way of "folding" with an icmp is to use a negated
5870 // scale, which we'll implement by inserting it into the other operand
5871 // of the icmp.
5872 assert(F.Scale == -1 &&
5873 "The only scale supported by ICmpZero uses is -1!");
5874 ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr);
5875 }
5876 } else {
5877 // Otherwise just expand the scaled register and an explicit scale,
5878 // which is expected to be matched as part of the address.
5879
5880 // Flush the operand list to suppress SCEVExpander hoisting address modes.
5881 // Unless the addressing mode will not be folded.
5882 if (!Ops.empty() && LU.Kind == LSRUse::Address &&
5883 isAMCompletelyFolded(TTI, LU, F)) {
5884 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), nullptr);
5885 Ops.clear();
5886 Ops.push_back(SE.getUnknown(FullV));
5887 }
5888 ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr));
5889 if (F.Scale != 1)
5890 ScaledS =
5891 SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
5892 Ops.push_back(ScaledS);
5893 }
5894 }
5895
5896 // Expand the GV portion.
5897 if (F.BaseGV) {
5898 // Flush the operand list to suppress SCEVExpander hoisting.
5899 if (!Ops.empty()) {
5900 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), IntTy);
5901 Ops.clear();
5902 Ops.push_back(SE.getUnknown(FullV));
5903 }
5904 Ops.push_back(SE.getUnknown(F.BaseGV));
5905 }
5906
5907 // Flush the operand list to suppress SCEVExpander hoisting of both folded and
5908 // unfolded offsets. LSR assumes they both live next to their uses.
5909 if (!Ops.empty()) {
5910 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
5911 Ops.clear();
5912 Ops.push_back(SE.getUnknown(FullV));
5913 }
5914
5915 // FIXME: Are we sure we won't get a mismatch here? Is there a way to bail
5916 // out at this point, or should we generate a SCEV adding together mixed
5917 // offsets?
5918 assert(F.BaseOffset.isCompatibleImmediate(LF.Offset) &&
5919 "Expanding mismatched offsets\n");
5920 // Expand the immediate portion.
5921 Immediate Offset = F.BaseOffset.addUnsigned(LF.Offset);
5922 if (Offset.isNonZero()) {
5923 if (LU.Kind == LSRUse::ICmpZero) {
5924 // The other interesting way of "folding" with an ICmpZero is to use a
5925 // negated immediate.
5926 if (!ICmpScaledV) {
5927 // TODO: Avoid implicit trunc?
5928 // See https://github.com/llvm/llvm-project/issues/112510.
5929 ICmpScaledV = ConstantInt::getSigned(
5930 IntTy, -(uint64_t)Offset.getFixedValue(), /*ImplicitTrunc=*/true);
5931 } else {
5932 Ops.push_back(SE.getUnknown(ICmpScaledV));
5933 ICmpScaledV = ConstantInt::getSigned(IntTy, Offset.getFixedValue(),
5934 /*ImplicitTrunc=*/true);
5935 }
5936 } else {
5937 // Just add the immediate values. These again are expected to be matched
5938 // as part of the address.
5939 Ops.push_back(Offset.getUnknownSCEV(SE, IntTy));
5940 }
5941 }
5942
5943 // Expand the unfolded offset portion.
5944 Immediate UnfoldedOffset = F.UnfoldedOffset;
5945 if (UnfoldedOffset.isNonZero()) {
5946 // Just add the immediate values.
5947 Ops.push_back(UnfoldedOffset.getUnknownSCEV(SE, IntTy));
5948 }
5949
5950 // Emit instructions summing all the operands.
5951 const SCEV *FullS = Ops.empty() ?
5952 SE.getConstant(IntTy, 0) :
5953 SE.getAddExpr(Ops);
5954 Value *FullV = Rewriter.expandCodeFor(FullS, Ty);
5955
5956 // We're done expanding now, so reset the rewriter.
5957 Rewriter.clearPostInc();
5958
5959 // An ICmpZero Formula represents an ICmp which we're handling as a
5960 // comparison against zero. Now that we've expanded an expression for that
5961 // form, update the ICmp's other operand.
5962 if (LU.Kind == LSRUse::ICmpZero) {
5963 ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
5964 if (auto *OperandIsInstr = dyn_cast<Instruction>(CI->getOperand(1)))
5965 DeadInsts.emplace_back(OperandIsInstr);
5966 assert(!F.BaseGV && "ICmp does not support folding a global value and "
5967 "a scale at the same time!");
5968 if (F.Scale == -1) {
5969 if (ICmpScaledV->getType() != OpTy) {
5971 CastInst::getCastOpcode(ICmpScaledV, false, OpTy, false),
5972 ICmpScaledV, OpTy, "tmp", CI->getIterator());
5973 ICmpScaledV = Cast;
5974 }
5975 CI->setOperand(1, ICmpScaledV);
5976 } else {
5977 // A scale of 1 means that the scale has been expanded as part of the
5978 // base regs.
5979 assert((F.Scale == 0 || F.Scale == 1) &&
5980 "ICmp does not support folding a global value and "
5981 "a scale at the same time!");
5982 // TODO: Avoid implicit trunc?
5983 // See https://github.com/llvm/llvm-project/issues/112510.
5985 -(uint64_t)Offset.getFixedValue(),
5986 /*ImplicitTrunc=*/true);
5987 if (C->getType() != OpTy) {
5989 CastInst::getCastOpcode(C, false, OpTy, false), C, OpTy,
5990 CI->getDataLayout());
5991 assert(C && "Cast of ConstantInt should have folded");
5992 }
5993
5994 CI->setOperand(1, C);
5995 }
5996 }
5997
5998 return FullV;
5999}
6000
6001/// Helper for Rewrite. PHI nodes are special because the use of their operands
6002/// effectively happens in their predecessor blocks, so the expression may need
6003/// to be expanded in multiple places.
6004void LSRInstance::RewriteForPHI(PHINode *PN, const LSRUse &LU,
6005 const LSRFixup &LF, const Formula &F,
6006 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
6007 DenseMap<BasicBlock *, Value *> Inserted;
6008
6009 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
6010 if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
6011 bool needUpdateFixups = false;
6012 BasicBlock *BB = PN->getIncomingBlock(i);
6013
6014 // If this is a critical edge, split the edge so that we do not insert
6015 // the code on all predecessor/successor paths. We do this unless this
6016 // is the canonical backedge for this loop, which complicates post-inc
6017 // users.
6018 if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
6021 BasicBlock *Parent = PN->getParent();
6022 Loop *PNLoop = LI.getLoopFor(Parent);
6023 if (!PNLoop || Parent != PNLoop->getHeader()) {
6024 // Split the critical edge.
6025 BasicBlock *NewBB = nullptr;
6026 if (!Parent->isLandingPad()) {
6027 NewBB =
6028 SplitCriticalEdge(BB, Parent,
6029 CriticalEdgeSplittingOptions(&DT, &LI, MSSAU)
6030 .setMergeIdenticalEdges()
6031 .setKeepOneInputPHIs());
6032 } else {
6034 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
6035 SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DTU, &LI);
6036 NewBB = NewBBs[0];
6037 }
6038 // If NewBB==NULL, then SplitCriticalEdge refused to split because all
6039 // phi predecessors are identical. The simple thing to do is skip
6040 // splitting in this case rather than complicate the API.
6041 if (NewBB) {
6042 // If PN is outside of the loop and BB is in the loop, we want to
6043 // move the block to be immediately before the PHI block, not
6044 // immediately after BB.
6045 if (L->contains(BB) && !L->contains(PN))
6046 NewBB->moveBefore(PN->getParent());
6047
6048 // Splitting the edge can reduce the number of PHI entries we have.
6049 e = PN->getNumIncomingValues();
6050 BB = NewBB;
6051 i = PN->getBasicBlockIndex(BB);
6052
6053 needUpdateFixups = true;
6054 }
6055 }
6056 }
6057
6058 std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
6059 Inserted.try_emplace(BB);
6060 if (!Pair.second)
6061 PN->setIncomingValue(i, Pair.first->second);
6062 else {
6063 Value *FullV =
6064 Expand(LU, LF, F, BB->getTerminator()->getIterator(), DeadInsts);
6065
6066 // If this is reuse-by-noop-cast, insert the noop cast.
6067 Type *OpTy = LF.OperandValToReplace->getType();
6068 if (FullV->getType() != OpTy)
6069 FullV = CastInst::Create(
6070 CastInst::getCastOpcode(FullV, false, OpTy, false), FullV,
6071 LF.OperandValToReplace->getType(), "tmp",
6072 BB->getTerminator()->getIterator());
6073
6074 // If the incoming block for this value is not in the loop, it means the
6075 // current PHI is not in a loop exit, so we must create a LCSSA PHI for
6076 // the inserted value.
6077 if (auto *I = dyn_cast<Instruction>(FullV))
6078 if (L->contains(I) && !L->contains(BB))
6079 InsertedNonLCSSAInsts.insert(I);
6080
6081 PN->setIncomingValue(i, FullV);
6082 Pair.first->second = FullV;
6083 }
6084
6085 // If LSR splits critical edge and phi node has other pending
6086 // fixup operands, we need to update those pending fixups. Otherwise
6087 // formulae will not be implemented completely and some instructions
6088 // will not be eliminated.
6089 if (needUpdateFixups) {
6090 for (LSRUse &LU : Uses)
6091 for (LSRFixup &Fixup : LU.Fixups)
6092 // If fixup is supposed to rewrite some operand in the phi
6093 // that was just updated, it may be already moved to
6094 // another phi node. Such fixup requires update.
6095 if (Fixup.UserInst == PN) {
6096 // Check if the operand we try to replace still exists in the
6097 // original phi.
6098 bool foundInOriginalPHI = false;
6099 for (const auto &val : PN->incoming_values())
6100 if (val == Fixup.OperandValToReplace) {
6101 foundInOriginalPHI = true;
6102 break;
6103 }
6104
6105 // If fixup operand found in original PHI - nothing to do.
6106 if (foundInOriginalPHI)
6107 continue;
6108
6109 // Otherwise it might be moved to another PHI and requires update.
6110 // If fixup operand not found in any of the incoming blocks that
6111 // means we have already rewritten it - nothing to do.
6112 for (const auto &Block : PN->blocks())
6113 for (BasicBlock::iterator I = Block->begin(); isa<PHINode>(I);
6114 ++I) {
6115 PHINode *NewPN = cast<PHINode>(I);
6116 for (const auto &val : NewPN->incoming_values())
6117 if (val == Fixup.OperandValToReplace)
6118 Fixup.UserInst = NewPN;
6119 }
6120 }
6121 }
6122 }
6123}
6124
6125/// Emit instructions for the leading candidate expression for this LSRUse (this
6126/// is called "expanding"), and update the UserInst to reference the newly
6127/// expanded value.
6128void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
6129 const Formula &F,
6130 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
6131 // First, find an insertion point that dominates UserInst. For PHI nodes,
6132 // find the nearest block which dominates all the relevant uses.
6133 if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
6134 RewriteForPHI(PN, LU, LF, F, DeadInsts);
6135 } else {
6136 Value *FullV = Expand(LU, LF, F, LF.UserInst->getIterator(), DeadInsts);
6137
6138 // If this is reuse-by-noop-cast, insert the noop cast.
6139 // For ICmpZero with pointer operands, Expand() already set both operands
6140 // in integer domain, so no cast is needed here.
6141 Type *OpTy = LF.OperandValToReplace->getType();
6142 if (FullV->getType() != OpTy &&
6143 !(LU.Kind == LSRUse::ICmpZero && OpTy->isPointerTy())) {
6144 Instruction *Cast =
6145 CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
6146 FullV, OpTy, "tmp", LF.UserInst->getIterator());
6147 FullV = Cast;
6148 }
6149
6150 // Update the user. ICmpZero is handled specially here (for now) because
6151 // Expand may have updated one of the operands of the icmp already, and
6152 // its new value may happen to be equal to LF.OperandValToReplace, in
6153 // which case doing replaceUsesOfWith leads to replacing both operands
6154 // with the same value. TODO: Reorganize this.
6155 if (LU.Kind == LSRUse::ICmpZero)
6156 LF.UserInst->setOperand(0, FullV);
6157 else
6158 LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
6159 }
6160
6161 if (auto *OperandIsInstr = dyn_cast<Instruction>(LF.OperandValToReplace))
6162 DeadInsts.emplace_back(OperandIsInstr);
6163}
6164
6165// Determine where to insert the transformed IV increment instruction for this
6166// fixup. By default this is the default insert position, but if this is a
6167// postincrement opportunity then we try to insert it in the same block as the
6168// fixup user instruction, as this is needed for a postincrement instruction to
6169// be generated.
6171 const LSRFixup &Fixup, const LSRUse &LU,
6172 Instruction *IVIncInsertPos,
6173 DominatorTree &DT) {
6174 // Only address uses can be postincremented
6175 if (LU.Kind != LSRUse::Address)
6176 return IVIncInsertPos;
6177
6178 // Don't try to postincrement if it's not legal
6179 Instruction *I = Fixup.UserInst;
6180 Type *Ty = I->getType();
6181 if (!(isa<LoadInst>(I) && TTI.isIndexedLoadLegal(TTI.MIM_PostInc, Ty)) &&
6182 !(isa<StoreInst>(I) && TTI.isIndexedStoreLegal(TTI.MIM_PostInc, Ty)))
6183 return IVIncInsertPos;
6184
6185 // It's only legal to hoist to the user block if it dominates the default
6186 // insert position.
6187 BasicBlock *HoistBlock = I->getParent();
6188 BasicBlock *IVIncBlock = IVIncInsertPos->getParent();
6189 if (!DT.dominates(I, IVIncBlock))
6190 return IVIncInsertPos;
6191
6192 return HoistBlock->getTerminator();
6193}
6194
6195/// Rewrite all the fixup locations with new values, following the chosen
6196/// solution.
6197void LSRInstance::ImplementSolution(
6198 const SmallVectorImpl<const Formula *> &Solution) {
6199 // Keep track of instructions we may have made dead, so that
6200 // we can remove them after we are done working.
6202
6203 // Mark phi nodes that terminate chains so the expander tries to reuse them.
6204 for (const IVChain &Chain : IVChainVec) {
6205 if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst()))
6206 Rewriter.setChainedPhi(PN);
6207 }
6208
6209 // Expand the new value definitions and update the users.
6210 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
6211 for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) {
6212 Instruction *InsertPos =
6213 getFixupInsertPos(TTI, Fixup, Uses[LUIdx], IVIncInsertPos, DT);
6214 Rewriter.setIVIncInsertPos(L, InsertPos);
6215 Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], DeadInsts);
6216 Changed = true;
6217 }
6218
6219 auto InsertedInsts = InsertedNonLCSSAInsts.takeVector();
6220 formLCSSAForInstructions(InsertedInsts, DT, LI, &SE);
6221
6222 for (const IVChain &Chain : IVChainVec) {
6223 GenerateIVChain(Chain, DeadInsts);
6224 Changed = true;
6225 }
6226
6227 for (const WeakVH &IV : Rewriter.getInsertedIVs())
6228 if (IV && dyn_cast<Instruction>(&*IV)->getParent())
6229 ScalarEvolutionIVs.push_back(IV);
6230
6231 // Clean up after ourselves. This must be done before deleting any
6232 // instructions.
6233 Rewriter.clear();
6234
6236 &TLI, MSSAU);
6237
6238 // In our cost analysis above, we assume that each addrec consumes exactly
6239 // one register, and arrange to have increments inserted just before the
6240 // latch to maximimize the chance this is true. However, if we reused
6241 // existing IVs, we now need to move the increments to match our
6242 // expectations. Otherwise, our cost modeling results in us having a
6243 // chosen a non-optimal result for the actual schedule. (And yes, this
6244 // scheduling decision does impact later codegen.)
6245 for (PHINode &PN : L->getHeader()->phis()) {
6246 BinaryOperator *BO = nullptr;
6247 Value *Start = nullptr, *Step = nullptr;
6248 if (!matchSimpleRecurrence(&PN, BO, Start, Step))
6249 continue;
6250
6251 switch (BO->getOpcode()) {
6252 case Instruction::Sub:
6253 if (BO->getOperand(0) != &PN)
6254 // sub is non-commutative - match handling elsewhere in LSR
6255 continue;
6256 break;
6257 case Instruction::Add:
6258 break;
6259 default:
6260 continue;
6261 };
6262
6263 if (!isa<Constant>(Step))
6264 // If not a constant step, might increase register pressure
6265 // (We assume constants have been canonicalized to RHS)
6266 continue;
6267
6268 if (BO->getParent() == IVIncInsertPos->getParent())
6269 // Only bother moving across blocks. Isel can handle block local case.
6270 continue;
6271
6272 // Can we legally schedule inc at the desired point?
6273 if (!llvm::all_of(BO->uses(),
6274 [&](Use &U) {return DT.dominates(IVIncInsertPos, U);}))
6275 continue;
6276 BO->moveBefore(IVIncInsertPos->getIterator());
6277 Changed = true;
6278 }
6279
6280
6281}
6282
6283LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
6284 DominatorTree &DT, LoopInfo &LI,
6285 const TargetTransformInfo &TTI, AssumptionCache &AC,
6286 TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU)
6287 : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), TLI(TLI), TTI(TTI), L(L),
6288 MSSAU(MSSAU), AMK(PreferredAddresingMode.getNumOccurrences() > 0
6290 : TTI.getPreferredAddressingMode(L, &SE)),
6291 Rewriter(SE, "lsr", false), BaselineCost(L, SE, TTI, AMK) {
6292 // If LoopSimplify form is not available, stay out of trouble.
6293 if (!L->isLoopSimplifyForm())
6294 return;
6295
6296 // If there's no interesting work to be done, bail early.
6297 if (IU.empty()) return;
6298
6299 // If there's too much analysis to be done, bail early. We won't be able to
6300 // model the problem anyway.
6301 unsigned NumUsers = 0;
6302 for (const IVStrideUse &U : IU) {
6303 if (++NumUsers > MaxIVUsers) {
6304 (void)U;
6305 LLVM_DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U
6306 << "\n");
6307 return;
6308 }
6309 // Bail out if we have a PHI on an EHPad that gets a value from a
6310 // CatchSwitchInst. Because the CatchSwitchInst cannot be split, there is
6311 // no good place to stick any instructions.
6312 if (auto *PN = dyn_cast<PHINode>(U.getUser())) {
6313 auto FirstNonPHI = PN->getParent()->getFirstNonPHIIt();
6314 if (isa<FuncletPadInst>(FirstNonPHI) ||
6315 isa<CatchSwitchInst>(FirstNonPHI))
6316 for (BasicBlock *PredBB : PN->blocks())
6317 if (isa<CatchSwitchInst>(PredBB->getFirstNonPHIIt()))
6318 return;
6319 }
6320 }
6321
6322 LLVM_DEBUG(dbgs() << "\nLSR on loop ";
6323 L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
6324 dbgs() << ":\n");
6325
6326 // Check if we expect this loop to use a hardware loop instruction, which will
6327 // be used when calculating the costs of formulas.
6328 HardwareLoopInfo HWLoopInfo(L);
6329 HardwareLoopProfitable =
6330 TTI.isHardwareLoopProfitable(L, SE, AC, &TLI, HWLoopInfo);
6331
6332 // Configure SCEVExpander already now, so the correct mode is used for
6333 // isSafeToExpand() checks.
6334#if LLVM_ENABLE_ABI_BREAKING_CHECKS
6335 Rewriter.setDebugType(DEBUG_TYPE);
6336#endif
6337 Rewriter.disableCanonicalMode();
6338 Rewriter.enableLSRMode();
6339
6340 // First, perform some low-level loop optimizations.
6341 OptimizeShadowIV();
6342 OptimizeLoopTermCond();
6343
6344 // If loop preparation eliminates all interesting IV users, bail.
6345 if (IU.empty()) return;
6346
6347 // Skip nested loops until we can model them better with formulae.
6348 if (!L->isInnermost()) {
6349 LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
6350 return;
6351 }
6352
6353 // Start collecting data and preparing for the solver.
6354 // If number of registers is not the major cost, we cannot benefit from the
6355 // current profitable chain optimization which is based on number of
6356 // registers.
6357 // FIXME: add profitable chain optimization for other kinds major cost, for
6358 // example number of instructions.
6359 if (TTI.isNumRegsMajorCostOfLSR() || StressIVChain)
6360 CollectChains();
6361 CollectInterestingTypesAndFactors();
6362 CollectFixupsAndInitialFormulae();
6363 CollectLoopInvariantFixupsAndFormulae();
6364
6365 if (Uses.empty())
6366 return;
6367
6368 LLVM_DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
6369 print_uses(dbgs()));
6370 LLVM_DEBUG(dbgs() << "The baseline solution requires ";
6371 BaselineCost.print(dbgs()); dbgs() << "\n");
6372
6373 // Now use the reuse data to generate a bunch of interesting ways
6374 // to formulate the values needed for the uses.
6375 GenerateAllReuseFormulae();
6376
6377 FilterOutUndesirableDedicatedRegisters();
6378 NarrowSearchSpaceUsingHeuristics();
6379
6381 Solve(Solution);
6382
6383 // Release memory that is no longer needed.
6384 Factors.clear();
6385 Types.clear();
6386 RegUses.clear();
6387
6388 if (Solution.empty())
6389 return;
6390
6391#ifndef NDEBUG
6392 // Formulae should be legal.
6393 for (const LSRUse &LU : Uses) {
6394 for (const Formula &F : LU.Formulae)
6395 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
6396 F) && "Illegal formula generated!");
6397 };
6398#endif
6399
6400 // Now that we've decided what we want, make it so.
6401 ImplementSolution(Solution);
6402}
6403
6404#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
6405void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
6406 if (Factors.empty() && Types.empty()) return;
6407
6408 OS << "LSR has identified the following interesting factors and types: ";
6409 ListSeparator LS;
6410
6411 for (int64_t Factor : Factors)
6412 OS << LS << '*' << Factor;
6413
6414 for (Type *Ty : Types)
6415 OS << LS << '(' << *Ty << ')';
6416 OS << '\n';
6417}
6418
6419void LSRInstance::print_fixups(raw_ostream &OS) const {
6420 OS << "LSR is examining the following fixup sites:\n";
6421 for (const LSRUse &LU : Uses)
6422 for (const LSRFixup &LF : LU.Fixups) {
6423 dbgs() << " ";
6424 LF.print(OS);
6425 OS << '\n';
6426 }
6427}
6428
6429void LSRInstance::print_uses(raw_ostream &OS) const {
6430 OS << "LSR is examining the following uses:\n";
6431 for (const LSRUse &LU : Uses) {
6432 dbgs() << " ";
6433 LU.print(OS);
6434 OS << '\n';
6435 for (const Formula &F : LU.Formulae) {
6436 OS << " ";
6437 F.print(OS);
6438 OS << '\n';
6439 }
6440 }
6441}
6442
6443void LSRInstance::print(raw_ostream &OS) const {
6444 print_factors_and_types(OS);
6445 print_fixups(OS);
6446 print_uses(OS);
6447}
6448
6449LLVM_DUMP_METHOD void LSRInstance::dump() const {
6450 print(errs()); errs() << '\n';
6451}
6452#endif
6453
6454namespace {
6455
6456class LoopStrengthReduce : public LoopPass {
6457public:
6458 static char ID; // Pass ID, replacement for typeid
6459
6460 LoopStrengthReduce();
6461
6462private:
6463 bool runOnLoop(Loop *L, LPPassManager &LPM) override;
6464 void getAnalysisUsage(AnalysisUsage &AU) const override;
6465};
6466
6467} // end anonymous namespace
6468
6469LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
6471}
6472
6473void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
6474 // We split critical edges, so we change the CFG. However, we do update
6475 // many analyses if they are around.
6477
6478 AU.addRequired<LoopInfoWrapperPass>();
6479 AU.addPreserved<LoopInfoWrapperPass>();
6481 AU.addRequired<DominatorTreeWrapperPass>();
6482 AU.addPreserved<DominatorTreeWrapperPass>();
6483 AU.addRequired<ScalarEvolutionWrapperPass>();
6484 AU.addPreserved<ScalarEvolutionWrapperPass>();
6485 AU.addRequired<AssumptionCacheTracker>();
6486 AU.addRequired<TargetLibraryInfoWrapperPass>();
6487 // Requiring LoopSimplify a second time here prevents IVUsers from running
6488 // twice, since LoopSimplify was invalidated by running ScalarEvolution.
6490 AU.addRequired<IVUsersWrapperPass>();
6491 AU.addPreserved<IVUsersWrapperPass>();
6492 AU.addRequired<TargetTransformInfoWrapperPass>();
6493 AU.addPreserved<MemorySSAWrapperPass>();
6494}
6495
6496namespace {
6497
6498/// Enables more convenient iteration over a DWARF expression vector.
6500ToDwarfOpIter(SmallVectorImpl<uint64_t> &Expr) {
6501 llvm::DIExpression::expr_op_iterator Begin =
6502 llvm::DIExpression::expr_op_iterator(Expr.begin());
6503 llvm::DIExpression::expr_op_iterator End =
6504 llvm::DIExpression::expr_op_iterator(Expr.end());
6505 return {Begin, End};
6506}
6507
6508struct SCEVDbgValueBuilder {
6509 SCEVDbgValueBuilder() = default;
6510 SCEVDbgValueBuilder(const SCEVDbgValueBuilder &Base) { clone(Base); }
6511
6512 void clone(const SCEVDbgValueBuilder &Base) {
6513 LocationOps = Base.LocationOps;
6514 Expr = Base.Expr;
6515 }
6516
6517 void clear() {
6518 LocationOps.clear();
6519 Expr.clear();
6520 }
6521
6522 /// The DIExpression as we translate the SCEV.
6524 /// The location ops of the DIExpression.
6525 SmallVector<Value *, 2> LocationOps;
6526
6527 void pushOperator(uint64_t Op) { Expr.push_back(Op); }
6528 void pushUInt(uint64_t Operand) { Expr.push_back(Operand); }
6529
6530 /// Add a DW_OP_LLVM_arg to the expression, followed by the index of the value
6531 /// in the set of values referenced by the expression.
6532 void pushLocation(llvm::Value *V) {
6534 auto *It = llvm::find(LocationOps, V);
6535 unsigned ArgIndex = 0;
6536 if (It != LocationOps.end()) {
6537 ArgIndex = std::distance(LocationOps.begin(), It);
6538 } else {
6539 ArgIndex = LocationOps.size();
6540 LocationOps.push_back(V);
6541 }
6542 Expr.push_back(ArgIndex);
6543 }
6544
6545 void pushValue(const SCEVUnknown *U) {
6546 llvm::Value *V = cast<SCEVUnknown>(U)->getValue();
6547 pushLocation(V);
6548 }
6549
6550 bool pushConst(const SCEVConstant *C) {
6551 if (C->getAPInt().getSignificantBits() > 64)
6552 return false;
6553 Expr.push_back(llvm::dwarf::DW_OP_consts);
6554 Expr.push_back(C->getAPInt().getSExtValue());
6555 return true;
6556 }
6557
6558 // Iterating the expression as DWARF ops is convenient when updating
6559 // DWARF_OP_LLVM_args.
6561 return ToDwarfOpIter(Expr);
6562 }
6563
6564 /// Several SCEV types are sequences of the same arithmetic operator applied
6565 /// to constants and values that may be extended or truncated.
6566 bool pushArithmeticExpr(const llvm::SCEVCommutativeExpr *CommExpr,
6567 uint64_t DwarfOp) {
6568 assert((isa<llvm::SCEVAddExpr>(CommExpr) || isa<SCEVMulExpr>(CommExpr)) &&
6569 "Expected arithmetic SCEV type");
6570 bool Success = true;
6571 unsigned EmitOperator = 0;
6572 for (const auto &Op : CommExpr->operands()) {
6573 Success &= pushSCEV(Op);
6574
6575 if (EmitOperator >= 1)
6576 pushOperator(DwarfOp);
6577 ++EmitOperator;
6578 }
6579 return Success;
6580 }
6581
6582 // TODO: Identify and omit noop casts.
6583 bool pushCast(const llvm::SCEVCastExpr *C, bool IsSigned) {
6584 const llvm::SCEV *Inner = C->getOperand(0);
6585 const llvm::Type *Type = C->getType();
6586 uint64_t ToWidth = Type->getIntegerBitWidth();
6587 bool Success = pushSCEV(Inner);
6588 uint64_t CastOps[] = {dwarf::DW_OP_LLVM_convert, ToWidth,
6589 IsSigned ? llvm::dwarf::DW_ATE_signed
6590 : llvm::dwarf::DW_ATE_unsigned};
6591 for (const auto &Op : CastOps)
6592 pushOperator(Op);
6593 return Success;
6594 }
6595
6596 // TODO: MinMax - although these haven't been encountered in the test suite.
6597 bool pushSCEV(const llvm::SCEV *S) {
6598 bool Success = true;
6599 if (const SCEVConstant *StartInt = dyn_cast<SCEVConstant>(S)) {
6600 Success &= pushConst(StartInt);
6601
6602 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
6603 if (!U->getValue())
6604 return false;
6605 pushLocation(U->getValue());
6606
6607 } else if (const SCEVMulExpr *MulRec = dyn_cast<SCEVMulExpr>(S)) {
6608 Success &= pushArithmeticExpr(MulRec, llvm::dwarf::DW_OP_mul);
6609
6610 } else if (const SCEVUDivExpr *UDiv = dyn_cast<SCEVUDivExpr>(S)) {
6611 Success &= pushSCEV(UDiv->getLHS());
6612 Success &= pushSCEV(UDiv->getRHS());
6613 pushOperator(llvm::dwarf::DW_OP_div);
6614
6615 } else if (const SCEVCastExpr *Cast = dyn_cast<SCEVCastExpr>(S)) {
6616 // Assert if a new and unknown SCEVCastEXpr type is encountered.
6619 isa<SCEVSignExtendExpr>(Cast)) &&
6620 "Unexpected cast type in SCEV.");
6621 Success &= pushCast(Cast, (isa<SCEVSignExtendExpr>(Cast)));
6622
6623 } else if (const SCEVAddExpr *AddExpr = dyn_cast<SCEVAddExpr>(S)) {
6624 Success &= pushArithmeticExpr(AddExpr, llvm::dwarf::DW_OP_plus);
6625
6626 } else if (isa<SCEVAddRecExpr>(S)) {
6627 // Nested SCEVAddRecExpr are generated by nested loops and are currently
6628 // unsupported.
6629 return false;
6630
6631 } else {
6632 return false;
6633 }
6634 return Success;
6635 }
6636
6637 /// Return true if the combination of arithmetic operator and underlying
6638 /// SCEV constant value is an identity function.
6639 bool isIdentityFunction(uint64_t Op, const SCEV *S) {
6640 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
6641 if (C->getAPInt().getSignificantBits() > 64)
6642 return false;
6643 int64_t I = C->getAPInt().getSExtValue();
6644 switch (Op) {
6645 case llvm::dwarf::DW_OP_plus:
6646 case llvm::dwarf::DW_OP_minus:
6647 return I == 0;
6648 case llvm::dwarf::DW_OP_mul:
6649 case llvm::dwarf::DW_OP_div:
6650 return I == 1;
6651 }
6652 }
6653 return false;
6654 }
6655
6656 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6657 /// builder's expression stack. The stack should already contain an
6658 /// expression for the iteration count, so that it can be multiplied by
6659 /// the stride and added to the start.
6660 /// Components of the expression are omitted if they are an identity function.
6661 /// Chain (non-affine) SCEVs are not supported.
6662 bool SCEVToValueExpr(const llvm::SCEVAddRecExpr &SAR, ScalarEvolution &SE) {
6663 assert(SAR.isAffine() && "Expected affine SCEV");
6664 const SCEV *Start = SAR.getStart();
6665 const SCEV *Stride = SAR.getStepRecurrence(SE);
6666
6667 // Skip pushing arithmetic noops.
6668 if (!isIdentityFunction(llvm::dwarf::DW_OP_mul, Stride)) {
6669 if (!pushSCEV(Stride))
6670 return false;
6671 pushOperator(llvm::dwarf::DW_OP_mul);
6672 }
6673 if (!isIdentityFunction(llvm::dwarf::DW_OP_plus, Start)) {
6674 if (!pushSCEV(Start))
6675 return false;
6676 pushOperator(llvm::dwarf::DW_OP_plus);
6677 }
6678 return true;
6679 }
6680
6681 /// Create an expression that is an offset from a value (usually the IV).
6682 void createOffsetExpr(int64_t Offset, Value *OffsetValue) {
6683 pushLocation(OffsetValue);
6685 LLVM_DEBUG(
6686 dbgs() << "scev-salvage: Generated IV offset expression. Offset: "
6687 << std::to_string(Offset) << "\n");
6688 }
6689
6690 /// Combine a translation of the SCEV and the IV to create an expression that
6691 /// recovers a location's value.
6692 /// returns true if an expression was created.
6693 bool createIterCountExpr(const SCEV *S,
6694 const SCEVDbgValueBuilder &IterationCount,
6695 ScalarEvolution &SE) {
6696 // SCEVs for SSA values are most frquently of the form
6697 // {start,+,stride}, but sometimes they are ({start,+,stride} + %a + ..).
6698 // This is because %a is a PHI node that is not the IV. However, these
6699 // SCEVs have not been observed to result in debuginfo-lossy optimisations,
6700 // so its not expected this point will be reached.
6701 if (!isa<SCEVAddRecExpr>(S))
6702 return false;
6703
6704 LLVM_DEBUG(dbgs() << "scev-salvage: Location to salvage SCEV: " << *S
6705 << '\n');
6706
6707 const auto *Rec = cast<SCEVAddRecExpr>(S);
6708 if (!Rec->isAffine())
6709 return false;
6710
6712 return false;
6713
6714 // Initialise a new builder with the iteration count expression. In
6715 // combination with the value's SCEV this enables recovery.
6716 clone(IterationCount);
6717 if (!SCEVToValueExpr(*Rec, SE))
6718 return false;
6719
6720 return true;
6721 }
6722
6723 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6724 /// builder's expression stack. The stack should already contain an
6725 /// expression for the iteration count, so that it can be multiplied by
6726 /// the stride and added to the start.
6727 /// Components of the expression are omitted if they are an identity function.
6728 bool SCEVToIterCountExpr(const llvm::SCEVAddRecExpr &SAR,
6729 ScalarEvolution &SE) {
6730 assert(SAR.isAffine() && "Expected affine SCEV");
6731 const SCEV *Start = SAR.getStart();
6732 const SCEV *Stride = SAR.getStepRecurrence(SE);
6733
6734 // Skip pushing arithmetic noops.
6735 if (!isIdentityFunction(llvm::dwarf::DW_OP_minus, Start)) {
6736 if (!pushSCEV(Start))
6737 return false;
6738 pushOperator(llvm::dwarf::DW_OP_minus);
6739 }
6740 if (!isIdentityFunction(llvm::dwarf::DW_OP_div, Stride)) {
6741 if (!pushSCEV(Stride))
6742 return false;
6743 pushOperator(llvm::dwarf::DW_OP_div);
6744 }
6745 return true;
6746 }
6747
6748 // Append the current expression and locations to a location list and an
6749 // expression list. Modify the DW_OP_LLVM_arg indexes to account for
6750 // the locations already present in the destination list.
6751 void appendToVectors(SmallVectorImpl<uint64_t> &DestExpr,
6752 SmallVectorImpl<Value *> &DestLocations) {
6753 assert(!DestLocations.empty() &&
6754 "Expected the locations vector to contain the IV");
6755 // The DWARF_OP_LLVM_arg arguments of the expression being appended must be
6756 // modified to account for the locations already in the destination vector.
6757 // All builders contain the IV as the first location op.
6758 assert(!LocationOps.empty() &&
6759 "Expected the location ops to contain the IV.");
6760 // DestIndexMap[n] contains the index in DestLocations for the nth
6761 // location in this SCEVDbgValueBuilder.
6762 SmallVector<uint64_t, 2> DestIndexMap;
6763 for (const auto &Op : LocationOps) {
6764 auto It = find(DestLocations, Op);
6765 if (It != DestLocations.end()) {
6766 // Location already exists in DestLocations, reuse existing ArgIndex.
6767 DestIndexMap.push_back(std::distance(DestLocations.begin(), It));
6768 continue;
6769 }
6770 // Location is not in DestLocations, add it.
6771 DestIndexMap.push_back(DestLocations.size());
6772 DestLocations.push_back(Op);
6773 }
6774
6775 for (const auto &Op : expr_ops()) {
6776 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6777 Op.appendToVector(DestExpr);
6778 continue;
6779 }
6780
6782 // `DW_OP_LLVM_arg n` represents the nth LocationOp in this SCEV,
6783 // DestIndexMap[n] contains its new index in DestLocations.
6784 uint64_t NewIndex = DestIndexMap[Op.getArg(0)];
6785 DestExpr.push_back(NewIndex);
6786 }
6787 }
6788};
6789
6790/// Holds all the required data to salvage a dbg.value using the pre-LSR SCEVs
6791/// and DIExpression.
6792struct DVIRecoveryRec {
6793 DVIRecoveryRec(DbgVariableRecord *DVR)
6794 : DbgRef(DVR), Expr(DVR->getExpression()), HadLocationArgList(false) {}
6795
6796 DbgVariableRecord *DbgRef;
6797 DIExpression *Expr;
6798 bool HadLocationArgList;
6799 SmallVector<WeakVH, 2> LocationOps;
6802
6803 void clear() {
6804 for (auto &RE : RecoveryExprs)
6805 RE.reset();
6806 RecoveryExprs.clear();
6807 }
6808
6809 ~DVIRecoveryRec() { clear(); }
6810};
6811} // namespace
6812
6813/// Returns the total number of DW_OP_llvm_arg operands in the expression.
6814/// This helps in determining if a DIArglist is necessary or can be omitted from
6815/// the dbg.value.
6817 auto expr_ops = ToDwarfOpIter(Expr);
6818 unsigned Count = 0;
6819 for (auto Op : expr_ops)
6820 if (Op.getOp() == dwarf::DW_OP_LLVM_arg)
6821 Count++;
6822 return Count;
6823}
6824
6825/// Overwrites DVI with the location and Ops as the DIExpression. This will
6826/// create an invalid expression if Ops has any dwarf::DW_OP_llvm_arg operands,
6827/// because a DIArglist is not created for the first argument of the dbg.value.
6828template <typename T>
6829static void updateDVIWithLocation(T &DbgVal, Value *Location,
6831 assert(numLLVMArgOps(Ops) == 0 && "Expected expression that does not "
6832 "contain any DW_OP_llvm_arg operands.");
6833 DbgVal.setRawLocation(ValueAsMetadata::get(Location));
6834 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6835 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6836}
6837
6838/// Overwrite DVI with locations placed into a DIArglist.
6839template <typename T>
6840static void updateDVIWithLocations(T &DbgVal,
6841 SmallVectorImpl<Value *> &Locations,
6843 assert(numLLVMArgOps(Ops) != 0 &&
6844 "Expected expression that references DIArglist locations using "
6845 "DW_OP_llvm_arg operands.");
6847 for (Value *V : Locations)
6848 MetadataLocs.push_back(ValueAsMetadata::get(V));
6849 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6850 DbgVal.setRawLocation(llvm::DIArgList::get(DbgVal.getContext(), ValArrayRef));
6851 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6852}
6853
6854/// Write the new expression and new location ops for the dbg.value. If possible
6855/// reduce the szie of the dbg.value by omitting DIArglist. This
6856/// can be omitted if:
6857/// 1. There is only a single location, refenced by a single DW_OP_llvm_arg.
6858/// 2. The DW_OP_LLVM_arg is the first operand in the expression.
6859static void UpdateDbgValue(DVIRecoveryRec &DVIRec,
6860 SmallVectorImpl<Value *> &NewLocationOps,
6862 DbgVariableRecord *DbgVal = DVIRec.DbgRef;
6863 unsigned NumLLVMArgs = numLLVMArgOps(NewExpr);
6864 if (NumLLVMArgs == 0) {
6865 // Location assumed to be on the stack.
6866 updateDVIWithLocation(*DbgVal, NewLocationOps[0], NewExpr);
6867 } else if (NumLLVMArgs == 1 && NewExpr[0] == dwarf::DW_OP_LLVM_arg) {
6868 // There is only a single DW_OP_llvm_arg at the start of the expression,
6869 // so it can be omitted along with DIArglist.
6870 assert(NewExpr[1] == 0 &&
6871 "Lone LLVM_arg in a DIExpression should refer to location-op 0.");
6873 updateDVIWithLocation(*DbgVal, NewLocationOps[0], ShortenedOps);
6874 } else {
6875 // Multiple DW_OP_llvm_arg, so DIArgList is strictly necessary.
6876 updateDVIWithLocations(*DbgVal, NewLocationOps, NewExpr);
6877 }
6878
6879 // If the DIExpression was previously empty then add the stack terminator.
6880 // Non-empty expressions have only had elements inserted into them and so
6881 // the terminator should already be present e.g. stack_value or fragment.
6882 DIExpression *SalvageExpr = DbgVal->getExpression();
6883 if (!DVIRec.Expr->isComplex() && SalvageExpr->isComplex()) {
6884 SalvageExpr = DIExpression::append(SalvageExpr, {dwarf::DW_OP_stack_value});
6885 DbgVal->setExpression(SalvageExpr);
6886 }
6887}
6888
6889/// Cached location ops may be erased during LSR, in which case a poison is
6890/// required when restoring from the cache. The type of that location is no
6891/// longer available, so just use int8. The poison will be replaced by one or
6892/// more locations later when a SCEVDbgValueBuilder selects alternative
6893/// locations to use for the salvage.
6895 return (VH) ? VH : PoisonValue::get(llvm::Type::getInt8Ty(C));
6896}
6897
6898/// Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
6899static void restorePreTransformState(DVIRecoveryRec &DVIRec) {
6900 DbgVariableRecord *DbgVal = DVIRec.DbgRef;
6901 LLVM_DEBUG(dbgs() << "scev-salvage: restore dbg.value to pre-LSR state\n"
6902 << "scev-salvage: post-LSR: " << *DbgVal << '\n');
6903 assert(DVIRec.Expr && "Expected an expression");
6904 DbgVal->setExpression(DVIRec.Expr);
6905
6906 // Even a single location-op may be inside a DIArgList and referenced with
6907 // DW_OP_LLVM_arg, which is valid only with a DIArgList.
6908 if (!DVIRec.HadLocationArgList) {
6909 assert(DVIRec.LocationOps.size() == 1 &&
6910 "Unexpected number of location ops.");
6911 // LSR's unsuccessful salvage attempt may have added DIArgList, which in
6912 // this case was not present before, so force the location back to a
6913 // single uncontained Value.
6914 Value *CachedValue =
6915 getValueOrPoison(DVIRec.LocationOps[0], DbgVal->getContext());
6916 DbgVal->setRawLocation(ValueAsMetadata::get(CachedValue));
6917 } else {
6919 for (WeakVH VH : DVIRec.LocationOps) {
6920 Value *CachedValue = getValueOrPoison(VH, DbgVal->getContext());
6921 MetadataLocs.push_back(ValueAsMetadata::get(CachedValue));
6922 }
6923 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6924 DbgVal->setRawLocation(
6925 llvm::DIArgList::get(DbgVal->getContext(), ValArrayRef));
6926 }
6927 LLVM_DEBUG(dbgs() << "scev-salvage: pre-LSR: " << *DbgVal << '\n');
6928}
6929
6931 llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec,
6932 const SCEV *SCEVInductionVar,
6933 SCEVDbgValueBuilder IterCountExpr) {
6934
6935 if (!DVIRec.DbgRef->isKillLocation())
6936 return false;
6937
6938 // LSR may have caused several changes to the dbg.value in the failed salvage
6939 // attempt. So restore the DIExpression, the location ops and also the
6940 // location ops format, which is always DIArglist for multiple ops, but only
6941 // sometimes for a single op.
6943
6944 // LocationOpIndexMap[i] will store the post-LSR location index of
6945 // the non-optimised out location at pre-LSR index i.
6946 SmallVector<int64_t, 2> LocationOpIndexMap;
6947 LocationOpIndexMap.assign(DVIRec.LocationOps.size(), -1);
6948 SmallVector<Value *, 2> NewLocationOps;
6949 NewLocationOps.push_back(LSRInductionVar);
6950
6951 for (unsigned i = 0; i < DVIRec.LocationOps.size(); i++) {
6952 WeakVH VH = DVIRec.LocationOps[i];
6953 // Place the locations not optimised out in the list first, avoiding
6954 // inserts later. The map is used to update the DIExpression's
6955 // DW_OP_LLVM_arg arguments as the expression is updated.
6956 if (VH && !isa<UndefValue>(VH)) {
6957 NewLocationOps.push_back(VH);
6958 LocationOpIndexMap[i] = NewLocationOps.size() - 1;
6959 LLVM_DEBUG(dbgs() << "scev-salvage: Location index " << i
6960 << " now at index " << LocationOpIndexMap[i] << "\n");
6961 continue;
6962 }
6963
6964 // It's possible that a value referred to in the SCEV may have been
6965 // optimised out by LSR.
6966 if (SE.containsErasedValue(DVIRec.SCEVs[i]) ||
6967 SE.containsUndefs(DVIRec.SCEVs[i])) {
6968 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV for location at index: " << i
6969 << " refers to a location that is now undef or erased. "
6970 "Salvage abandoned.\n");
6971 return false;
6972 }
6973
6974 LLVM_DEBUG(dbgs() << "scev-salvage: salvaging location at index " << i
6975 << " with SCEV: " << *DVIRec.SCEVs[i] << "\n");
6976
6977 DVIRec.RecoveryExprs[i] = std::make_unique<SCEVDbgValueBuilder>();
6978 SCEVDbgValueBuilder *SalvageExpr = DVIRec.RecoveryExprs[i].get();
6979
6980 // Create an offset-based salvage expression if possible, as it requires
6981 // less DWARF ops than an iteration count-based expression.
6982 if (std::optional<APInt> Offset =
6983 SE.computeConstantDifference(DVIRec.SCEVs[i], SCEVInductionVar)) {
6984 if (Offset->getSignificantBits() <= 64)
6985 SalvageExpr->createOffsetExpr(Offset->getSExtValue(), LSRInductionVar);
6986 else
6987 return false;
6988 } else if (!SalvageExpr->createIterCountExpr(DVIRec.SCEVs[i], IterCountExpr,
6989 SE))
6990 return false;
6991 }
6992
6993 // Merge the DbgValueBuilder generated expressions and the original
6994 // DIExpression, place the result into an new vector.
6996 if (DVIRec.Expr->getNumElements() == 0) {
6997 assert(DVIRec.RecoveryExprs.size() == 1 &&
6998 "Expected only a single recovery expression for an empty "
6999 "DIExpression.");
7000 assert(DVIRec.RecoveryExprs[0] &&
7001 "Expected a SCEVDbgSalvageBuilder for location 0");
7002 SCEVDbgValueBuilder *B = DVIRec.RecoveryExprs[0].get();
7003 B->appendToVectors(NewExpr, NewLocationOps);
7004 }
7005 for (const auto &Op : DVIRec.Expr->expr_ops()) {
7006 // Most Ops needn't be updated.
7007 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
7008 Op.appendToVector(NewExpr);
7009 continue;
7010 }
7011
7012 uint64_t LocationArgIndex = Op.getArg(0);
7013 SCEVDbgValueBuilder *DbgBuilder =
7014 DVIRec.RecoveryExprs[LocationArgIndex].get();
7015 // The location doesn't have s SCEVDbgValueBuilder, so LSR did not
7016 // optimise it away. So just translate the argument to the updated
7017 // location index.
7018 if (!DbgBuilder) {
7019 NewExpr.push_back(dwarf::DW_OP_LLVM_arg);
7020 assert(LocationOpIndexMap[Op.getArg(0)] != -1 &&
7021 "Expected a positive index for the location-op position.");
7022 NewExpr.push_back(LocationOpIndexMap[Op.getArg(0)]);
7023 continue;
7024 }
7025 // The location has a recovery expression.
7026 DbgBuilder->appendToVectors(NewExpr, NewLocationOps);
7027 }
7028
7029 UpdateDbgValue(DVIRec, NewLocationOps, NewExpr);
7030 LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: " << *DVIRec.DbgRef << "\n");
7031 return true;
7032}
7033
7034/// Obtain an expression for the iteration count, then attempt to salvage the
7035/// dbg.value intrinsics.
7037 llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar,
7038 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &DVIToUpdate) {
7039 if (DVIToUpdate.empty())
7040 return;
7041
7042 const llvm::SCEV *SCEVInductionVar = SE.getSCEV(LSRInductionVar);
7043 assert(SCEVInductionVar &&
7044 "Anticipated a SCEV for the post-LSR induction variable");
7045
7046 if (const SCEVAddRecExpr *IVAddRec =
7047 dyn_cast<SCEVAddRecExpr>(SCEVInductionVar)) {
7048 if (!IVAddRec->isAffine())
7049 return;
7050
7051 // Prevent translation using excessive resources.
7052 if (IVAddRec->getExpressionSize() > MaxSCEVSalvageExpressionSize)
7053 return;
7054
7055 // The iteration count is required to recover location values.
7056 SCEVDbgValueBuilder IterCountExpr;
7057 IterCountExpr.pushLocation(LSRInductionVar);
7058 if (!IterCountExpr.SCEVToIterCountExpr(*IVAddRec, SE))
7059 return;
7060
7061 LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV: " << *SCEVInductionVar
7062 << '\n');
7063
7064 for (auto &DVIRec : DVIToUpdate) {
7065 SalvageDVI(L, SE, LSRInductionVar, *DVIRec, SCEVInductionVar,
7066 IterCountExpr);
7067 }
7068 }
7069}
7070
7071/// Identify and cache salvageable DVI locations and expressions along with the
7072/// corresponding SCEV(s). Also ensure that the DVI is not deleted between
7073/// cacheing and salvaging.
7075 Loop *L, ScalarEvolution &SE,
7076 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &SalvageableDVISCEVs) {
7077 for (const auto &B : L->getBlocks()) {
7078 for (auto &I : *B) {
7079 for (DbgVariableRecord &DbgVal : filterDbgVars(I.getDbgRecordRange())) {
7080 if (!DbgVal.isDbgValue() && !DbgVal.isDbgAssign())
7081 continue;
7082
7083 // Ensure that if any location op is undef that the dbg.vlue is not
7084 // cached.
7085 if (DbgVal.isKillLocation())
7086 continue;
7087
7088 // Check that the location op SCEVs are suitable for translation to
7089 // DIExpression.
7090 const auto &HasTranslatableLocationOps =
7091 [&](const DbgVariableRecord &DbgValToTranslate) -> bool {
7092 for (const auto LocOp : DbgValToTranslate.location_ops()) {
7093 if (!LocOp)
7094 return false;
7095
7096 if (!SE.isSCEVable(LocOp->getType()))
7097 return false;
7098
7099 const SCEV *S = SE.getSCEV(LocOp);
7100 if (SE.containsUndefs(S))
7101 return false;
7102 }
7103 return true;
7104 };
7105
7106 if (!HasTranslatableLocationOps(DbgVal))
7107 continue;
7108
7109 std::unique_ptr<DVIRecoveryRec> NewRec =
7110 std::make_unique<DVIRecoveryRec>(&DbgVal);
7111 // Each location Op may need a SCEVDbgValueBuilder in order to recover
7112 // it. Pre-allocating a vector will enable quick lookups of the builder
7113 // later during the salvage.
7114 NewRec->RecoveryExprs.resize(DbgVal.getNumVariableLocationOps());
7115 for (const auto LocOp : DbgVal.location_ops()) {
7116 NewRec->SCEVs.push_back(SE.getSCEV(LocOp));
7117 NewRec->LocationOps.push_back(LocOp);
7118 NewRec->HadLocationArgList = DbgVal.hasArgList();
7119 }
7120 SalvageableDVISCEVs.push_back(std::move(NewRec));
7121 }
7122 }
7123 }
7124}
7125
7126/// Ideally pick the PHI IV inserted by ScalarEvolutionExpander. As a fallback
7127/// any PHi from the loop header is usable, but may have less chance of
7128/// surviving subsequent transforms.
7130 const LSRInstance &LSR) {
7131
7132 auto IsSuitableIV = [&](PHINode *P) {
7133 if (!SE.isSCEVable(P->getType()))
7134 return false;
7135 if (const SCEVAddRecExpr *Rec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(P)))
7136 return Rec->isAffine() && !SE.containsUndefs(SE.getSCEV(P));
7137 return false;
7138 };
7139
7140 // For now, just pick the first IV that was generated and inserted by
7141 // ScalarEvolution. Ideally pick an IV that is unlikely to be optimised away
7142 // by subsequent transforms.
7143 for (const WeakVH &IV : LSR.getScalarEvolutionIVs()) {
7144 if (!IV)
7145 continue;
7146
7147 // There should only be PHI node IVs.
7148 PHINode *P = cast<PHINode>(&*IV);
7149
7150 if (IsSuitableIV(P))
7151 return P;
7152 }
7153
7154 for (PHINode &P : L.getHeader()->phis()) {
7155 if (IsSuitableIV(&P))
7156 return &P;
7157 }
7158 return nullptr;
7159}
7160
7162 DominatorTree &DT, LoopInfo &LI,
7163 const TargetTransformInfo &TTI,
7165 MemorySSA *MSSA) {
7166
7167 // Debug preservation - before we start removing anything identify which DVI
7168 // meet the salvageable criteria and store their DIExpression and SCEVs.
7169 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> SalvageableDVIRecords;
7170 DbgGatherSalvagableDVI(L, SE, SalvageableDVIRecords);
7171
7172 bool Changed = false;
7173 std::unique_ptr<MemorySSAUpdater> MSSAU;
7174 if (MSSA)
7175 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
7176
7177 // Run the main LSR transformation.
7178 const LSRInstance &Reducer =
7179 LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get());
7180 Changed |= Reducer.getChanged();
7181
7182 // Remove any extra phis created by processing inner loops.
7183 Changed |= DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7184 if (EnablePhiElim && L->isLoopSimplifyForm()) {
7186 SCEVExpander Rewriter(SE, "lsr", false);
7187#if LLVM_ENABLE_ABI_BREAKING_CHECKS
7188 Rewriter.setDebugType(DEBUG_TYPE);
7189#endif
7190 unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI);
7191 Rewriter.clear();
7192 if (numFolded) {
7193 Changed = true;
7195 MSSAU.get());
7196 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7197 }
7198 }
7199 // LSR may at times remove all uses of an induction variable from a loop.
7200 // The only remaining use is the PHI in the exit block.
7201 // When this is the case, if the exit value of the IV can be calculated using
7202 // SCEV, we can replace the exit block PHI with the final value of the IV and
7203 // skip the updates in each loop iteration.
7204 if (L->isRecursivelyLCSSAForm(DT, LI) && L->getExitBlock()) {
7206 SCEVExpander Rewriter(SE, "lsr", true);
7207 int Rewrites = rewriteLoopExitValues(L, &LI, &TLI, &SE, &TTI, Rewriter, &DT,
7208 UnusedIndVarInLoop, DeadInsts);
7209 Rewriter.clear();
7210 if (Rewrites) {
7211 Changed = true;
7213 MSSAU.get());
7214 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7215 }
7216 }
7217
7218 if (SalvageableDVIRecords.empty())
7219 return Changed;
7220
7221 // Obtain relevant IVs and attempt to rewrite the salvageable DVIs with
7222 // expressions composed using the derived iteration count.
7223 // TODO: Allow for multiple IV references for nested AddRecSCEVs
7224 for (const auto &L : LI) {
7225 if (llvm::PHINode *IV = GetInductionVariable(*L, SE, Reducer))
7226 DbgRewriteSalvageableDVIs(L, SE, IV, SalvageableDVIRecords);
7227 else {
7228 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV salvaging not possible. An IV "
7229 "could not be identified.\n");
7230 }
7231 }
7232
7233 for (auto &Rec : SalvageableDVIRecords)
7234 Rec->clear();
7235 SalvageableDVIRecords.clear();
7236 return Changed;
7237}
7238
7239bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
7240 if (skipLoop(L))
7241 return false;
7242
7243 auto &IU = getAnalysis<IVUsersWrapperPass>().getIU();
7244 auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
7245 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
7246 auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
7247 const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
7248 *L->getHeader()->getParent());
7249 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
7250 *L->getHeader()->getParent());
7251 auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
7252 *L->getHeader()->getParent());
7253 auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
7254 MemorySSA *MSSA = nullptr;
7255 if (MSSAAnalysis)
7256 MSSA = &MSSAAnalysis->getMSSA();
7257 return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, TLI, MSSA);
7258}
7259
7262 LPMUpdater &) {
7263 if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,
7264 AR.DT, AR.LI, AR.TTI, AR.AC, AR.TLI, AR.MSSA))
7265 return PreservedAnalyses::all();
7266
7267 auto PA = getLoopPassPreservedAnalyses();
7268 if (AR.MSSA)
7269 PA.preserve<MemorySSAAnalysis>();
7270 return PA;
7271}
7272
7273char LoopStrengthReduce::ID = 0;
7274
7275INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
7276 "Loop Strength Reduction", false, false)
7282INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
7283INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
7284 "Loop Strength Reduction", false, false)
7285
7286Pass *llvm::createLoopStrengthReducePass() { return new LoopStrengthReduce(); }
#define Success
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
Function Alias Analysis false
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:661
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isCanonical(const MDString *S)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
This file contains constants used for implementing Dwarf debug support.
early cse Early CSE w MemorySSA
#define DEBUG_TYPE
Hexagon Hardware Loops
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
This header provides classes for managing per-loop analyses.
static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec, const SCEV *SCEVInductionVar, SCEVDbgValueBuilder IterCountExpr)
static cl::opt< bool > DropScaledForVScale("lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true), cl::desc("Avoid using scaled registers with vscale-relative addressing"))
static Value * getWideOperand(Value *Oper)
IVChain logic must consistently peek base TruncInst operands, so wrap it in a convenient helper.
static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE)
Return true if the given add can be sign-extended without changing its value.
static bool mayUsePostIncMode(const TargetTransformInfo &TTI, LSRUse &LU, const SCEV *S, const Loop *L, ScalarEvolution &SE)
Return true if the SCEV represents a value that may end up as a post-increment operation.
static void restorePreTransformState(DVIRecoveryRec &DVIRec)
Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L)
static User::op_iterator findIVOperand(User::op_iterator OI, User::op_iterator OE, Loop *L, ScalarEvolution &SE)
Helper for CollectChains that finds an IV operand (computed by an AddRec in this loop) within [OI,...
static cl::opt< TTI::AddressingModeKind > PreferredAddresingMode("lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None), cl::desc("A flag that overrides the target's preferred addressing mode."), cl::values(clEnumValN(TTI::AMK_None, "none", "Don't prefer any addressing mode"), clEnumValN(TTI::AMK_PreIndexed, "preindexed", "Prefer pre-indexed addressing mode"), clEnumValN(TTI::AMK_PostIndexed, "postindexed", "Prefer post-indexed addressing mode"), clEnumValN(TTI::AMK_All, "all", "Consider all addressing modes")))
static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset, Immediate MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg, int64_t Scale)
Test whether we know how to expand the current formula.
static void DbgGatherSalvagableDVI(Loop *L, ScalarEvolution &SE, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &SalvageableDVISCEVs)
Identify and cache salvageable DVI locations and expressions along with the corresponding SCEV(s).
static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE)
Return true if the given mul can be sign-extended without changing its value.
static const unsigned MaxSCEVSalvageExpressionSize
Limit the size of expression that SCEV-based salvaging will attempt to translate into a DIExpression.
static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if this AddRec is already a phi in its loop.
static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F, const Loop &L)
static cl::opt< bool > InsnsCost("lsr-insns-cost", cl::Hidden, cl::init(true), cl::desc("Add instruction count to a LSR cost model"))
static cl::opt< bool > StressIVChain("stress-ivchain", cl::Hidden, cl::init(false), cl::desc("Stress test LSR IV chains"))
static bool isAddressUse(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Returns true if the specified instruction is using the specified value as an address.
static void DoInitialMatch(const SCEV *S, Loop *L, SmallVectorImpl< SCEVUse > &Good, SmallVectorImpl< SCEVUse > &Bad, ScalarEvolution &SE)
Recursion helper for initialMatch.
static void updateDVIWithLocation(T &DbgVal, Value *Location, SmallVectorImpl< uint64_t > &Ops)
Overwrites DVI with the location and Ops as the DIExpression.
static bool isLegalAddImmediate(const TargetTransformInfo &TTI, Immediate Offset)
static cl::opt< cl::boolOrDefault > AllowDropSolutionIfLessProfitable("lsr-drop-solution", cl::Hidden, cl::desc("Attempt to drop solution if it is less profitable"))
static cl::opt< bool > EnableVScaleImmediates("lsr-enable-vscale-immediates", cl::Hidden, cl::init(true), cl::desc("Enable analysis of vscale-relative immediates in LSR"))
static Instruction * getFixupInsertPos(const TargetTransformInfo &TTI, const LSRFixup &Fixup, const LSRUse &LU, Instruction *IVIncInsertPos, DominatorTree &DT)
static const SCEV * getExprBase(const SCEV *S)
Return an approximation of this SCEV expression's "base", or NULL for any constant.
static bool isAlwaysFoldable(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg)
static llvm::PHINode * GetInductionVariable(const Loop &L, ScalarEvolution &SE, const LSRInstance &LSR)
Ideally pick the PHI IV inserted by ScalarEvolutionExpander.
static bool IsSimplerBaseSCEVForTarget(const TargetTransformInfo &TTI, ScalarEvolution &SE, const SCEV *Best, const SCEV *Reg, MemAccessTy AccessType)
static const unsigned MaxIVUsers
MaxIVUsers is an arbitrary threshold that provides an early opportunity for bail out.
static bool isHighCostExpansion(const SCEV *S, SmallPtrSetImpl< const SCEV * > &Processed, ScalarEvolution &SE)
Check if expanding this expression is likely to incur significant cost.
static Value * getValueOrPoison(WeakVH &VH, LLVMContext &C)
Cached location ops may be erased during LSR, in which case a poison is required when restoring from ...
static MemAccessTy getAccessType(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Return the type of the memory being accessed.
static unsigned numLLVMArgOps(SmallVectorImpl< uint64_t > &Expr)
Returns the total number of DW_OP_llvm_arg operands in the expression.
static Immediate ExtractImmediate(SCEVUse &S, ScalarEvolution &SE, bool PreferScalable=false)
If S involves the addition of a constant integer value, return that integer value,...
static void DbgRewriteSalvageableDVIs(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &DVIToUpdate)
Obtain an expression for the iteration count, then attempt to salvage the dbg.value intrinsics.
static cl::opt< bool > EnablePhiElim("enable-lsr-phielim", cl::Hidden, cl::init(true), cl::desc("Enable LSR phi elimination"))
static void UpdateDbgValue(DVIRecoveryRec &DVIRec, SmallVectorImpl< Value * > &NewLocationOps, SmallVectorImpl< uint64_t > &NewExpr)
Write the new expression and new location ops for the dbg.value.
static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if the given addrec can be sign-extended without changing its value.
static Immediate ExtractImmediateOperand(MutableArrayRef< SCEVUse > Ops, ScalarEvolution &SE, bool PreferScalable)
Extracts an immediate operand from Ops and replaces the operand with zero.
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F)
Check if the addressing mode defined by F is completely folded in LU at isel time.
static cl::opt< bool > LSRExpNarrow("lsr-exp-narrow", cl::Hidden, cl::init(false), cl::desc("Narrow LSR complex solution using" " expectation of registers number"))
static cl::opt< bool > FilterSameScaledReg("lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true), cl::desc("Narrow LSR search space by filtering non-optimal formulae" " with the same ScaledReg and Scale"))
static void updateDVIWithLocations(T &DbgVal, SmallVectorImpl< Value * > &Locations, SmallVectorImpl< uint64_t > &Ops)
Overwrite DVI with locations placed into a DIArglist.
static cl::opt< unsigned > ComplexityLimit("lsr-complexity-limit", cl::Hidden, cl::init(std::numeric_limits< uint16_t >::max()), cl::desc("LSR search space complexity limit"))
static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC, TargetLibraryInfo &TLI, MemorySSA *MSSA)
static GlobalValue * ExtractSymbol(SCEVUse &S, ScalarEvolution &SE)
If S involves the addition of a GlobalValue address, return that symbol, and mutate S to point to a n...
static bool isProfitableChain(IVChain &Chain, SmallPtrSetImpl< Instruction * > &Users, ScalarEvolution &SE, const TargetTransformInfo &TTI)
Return true if the number of registers needed for the chain is estimated to be less than the number r...
static const SCEV * CollectSubexprs(const SCEV *S, const SCEVConstant *C, SmallVectorImpl< const SCEV * > &Ops, const Loop *L, ScalarEvolution &SE, unsigned Depth=0)
Split S into subexpressions which can be pulled out into separate registers.
static const SCEV * getExactSDiv(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool IgnoreSignificantBits=false)
Return an expression for LHS /s RHS, if it can be determined and if the remainder is known to be zero...
static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, Value *Operand, const TargetTransformInfo &TTI)
Return true if the IVInc can be folded into an addressing mode.
static const SCEV * getAnyExtendConsideringPostIncUses(ArrayRef< PostIncLoopSet > Loops, const SCEV *Expr, Type *ToTy, ScalarEvolution &SE)
Extend/Truncate Expr to ToTy considering post-inc uses in Loops.
static unsigned getSetupCost(const SCEV *Reg, unsigned Depth, const TargetTransformInfo &TTI)
static cl::opt< unsigned > SetupCostDepthLimit("lsr-setupcost-depth-limit", cl::Hidden, cl::init(7), cl::desc("The limit on recursion depth for LSRs setup cost"))
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
Register Reg
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
#define T
uint64_t IntrinsicInst * II
#define P(N)
PowerPC TLS Dynamic Call Fixup
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the PointerIntPair class.
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
SI optimize exec mask operations pre RA
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
static const unsigned UnknownAddressSpace
#define LLVM_DEBUG(...)
Definition Debug.h:119
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
Virtual Register Rewriter
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:330
LLVM_ABI APInt sdiv(const APInt &RHS) const
Signed division function for APInt.
Definition APInt.cpp:1670
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition APInt.h:1554
LLVM_ABI APInt srem(const APInt &RHS) const
Function for signed remainder operation.
Definition APInt.cpp:1771
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
LLVM_ABI AnalysisUsage & addRequiredID(const void *ID)
Definition Pass.cpp:289
AnalysisUsage & addPreservedID(const void *ID)
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:530
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:388
LLVM_ABI bool isLandingPad() const
Return true if this basic block is a landing pad.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
BinaryOps getOpcode() const
Definition InstrTypes.h:409
static LLVM_ABI BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
static LLVM_ABI Instruction::CastOps getCastOpcode(const Value *Val, bool SrcIsSigned, Type *Ty, bool DstIsSigned)
Returns the opcode necessary to cast Val into Ty using usual casting rules.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ ICMP_NE
not equal
Definition InstrTypes.h:762
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:852
Value * getCondition() const
static LLVM_ABI bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI DIArgList * get(LLVMContext &Context, ArrayRef< ValueAsMetadata * > Args)
DWARF expression.
iterator_range< expr_op_iterator > expr_ops() const
static LLVM_ABI DIExpression * append(const DIExpression *Expr, ArrayRef< uint64_t > Ops)
Append the opcodes Ops to DIExpr.
unsigned getNumElements() const
static LLVM_ABI void appendOffset(SmallVectorImpl< uint64_t > &Ops, int64_t Offset)
Append Ops with operations to apply the Offset.
LLVM_ABI bool isComplex() const
Return whether the location is computed on the expression stack, meaning it cannot be a simple regist...
LLVM_ABI LLVMContext & getContext()
Record of a variable value-assignment, aka a non instruction representation of the dbg....
LLVM_ABI bool isKillLocation() const
void setRawLocation(Metadata *NewLocation)
Use of this should generally be avoided; instead, replaceVariableLocationOp and addVariableLocationOp...
void setExpression(DIExpression *NewExpr)
DIExpression * getExpression() const
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:301
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:286
NodeT * getBlock() const
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:306
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:151
LLVM_ABI Instruction * findNearestCommonDominator(Instruction *I1, Instruction *I2) const
Find the nearest instruction I that dominates both I1 and I2, in the sense that a result produced bef...
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
PointerType * getType() const
Global values are always pointers.
IVStrideUse - Keep track of one use of a strided induction variable.
Definition IVUsers.h:36
void transformToPostInc(const Loop *L)
transformToPostInc - Transform the expression to post-inc form for the given loop.
Definition IVUsers.cpp:365
Value * getOperandValToReplace() const
getOperandValToReplace - Return the Value of the operand in the user instruction that this IVStrideUs...
Definition IVUsers.h:55
void setUser(Instruction *NewUser)
setUser - Assign a new user instruction for this use.
Definition IVUsers.h:49
Analysis pass that exposes the IVUsers for a loop.
Definition IVUsers.h:187
ilist< IVStrideUse >::const_iterator const_iterator
Definition IVUsers.h:143
iterator end()
Definition IVUsers.h:145
iterator begin()
Definition IVUsers.h:144
bool empty() const
Definition IVUsers.h:148
LLVM_ABI void print(raw_ostream &OS) const
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isLifetimeStartOrEnd() const LLVM_READONLY
Return true if the instruction is a llvm.lifetime.start or llvm.lifetime.end marker.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isEHPad() const
Return true if the instruction is a variety of EH-block.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI Type * getAccessType() const LLVM_READONLY
Return the type this instruction accesses in memory, if any.
const char * getOpcodeName() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:350
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
An instruction for reading from memory.
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
The legacy pass manager's analysis pass to compute loop information.
Definition LoopInfo.h:612
LLVM_ABI PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1561
An analysis that produces MemorySSA for a function.
Definition MemorySSA.h:922
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition MemorySSA.h:702
Represent a mutable reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:294
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
iterator_range< const_block_iterator > blocks() const
op_range incoming_values()
void setIncomingValue(unsigned i, Value *V)
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
static unsigned getIncomingValueNumForOperand(unsigned i)
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
This node represents an addition of some number of SCEVs.
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
SCEVUse getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
This class represents a constant integer value.
ConstantInt * getValue() const
const APInt & getAPInt() const
This class uses information about analyze scalars to rewrite expressions in canonical form.
This node represents multiplication of some number of SCEVs.
ArrayRef< SCEVUse > operands() const
This means that we are dealing with an entirely unknown SCEV value, and only represent it as its LLVM...
This class represents an analyzed expression in the program.
unsigned short getExpressionSize() const
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
static constexpr auto FlagAnyWrap
LLVM_ABI ArrayRef< SCEVUse > operands() const
Return operands of this SCEV expression.
SCEVTypes getSCEVType() const
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
LLVM_ABI uint64_t getTypeSizeInBits(Type *Ty) const
Return the size in bits of the specified type, for which isSCEVable must return true.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getAddRecExpr(SCEVUse Start, SCEVUse Step, const Loop *L, SCEV::NoWrapFlags Flags)
Get an add recurrence expression for the specified loop.
LLVM_ABI const SCEV * getNoopOrSignExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
LLVM_ABI Type * getEffectiveSCEVType(Type *Ty) const
Return a type with the same bitwidth as the given type and which represents how SCEV will treat the g...
LLVM_ABI const SCEV * getAnyExtendExpr(const SCEV *Op, Type *Ty)
getAnyExtendExpr - Return a SCEV for the given operand extended with unspecified bits out to the give...
LLVM_ABI bool containsUndefs(const SCEV *S) const
Return true if the SCEV expression contains an undef value.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
LLVM_ABI const SCEV * getVScale(Type *Ty)
LLVM_ABI bool hasComputableLoopEvolution(const SCEV *S, const Loop *L)
Return true if the given SCEV changes value in a known way in the specified loop.
LLVM_ABI const SCEV * getPointerBase(const SCEV *V)
Transitively follow the chain of pointer-type operands until reaching a SCEV that does not have a sin...
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVM_ABI const SCEV * getUnknown(Value *V)
LLVM_ABI std::optional< APInt > computeConstantDifference(const SCEV *LHS, const SCEV *RHS)
Compute LHS - RHS and returns the result as an APInt if it is a constant, and std::nullopt if it isn'...
LLVM_ABI bool properlyDominates(const SCEV *S, const BasicBlock *BB)
Return true if elements that makes up the given SCEV properly dominate the specified basic block.
LLVM_ABI bool containsErasedValue(const SCEV *S) const
Return true if the SCEV expression contains a Value that has been optimised out and is now a nullptr.
LLVMContext & getContext() const
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
iterator end()
Get an iterator to the end of the SetVector.
Definition SetVector.h:112
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition SetVector.h:106
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
iterator_range< const_set_bits_iterator > set_bits() const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
size_type size() const
Returns the number of bits in this bitvector.
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
iterator erase(const_iterator CI)
typename SuperClass::const_iterator const_iterator
typename SuperClass::iterator iterator
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
static StackOffset get(int64_t Fixed, int64_t Scalable)
Definition TypeSize.h:41
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI bool shouldDropLSRSolutionIfLessProfitable() const
Return true if LSR should drop a found solution if it's calculated to be less profitable than the bas...
LLVM_ABI bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const
Return true if LSR cost of C1 is lower than C2.
LLVM_ABI bool isIndexedStoreLegal(enum MemIndexedMode Mode, Type *Ty) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace=0, Instruction *I=nullptr, int64_t ScalableOffset=0) const
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI bool isIndexedLoadLegal(enum MemIndexedMode Mode, Type *Ty) const
LLVM_ABI bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
LLVM_ABI bool isLegalAddImmediate(int64_t Imm) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
LLVM_ABI bool canSaveCmp(Loop *L, CondBrInst **BI, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *LibInfo) const
Return true if the target can save a compare for loop count, for example hardware loop saves a compar...
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
@ TCC_Free
Expected to fold away in lowering.
LLVM_ABI bool canMacroFuseCmp() const
Return true if the target can fuse a compare and branch.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
@ AMK_All
Consider all addressing modes.
@ AMK_PreIndexed
Prefer pre-indexed addressing mode.
@ AMK_None
Don't prefer any addressing mode.
LLVM_ABI bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
This class represents a truncation of integer types.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:307
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI int getFPMantissaWidth() const
Return the width of the mantissa of this type.
Definition Type.cpp:237
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
Use * op_iterator
Definition User.h:254
op_range operands()
Definition User.h:267
op_iterator op_begin()
Definition User.h:259
void setOperand(unsigned i, Value *Val)
Definition User.h:212
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:25
Value * getOperand(unsigned i) const
Definition User.h:207
op_iterator op_end()
Definition User.h:261
static LLVM_ABI ValueAsMetadata * get(Value *V)
Definition Metadata.cpp:509
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI void printAsOperand(raw_ostream &O, bool PrintType=true, const Module *M=nullptr) const
Print the name of this Value out to the specified raw_ostream.
iterator_range< use_iterator > uses()
Definition Value.h:380
A nullable Value handle that is nullable.
int getNumOccurrences() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:212
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:190
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
match_bind< const SCEVMulExpr > m_scev_Mul(const SCEVMulExpr *&V)
bool match(const SCEV *S, const Pattern &P)
SCEVAffineAddRec_match< Op0_t, Op1_t, match_isa< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
cst_pred_ty< is_specific_cst > m_scev_SpecificInt(uint64_t V)
Match an SCEV constant with a plain unsigned integer.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
@ DW_OP_LLVM_arg
Only used in LLVM metadata.
Definition Dwarf.h:149
@ DW_OP_LLVM_convert
Only used in LLVM metadata.
Definition Dwarf.h:145
constexpr double e
Sequence
A sequence of states that a pointer may go through in which an objc_retain and objc_release are actua...
Definition PtrState.h:41
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
iterator end() const
Definition BasicBlock.h:89
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
Definition SFrame.h:77
unsigned KindType
For isa, dyn_cast, etc operations on TelemetryInfo.
Definition Telemetry.h:83
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:558
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1764
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
InstructionCost Cost
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition Utils.cpp:1690
bool operator!=(uint64_t V1, const APInt &V2)
Definition APInt.h:2142
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207
LLVM_ABI char & LoopSimplifyID
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
DomTreeNodeBase< BasicBlock > DomTreeNode
Definition Dominators.h:94
AnalysisManager< Loop, LoopStandardAnalysisResults & > LoopAnalysisManager
The loop analysis manager.
LLVM_ABI bool matchSimpleRecurrence(const PHINode *P, BinaryOperator *&BO, Value *&Start, Value *&Step)
Attempt to match a simple first order recurrence cycle of the form: iv = phi Ty [Start,...
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
LLVM_ABI bool DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr)
Examine each PHI in the given block and delete it if it is dead.
LLVM_ABI void initializeLoopStrengthReducePass(PassRegistry &)
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
LLVM_ABI const SCEV * denormalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE)
Denormalize S to be post-increment for all loops present in Loops.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
LLVM_ABI Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void SplitLandingPadPredecessors(BasicBlock *OrigBB, ArrayRef< BasicBlock * > Preds, const char *Suffix, const char *Suffix2, SmallVectorImpl< BasicBlock * > &NewBBs, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, bool PreserveLCSSA=false)
This method transforms the landing pad, OrigBB, by introducing two new basic blocks into the function...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
LLVM_ABI const SCEV * normalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE, bool CheckInvertible=true)
Normalize S to be post-increment for all loops present in Loops.
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Other
Any other memory.
Definition ModRef.h:68
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
@ Add
Sum of integers.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2011
DWARFExpression::Operation Op
LLVM_ABI Pass * createLoopStrengthReducePass()
LLVM_ABI BasicBlock * SplitCriticalEdge(Instruction *TI, unsigned SuccNum, const CriticalEdgeSplittingOptions &Options=CriticalEdgeSplittingOptions(), const Twine &BBName="")
If this edge is a critical edge, insert a new node to split the critical edge.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructionsPermissive(SmallVectorImpl< WeakTrackingVH > &DeadInsts, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
Same functionality as RecursivelyDeleteTriviallyDeadInstructions, but allow instructions that are not...
Definition Local.cpp:550
constexpr unsigned BitWidth
LLVM_ABI bool formLCSSAForInstructions(SmallVectorImpl< Instruction * > &Worklist, const DominatorTree &DT, const LoopInfo &LI, ScalarEvolution *SE, SmallVectorImpl< PHINode * > *PHIsToRemove=nullptr, SmallVectorImpl< PHINode * > *InsertedPHIs=nullptr)
Ensures LCSSA form for every instruction from the Worklist in the scope of innermost containing loop.
Definition LCSSA.cpp:308
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
SmallPtrSet< const Loop *, 2 > PostIncLoopSet
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1771
LLVM_ABI int rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI, ScalarEvolution *SE, const TargetTransformInfo *TTI, SCEVExpander &Rewriter, DominatorTree *DT, ReplaceExitVal ReplaceExitValue, SmallVector< WeakTrackingVH, 16 > &DeadInsts)
If the final value of any expressions that are recurrent in the loop can be computed,...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
@ UnusedIndVarInLoop
Definition LoopUtils.h:600
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
SCEVUseT< const SCEV * > SCEVUse
bool SCEVExprContains(const SCEV *Root, PredTy Pred)
Return true if any node in Root satisfies the predicate Pred.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
#define N
Attributes of a target dependent hardware loop.
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Information about a load/store intrinsic defined by the target.
Value * PtrVal
This is the pointer that the intrinsic is loading from or storing to.