LLVM 23.0.0git
LoopStrengthReduce.cpp
Go to the documentation of this file.
1//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This transformation analyzes and transforms the induction variables (and
10// computations derived from them) into forms suitable for efficient execution
11// on the target.
12//
13// This pass performs a strength reduction on array references inside loops that
14// have as one or more of their components the loop induction variable, it
15// rewrites expressions to take advantage of scaled-index addressing modes
16// available on the target, and it performs a variety of other optimizations
17// related to loop induction variables.
18//
19// Terminology note: this code has a lot of handling for "post-increment" or
20// "post-inc" users. This is not talking about post-increment addressing modes;
21// it is instead talking about code like this:
22//
23// %i = phi [ 0, %entry ], [ %i.next, %latch ]
24// ...
25// %i.next = add %i, 1
26// %c = icmp eq %i.next, %n
27//
28// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
29// it's useful to think about these as the same register, with some uses using
30// the value of the register before the add and some using it after. In this
31// example, the icmp is a post-increment user, since it uses %i.next, which is
32// the value of the induction variable after the increment. The other common
33// case of post-increment users is users outside the loop.
34//
35// TODO: More sophistication in the way Formulae are generated and filtered.
36//
37// TODO: Handle multiple loops at a time.
38//
39// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
40// of a GlobalValue?
41//
42// TODO: When truncation is free, truncate ICmp users' operands to make it a
43// smaller encoding (on x86 at least).
44//
45// TODO: When a negated register is used by an add (such as in a list of
46// multiple base registers, or as the increment expression in an addrec),
47// we may not actually need both reg and (-1 * reg) in registers; the
48// negation can be implemented by using a sub instead of an add. The
49// lack of support for taking this into consideration when making
50// register pressure decisions is partly worked around by the "Special"
51// use kind.
52//
53//===----------------------------------------------------------------------===//
54
56#include "llvm/ADT/APInt.h"
57#include "llvm/ADT/DenseMap.h"
58#include "llvm/ADT/DenseSet.h"
60#include "llvm/ADT/STLExtras.h"
61#include "llvm/ADT/SetVector.h"
64#include "llvm/ADT/SmallSet.h"
66#include "llvm/ADT/Statistic.h"
84#include "llvm/IR/BasicBlock.h"
85#include "llvm/IR/Constant.h"
86#include "llvm/IR/Constants.h"
89#include "llvm/IR/Dominators.h"
90#include "llvm/IR/GlobalValue.h"
91#include "llvm/IR/IRBuilder.h"
92#include "llvm/IR/InstrTypes.h"
93#include "llvm/IR/Instruction.h"
96#include "llvm/IR/Module.h"
97#include "llvm/IR/Operator.h"
98#include "llvm/IR/Type.h"
99#include "llvm/IR/Use.h"
100#include "llvm/IR/User.h"
101#include "llvm/IR/Value.h"
102#include "llvm/IR/ValueHandle.h"
104#include "llvm/Pass.h"
105#include "llvm/Support/Casting.h"
108#include "llvm/Support/Debug.h"
118#include <algorithm>
119#include <cassert>
120#include <cstddef>
121#include <cstdint>
122#include <iterator>
123#include <limits>
124#include <map>
125#include <numeric>
126#include <optional>
127#include <utility>
128
129using namespace llvm;
130using namespace SCEVPatternMatch;
131
132#define DEBUG_TYPE "loop-reduce"
133
134/// MaxIVUsers is an arbitrary threshold that provides an early opportunity for
135/// bail out. This threshold is far beyond the number of users that LSR can
136/// conceivably solve, so it should not affect generated code, but catches the
137/// worst cases before LSR burns too much compile time and stack space.
138static const unsigned MaxIVUsers = 200;
139
140/// Limit the size of expression that SCEV-based salvaging will attempt to
141/// translate into a DIExpression.
142/// Choose a maximum size such that debuginfo is not excessively increased and
143/// the salvaging is not too expensive for the compiler.
144static const unsigned MaxSCEVSalvageExpressionSize = 64;
145
146// Cleanup congruent phis after LSR phi expansion.
148 "enable-lsr-phielim", cl::Hidden, cl::init(true),
149 cl::desc("Enable LSR phi elimination"));
150
151// The flag adds instruction count to solutions cost comparison.
153 "lsr-insns-cost", cl::Hidden, cl::init(true),
154 cl::desc("Add instruction count to a LSR cost model"));
155
156// Flag to choose how to narrow complex lsr solution
158 "lsr-exp-narrow", cl::Hidden, cl::init(false),
159 cl::desc("Narrow LSR complex solution using"
160 " expectation of registers number"));
161
162// Flag to narrow search space by filtering non-optimal formulae with
163// the same ScaledReg and Scale.
165 "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true),
166 cl::desc("Narrow LSR search space by filtering non-optimal formulae"
167 " with the same ScaledReg and Scale"));
168
170 "lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None),
171 cl::desc("A flag that overrides the target's preferred addressing mode."),
173 clEnumValN(TTI::AMK_None, "none", "Don't prefer any addressing mode"),
174 clEnumValN(TTI::AMK_PreIndexed, "preindexed",
175 "Prefer pre-indexed addressing mode"),
176 clEnumValN(TTI::AMK_PostIndexed, "postindexed",
177 "Prefer post-indexed addressing mode"),
178 clEnumValN(TTI::AMK_All, "all", "Consider all addressing modes")));
179
181 "lsr-complexity-limit", cl::Hidden,
182 cl::init(std::numeric_limits<uint16_t>::max()),
183 cl::desc("LSR search space complexity limit"));
184
186 "lsr-setupcost-depth-limit", cl::Hidden, cl::init(7),
187 cl::desc("The limit on recursion depth for LSRs setup cost"));
188
190 "lsr-drop-solution", cl::Hidden,
191 cl::desc("Attempt to drop solution if it is less profitable"));
192
194 "lsr-enable-vscale-immediates", cl::Hidden, cl::init(true),
195 cl::desc("Enable analysis of vscale-relative immediates in LSR"));
196
198 "lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true),
199 cl::desc("Avoid using scaled registers with vscale-relative addressing"));
200
201#ifndef NDEBUG
202// Stress test IV chain generation.
204 "stress-ivchain", cl::Hidden, cl::init(false),
205 cl::desc("Stress test LSR IV chains"));
206#else
207static bool StressIVChain = false;
208#endif
209
210namespace {
211
212struct MemAccessTy {
213 /// Used in situations where the accessed memory type is unknown.
214 static const unsigned UnknownAddressSpace =
215 std::numeric_limits<unsigned>::max();
216
217 Type *MemTy = nullptr;
218 unsigned AddrSpace = UnknownAddressSpace;
219
220 MemAccessTy() = default;
221 MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {}
222
223 bool operator==(MemAccessTy Other) const {
224 return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
225 }
226
227 bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
228
229 static MemAccessTy getUnknown(LLVMContext &Ctx,
230 unsigned AS = UnknownAddressSpace) {
231 return MemAccessTy(Type::getVoidTy(Ctx), AS);
232 }
233
234 Type *getType() { return MemTy; }
235};
236
237/// This class holds data which is used to order reuse candidates.
238class RegSortData {
239public:
240 /// This represents the set of LSRUse indices which reference
241 /// a particular register.
242 SmallBitVector UsedByIndices;
243
244 void print(raw_ostream &OS) const;
245 void dump() const;
246};
247
248// An offset from an address that is either scalable or fixed. Used for
249// per-target optimizations of addressing modes.
250class Immediate : public details::FixedOrScalableQuantity<Immediate, int64_t> {
251 constexpr Immediate(ScalarTy MinVal, bool Scalable)
252 : FixedOrScalableQuantity(MinVal, Scalable) {}
253
254 constexpr Immediate(const FixedOrScalableQuantity<Immediate, int64_t> &V)
255 : FixedOrScalableQuantity(V) {}
256
257public:
258 constexpr Immediate() = delete;
259
260 static constexpr Immediate getFixed(ScalarTy MinVal) {
261 return {MinVal, false};
262 }
263 static constexpr Immediate getScalable(ScalarTy MinVal) {
264 return {MinVal, true};
265 }
266 static constexpr Immediate get(ScalarTy MinVal, bool Scalable) {
267 return {MinVal, Scalable};
268 }
269 static constexpr Immediate getZero() { return {0, false}; }
270 static constexpr Immediate getFixedMin() {
271 return {std::numeric_limits<int64_t>::min(), false};
272 }
273 static constexpr Immediate getFixedMax() {
274 return {std::numeric_limits<int64_t>::max(), false};
275 }
276 static constexpr Immediate getScalableMin() {
277 return {std::numeric_limits<int64_t>::min(), true};
278 }
279 static constexpr Immediate getScalableMax() {
280 return {std::numeric_limits<int64_t>::max(), true};
281 }
282
283 constexpr bool isLessThanZero() const { return Quantity < 0; }
284
285 constexpr bool isGreaterThanZero() const { return Quantity > 0; }
286
287 constexpr bool isCompatibleImmediate(const Immediate &Imm) const {
288 return isZero() || Imm.isZero() || Imm.Scalable == Scalable;
289 }
290
291 constexpr bool isMin() const {
292 return Quantity == std::numeric_limits<ScalarTy>::min();
293 }
294
295 constexpr bool isMax() const {
296 return Quantity == std::numeric_limits<ScalarTy>::max();
297 }
298
299 // Arithmetic 'operators' that cast to unsigned types first.
300 constexpr Immediate addUnsigned(const Immediate &RHS) const {
301 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
302 ScalarTy Value = (uint64_t)Quantity + RHS.getKnownMinValue();
303 return {Value, Scalable || RHS.isScalable()};
304 }
305
306 constexpr Immediate subUnsigned(const Immediate &RHS) const {
307 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
308 ScalarTy Value = (uint64_t)Quantity - RHS.getKnownMinValue();
309 return {Value, Scalable || RHS.isScalable()};
310 }
311
312 // Scale the quantity by a constant without caring about runtime scalability.
313 constexpr Immediate mulUnsigned(const ScalarTy RHS) const {
314 ScalarTy Value = (uint64_t)Quantity * RHS;
315 return {Value, Scalable};
316 }
317
318 // Helpers for generating SCEVs with vscale terms where needed.
319 const SCEV *getSCEV(ScalarEvolution &SE, Type *Ty) const {
320 const SCEV *S = SE.getConstant(Ty, Quantity);
321 if (Scalable)
322 S = SE.getMulExpr(S, SE.getVScale(S->getType()));
323 return S;
324 }
325
326 const SCEV *getNegativeSCEV(ScalarEvolution &SE, Type *Ty) const {
327 const SCEV *NegS = SE.getConstant(Ty, -(uint64_t)Quantity);
328 if (Scalable)
329 NegS = SE.getMulExpr(NegS, SE.getVScale(NegS->getType()));
330 return NegS;
331 }
332
333 const SCEV *getUnknownSCEV(ScalarEvolution &SE, Type *Ty) const {
334 // TODO: Avoid implicit trunc?
335 // See https://github.com/llvm/llvm-project/issues/112510.
336 const SCEV *SU = SE.getUnknown(
337 ConstantInt::getSigned(Ty, Quantity, /*ImplicitTrunc=*/true));
338 if (Scalable)
339 SU = SE.getMulExpr(SU, SE.getVScale(SU->getType()));
340 return SU;
341 }
342};
343
344// This is needed for the Compare type of std::map when Immediate is used
345// as a key. We don't need it to be fully correct against any value of vscale,
346// just to make sure that vscale-related terms in the map are considered against
347// each other rather than being mixed up and potentially missing opportunities.
348struct KeyOrderTargetImmediate {
349 bool operator()(const Immediate &LHS, const Immediate &RHS) const {
350 if (LHS.isScalable() && !RHS.isScalable())
351 return false;
352 if (!LHS.isScalable() && RHS.isScalable())
353 return true;
354 return LHS.getKnownMinValue() < RHS.getKnownMinValue();
355 }
356};
357
358// This would be nicer if we could be generic instead of directly using size_t,
359// but there doesn't seem to be a type trait for is_orderable or
360// is_lessthan_comparable or similar.
361struct KeyOrderSizeTAndImmediate {
362 bool operator()(const std::pair<size_t, Immediate> &LHS,
363 const std::pair<size_t, Immediate> &RHS) const {
364 size_t LSize = LHS.first;
365 size_t RSize = RHS.first;
366 if (LSize != RSize)
367 return LSize < RSize;
368 return KeyOrderTargetImmediate()(LHS.second, RHS.second);
369 }
370};
371} // end anonymous namespace
372
373#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
374void RegSortData::print(raw_ostream &OS) const {
375 OS << "[NumUses=" << UsedByIndices.count() << ']';
376}
377
378LLVM_DUMP_METHOD void RegSortData::dump() const {
379 print(errs()); errs() << '\n';
380}
381#endif
382
383namespace {
384
385/// Map register candidates to information about how they are used.
386class RegUseTracker {
387 using RegUsesTy = DenseMap<const SCEV *, RegSortData>;
388
389 RegUsesTy RegUsesMap;
391
392public:
393 void countRegister(const SCEV *Reg, size_t LUIdx);
394 void dropRegister(const SCEV *Reg, size_t LUIdx);
395 void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
396
397 bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
398
399 const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
400
401 void clear();
402
405
406 iterator begin() { return RegSequence.begin(); }
407 iterator end() { return RegSequence.end(); }
408 const_iterator begin() const { return RegSequence.begin(); }
409 const_iterator end() const { return RegSequence.end(); }
410};
411
412} // end anonymous namespace
413
414void
415RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
416 std::pair<RegUsesTy::iterator, bool> Pair = RegUsesMap.try_emplace(Reg);
417 RegSortData &RSD = Pair.first->second;
418 if (Pair.second)
419 RegSequence.push_back(Reg);
420 RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
421 RSD.UsedByIndices.set(LUIdx);
422}
423
424void
425RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
426 RegUsesTy::iterator It = RegUsesMap.find(Reg);
427 assert(It != RegUsesMap.end());
428 RegSortData &RSD = It->second;
429 assert(RSD.UsedByIndices.size() > LUIdx);
430 RSD.UsedByIndices.reset(LUIdx);
431}
432
433void
434RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
435 assert(LUIdx <= LastLUIdx);
436
437 // Update RegUses. The data structure is not optimized for this purpose;
438 // we must iterate through it and update each of the bit vectors.
439 for (auto &Pair : RegUsesMap) {
440 SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
441 if (LUIdx < UsedByIndices.size())
442 UsedByIndices[LUIdx] =
443 LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : false;
444 UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
445 }
446}
447
448bool
449RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
450 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
451 if (I == RegUsesMap.end())
452 return false;
453 const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
454 int i = UsedByIndices.find_first();
455 if (i == -1) return false;
456 if ((size_t)i != LUIdx) return true;
457 return UsedByIndices.find_next(i) != -1;
458}
459
460const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
461 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
462 assert(I != RegUsesMap.end() && "Unknown register!");
463 return I->second.UsedByIndices;
464}
465
466void RegUseTracker::clear() {
467 RegUsesMap.clear();
468 RegSequence.clear();
469}
470
471namespace {
472
473/// This class holds information that describes a formula for computing
474/// satisfying a use. It may include broken-out immediates and scaled registers.
475struct Formula {
476 /// Global base address used for complex addressing.
477 GlobalValue *BaseGV = nullptr;
478
479 /// Base offset for complex addressing.
480 Immediate BaseOffset = Immediate::getZero();
481
482 /// Whether any complex addressing has a base register.
483 bool HasBaseReg = false;
484
485 /// The scale of any complex addressing.
486 int64_t Scale = 0;
487
488 /// The list of "base" registers for this use. When this is non-empty. The
489 /// canonical representation of a formula is
490 /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
491 /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
492 /// 3. The reg containing recurrent expr related with currect loop in the
493 /// formula should be put in the ScaledReg.
494 /// #1 enforces that the scaled register is always used when at least two
495 /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
496 /// #2 enforces that 1 * reg is reg.
497 /// #3 ensures invariant regs with respect to current loop can be combined
498 /// together in LSR codegen.
499 /// This invariant can be temporarily broken while building a formula.
500 /// However, every formula inserted into the LSRInstance must be in canonical
501 /// form.
503
504 /// The 'scaled' register for this use. This should be non-null when Scale is
505 /// not zero.
506 const SCEV *ScaledReg = nullptr;
507
508 /// An additional constant offset which added near the use. This requires a
509 /// temporary register, but the offset itself can live in an add immediate
510 /// field rather than a register.
511 Immediate UnfoldedOffset = Immediate::getZero();
512
513 Formula() = default;
514
515 void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
516
517 bool isCanonical(const Loop &L) const;
518
519 void canonicalize(const Loop &L);
520
521 bool unscale();
522
523 bool hasZeroEnd() const;
524
525 bool countsDownToZero() const;
526
527 size_t getNumRegs() const;
528 Type *getType() const;
529
530 void deleteBaseReg(const SCEV *&S);
531
532 bool referencesReg(const SCEV *S) const;
533 bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
534 const RegUseTracker &RegUses) const;
535
536 void print(raw_ostream &OS) const;
537 void dump() const;
538};
539
540} // end anonymous namespace
541
542/// Recursion helper for initialMatch.
543static void DoInitialMatch(const SCEV *S, Loop *L,
546 // Collect expressions which properly dominate the loop header.
547 if (SE.properlyDominates(S, L->getHeader())) {
548 Good.push_back(S);
549 return;
550 }
551
552 // Look at add operands.
553 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
554 for (const SCEV *S : Add->operands())
555 DoInitialMatch(S, L, Good, Bad, SE);
556 return;
557 }
558
559 // Look at addrec operands.
560 const SCEV *Start, *Step;
561 const Loop *ARLoop;
562 if (match(S,
563 m_scev_AffineAddRec(m_SCEV(Start), m_SCEV(Step), m_Loop(ARLoop))) &&
564 !Start->isZero()) {
565 DoInitialMatch(Start, L, Good, Bad, SE);
566 DoInitialMatch(SE.getAddRecExpr(SE.getConstant(S->getType(), 0), Step,
567 // FIXME: AR->getNoWrapFlags()
568 ARLoop, SCEV::FlagAnyWrap),
569 L, Good, Bad, SE);
570 return;
571 }
572
573 // Handle a multiplication by -1 (negation) if it didn't fold.
574 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
575 if (Mul->getOperand(0)->isAllOnesValue()) {
577 const SCEV *NewMul = SE.getMulExpr(Ops);
578
581 DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
582 const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
583 SE.getEffectiveSCEVType(NewMul->getType())));
584 for (const SCEV *S : MyGood)
585 Good.push_back(SE.getMulExpr(NegOne, S));
586 for (const SCEV *S : MyBad)
587 Bad.push_back(SE.getMulExpr(NegOne, S));
588 return;
589 }
590
591 // Ok, we can't do anything interesting. Just stuff the whole thing into a
592 // register and hope for the best.
593 Bad.push_back(S);
594}
595
596/// Incorporate loop-variant parts of S into this Formula, attempting to keep
597/// all loop-invariant and loop-computable values in a single base register.
598void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
601 DoInitialMatch(S, L, Good, Bad, SE);
602 if (!Good.empty()) {
603 const SCEV *Sum = SE.getAddExpr(Good);
604 if (!Sum->isZero())
605 BaseRegs.push_back(Sum);
606 HasBaseReg = true;
607 }
608 if (!Bad.empty()) {
609 const SCEV *Sum = SE.getAddExpr(Bad);
610 if (!Sum->isZero())
611 BaseRegs.push_back(Sum);
612 HasBaseReg = true;
613 }
614 canonicalize(*L);
615}
616
617static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L) {
618 return SCEVExprContains(S, [&L](const SCEV *S) {
619 return isa<SCEVAddRecExpr>(S) && (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
620 });
621}
622
623/// Check whether or not this formula satisfies the canonical
624/// representation.
625/// \see Formula::BaseRegs.
626bool Formula::isCanonical(const Loop &L) const {
627 assert((Scale == 0 || ScaledReg) &&
628 "ScaledReg must be non-null if Scale is non-zero");
629
630 if (!ScaledReg)
631 return BaseRegs.size() <= 1;
632
633 if (Scale != 1)
634 return true;
635
636 if (Scale == 1 && BaseRegs.empty())
637 return false;
638
639 if (containsAddRecDependentOnLoop(ScaledReg, L))
640 return true;
641
642 // If ScaledReg is not a recurrent expr, or it is but its loop is not current
643 // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
644 // loop, we want to swap the reg in BaseRegs with ScaledReg.
645 return none_of(BaseRegs, [&L](const SCEV *S) {
647 });
648}
649
650/// Helper method to morph a formula into its canonical representation.
651/// \see Formula::BaseRegs.
652/// Every formula having more than one base register, must use the ScaledReg
653/// field. Otherwise, we would have to do special cases everywhere in LSR
654/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
655/// On the other hand, 1*reg should be canonicalized into reg.
656void Formula::canonicalize(const Loop &L) {
657 if (isCanonical(L))
658 return;
659
660 if (BaseRegs.empty()) {
661 // No base reg? Use scale reg with scale = 1 as such.
662 assert(ScaledReg && "Expected 1*reg => reg");
663 assert(Scale == 1 && "Expected 1*reg => reg");
664 BaseRegs.push_back(ScaledReg);
665 Scale = 0;
666 ScaledReg = nullptr;
667 return;
668 }
669
670 // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
671 if (!ScaledReg) {
672 ScaledReg = BaseRegs.pop_back_val();
673 Scale = 1;
674 }
675
676 // If ScaledReg is an invariant with respect to L, find the reg from
677 // BaseRegs containing the recurrent expr related with Loop L. Swap the
678 // reg with ScaledReg.
679 if (!containsAddRecDependentOnLoop(ScaledReg, L)) {
680 auto I = find_if(BaseRegs, [&L](const SCEV *S) {
682 });
683 if (I != BaseRegs.end())
684 std::swap(ScaledReg, *I);
685 }
686 assert(isCanonical(L) && "Failed to canonicalize?");
687}
688
689/// Get rid of the scale in the formula.
690/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
691/// \return true if it was possible to get rid of the scale, false otherwise.
692/// \note After this operation the formula may not be in the canonical form.
693bool Formula::unscale() {
694 if (Scale != 1)
695 return false;
696 Scale = 0;
697 BaseRegs.push_back(ScaledReg);
698 ScaledReg = nullptr;
699 return true;
700}
701
702bool Formula::hasZeroEnd() const {
703 if (UnfoldedOffset || BaseOffset)
704 return false;
705 if (BaseRegs.size() != 1 || ScaledReg)
706 return false;
707 return true;
708}
709
710bool Formula::countsDownToZero() const {
711 if (!hasZeroEnd())
712 return false;
713 assert(BaseRegs.size() == 1 && "hasZeroEnd should mean one BaseReg");
714 const APInt *StepInt;
715 if (!match(BaseRegs[0], m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StepInt))))
716 return false;
717 return StepInt->isNegative();
718}
719
720/// Return the total number of register operands used by this formula. This does
721/// not include register uses implied by non-constant addrec strides.
722size_t Formula::getNumRegs() const {
723 return !!ScaledReg + BaseRegs.size();
724}
725
726/// Return the type of this formula, if it has one, or null otherwise. This type
727/// is meaningless except for the bit size.
728Type *Formula::getType() const {
729 return !BaseRegs.empty() ? BaseRegs.front()->getType() :
730 ScaledReg ? ScaledReg->getType() :
731 BaseGV ? BaseGV->getType() :
732 nullptr;
733}
734
735/// Delete the given base reg from the BaseRegs list.
736void Formula::deleteBaseReg(const SCEV *&S) {
737 if (&S != &BaseRegs.back())
738 std::swap(S, BaseRegs.back());
739 BaseRegs.pop_back();
740}
741
742/// Test if this formula references the given register.
743bool Formula::referencesReg(const SCEV *S) const {
744 return S == ScaledReg || is_contained(BaseRegs, S);
745}
746
747/// Test whether this formula uses registers which are used by uses other than
748/// the use with the given index.
749bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
750 const RegUseTracker &RegUses) const {
751 if (ScaledReg)
752 if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
753 return true;
754 for (const SCEV *BaseReg : BaseRegs)
755 if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
756 return true;
757 return false;
758}
759
760#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
761void Formula::print(raw_ostream &OS) const {
762 ListSeparator Plus(" + ");
763 if (BaseGV) {
764 OS << Plus;
765 BaseGV->printAsOperand(OS, /*PrintType=*/false);
766 }
767 if (BaseOffset.isNonZero())
768 OS << Plus << BaseOffset;
769
770 for (const SCEV *BaseReg : BaseRegs)
771 OS << Plus << "reg(" << *BaseReg << ')';
772
773 if (HasBaseReg && BaseRegs.empty())
774 OS << Plus << "**error: HasBaseReg**";
775 else if (!HasBaseReg && !BaseRegs.empty())
776 OS << Plus << "**error: !HasBaseReg**";
777
778 if (Scale != 0) {
779 OS << Plus << Scale << "*reg(";
780 if (ScaledReg)
781 OS << *ScaledReg;
782 else
783 OS << "<unknown>";
784 OS << ')';
785 }
786 if (UnfoldedOffset.isNonZero())
787 OS << Plus << "imm(" << UnfoldedOffset << ')';
788}
789
790LLVM_DUMP_METHOD void Formula::dump() const {
791 print(errs()); errs() << '\n';
792}
793#endif
794
795/// Return true if the given addrec can be sign-extended without changing its
796/// value.
798 Type *WideTy =
800 return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
801}
802
803/// Return true if the given add can be sign-extended without changing its
804/// value.
805static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
806 Type *WideTy =
807 IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
808 return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
809}
810
811/// Return true if the given mul can be sign-extended without changing its
812/// value.
813static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
814 Type *WideTy =
816 SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
817 return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
818}
819
820/// Return an expression for LHS /s RHS, if it can be determined and if the
821/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
822/// is true, expressions like (X * Y) /s Y are simplified to X, ignoring that
823/// the multiplication may overflow, which is useful when the result will be
824/// used in a context where the most significant bits are ignored.
825static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
826 ScalarEvolution &SE,
827 bool IgnoreSignificantBits = false) {
828 // Handle the trivial case, which works for any SCEV type.
829 if (LHS == RHS)
830 return SE.getConstant(LHS->getType(), 1);
831
832 // Handle a few RHS special cases.
834 if (RC) {
835 const APInt &RA = RC->getAPInt();
836 // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
837 // some folding.
838 if (RA.isAllOnes()) {
839 if (LHS->getType()->isPointerTy())
840 return nullptr;
841 return SE.getMulExpr(LHS, RC);
842 }
843 // Handle x /s 1 as x.
844 if (RA == 1)
845 return LHS;
846 }
847
848 // Check for a division of a constant by a constant.
850 if (!RC)
851 return nullptr;
852 const APInt &LA = C->getAPInt();
853 const APInt &RA = RC->getAPInt();
854 if (LA.srem(RA) != 0)
855 return nullptr;
856 return SE.getConstant(LA.sdiv(RA));
857 }
858
859 // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
861 if ((IgnoreSignificantBits || isAddRecSExtable(AR, SE)) && AR->isAffine()) {
862 const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
863 IgnoreSignificantBits);
864 if (!Step) return nullptr;
865 const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
866 IgnoreSignificantBits);
867 if (!Start) return nullptr;
868 // FlagNW is independent of the start value, step direction, and is
869 // preserved with smaller magnitude steps.
870 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
871 return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
872 }
873 return nullptr;
874 }
875
876 // Distribute the sdiv over add operands, if the add doesn't overflow.
878 if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
880 for (const SCEV *S : Add->operands()) {
881 const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
882 if (!Op) return nullptr;
883 Ops.push_back(Op);
884 }
885 return SE.getAddExpr(Ops);
886 }
887 return nullptr;
888 }
889
890 // Check for a multiply operand that we can pull RHS out of.
892 if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
893 // Handle special case C1*X*Y /s C2*X*Y.
894 if (const SCEVMulExpr *MulRHS = dyn_cast<SCEVMulExpr>(RHS)) {
895 if (IgnoreSignificantBits || isMulSExtable(MulRHS, SE)) {
896 const SCEVConstant *LC = dyn_cast<SCEVConstant>(Mul->getOperand(0));
897 const SCEVConstant *RC =
898 dyn_cast<SCEVConstant>(MulRHS->getOperand(0));
899 if (LC && RC) {
901 SmallVector<const SCEV *, 4> ROps(drop_begin(MulRHS->operands()));
902 if (LOps == ROps)
903 return getExactSDiv(LC, RC, SE, IgnoreSignificantBits);
904 }
905 }
906 }
907
909 bool Found = false;
910 for (const SCEV *S : Mul->operands()) {
911 if (!Found)
912 if (const SCEV *Q = getExactSDiv(S, RHS, SE,
913 IgnoreSignificantBits)) {
914 S = Q;
915 Found = true;
916 }
917 Ops.push_back(S);
918 }
919 return Found ? SE.getMulExpr(Ops) : nullptr;
920 }
921 return nullptr;
922 }
923
924 // Otherwise we don't know.
925 return nullptr;
926}
927
928/// Extracts an immediate operand from \p Ops and replaces the operand with
929/// zero. If \p PreferScalable is true and \p Ops contains both a scalable and
930/// non-scalable offsets, the scalable offset will be extracted.
932 ScalarEvolution &SE,
933 bool PreferScalable) {
934 const APInt *C;
935 SCEVUse *Op = nullptr;
936 Immediate Result = Immediate::getZero();
937
938 // Ops are sorted by their SCEVType (the order of SCEVTypes enum). So, for an
939 // AddExpr the possible order of operands is:
940 // Constant < VScale < Truncate < ZeroExtend < SignExtend < MulExpr < ...
941
942 // This means fixed-size immediates will always appear on the LHS:
943 SCEVUse &S = Ops.front();
944 if (match(S, m_scev_APInt(C)) && !C->isZero() &&
945 C->getSignificantBits() <= 64) {
946 Op = &S;
947 Result = Immediate::getFixed(C->getSExtValue());
948 }
949
950 // But scalable immediates, which are MulExpr(Vscale, Constant), can appear
951 // later in the operand list:
952 if (EnableVScaleImmediates && (Result.isZero() || PreferScalable)) {
953 for (SCEVUse &S : Ops) {
954 // We know anything past scMulExpr will not be a vscale immediate.
955 if (S->getSCEVType() > scMulExpr)
956 break;
958 Op = &S;
959 Result = Immediate::getScalable(C->getSExtValue());
960 break;
961 }
962 }
963 }
964
965 if (Result.isNonZero()) {
966 SCEVUse &S = *Op;
967 S = SE.getConstant(S->getType(), 0);
968 }
969
970 return Result;
971}
972
973/// If S involves the addition of a constant integer value, return that integer
974/// value, and mutate S to point to a new SCEV with that value excluded.
975static Immediate ExtractImmediate(SCEVUse &S, ScalarEvolution &SE,
976 bool PreferScalable = false) {
977 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
978 SmallVector<SCEVUse, 8> NewOps(Add->operands());
979 Immediate Result = ExtractImmediateOperand(NewOps, SE, PreferScalable);
980 if (Result.isZero())
981 Result = ExtractImmediate(NewOps.front(), SE, PreferScalable);
982 if (Result.isNonZero())
983 S = SE.getAddExpr(NewOps);
984 return Result;
985 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
986 SmallVector<SCEVUse, 8> NewOps(AR->operands());
987 Immediate Result = ExtractImmediate(NewOps.front(), SE, PreferScalable);
988 if (Result.isNonZero())
989 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
990 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
992 return Result;
993 }
994 return ExtractImmediateOperand({S}, SE, PreferScalable);
995}
996
997/// If S involves the addition of a GlobalValue address, return that symbol, and
998/// mutate S to point to a new SCEV with that value excluded.
1000 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
1001 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
1002 S = SE.getConstant(GV->getType(), 0);
1003 return GV;
1004 }
1005 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
1006 SmallVector<SCEVUse, 8> NewOps(Add->operands());
1007 GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
1008 if (Result)
1009 S = SE.getAddExpr(NewOps);
1010 return Result;
1011 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
1012 SmallVector<SCEVUse, 8> NewOps(AR->operands());
1013 GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
1014 if (Result)
1015 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
1016 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
1018 return Result;
1019 }
1020 return nullptr;
1021}
1022
1023/// Returns true if the specified instruction is using the specified value as an
1024/// address.
1026 Instruction *Inst, Value *OperandVal) {
1027 bool isAddress = isa<LoadInst>(Inst);
1028 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
1029 if (SI->getPointerOperand() == OperandVal)
1030 isAddress = true;
1031 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
1032 // Addressing modes can also be folded into prefetches and a variety
1033 // of intrinsics.
1034 switch (II->getIntrinsicID()) {
1035 case Intrinsic::memset:
1036 case Intrinsic::prefetch:
1037 case Intrinsic::masked_load:
1038 if (II->getArgOperand(0) == OperandVal)
1039 isAddress = true;
1040 break;
1041 case Intrinsic::masked_store:
1042 if (II->getArgOperand(1) == OperandVal)
1043 isAddress = true;
1044 break;
1045 case Intrinsic::memmove:
1046 case Intrinsic::memcpy:
1047 if (II->getArgOperand(0) == OperandVal ||
1048 II->getArgOperand(1) == OperandVal)
1049 isAddress = true;
1050 break;
1051 default: {
1052 MemIntrinsicInfo IntrInfo;
1053 if (TTI.getTgtMemIntrinsic(II, IntrInfo)) {
1054 if (IntrInfo.PtrVal == OperandVal)
1055 isAddress = true;
1056 }
1057 }
1058 }
1059 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1060 if (RMW->getPointerOperand() == OperandVal)
1061 isAddress = true;
1062 } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1063 if (CmpX->getPointerOperand() == OperandVal)
1064 isAddress = true;
1065 }
1066 return isAddress;
1067}
1068
1069/// Return the type of the memory being accessed.
1070static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
1071 Instruction *Inst, Value *OperandVal) {
1072 MemAccessTy AccessTy = MemAccessTy::getUnknown(Inst->getContext());
1073
1074 // First get the type of memory being accessed.
1075 if (Type *Ty = Inst->getAccessType())
1076 AccessTy.MemTy = Ty;
1077
1078 // Then get the pointer address space.
1079 if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
1080 AccessTy.AddrSpace = SI->getPointerAddressSpace();
1081 } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
1082 AccessTy.AddrSpace = LI->getPointerAddressSpace();
1083 } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1084 AccessTy.AddrSpace = RMW->getPointerAddressSpace();
1085 } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1086 AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
1087 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
1088 switch (II->getIntrinsicID()) {
1089 case Intrinsic::prefetch:
1090 case Intrinsic::memset:
1091 AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
1092 AccessTy.MemTy = OperandVal->getType();
1093 break;
1094 case Intrinsic::memmove:
1095 case Intrinsic::memcpy:
1096 AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace();
1097 AccessTy.MemTy = OperandVal->getType();
1098 break;
1099 case Intrinsic::masked_load:
1100 AccessTy.AddrSpace =
1101 II->getArgOperand(0)->getType()->getPointerAddressSpace();
1102 break;
1103 case Intrinsic::masked_store:
1104 AccessTy.AddrSpace =
1105 II->getArgOperand(1)->getType()->getPointerAddressSpace();
1106 break;
1107 default: {
1108 MemIntrinsicInfo IntrInfo;
1109 if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) {
1110 AccessTy.AddrSpace
1111 = IntrInfo.PtrVal->getType()->getPointerAddressSpace();
1112 }
1113
1114 break;
1115 }
1116 }
1117 }
1118
1119 return AccessTy;
1120}
1121
1122/// Return true if this AddRec is already a phi in its loop.
1123static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
1124 for (PHINode &PN : AR->getLoop()->getHeader()->phis()) {
1125 if (SE.isSCEVable(PN.getType()) &&
1126 (SE.getEffectiveSCEVType(PN.getType()) ==
1127 SE.getEffectiveSCEVType(AR->getType())) &&
1128 SE.getSCEV(&PN) == AR)
1129 return true;
1130 }
1131 return false;
1132}
1133
1134/// Check if expanding this expression is likely to incur significant cost. This
1135/// is tricky because SCEV doesn't track which expressions are actually computed
1136/// by the current IR.
1137///
1138/// We currently allow expansion of IV increments that involve adds,
1139/// multiplication by constants, and AddRecs from existing phis.
1140///
1141/// TODO: Allow UDivExpr if we can find an existing IV increment that is an
1142/// obvious multiple of the UDivExpr.
1143static bool isHighCostExpansion(const SCEV *S,
1145 ScalarEvolution &SE) {
1146 // Zero/One operand expressions
1147 switch (S->getSCEVType()) {
1148 case scUnknown:
1149 case scConstant:
1150 case scVScale:
1151 return false;
1152 case scTruncate:
1153 return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
1154 Processed, SE);
1155 case scZeroExtend:
1156 return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
1157 Processed, SE);
1158 case scSignExtend:
1159 return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
1160 Processed, SE);
1161 default:
1162 break;
1163 }
1164
1165 if (!Processed.insert(S).second)
1166 return false;
1167
1168 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
1169 for (const SCEV *S : Add->operands()) {
1170 if (isHighCostExpansion(S, Processed, SE))
1171 return true;
1172 }
1173 return false;
1174 }
1175
1176 const SCEV *Op0, *Op1;
1177 if (match(S, m_scev_Mul(m_SCEV(Op0), m_SCEV(Op1)))) {
1178 // Multiplication by a constant is ok
1179 if (isa<SCEVConstant>(Op0))
1180 return isHighCostExpansion(Op1, Processed, SE);
1181
1182 // If we have the value of one operand, check if an existing
1183 // multiplication already generates this expression.
1184 if (const auto *U = dyn_cast<SCEVUnknown>(Op1)) {
1185 Value *UVal = U->getValue();
1186 for (User *UR : UVal->users()) {
1187 // If U is a constant, it may be used by a ConstantExpr.
1189 if (UI && UI->getOpcode() == Instruction::Mul &&
1190 SE.isSCEVable(UI->getType())) {
1191 return SE.getSCEV(UI) == S;
1192 }
1193 }
1194 }
1195 }
1196
1197 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
1198 if (isExistingPhi(AR, SE))
1199 return false;
1200 }
1201
1202 // Fow now, consider any other type of expression (div/mul/min/max) high cost.
1203 return true;
1204}
1205
1206namespace {
1207
1208class LSRUse;
1209
1210} // end anonymous namespace
1211
1212/// Check if the addressing mode defined by \p F is completely
1213/// folded in \p LU at isel time.
1214/// This includes address-mode folding and special icmp tricks.
1215/// This function returns true if \p LU can accommodate what \p F
1216/// defines and up to 1 base + 1 scaled + offset.
1217/// In other words, if \p F has several base registers, this function may
1218/// still return true. Therefore, users still need to account for
1219/// additional base registers and/or unfolded offsets to derive an
1220/// accurate cost model.
1221static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1222 const LSRUse &LU, const Formula &F);
1223
1224// Get the cost of the scaling factor used in F for LU.
1225static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI,
1226 const LSRUse &LU, const Formula &F,
1227 const Loop &L);
1228
1229namespace {
1230
1231/// This class is used to measure and compare candidate formulae.
1232class Cost {
1233 const Loop *L = nullptr;
1234 ScalarEvolution *SE = nullptr;
1235 const TargetTransformInfo *TTI = nullptr;
1236 TargetTransformInfo::LSRCost C;
1238
1239public:
1240 Cost() = delete;
1241 Cost(const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
1243 L(L), SE(&SE), TTI(&TTI), AMK(AMK) {
1244 C.Insns = 0;
1245 C.NumRegs = 0;
1246 C.AddRecCost = 0;
1247 C.NumIVMuls = 0;
1248 C.NumBaseAdds = 0;
1249 C.ImmCost = 0;
1250 C.SetupCost = 0;
1251 C.ScaleCost = 0;
1252 }
1253
1254 bool isLess(const Cost &Other) const;
1255
1256 void Lose();
1257
1258#ifndef NDEBUG
1259 // Once any of the metrics loses, they must all remain losers.
1260 bool isValid() {
1261 return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
1262 | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
1263 || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
1264 & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
1265 }
1266#endif
1267
1268 bool isLoser() {
1269 assert(isValid() && "invalid cost");
1270 return C.NumRegs == ~0u;
1271 }
1272
1273 void RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
1274 const DenseSet<const SCEV *> &VisitedRegs, const LSRUse &LU,
1275 bool HardwareLoopProfitable,
1276 SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
1277
1278 void print(raw_ostream &OS) const;
1279 void dump() const;
1280
1281private:
1282 void RateRegister(const Formula &F, const SCEV *Reg,
1283 SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
1284 bool HardwareLoopProfitable);
1285 void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1286 SmallPtrSetImpl<const SCEV *> &Regs,
1287 const LSRUse &LU, bool HardwareLoopProfitable,
1288 SmallPtrSetImpl<const SCEV *> *LoserRegs);
1289};
1290
1291/// An operand value in an instruction which is to be replaced with some
1292/// equivalent, possibly strength-reduced, replacement.
1293struct LSRFixup {
1294 /// The instruction which will be updated.
1295 Instruction *UserInst = nullptr;
1296
1297 /// The operand of the instruction which will be replaced. The operand may be
1298 /// used more than once; every instance will be replaced.
1299 Value *OperandValToReplace = nullptr;
1300
1301 /// If this user is to use the post-incremented value of an induction
1302 /// variable, this set is non-empty and holds the loops associated with the
1303 /// induction variable.
1304 PostIncLoopSet PostIncLoops;
1305
1306 /// A constant offset to be added to the LSRUse expression. This allows
1307 /// multiple fixups to share the same LSRUse with different offsets, for
1308 /// example in an unrolled loop.
1309 Immediate Offset = Immediate::getZero();
1310
1311 LSRFixup() = default;
1312
1313 bool isUseFullyOutsideLoop(const Loop *L) const;
1314
1315 void print(raw_ostream &OS) const;
1316 void dump() const;
1317};
1318
1319/// This class holds the state that LSR keeps for each use in IVUsers, as well
1320/// as uses invented by LSR itself. It includes information about what kinds of
1321/// things can be folded into the user, information about the user itself, and
1322/// information about how the use may be satisfied. TODO: Represent multiple
1323/// users of the same expression in common?
1324class LSRUse {
1325 DenseSet<SmallVector<const SCEV *, 4>> Uniquifier;
1326
1327public:
1328 /// An enum for a kind of use, indicating what types of scaled and immediate
1329 /// operands it might support.
1330 enum KindType {
1331 Basic, ///< A normal use, with no folding.
1332 Special, ///< A special case of basic, allowing -1 scales.
1333 Address, ///< An address use; folding according to TargetLowering
1334 ICmpZero ///< An equality icmp with both operands folded into one.
1335 // TODO: Add a generic icmp too?
1336 };
1337
1338 using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>;
1339
1340 KindType Kind;
1341 MemAccessTy AccessTy;
1342
1343 /// The list of operands which are to be replaced.
1345
1346 /// Keep track of the min and max offsets of the fixups.
1347 Immediate MinOffset = Immediate::getFixedMax();
1348 Immediate MaxOffset = Immediate::getFixedMin();
1349
1350 /// This records whether all of the fixups using this LSRUse are outside of
1351 /// the loop, in which case some special-case heuristics may be used.
1352 bool AllFixupsOutsideLoop = true;
1353
1354 /// This records whether all of the fixups using this LSRUse are unconditional
1355 /// within the loop, meaning they will be executed on every path to the loop
1356 /// latch. This includes fixups before early exits.
1357 bool AllFixupsUnconditional = true;
1358
1359 /// RigidFormula is set to true to guarantee that this use will be associated
1360 /// with a single formula--the one that initially matched. Some SCEV
1361 /// expressions cannot be expanded. This allows LSR to consider the registers
1362 /// used by those expressions without the need to expand them later after
1363 /// changing the formula.
1364 bool RigidFormula = false;
1365
1366 /// A list of ways to build a value that can satisfy this user. After the
1367 /// list is populated, one of these is selected heuristically and used to
1368 /// formulate a replacement for OperandValToReplace in UserInst.
1369 SmallVector<Formula, 12> Formulae;
1370
1371 /// The set of register candidates used by all formulae in this LSRUse.
1372 SmallPtrSet<const SCEV *, 4> Regs;
1373
1374 LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {}
1375
1376 LSRFixup &getNewFixup() {
1377 Fixups.push_back(LSRFixup());
1378 return Fixups.back();
1379 }
1380
1381 void pushFixup(LSRFixup &f) {
1382 Fixups.push_back(f);
1383 if (Immediate::isKnownGT(f.Offset, MaxOffset))
1384 MaxOffset = f.Offset;
1385 if (Immediate::isKnownLT(f.Offset, MinOffset))
1386 MinOffset = f.Offset;
1387 }
1388
1389 bool HasFormulaWithSameRegs(const Formula &F) const;
1390 float getNotSelectedProbability(const SCEV *Reg) const;
1391 bool InsertFormula(const Formula &F, const Loop &L);
1392 void DeleteFormula(Formula &F);
1393 void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
1394
1395 void print(raw_ostream &OS) const;
1396 void dump() const;
1397};
1398
1399} // end anonymous namespace
1400
1401static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
1402 LSRUse::KindType Kind, MemAccessTy AccessTy,
1403 GlobalValue *BaseGV, Immediate BaseOffset,
1404 bool HasBaseReg, int64_t Scale,
1405 Instruction *Fixup = nullptr);
1406
1407static unsigned getSetupCost(const SCEV *Reg, unsigned Depth,
1408 const TargetTransformInfo &TTI) {
1409 if (isa<SCEVUnknown>(Reg))
1410 return 1;
1411 if (const auto *C = dyn_cast<SCEVConstant>(Reg)) {
1412 if (TTI.getIntImmCost(C->getAPInt(), C->getType(),
1415 return 0;
1416 return 1;
1417 }
1418 if (Depth == 0)
1419 return 0;
1420 if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))
1421 return getSetupCost(S->getStart(), Depth - 1, TTI);
1422 if (auto S = dyn_cast<SCEVIntegralCastExpr>(Reg))
1423 return getSetupCost(S->getOperand(), Depth - 1, TTI);
1424 if (auto S = dyn_cast<SCEVNAryExpr>(Reg))
1425 return std::accumulate(S->operands().begin(), S->operands().end(), 0,
1426 [&](unsigned i, const SCEV *Reg) {
1427 return i + getSetupCost(Reg, Depth - 1, TTI);
1428 });
1429 if (auto S = dyn_cast<SCEVUDivExpr>(Reg))
1430 return getSetupCost(S->getLHS(), Depth - 1, TTI) +
1431 getSetupCost(S->getRHS(), Depth - 1, TTI);
1432 return 0;
1433}
1434
1435/// Tally up interesting quantities from the given register.
1436void Cost::RateRegister(const Formula &F, const SCEV *Reg,
1437 SmallPtrSetImpl<const SCEV *> &Regs, const LSRUse &LU,
1438 bool HardwareLoopProfitable) {
1439 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
1440 // If this is an addrec for another loop, it should be an invariant
1441 // with respect to L since L is the innermost loop (at least
1442 // for now LSR only handles innermost loops).
1443 if (AR->getLoop() != L) {
1444 // If the AddRec exists, consider it's register free and leave it alone.
1445 if (isExistingPhi(AR, *SE) && !(AMK & TTI::AMK_PostIndexed))
1446 return;
1447
1448 // It is bad to allow LSR for current loop to add induction variables
1449 // for its sibling loops.
1450 if (!AR->getLoop()->contains(L)) {
1451 Lose();
1452 return;
1453 }
1454
1455 // Otherwise, it will be an invariant with respect to Loop L.
1456 ++C.NumRegs;
1457 return;
1458 }
1459
1460 unsigned LoopCost = 1;
1461 if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||
1462 TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {
1463 const SCEV *Start;
1464 const APInt *Step;
1465 if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_scev_APInt(Step)))) {
1466 // If the step size matches the base offset, we could use pre-indexed
1467 // addressing.
1468 bool CanPreIndex = (AMK & TTI::AMK_PreIndexed) &&
1469 F.BaseOffset.isFixed() &&
1470 *Step == F.BaseOffset.getFixedValue();
1471 bool CanPostIndex = (AMK & TTI::AMK_PostIndexed) &&
1472 !isa<SCEVConstant>(Start) &&
1473 SE->isLoopInvariant(Start, L);
1474 // We can only pre or post index when the load/store is unconditional.
1475 if ((CanPreIndex || CanPostIndex) && LU.AllFixupsUnconditional)
1476 LoopCost = 0;
1477 }
1478 }
1479
1480 // If the loop counts down to zero and we'll be using a hardware loop then
1481 // the addrec will be combined into the hardware loop instruction.
1482 if (LU.Kind == LSRUse::ICmpZero && F.countsDownToZero() &&
1483 HardwareLoopProfitable)
1484 LoopCost = 0;
1485 C.AddRecCost += LoopCost;
1486
1487 // Add the step value register, if it needs one.
1488 // TODO: The non-affine case isn't precisely modeled here.
1489 if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
1490 if (!Regs.count(AR->getOperand(1))) {
1491 RateRegister(F, AR->getOperand(1), Regs, LU, HardwareLoopProfitable);
1492 if (isLoser())
1493 return;
1494 }
1495 }
1496 }
1497 ++C.NumRegs;
1498
1499 // Rough heuristic; favor registers which don't require extra setup
1500 // instructions in the preheader.
1501 C.SetupCost += getSetupCost(Reg, SetupCostDepthLimit, *TTI);
1502 // Ensure we don't, even with the recusion limit, produce invalid costs.
1503 C.SetupCost = std::min<unsigned>(C.SetupCost, 1 << 16);
1504
1505 C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
1507}
1508
1509/// Record this register in the set. If we haven't seen it before, rate
1510/// it. Optional LoserRegs provides a way to declare any formula that refers to
1511/// one of those regs an instant loser.
1512void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1513 SmallPtrSetImpl<const SCEV *> &Regs,
1514 const LSRUse &LU, bool HardwareLoopProfitable,
1515 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1516 if (LoserRegs && LoserRegs->count(Reg)) {
1517 Lose();
1518 return;
1519 }
1520 if (Regs.insert(Reg).second) {
1521 RateRegister(F, Reg, Regs, LU, HardwareLoopProfitable);
1522 if (LoserRegs && isLoser())
1523 LoserRegs->insert(Reg);
1524 }
1525}
1526
1527void Cost::RateFormula(const Formula &F, SmallPtrSetImpl<const SCEV *> &Regs,
1528 const DenseSet<const SCEV *> &VisitedRegs,
1529 const LSRUse &LU, bool HardwareLoopProfitable,
1530 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1531 if (isLoser())
1532 return;
1533 assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
1534 // Tally up the registers.
1535 unsigned PrevAddRecCost = C.AddRecCost;
1536 unsigned PrevNumRegs = C.NumRegs;
1537 unsigned PrevNumBaseAdds = C.NumBaseAdds;
1538 if (const SCEV *ScaledReg = F.ScaledReg) {
1539 if (VisitedRegs.count(ScaledReg)) {
1540 Lose();
1541 return;
1542 }
1543 RatePrimaryRegister(F, ScaledReg, Regs, LU, HardwareLoopProfitable,
1544 LoserRegs);
1545 if (isLoser())
1546 return;
1547 }
1548 for (const SCEV *BaseReg : F.BaseRegs) {
1549 if (VisitedRegs.count(BaseReg)) {
1550 Lose();
1551 return;
1552 }
1553 RatePrimaryRegister(F, BaseReg, Regs, LU, HardwareLoopProfitable,
1554 LoserRegs);
1555 if (isLoser())
1556 return;
1557 }
1558
1559 // Determine how many (unfolded) adds we'll need inside the loop.
1560 size_t NumBaseParts = F.getNumRegs();
1561 if (NumBaseParts > 1)
1562 // Do not count the base and a possible second register if the target
1563 // allows to fold 2 registers.
1564 C.NumBaseAdds +=
1565 NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F)));
1566 C.NumBaseAdds += (F.UnfoldedOffset.isNonZero());
1567
1568 // Accumulate non-free scaling amounts.
1569 C.ScaleCost += getScalingFactorCost(*TTI, LU, F, *L).getValue();
1570
1571 // Tally up the non-zero immediates.
1572 for (const LSRFixup &Fixup : LU.Fixups) {
1573 if (Fixup.Offset.isCompatibleImmediate(F.BaseOffset)) {
1574 Immediate Offset = Fixup.Offset.addUnsigned(F.BaseOffset);
1575 if (F.BaseGV)
1576 C.ImmCost += 64; // Handle symbolic values conservatively.
1577 // TODO: This should probably be the pointer size.
1578 else if (Offset.isNonZero())
1579 C.ImmCost +=
1580 APInt(64, Offset.getKnownMinValue(), true).getSignificantBits();
1581
1582 // Check with target if this offset with this instruction is
1583 // specifically not supported.
1584 if (LU.Kind == LSRUse::Address && Offset.isNonZero() &&
1585 !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1586 Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
1587 C.NumBaseAdds++;
1588 } else {
1589 // Incompatible immediate type, increase cost to avoid using
1590 C.ImmCost += 2048;
1591 }
1592 }
1593
1594 // If we don't count instruction cost exit here.
1595 if (!InsnsCost) {
1596 assert(isValid() && "invalid cost");
1597 return;
1598 }
1599
1600 // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
1601 // additional instruction (at least fill).
1602 // TODO: Need distinguish register class?
1603 unsigned TTIRegNum = TTI->getNumberOfRegisters(
1604 TTI->getRegisterClassForType(false, F.getType())) - 1;
1605 if (C.NumRegs > TTIRegNum) {
1606 // Cost already exceeded TTIRegNum, then only newly added register can add
1607 // new instructions.
1608 if (PrevNumRegs > TTIRegNum)
1609 C.Insns += (C.NumRegs - PrevNumRegs);
1610 else
1611 C.Insns += (C.NumRegs - TTIRegNum);
1612 }
1613
1614 // If ICmpZero formula ends with not 0, it could not be replaced by
1615 // just add or sub. We'll need to compare final result of AddRec.
1616 // That means we'll need an additional instruction. But if the target can
1617 // macro-fuse a compare with a branch, don't count this extra instruction.
1618 // For -10 + {0, +, 1}:
1619 // i = i + 1;
1620 // cmp i, 10
1621 //
1622 // For {-10, +, 1}:
1623 // i = i + 1;
1624 if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() &&
1625 !TTI->canMacroFuseCmp())
1626 C.Insns++;
1627 // Each new AddRec adds 1 instruction to calculation.
1628 C.Insns += (C.AddRecCost - PrevAddRecCost);
1629
1630 // BaseAdds adds instructions for unfolded registers.
1631 if (LU.Kind != LSRUse::ICmpZero)
1632 C.Insns += C.NumBaseAdds - PrevNumBaseAdds;
1633 assert(isValid() && "invalid cost");
1634}
1635
1636/// Set this cost to a losing value.
1637void Cost::Lose() {
1638 C.Insns = std::numeric_limits<unsigned>::max();
1639 C.NumRegs = std::numeric_limits<unsigned>::max();
1640 C.AddRecCost = std::numeric_limits<unsigned>::max();
1641 C.NumIVMuls = std::numeric_limits<unsigned>::max();
1642 C.NumBaseAdds = std::numeric_limits<unsigned>::max();
1643 C.ImmCost = std::numeric_limits<unsigned>::max();
1644 C.SetupCost = std::numeric_limits<unsigned>::max();
1645 C.ScaleCost = std::numeric_limits<unsigned>::max();
1646}
1647
1648/// Choose the lower cost.
1649bool Cost::isLess(const Cost &Other) const {
1650 if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
1651 C.Insns != Other.C.Insns)
1652 return C.Insns < Other.C.Insns;
1653 return TTI->isLSRCostLess(C, Other.C);
1654}
1655
1656#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1657void Cost::print(raw_ostream &OS) const {
1658 if (InsnsCost)
1659 OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
1660 OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s");
1661 if (C.AddRecCost != 0)
1662 OS << ", with addrec cost " << C.AddRecCost;
1663 if (C.NumIVMuls != 0)
1664 OS << ", plus " << C.NumIVMuls << " IV mul"
1665 << (C.NumIVMuls == 1 ? "" : "s");
1666 if (C.NumBaseAdds != 0)
1667 OS << ", plus " << C.NumBaseAdds << " base add"
1668 << (C.NumBaseAdds == 1 ? "" : "s");
1669 if (C.ScaleCost != 0)
1670 OS << ", plus " << C.ScaleCost << " scale cost";
1671 if (C.ImmCost != 0)
1672 OS << ", plus " << C.ImmCost << " imm cost";
1673 if (C.SetupCost != 0)
1674 OS << ", plus " << C.SetupCost << " setup cost";
1675}
1676
1677LLVM_DUMP_METHOD void Cost::dump() const {
1678 print(errs()); errs() << '\n';
1679}
1680#endif
1681
1682/// Test whether this fixup always uses its value outside of the given loop.
1683bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
1684 // PHI nodes use their value in their incoming blocks.
1685 if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
1686 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
1687 if (PN->getIncomingValue(i) == OperandValToReplace &&
1688 L->contains(PN->getIncomingBlock(i)))
1689 return false;
1690 return true;
1691 }
1692
1693 return !L->contains(UserInst);
1694}
1695
1696#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1697void LSRFixup::print(raw_ostream &OS) const {
1698 OS << "UserInst=";
1699 // Store is common and interesting enough to be worth special-casing.
1700 if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
1701 OS << "store ";
1702 Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
1703 } else if (UserInst->getType()->isVoidTy())
1704 OS << UserInst->getOpcodeName();
1705 else
1706 UserInst->printAsOperand(OS, /*PrintType=*/false);
1707
1708 OS << ", OperandValToReplace=";
1709 OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
1710
1711 for (const Loop *PIL : PostIncLoops) {
1712 OS << ", PostIncLoop=";
1713 PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
1714 }
1715
1716 if (Offset.isNonZero())
1717 OS << ", Offset=" << Offset;
1718}
1719
1720LLVM_DUMP_METHOD void LSRFixup::dump() const {
1721 print(errs()); errs() << '\n';
1722}
1723#endif
1724
1725/// Test whether this use as a formula which has the same registers as the given
1726/// formula.
1727bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
1729 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1730 // Unstable sort by host order ok, because this is only used for uniquifying.
1731 llvm::sort(Key);
1732 return Uniquifier.count(Key);
1733}
1734
1735/// The function returns a probability of selecting formula without Reg.
1736float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
1737 unsigned FNum = 0;
1738 for (const Formula &F : Formulae)
1739 if (F.referencesReg(Reg))
1740 FNum++;
1741 return ((float)(Formulae.size() - FNum)) / Formulae.size();
1742}
1743
1744/// If the given formula has not yet been inserted, add it to the list, and
1745/// return true. Return false otherwise. The formula must be in canonical form.
1746bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
1747 assert(F.isCanonical(L) && "Invalid canonical representation");
1748
1749 if (!Formulae.empty() && RigidFormula)
1750 return false;
1751
1753 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1754 // Unstable sort by host order ok, because this is only used for uniquifying.
1755 llvm::sort(Key);
1756
1757 if (!Uniquifier.insert(Key).second)
1758 return false;
1759
1760 // Using a register to hold the value of 0 is not profitable.
1761 assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
1762 "Zero allocated in a scaled register!");
1763#ifndef NDEBUG
1764 for (const SCEV *BaseReg : F.BaseRegs)
1765 assert(!BaseReg->isZero() && "Zero allocated in a base register!");
1766#endif
1767
1768 // Add the formula to the list.
1769 Formulae.push_back(F);
1770
1771 // Record registers now being used by this use.
1772 Regs.insert_range(F.BaseRegs);
1773 if (F.ScaledReg)
1774 Regs.insert(F.ScaledReg);
1775
1776 return true;
1777}
1778
1779/// Remove the given formula from this use's list.
1780void LSRUse::DeleteFormula(Formula &F) {
1781 if (&F != &Formulae.back())
1782 std::swap(F, Formulae.back());
1783 Formulae.pop_back();
1784}
1785
1786/// Recompute the Regs field, and update RegUses.
1787void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
1788 // Now that we've filtered out some formulae, recompute the Regs set.
1789 SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
1790 Regs.clear();
1791 for (const Formula &F : Formulae) {
1792 if (F.ScaledReg) Regs.insert(F.ScaledReg);
1793 Regs.insert_range(F.BaseRegs);
1794 }
1795
1796 // Update the RegTracker.
1797 for (const SCEV *S : OldRegs)
1798 if (!Regs.count(S))
1799 RegUses.dropRegister(S, LUIdx);
1800}
1801
1802#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1803void LSRUse::print(raw_ostream &OS) const {
1804 OS << "LSR Use: Kind=";
1805 switch (Kind) {
1806 case Basic: OS << "Basic"; break;
1807 case Special: OS << "Special"; break;
1808 case ICmpZero: OS << "ICmpZero"; break;
1809 case Address:
1810 OS << "Address of ";
1811 if (AccessTy.MemTy->isPointerTy())
1812 OS << "pointer"; // the full pointer type could be really verbose
1813 else {
1814 OS << *AccessTy.MemTy;
1815 }
1816
1817 OS << " in addrspace(" << AccessTy.AddrSpace << ')';
1818 }
1819
1820 OS << ", Offsets={";
1821 bool NeedComma = false;
1822 for (const LSRFixup &Fixup : Fixups) {
1823 if (NeedComma) OS << ',';
1824 OS << Fixup.Offset;
1825 NeedComma = true;
1826 }
1827 OS << '}';
1828
1829 if (AllFixupsOutsideLoop)
1830 OS << ", all-fixups-outside-loop";
1831
1832 if (AllFixupsUnconditional)
1833 OS << ", all-fixups-unconditional";
1834}
1835
1836LLVM_DUMP_METHOD void LSRUse::dump() const {
1837 print(errs()); errs() << '\n';
1838}
1839#endif
1840
1842 LSRUse::KindType Kind, MemAccessTy AccessTy,
1843 GlobalValue *BaseGV, Immediate BaseOffset,
1844 bool HasBaseReg, int64_t Scale,
1845 Instruction *Fixup /* = nullptr */) {
1846 switch (Kind) {
1847 case LSRUse::Address: {
1848 int64_t FixedOffset =
1849 BaseOffset.isScalable() ? 0 : BaseOffset.getFixedValue();
1850 int64_t ScalableOffset =
1851 BaseOffset.isScalable() ? BaseOffset.getKnownMinValue() : 0;
1852 return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, FixedOffset,
1853 HasBaseReg, Scale, AccessTy.AddrSpace,
1854 Fixup, ScalableOffset);
1855 }
1856 case LSRUse::ICmpZero:
1857 // There's not even a target hook for querying whether it would be legal to
1858 // fold a GV into an ICmp.
1859 if (BaseGV)
1860 return false;
1861
1862 // ICmp only has two operands; don't allow more than two non-trivial parts.
1863 if (Scale != 0 && HasBaseReg && BaseOffset.isNonZero())
1864 return false;
1865
1866 // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
1867 // putting the scaled register in the other operand of the icmp.
1868 if (Scale != 0 && Scale != -1)
1869 return false;
1870
1871 // If we have low-level target information, ask the target if it can fold an
1872 // integer immediate on an icmp.
1873 if (BaseOffset.isNonZero()) {
1874 // We don't have an interface to query whether the target supports
1875 // icmpzero against scalable quantities yet.
1876 if (BaseOffset.isScalable())
1877 return false;
1878
1879 // We have one of:
1880 // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
1881 // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
1882 // Offs is the ICmp immediate.
1883 if (Scale == 0)
1884 // The cast does the right thing with
1885 // std::numeric_limits<int64_t>::min().
1886 BaseOffset = BaseOffset.getFixed(-(uint64_t)BaseOffset.getFixedValue());
1887 return TTI.isLegalICmpImmediate(BaseOffset.getFixedValue());
1888 }
1889
1890 // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
1891 return true;
1892
1893 case LSRUse::Basic:
1894 // Only handle single-register values.
1895 return !BaseGV && Scale == 0 && BaseOffset.isZero();
1896
1897 case LSRUse::Special:
1898 // Special case Basic to handle -1 scales.
1899 return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset.isZero();
1900 }
1901
1902 llvm_unreachable("Invalid LSRUse Kind!");
1903}
1904
1906 Immediate MinOffset, Immediate MaxOffset,
1907 LSRUse::KindType Kind, MemAccessTy AccessTy,
1908 GlobalValue *BaseGV, Immediate BaseOffset,
1909 bool HasBaseReg, int64_t Scale) {
1910 if (BaseOffset.isNonZero() &&
1911 (BaseOffset.isScalable() != MinOffset.isScalable() ||
1912 BaseOffset.isScalable() != MaxOffset.isScalable()))
1913 return false;
1914 // Check for overflow.
1915 int64_t Base = BaseOffset.getKnownMinValue();
1916 int64_t Min = MinOffset.getKnownMinValue();
1917 int64_t Max = MaxOffset.getKnownMinValue();
1918 if (((int64_t)((uint64_t)Base + Min) > Base) != (Min > 0))
1919 return false;
1920 MinOffset = Immediate::get((uint64_t)Base + Min, MinOffset.isScalable());
1921 if (((int64_t)((uint64_t)Base + Max) > Base) != (Max > 0))
1922 return false;
1923 MaxOffset = Immediate::get((uint64_t)Base + Max, MaxOffset.isScalable());
1924
1925 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
1926 HasBaseReg, Scale) &&
1927 isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
1928 HasBaseReg, Scale);
1929}
1930
1932 Immediate MinOffset, Immediate MaxOffset,
1933 LSRUse::KindType Kind, MemAccessTy AccessTy,
1934 const Formula &F, const Loop &L) {
1935 // For the purpose of isAMCompletelyFolded either having a canonical formula
1936 // or a scale not equal to zero is correct.
1937 // Problems may arise from non canonical formulae having a scale == 0.
1938 // Strictly speaking it would best to just rely on canonical formulae.
1939 // However, when we generate the scaled formulae, we first check that the
1940 // scaling factor is profitable before computing the actual ScaledReg for
1941 // compile time sake.
1942 assert((F.isCanonical(L) || F.Scale != 0));
1943 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1944 F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
1945}
1946
1947/// Test whether we know how to expand the current formula.
1948static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1949 Immediate MaxOffset, LSRUse::KindType Kind,
1950 MemAccessTy AccessTy, GlobalValue *BaseGV,
1951 Immediate BaseOffset, bool HasBaseReg, int64_t Scale) {
1952 // We know how to expand completely foldable formulae.
1953 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1954 BaseOffset, HasBaseReg, Scale) ||
1955 // Or formulae that use a base register produced by a sum of base
1956 // registers.
1957 (Scale == 1 &&
1958 isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1959 BaseGV, BaseOffset, true, 0));
1960}
1961
1962static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1963 Immediate MaxOffset, LSRUse::KindType Kind,
1964 MemAccessTy AccessTy, const Formula &F) {
1965 return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
1966 F.BaseOffset, F.HasBaseReg, F.Scale);
1967}
1968
1970 Immediate Offset) {
1971 if (Offset.isScalable())
1972 return TTI.isLegalAddScalableImmediate(Offset.getKnownMinValue());
1973
1974 return TTI.isLegalAddImmediate(Offset.getFixedValue());
1975}
1976
1978 const LSRUse &LU, const Formula &F) {
1979 // Target may want to look at the user instructions.
1980 if (LU.Kind == LSRUse::Address && TTI.LSRWithInstrQueries()) {
1981 for (const LSRFixup &Fixup : LU.Fixups)
1982 if (!isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1983 (F.BaseOffset + Fixup.Offset), F.HasBaseReg,
1984 F.Scale, Fixup.UserInst))
1985 return false;
1986 return true;
1987 }
1988
1989 return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1990 LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
1991 F.Scale);
1992}
1993
1995 const LSRUse &LU, const Formula &F,
1996 const Loop &L) {
1997 if (!F.Scale)
1998 return 0;
1999
2000 // If the use is not completely folded in that instruction, we will have to
2001 // pay an extra cost only for scale != 1.
2002 if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
2003 LU.AccessTy, F, L))
2004 return F.Scale != 1;
2005
2006 switch (LU.Kind) {
2007 case LSRUse::Address: {
2008 // Check the scaling factor cost with both the min and max offsets.
2009 int64_t ScalableMin = 0, ScalableMax = 0, FixedMin = 0, FixedMax = 0;
2010 if (F.BaseOffset.isScalable()) {
2011 ScalableMin = (F.BaseOffset + LU.MinOffset).getKnownMinValue();
2012 ScalableMax = (F.BaseOffset + LU.MaxOffset).getKnownMinValue();
2013 } else {
2014 FixedMin = (F.BaseOffset + LU.MinOffset).getFixedValue();
2015 FixedMax = (F.BaseOffset + LU.MaxOffset).getFixedValue();
2016 }
2017 InstructionCost ScaleCostMinOffset = TTI.getScalingFactorCost(
2018 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMin, ScalableMin),
2019 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
2020 InstructionCost ScaleCostMaxOffset = TTI.getScalingFactorCost(
2021 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMax, ScalableMax),
2022 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
2023
2024 assert(ScaleCostMinOffset.isValid() && ScaleCostMaxOffset.isValid() &&
2025 "Legal addressing mode has an illegal cost!");
2026 return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
2027 }
2028 case LSRUse::ICmpZero:
2029 case LSRUse::Basic:
2030 case LSRUse::Special:
2031 // The use is completely folded, i.e., everything is folded into the
2032 // instruction.
2033 return 0;
2034 }
2035
2036 llvm_unreachable("Invalid LSRUse Kind!");
2037}
2038
2040 LSRUse::KindType Kind, MemAccessTy AccessTy,
2041 GlobalValue *BaseGV, Immediate BaseOffset,
2042 bool HasBaseReg) {
2043 // Fast-path: zero is always foldable.
2044 if (BaseOffset.isZero() && !BaseGV)
2045 return true;
2046
2047 // Conservatively, create an address with an immediate and a
2048 // base and a scale.
2049 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2050
2051 // Canonicalize a scale of 1 to a base register if the formula doesn't
2052 // already have a base register.
2053 if (!HasBaseReg && Scale == 1) {
2054 Scale = 0;
2055 HasBaseReg = true;
2056 }
2057
2058 // FIXME: Try with + without a scale? Maybe based on TTI?
2059 // I think basereg + scaledreg + immediateoffset isn't a good 'conservative'
2060 // default for many architectures, not just AArch64 SVE. More investigation
2061 // needed later to determine if this should be used more widely than just
2062 // on scalable types.
2063 if (HasBaseReg && BaseOffset.isNonZero() && Kind != LSRUse::ICmpZero &&
2064 AccessTy.MemTy && AccessTy.MemTy->isScalableTy() && DropScaledForVScale)
2065 Scale = 0;
2066
2067 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
2068 HasBaseReg, Scale);
2069}
2070
2072 ScalarEvolution &SE, Immediate MinOffset,
2073 Immediate MaxOffset, LSRUse::KindType Kind,
2074 MemAccessTy AccessTy, const SCEV *S,
2075 bool HasBaseReg) {
2076 // Fast-path: zero is always foldable.
2077 if (S->isZero()) return true;
2078
2079 // Conservatively, create an address with an immediate and a
2080 // base and a scale.
2081 SCEVUse SCopy = S;
2082 Immediate BaseOffset = ExtractImmediate(SCopy, SE);
2083 GlobalValue *BaseGV = ExtractSymbol(SCopy, SE);
2084
2085 // If there's anything else involved, it's not foldable.
2086 if (!SCopy->isZero())
2087 return false;
2088
2089 // Fast-path: zero is always foldable.
2090 if (BaseOffset.isZero() && !BaseGV)
2091 return true;
2092
2093 if (BaseOffset.isScalable())
2094 return false;
2095
2096 // Conservatively, create an address with an immediate and a
2097 // base and a scale.
2098 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2099
2100 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
2101 BaseOffset, HasBaseReg, Scale);
2102}
2103
2104namespace {
2105
2106/// An individual increment in a Chain of IV increments. Relate an IV user to
2107/// an expression that computes the IV it uses from the IV used by the previous
2108/// link in the Chain.
2109///
2110/// For the head of a chain, IncExpr holds the absolute SCEV expression for the
2111/// original IVOperand. The head of the chain's IVOperand is only valid during
2112/// chain collection, before LSR replaces IV users. During chain generation,
2113/// IncExpr can be used to find the new IVOperand that computes the same
2114/// expression.
2115struct IVInc {
2116 Instruction *UserInst;
2117 Value* IVOperand;
2118 const SCEV *IncExpr;
2119
2120 IVInc(Instruction *U, Value *O, const SCEV *E)
2121 : UserInst(U), IVOperand(O), IncExpr(E) {}
2122};
2123
2124// The list of IV increments in program order. We typically add the head of a
2125// chain without finding subsequent links.
2126struct IVChain {
2128 const SCEV *ExprBase = nullptr;
2129
2130 IVChain() = default;
2131 IVChain(const IVInc &Head, const SCEV *Base)
2132 : Incs(1, Head), ExprBase(Base) {}
2133
2134 using const_iterator = SmallVectorImpl<IVInc>::const_iterator;
2135
2136 // Return the first increment in the chain.
2137 const_iterator begin() const {
2138 assert(!Incs.empty());
2139 return std::next(Incs.begin());
2140 }
2141 const_iterator end() const {
2142 return Incs.end();
2143 }
2144
2145 // Returns true if this chain contains any increments.
2146 bool hasIncs() const { return Incs.size() >= 2; }
2147
2148 // Add an IVInc to the end of this chain.
2149 void add(const IVInc &X) { Incs.push_back(X); }
2150
2151 // Returns the last UserInst in the chain.
2152 Instruction *tailUserInst() const { return Incs.back().UserInst; }
2153
2154 // Returns true if IncExpr can be profitably added to this chain.
2155 bool isProfitableIncrement(const SCEV *OperExpr,
2156 const SCEV *IncExpr,
2157 ScalarEvolution&);
2158};
2159
2160/// Helper for CollectChains to track multiple IV increment uses. Distinguish
2161/// between FarUsers that definitely cross IV increments and NearUsers that may
2162/// be used between IV increments.
2163struct ChainUsers {
2164 SmallPtrSet<Instruction*, 4> FarUsers;
2165 SmallPtrSet<Instruction*, 4> NearUsers;
2166};
2167
2168/// This class holds state for the main loop strength reduction logic.
2169class LSRInstance {
2170 IVUsers &IU;
2171 ScalarEvolution &SE;
2172 DominatorTree &DT;
2173 LoopInfo &LI;
2174 AssumptionCache &AC;
2175 TargetLibraryInfo &TLI;
2176 const TargetTransformInfo &TTI;
2177 Loop *const L;
2178 MemorySSAUpdater *MSSAU;
2180 mutable SCEVExpander Rewriter;
2181 bool Changed = false;
2182 bool HardwareLoopProfitable = false;
2183 bool ShouldPreserveLCSSA = false;
2184
2185 /// This is the insert position that the current loop's induction variable
2186 /// increment should be placed. In simple loops, this is the latch block's
2187 /// terminator. But in more complicated cases, this is a position which will
2188 /// dominate all the in-loop post-increment users.
2189 Instruction *IVIncInsertPos = nullptr;
2190
2191 /// Interesting factors between use strides.
2192 ///
2193 /// We explicitly use a SetVector which contains a SmallSet, instead of the
2194 /// default, a SmallDenseSet, because we need to use the full range of
2195 /// int64_ts, and there's currently no good way of doing that with
2196 /// SmallDenseSet.
2197 SetVector<int64_t, SmallVector<int64_t, 8>, SmallSet<int64_t, 8>> Factors;
2198
2199 /// The cost of the current SCEV, the best solution by LSR will be dropped if
2200 /// the solution is not profitable.
2201 Cost BaselineCost;
2202
2203 /// Interesting use types, to facilitate truncation reuse.
2204 SmallSetVector<Type *, 4> Types;
2205
2206 /// The list of interesting uses.
2208
2209 /// Track which uses use which register candidates.
2210 RegUseTracker RegUses;
2211
2212 // Limit the number of chains to avoid quadratic behavior. We don't expect to
2213 // have more than a few IV increment chains in a loop. Missing a Chain falls
2214 // back to normal LSR behavior for those uses.
2215 static const unsigned MaxChains = 8;
2216
2217 /// IV users can form a chain of IV increments.
2219
2220 /// IV users that belong to profitable IVChains.
2221 SmallPtrSet<Use*, MaxChains> IVIncSet;
2222
2223 /// Induction variables that were generated and inserted by the SCEV Expander.
2224 SmallVector<llvm::WeakVH, 2> ScalarEvolutionIVs;
2225
2226 // Inserting instructions in the loop and using them as PHI's input could
2227 // break LCSSA in case if PHI's parent block is not a loop exit (i.e. the
2228 // corresponding incoming block is not loop exiting). So collect all such
2229 // instructions to form LCSSA for them later.
2230 SmallSetVector<Instruction *, 4> InsertedNonLCSSAInsts;
2231
2232 void OptimizeShadowIV();
2233 bool FindIVUserForCond(Instruction *Cond, IVStrideUse *&CondUse);
2234 Instruction *OptimizeMax(ICmpInst *Cond, IVStrideUse *&CondUse);
2235 void OptimizeLoopTermCond();
2236
2237 void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
2238 SmallVectorImpl<ChainUsers> &ChainUsersVec);
2239 void FinalizeChain(IVChain &Chain);
2240 void CollectChains();
2241 void GenerateIVChain(const IVChain &Chain,
2242 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2243
2244 void CollectInterestingTypesAndFactors();
2245 void CollectFixupsAndInitialFormulae();
2246
2247 // Support for sharing of LSRUses between LSRFixups.
2248 using UseMapTy = DenseMap<LSRUse::SCEVUseKindPair, size_t>;
2249 UseMapTy UseMap;
2250
2251 bool reconcileNewOffset(LSRUse &LU, Immediate NewOffset, bool HasBaseReg,
2252 LSRUse::KindType Kind, MemAccessTy AccessTy);
2253
2254 std::pair<size_t, Immediate> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
2255 MemAccessTy AccessTy);
2256
2257 void DeleteUse(LSRUse &LU, size_t LUIdx);
2258
2259 LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
2260
2261 void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2262 void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2263 void CountRegisters(const Formula &F, size_t LUIdx);
2264 bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
2265 bool IsFixupExecutedEachIncrement(const LSRFixup &LF) const;
2266
2267 void CollectLoopInvariantFixupsAndFormulae();
2268
2269 void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
2270 unsigned Depth = 0);
2271
2272 void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
2273 const Formula &Base, unsigned Depth,
2274 size_t Idx, bool IsScaledReg = false);
2275 void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
2276 void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2277 const Formula &Base, size_t Idx,
2278 bool IsScaledReg = false);
2279 void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2280 void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2281 const Formula &Base,
2282 const SmallVectorImpl<Immediate> &Worklist,
2283 size_t Idx, bool IsScaledReg = false);
2284 void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2285 void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2286 void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2287 void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
2288 void GenerateCrossUseConstantOffsets();
2289 void GenerateAllReuseFormulae();
2290
2291 void FilterOutUndesirableDedicatedRegisters();
2292
2293 size_t EstimateSearchSpaceComplexity() const;
2294 void NarrowSearchSpaceByDetectingSupersets();
2295 void NarrowSearchSpaceByCollapsingUnrolledCode();
2296 void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
2297 void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
2298 void NarrowSearchSpaceByFilterPostInc();
2299 void NarrowSearchSpaceByMergingUsesOutsideLoop();
2300 void NarrowSearchSpaceByDeletingCostlyFormulas();
2301 void NarrowSearchSpaceByPickingWinnerRegs();
2302 void NarrowSearchSpaceUsingHeuristics();
2303
2304 void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
2305 Cost &SolutionCost,
2306 SmallVectorImpl<const Formula *> &Workspace,
2307 const Cost &CurCost,
2308 const SmallPtrSet<const SCEV *, 16> &CurRegs,
2309 DenseSet<const SCEV *> &VisitedRegs) const;
2310 void Solve(SmallVectorImpl<const Formula *> &Solution) const;
2311
2313 HoistInsertPosition(BasicBlock::iterator IP,
2314 const SmallVectorImpl<Instruction *> &Inputs) const;
2315 BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP,
2316 const LSRFixup &LF,
2317 const LSRUse &LU) const;
2318
2319 Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2321 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2322 void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
2323 const Formula &F,
2324 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2325 void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2326 SmallVectorImpl<WeakTrackingVH> &DeadInsts);
2327 void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
2328
2329public:
2330 // TODO(boomanaiden154): The PreserveLCSSA flag is a hack to allow
2331 // experimentation with the NewPM which requires LCSSA preservation while
2332 // some of the details are worked out in LSR. Eventually it should be set
2333 // to true and removed.
2334 LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
2335 LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC,
2336 TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU,
2337 bool PreserveLCSSA);
2338
2339 bool getChanged() const { return Changed; }
2340 const SmallVectorImpl<WeakVH> &getScalarEvolutionIVs() const {
2341 return ScalarEvolutionIVs;
2342 }
2343
2344 void print_factors_and_types(raw_ostream &OS) const;
2345 void print_fixups(raw_ostream &OS) const;
2346 void print_uses(raw_ostream &OS) const;
2347 void print(raw_ostream &OS) const;
2348 void dump() const;
2349};
2350
2351} // end anonymous namespace
2352
2353/// If IV is used in a int-to-float cast inside the loop then try to eliminate
2354/// the cast operation.
2355void LSRInstance::OptimizeShadowIV() {
2356 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2357 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2358 return;
2359
2360 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
2361 UI != E; /* empty */) {
2362 IVUsers::const_iterator CandidateUI = UI;
2363 ++UI;
2364 Instruction *ShadowUse = CandidateUI->getUser();
2365 Type *DestTy = nullptr;
2366 bool IsSigned = false;
2367
2368 /* If shadow use is a int->float cast then insert a second IV
2369 to eliminate this cast.
2370
2371 for (unsigned i = 0; i < n; ++i)
2372 foo((double)i);
2373
2374 is transformed into
2375
2376 double d = 0.0;
2377 for (unsigned i = 0; i < n; ++i, ++d)
2378 foo(d);
2379 */
2380 if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
2381 IsSigned = false;
2382 DestTy = UCast->getDestTy();
2383 }
2384 else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
2385 IsSigned = true;
2386 DestTy = SCast->getDestTy();
2387 }
2388 if (!DestTy) continue;
2389
2390 // If target does not support DestTy natively then do not apply
2391 // this transformation.
2392 if (!TTI.isTypeLegal(DestTy)) continue;
2393
2394 PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
2395 if (!PH) continue;
2396 if (PH->getNumIncomingValues() != 2) continue;
2397
2398 // If the calculation in integers overflows, the result in FP type will
2399 // differ. So we only can do this transformation if we are guaranteed to not
2400 // deal with overflowing values
2401 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PH));
2402 if (!AR) continue;
2403 if (IsSigned && !AR->hasNoSignedWrap()) continue;
2404 if (!IsSigned && !AR->hasNoUnsignedWrap()) continue;
2405
2406 Type *SrcTy = PH->getType();
2407 int Mantissa = DestTy->getFPMantissaWidth();
2408 if (Mantissa == -1) continue;
2409 if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
2410 continue;
2411
2412 unsigned Entry, Latch;
2413 if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
2414 Entry = 0;
2415 Latch = 1;
2416 } else {
2417 Entry = 1;
2418 Latch = 0;
2419 }
2420
2421 ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
2422 if (!Init) continue;
2423 Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
2424 (double)Init->getSExtValue() :
2425 (double)Init->getZExtValue());
2426
2427 BinaryOperator *Incr =
2429 if (!Incr) continue;
2430 if (Incr->getOpcode() != Instruction::Add
2431 && Incr->getOpcode() != Instruction::Sub)
2432 continue;
2433
2434 /* Initialize new IV, double d = 0.0 in above example. */
2435 ConstantInt *C = nullptr;
2436 if (Incr->getOperand(0) == PH)
2438 else if (Incr->getOperand(1) == PH)
2440 else
2441 continue;
2442
2443 if (!C) continue;
2444
2445 // Ignore negative constants, as the code below doesn't handle them
2446 // correctly. TODO: Remove this restriction.
2447 if (!C->getValue().isStrictlyPositive())
2448 continue;
2449
2450 /* Add new PHINode. */
2451 PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH->getIterator());
2452 NewPH->setDebugLoc(PH->getDebugLoc());
2453
2454 /* create new increment. '++d' in above example. */
2455 Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
2456 BinaryOperator *NewIncr = BinaryOperator::Create(
2457 Incr->getOpcode() == Instruction::Add ? Instruction::FAdd
2458 : Instruction::FSub,
2459 NewPH, CFP, "IV.S.next.", Incr->getIterator());
2460 NewIncr->setDebugLoc(Incr->getDebugLoc());
2461
2462 NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
2463 NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
2464
2465 /* Remove cast operation */
2466 ShadowUse->replaceAllUsesWith(NewPH);
2467 ShadowUse->eraseFromParent();
2468 Changed = true;
2469 break;
2470 }
2471}
2472
2473/// If Cond has an operand that is an expression of an IV, set the IV user and
2474/// stride information and return true, otherwise return false.
2475bool LSRInstance::FindIVUserForCond(Instruction *Cond, IVStrideUse *&CondUse) {
2476 for (IVStrideUse &U : IU)
2477 if (U.getUser() == Cond) {
2478 // NOTE: we could handle setcc instructions with multiple uses here, but
2479 // InstCombine does it as well for simple uses, it's not clear that it
2480 // occurs enough in real life to handle.
2481 CondUse = &U;
2482 return true;
2483 }
2484 return false;
2485}
2486
2487/// Rewrite the loop's terminating condition if it uses a max computation.
2488///
2489/// This is a narrow solution to a specific, but acute, problem. For loops
2490/// like this:
2491///
2492/// i = 0;
2493/// do {
2494/// p[i] = 0.0;
2495/// } while (++i < n);
2496///
2497/// the trip count isn't just 'n', because 'n' might not be positive. And
2498/// unfortunately this can come up even for loops where the user didn't use
2499/// a C do-while loop. For example, seemingly well-behaved top-test loops
2500/// will commonly be lowered like this:
2501///
2502/// if (n > 0) {
2503/// i = 0;
2504/// do {
2505/// p[i] = 0.0;
2506/// } while (++i < n);
2507/// }
2508///
2509/// and then it's possible for subsequent optimization to obscure the if
2510/// test in such a way that indvars can't find it.
2511///
2512/// When indvars can't find the if test in loops like this, it creates a
2513/// max expression, which allows it to give the loop a canonical
2514/// induction variable:
2515///
2516/// i = 0;
2517/// max = n < 1 ? 1 : n;
2518/// do {
2519/// p[i] = 0.0;
2520/// } while (++i != max);
2521///
2522/// Canonical induction variables are necessary because the loop passes
2523/// are designed around them. The most obvious example of this is the
2524/// LoopInfo analysis, which doesn't remember trip count values. It
2525/// expects to be able to rediscover the trip count each time it is
2526/// needed, and it does this using a simple analysis that only succeeds if
2527/// the loop has a canonical induction variable.
2528///
2529/// However, when it comes time to generate code, the maximum operation
2530/// can be quite costly, especially if it's inside of an outer loop.
2531///
2532/// This function solves this problem by detecting this type of loop and
2533/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
2534/// the instructions for the maximum computation.
2535Instruction *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse *&CondUse) {
2536 // Check that the loop matches the pattern we're looking for.
2537 if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
2538 Cond->getPredicate() != CmpInst::ICMP_NE)
2539 return Cond;
2540
2541 SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
2542 if (!Sel || !Sel->hasOneUse()) return Cond;
2543
2544 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2545 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2546 return Cond;
2547 const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
2548
2549 // Add one to the backedge-taken count to get the trip count.
2550 const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
2551 if (IterationCount != SE.getSCEV(Sel)) return Cond;
2552
2553 // Check for a max calculation that matches the pattern. There's no check
2554 // for ICMP_ULE here because the comparison would be with zero, which
2555 // isn't interesting.
2556 CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
2557 const SCEVNAryExpr *Max = nullptr;
2558 if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
2559 Pred = ICmpInst::ICMP_SLE;
2560 Max = S;
2561 } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
2562 Pred = ICmpInst::ICMP_SLT;
2563 Max = S;
2564 } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
2565 Pred = ICmpInst::ICMP_ULT;
2566 Max = U;
2567 } else {
2568 // No match; bail.
2569 return Cond;
2570 }
2571
2572 // To handle a max with more than two operands, this optimization would
2573 // require additional checking and setup.
2574 if (Max->getNumOperands() != 2)
2575 return Cond;
2576
2577 const SCEV *MaxLHS = Max->getOperand(0);
2578 const SCEV *MaxRHS = Max->getOperand(1);
2579
2580 // ScalarEvolution canonicalizes constants to the left. For < and >, look
2581 // for a comparison with 1. For <= and >=, a comparison with zero.
2582 if (!MaxLHS ||
2583 (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
2584 return Cond;
2585
2586 // Check the relevant induction variable for conformance to
2587 // the pattern.
2588 const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
2589 if (!match(IV,
2591 return Cond;
2592
2593 assert(cast<SCEVAddRecExpr>(IV)->getLoop() == L &&
2594 "Loop condition operand is an addrec in a different loop!");
2595
2596 // Check the right operand of the select, and remember it, as it will
2597 // be used in the new comparison instruction.
2598 Value *NewRHS = nullptr;
2599 if (ICmpInst::isTrueWhenEqual(Pred)) {
2600 // Look for n+1, and grab n.
2601 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
2602 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2603 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2604 NewRHS = BO->getOperand(0);
2605 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
2606 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2607 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2608 NewRHS = BO->getOperand(0);
2609 if (!NewRHS)
2610 return Cond;
2611 } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
2612 NewRHS = Sel->getOperand(1);
2613 else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
2614 NewRHS = Sel->getOperand(2);
2615 else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
2616 NewRHS = SU->getValue();
2617 else
2618 // Max doesn't match expected pattern.
2619 return Cond;
2620
2621 // Determine the new comparison opcode. It may be signed or unsigned,
2622 // and the original comparison may be either equality or inequality.
2623 if (Cond->getPredicate() == CmpInst::ICMP_EQ)
2624 Pred = CmpInst::getInversePredicate(Pred);
2625
2626 // Ok, everything looks ok to change the condition into an SLT or SGE and
2627 // delete the max calculation.
2628 ICmpInst *NewCond = new ICmpInst(Cond->getIterator(), Pred,
2629 Cond->getOperand(0), NewRHS, "scmp");
2630
2631 // Delete the max calculation instructions.
2632 NewCond->setDebugLoc(Cond->getDebugLoc());
2633 Cond->replaceAllUsesWith(NewCond);
2634 CondUse->setUser(NewCond);
2636 Cond->eraseFromParent();
2637 Sel->eraseFromParent();
2638 if (Cmp->use_empty()) {
2639 salvageDebugInfo(*Cmp);
2640 Cmp->eraseFromParent();
2641 }
2642 return NewCond;
2643}
2644
2645/// Change loop terminating condition to use the postinc iv when possible.
2646void
2647LSRInstance::OptimizeLoopTermCond() {
2648 SmallPtrSet<Instruction *, 4> PostIncs;
2649
2650 // We need a different set of heuristics for rotated and non-rotated loops.
2651 // If a loop is rotated then the latch is also the backedge, so inserting
2652 // post-inc expressions just before the latch is ideal. To reduce live ranges
2653 // it also makes sense to rewrite terminating conditions to use post-inc
2654 // expressions.
2655 //
2656 // If the loop is not rotated then the latch is not a backedge; the latch
2657 // check is done in the loop head. Adding post-inc expressions before the
2658 // latch will cause overlapping live-ranges of pre-inc and post-inc expressions
2659 // in the loop body. In this case we do *not* want to use post-inc expressions
2660 // in the latch check, and we want to insert post-inc expressions before
2661 // the backedge.
2662 BasicBlock *LatchBlock = L->getLoopLatch();
2663 SmallVector<BasicBlock*, 8> ExitingBlocks;
2664 L->getExitingBlocks(ExitingBlocks);
2665 if (!llvm::is_contained(ExitingBlocks, LatchBlock)) {
2666 // The backedge doesn't exit the loop; treat this as a head-tested loop.
2667 IVIncInsertPos = LatchBlock->getTerminator();
2668 return;
2669 }
2670
2671 // Otherwise treat this as a rotated loop.
2672 for (BasicBlock *ExitingBlock : ExitingBlocks) {
2673 // Get the terminating condition for the loop if possible. If we
2674 // can, we want to change it to use a post-incremented version of its
2675 // induction variable, to allow coalescing the live ranges for the IV into
2676 // one register value.
2677
2678 CondBrInst *TermBr = dyn_cast<CondBrInst>(ExitingBlock->getTerminator());
2679 if (!TermBr)
2680 continue;
2681
2683 // If the argument to TermBr is an extractelement, then the source of that
2684 // instruction is what's generated the condition.
2686 if (Extract)
2687 Cond = dyn_cast<Instruction>(Extract->getVectorOperand());
2688 // FIXME: We could do more here, like handling logical operations where one
2689 // side is a cmp that uses an induction variable.
2690 if (!Cond)
2691 continue;
2692
2693 // Search IVUsesByStride to find Cond's IVUse if there is one.
2694 IVStrideUse *CondUse = nullptr;
2695 if (!FindIVUserForCond(Cond, CondUse))
2696 continue;
2697
2698 // If the trip count is computed in terms of a max (due to ScalarEvolution
2699 // being unable to find a sufficient guard, for example), change the loop
2700 // comparison to use SLT or ULT instead of NE.
2701 // One consequence of doing this now is that it disrupts the count-down
2702 // optimization. That's not always a bad thing though, because in such
2703 // cases it may still be worthwhile to avoid a max.
2704 if (auto *Cmp = dyn_cast<ICmpInst>(Cond))
2705 Cond = OptimizeMax(Cmp, CondUse);
2706
2707 // If this exiting block dominates the latch block, it may also use
2708 // the post-inc value if it won't be shared with other uses.
2709 // Check for dominance.
2710 if (!DT.dominates(ExitingBlock, LatchBlock))
2711 continue;
2712
2713 // Conservatively avoid trying to use the post-inc value in non-latch
2714 // exits if there may be pre-inc users in intervening blocks.
2715 if (LatchBlock != ExitingBlock)
2716 for (const IVStrideUse &UI : IU)
2717 // Test if the use is reachable from the exiting block. This dominator
2718 // query is a conservative approximation of reachability.
2719 if (&UI != CondUse &&
2720 !DT.properlyDominates(UI.getUser()->getParent(), ExitingBlock)) {
2721 // Conservatively assume there may be reuse if the quotient of their
2722 // strides could be a legal scale.
2723 const SCEV *A = IU.getStride(*CondUse, L);
2724 const SCEV *B = IU.getStride(UI, L);
2725 if (!A || !B) continue;
2726 if (SE.getTypeSizeInBits(A->getType()) !=
2727 SE.getTypeSizeInBits(B->getType())) {
2728 if (SE.getTypeSizeInBits(A->getType()) >
2729 SE.getTypeSizeInBits(B->getType()))
2730 B = SE.getSignExtendExpr(B, A->getType());
2731 else
2732 A = SE.getSignExtendExpr(A, B->getType());
2733 }
2734 if (const SCEVConstant *D =
2736 const ConstantInt *C = D->getValue();
2737 // Stride of one or negative one can have reuse with non-addresses.
2738 if (C->isOne() || C->isMinusOne())
2739 goto decline_post_inc;
2740 // Avoid weird situations.
2741 if (C->getValue().getSignificantBits() >= 64 ||
2742 C->getValue().isMinSignedValue())
2743 goto decline_post_inc;
2744 // Check for possible scaled-address reuse.
2745 if (isAddressUse(TTI, UI.getUser(), UI.getOperandValToReplace())) {
2746 MemAccessTy AccessTy =
2747 getAccessType(TTI, UI.getUser(), UI.getOperandValToReplace());
2748 int64_t Scale = C->getSExtValue();
2749 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2750 /*BaseOffset=*/0,
2751 /*HasBaseReg=*/true, Scale,
2752 AccessTy.AddrSpace))
2753 goto decline_post_inc;
2754 Scale = -Scale;
2755 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2756 /*BaseOffset=*/0,
2757 /*HasBaseReg=*/true, Scale,
2758 AccessTy.AddrSpace))
2759 goto decline_post_inc;
2760 }
2761 }
2762 }
2763
2764 LLVM_DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: "
2765 << *Cond << '\n');
2766
2767 // It's possible for the setcc instruction to be anywhere in the loop, and
2768 // possible for it to have multiple users. If it is not immediately before
2769 // the exiting block branch, move it.
2770 if (isa_and_nonnull<CmpInst>(Cond) && Cond->getNextNode() != TermBr &&
2771 !Extract) {
2772 if (Cond->hasOneUse()) {
2773 Cond->moveBefore(TermBr->getIterator());
2774 } else {
2775 // Clone the terminating condition and insert into the loopend.
2776 Instruction *OldCond = Cond;
2777 Cond = Cond->clone();
2778 Cond->setName(L->getHeader()->getName() + ".termcond");
2779 Cond->insertInto(ExitingBlock, TermBr->getIterator());
2780
2781 // Clone the IVUse, as the old use still exists!
2782 CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
2783 TermBr->replaceUsesOfWith(OldCond, Cond);
2784 }
2785 }
2786
2787 // If we get to here, we know that we can transform the setcc instruction to
2788 // use the post-incremented version of the IV, allowing us to coalesce the
2789 // live ranges for the IV correctly.
2790 CondUse->transformToPostInc(L);
2791 Changed = true;
2792
2793 PostIncs.insert(Cond);
2794 decline_post_inc:;
2795 }
2796
2797 // Determine an insertion point for the loop induction variable increment. It
2798 // must dominate all the post-inc comparisons we just set up, and it must
2799 // dominate the loop latch edge.
2800 IVIncInsertPos = L->getLoopLatch()->getTerminator();
2801 for (Instruction *Inst : PostIncs)
2802 IVIncInsertPos = DT.findNearestCommonDominator(IVIncInsertPos, Inst);
2803}
2804
2805/// Determine if the given use can accommodate a fixup at the given offset and
2806/// other details. If so, update the use and return true.
2807bool LSRInstance::reconcileNewOffset(LSRUse &LU, Immediate NewOffset,
2808 bool HasBaseReg, LSRUse::KindType Kind,
2809 MemAccessTy AccessTy) {
2810 Immediate NewMinOffset = LU.MinOffset;
2811 Immediate NewMaxOffset = LU.MaxOffset;
2812 MemAccessTy NewAccessTy = AccessTy;
2813
2814 // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
2815 // something conservative, however this can pessimize in the case that one of
2816 // the uses will have all its uses outside the loop, for example.
2817 if (LU.Kind != Kind)
2818 return false;
2819
2820 // Check for a mismatched access type, and fall back conservatively as needed.
2821 // TODO: Be less conservative when the type is similar and can use the same
2822 // addressing modes.
2823 if (Kind == LSRUse::Address) {
2824 if (AccessTy.MemTy != LU.AccessTy.MemTy) {
2825 NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext(),
2826 AccessTy.AddrSpace);
2827 }
2828 }
2829
2830 // Conservatively assume HasBaseReg is true for now.
2831 if (Immediate::isKnownLT(NewOffset, LU.MinOffset)) {
2832 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2833 LU.MaxOffset - NewOffset, HasBaseReg))
2834 return false;
2835 NewMinOffset = NewOffset;
2836 } else if (Immediate::isKnownGT(NewOffset, LU.MaxOffset)) {
2837 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2838 NewOffset - LU.MinOffset, HasBaseReg))
2839 return false;
2840 NewMaxOffset = NewOffset;
2841 }
2842
2843 // FIXME: We should be able to handle some level of scalable offset support
2844 // for 'void', but in order to get basic support up and running this is
2845 // being left out.
2846 if (NewAccessTy.MemTy && NewAccessTy.MemTy->isVoidTy() &&
2847 (NewMinOffset.isScalable() || NewMaxOffset.isScalable()))
2848 return false;
2849
2850 // Update the use.
2851 LU.MinOffset = NewMinOffset;
2852 LU.MaxOffset = NewMaxOffset;
2853 LU.AccessTy = NewAccessTy;
2854 return true;
2855}
2856
2857/// Return an LSRUse index and an offset value for a fixup which needs the given
2858/// expression, with the given kind and optional access type. Either reuse an
2859/// existing use or create a new one, as needed.
2860std::pair<size_t, Immediate> LSRInstance::getUse(const SCEV *&Expr,
2861 LSRUse::KindType Kind,
2862 MemAccessTy AccessTy) {
2863 const SCEV *Copy = Expr;
2864 SCEVUse ExprUse = Expr;
2865 Immediate Offset = ExtractImmediate(
2866 ExprUse, SE, AccessTy.MemTy && AccessTy.MemTy->isScalableTy());
2867 Expr = ExprUse;
2868
2869 // Basic uses can't accept any offset, for example.
2870 if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
2871 Offset, /*HasBaseReg=*/ true)) {
2872 Expr = Copy;
2873 Offset = Immediate::getFixed(0);
2874 }
2875
2876 std::pair<UseMapTy::iterator, bool> P =
2877 UseMap.try_emplace(LSRUse::SCEVUseKindPair(Expr, Kind));
2878 if (!P.second) {
2879 // A use already existed with this base.
2880 size_t LUIdx = P.first->second;
2881 LSRUse &LU = Uses[LUIdx];
2882 if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
2883 // Reuse this use.
2884 return std::make_pair(LUIdx, Offset);
2885 }
2886
2887 // Create a new use.
2888 size_t LUIdx = Uses.size();
2889 P.first->second = LUIdx;
2890 Uses.push_back(LSRUse(Kind, AccessTy));
2891 LSRUse &LU = Uses[LUIdx];
2892
2893 LU.MinOffset = Offset;
2894 LU.MaxOffset = Offset;
2895 return std::make_pair(LUIdx, Offset);
2896}
2897
2898/// Delete the given use from the Uses list.
2899void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
2900 if (&LU != &Uses.back())
2901 std::swap(LU, Uses.back());
2902 Uses.pop_back();
2903
2904 // Update RegUses.
2905 RegUses.swapAndDropUse(LUIdx, Uses.size());
2906}
2907
2908/// Look for a use distinct from OrigLU which is has a formula that has the same
2909/// registers as the given formula.
2910LSRUse *
2911LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
2912 const LSRUse &OrigLU) {
2913 // Search all uses for the formula. This could be more clever.
2914 for (LSRUse &LU : Uses) {
2915 // Check whether this use is close enough to OrigLU, to see whether it's
2916 // worthwhile looking through its formulae.
2917 // Ignore ICmpZero uses because they may contain formulae generated by
2918 // GenerateICmpZeroScales, in which case adding fixup offsets may
2919 // be invalid.
2920 if (&LU != &OrigLU && LU.Kind != LSRUse::ICmpZero &&
2921 LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
2922 LU.HasFormulaWithSameRegs(OrigF)) {
2923 // Scan through this use's formulae.
2924 for (const Formula &F : LU.Formulae) {
2925 // Check to see if this formula has the same registers and symbols
2926 // as OrigF.
2927 if (F.BaseRegs == OrigF.BaseRegs &&
2928 F.ScaledReg == OrigF.ScaledReg &&
2929 F.BaseGV == OrigF.BaseGV &&
2930 F.Scale == OrigF.Scale &&
2931 F.UnfoldedOffset == OrigF.UnfoldedOffset) {
2932 if (F.BaseOffset.isZero())
2933 return &LU;
2934 // This is the formula where all the registers and symbols matched;
2935 // there aren't going to be any others. Since we declined it, we
2936 // can skip the rest of the formulae and proceed to the next LSRUse.
2937 break;
2938 }
2939 }
2940 }
2941 }
2942
2943 // Nothing looked good.
2944 return nullptr;
2945}
2946
2947void LSRInstance::CollectInterestingTypesAndFactors() {
2948 SmallSetVector<const SCEV *, 4> Strides;
2949
2950 // Collect interesting types and strides.
2952 for (const IVStrideUse &U : IU) {
2953 const SCEV *Expr = IU.getExpr(U);
2954 if (!Expr)
2955 continue;
2956
2957 // Collect interesting types.
2958 Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
2959
2960 // Add strides for mentioned loops.
2961 Worklist.push_back(Expr);
2962 do {
2963 const SCEV *S = Worklist.pop_back_val();
2964 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
2965 if (AR->getLoop() == L)
2966 Strides.insert(AR->getStepRecurrence(SE));
2967 Worklist.push_back(AR->getStart());
2968 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
2969 append_range(Worklist, Add->operands());
2970 }
2971 } while (!Worklist.empty());
2972 }
2973
2974 // Compute interesting factors from the set of interesting strides.
2975 for (SmallSetVector<const SCEV *, 4>::const_iterator
2976 I = Strides.begin(), E = Strides.end(); I != E; ++I)
2977 for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
2978 std::next(I); NewStrideIter != E; ++NewStrideIter) {
2979 const SCEV *OldStride = *I;
2980 const SCEV *NewStride = *NewStrideIter;
2981
2982 if (SE.getTypeSizeInBits(OldStride->getType()) !=
2983 SE.getTypeSizeInBits(NewStride->getType())) {
2984 if (SE.getTypeSizeInBits(OldStride->getType()) >
2985 SE.getTypeSizeInBits(NewStride->getType()))
2986 NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
2987 else
2988 OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
2989 }
2990 if (const SCEVConstant *Factor =
2991 dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
2992 SE, true))) {
2993 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2994 Factors.insert(Factor->getAPInt().getSExtValue());
2995 } else if (const SCEVConstant *Factor =
2997 NewStride,
2998 SE, true))) {
2999 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
3000 Factors.insert(Factor->getAPInt().getSExtValue());
3001 }
3002 }
3003
3004 // If all uses use the same type, don't bother looking for truncation-based
3005 // reuse.
3006 if (Types.size() == 1)
3007 Types.clear();
3008
3009 LLVM_DEBUG(print_factors_and_types(dbgs()));
3010}
3011
3012/// Helper for CollectChains that finds an IV operand (computed by an AddRec in
3013/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
3014/// IVStrideUses, we could partially skip this.
3015static User::op_iterator
3017 Loop *L, ScalarEvolution &SE) {
3018 for(; OI != OE; ++OI) {
3019 if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
3020 if (!SE.isSCEVable(Oper->getType()))
3021 continue;
3022
3023 if (const SCEVAddRecExpr *AR =
3025 if (AR->getLoop() == L)
3026 break;
3027 }
3028 }
3029 }
3030 return OI;
3031}
3032
3033/// IVChain logic must consistently peek base TruncInst operands, so wrap it in
3034/// a convenient helper.
3036 if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
3037 return Trunc->getOperand(0);
3038 return Oper;
3039}
3040
3041/// Return an approximation of this SCEV expression's "base", or NULL for any
3042/// constant. Returning the expression itself is conservative. Returning a
3043/// deeper subexpression is more precise and valid as long as it isn't less
3044/// complex than another subexpression. For expressions involving multiple
3045/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
3046/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
3047/// IVInc==b-a.
3048///
3049/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
3050/// SCEVUnknown, we simply return the rightmost SCEV operand.
3051static const SCEV *getExprBase(const SCEV *S) {
3052 switch (S->getSCEVType()) {
3053 default: // including scUnknown.
3054 return S;
3055 case scConstant:
3056 case scVScale:
3057 return nullptr;
3058 case scTruncate:
3059 return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
3060 case scZeroExtend:
3061 return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
3062 case scSignExtend:
3063 return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
3064 case scAddExpr: {
3065 // Skip over scaled operands (scMulExpr) to follow add operands as long as
3066 // there's nothing more complex.
3067 // FIXME: not sure if we want to recognize negation.
3068 const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
3069 for (const SCEV *SubExpr : reverse(Add->operands())) {
3070 if (SubExpr->getSCEVType() == scAddExpr)
3071 return getExprBase(SubExpr);
3072
3073 if (SubExpr->getSCEVType() != scMulExpr)
3074 return SubExpr;
3075 }
3076 return S; // all operands are scaled, be conservative.
3077 }
3078 case scAddRecExpr:
3079 return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
3080 }
3081 llvm_unreachable("Unknown SCEV kind!");
3082}
3083
3084/// Return true if the chain increment is profitable to expand into a loop
3085/// invariant value, which may require its own register. A profitable chain
3086/// increment will be an offset relative to the same base. We allow such offsets
3087/// to potentially be used as chain increment as long as it's not obviously
3088/// expensive to expand using real instructions.
3089bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
3090 const SCEV *IncExpr,
3091 ScalarEvolution &SE) {
3092 // Aggressively form chains when -stress-ivchain.
3093 if (StressIVChain)
3094 return true;
3095
3096 // Do not replace a constant offset from IV head with a nonconstant IV
3097 // increment.
3098 if (!isa<SCEVConstant>(IncExpr)) {
3099 const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
3100 if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
3101 return false;
3102 }
3103
3104 SmallPtrSet<const SCEV*, 8> Processed;
3105 return !isHighCostExpansion(IncExpr, Processed, SE);
3106}
3107
3108/// Return true if the number of registers needed for the chain is estimated to
3109/// be less than the number required for the individual IV users. First prohibit
3110/// any IV users that keep the IV live across increments (the Users set should
3111/// be empty). Next count the number and type of increments in the chain.
3112///
3113/// Chaining IVs can lead to considerable code bloat if ISEL doesn't
3114/// effectively use postinc addressing modes. Only consider it profitable it the
3115/// increments can be computed in fewer registers when chained.
3116///
3117/// TODO: Consider IVInc free if it's already used in another chains.
3118static bool isProfitableChain(IVChain &Chain,
3120 ScalarEvolution &SE,
3121 const TargetTransformInfo &TTI) {
3122 if (StressIVChain)
3123 return true;
3124
3125 if (!Chain.hasIncs())
3126 return false;
3127
3128 if (!Users.empty()) {
3129 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
3130 for (Instruction *Inst
3131 : Users) { dbgs() << " " << *Inst << "\n"; });
3132 return false;
3133 }
3134 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3135
3136 // The chain itself may require a register, so initialize cost to 1.
3137 int cost = 1;
3138
3139 // A complete chain likely eliminates the need for keeping the original IV in
3140 // a register. LSR does not currently know how to form a complete chain unless
3141 // the header phi already exists.
3142 if (isa<PHINode>(Chain.tailUserInst())
3143 && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
3144 --cost;
3145 }
3146 const SCEV *LastIncExpr = nullptr;
3147 unsigned NumConstIncrements = 0;
3148 unsigned NumVarIncrements = 0;
3149 unsigned NumReusedIncrements = 0;
3150
3151 if (TTI.isProfitableLSRChainElement(Chain.Incs[0].UserInst))
3152 return true;
3153
3154 for (const IVInc &Inc : Chain) {
3155 if (TTI.isProfitableLSRChainElement(Inc.UserInst))
3156 return true;
3157 if (Inc.IncExpr->isZero())
3158 continue;
3159
3160 // Incrementing by zero or some constant is neutral. We assume constants can
3161 // be folded into an addressing mode or an add's immediate operand.
3162 if (isa<SCEVConstant>(Inc.IncExpr)) {
3163 ++NumConstIncrements;
3164 continue;
3165 }
3166
3167 if (Inc.IncExpr == LastIncExpr)
3168 ++NumReusedIncrements;
3169 else
3170 ++NumVarIncrements;
3171
3172 LastIncExpr = Inc.IncExpr;
3173 }
3174 // An IV chain with a single increment is handled by LSR's postinc
3175 // uses. However, a chain with multiple increments requires keeping the IV's
3176 // value live longer than it needs to be if chained.
3177 if (NumConstIncrements > 1)
3178 --cost;
3179
3180 // Materializing increment expressions in the preheader that didn't exist in
3181 // the original code may cost a register. For example, sign-extended array
3182 // indices can produce ridiculous increments like this:
3183 // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
3184 cost += NumVarIncrements;
3185
3186 // Reusing variable increments likely saves a register to hold the multiple of
3187 // the stride.
3188 cost -= NumReusedIncrements;
3189
3190 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
3191 << "\n");
3192
3193 return cost < 0;
3194}
3195
3196/// Add this IV user to an existing chain or make it the head of a new chain.
3197void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
3198 SmallVectorImpl<ChainUsers> &ChainUsersVec) {
3199 // When IVs are used as types of varying widths, they are generally converted
3200 // to a wider type with some uses remaining narrow under a (free) trunc.
3201 Value *const NextIV = getWideOperand(IVOper);
3202 const SCEV *const OperExpr = SE.getSCEV(NextIV);
3203 const SCEV *const OperExprBase = getExprBase(OperExpr);
3204
3205 // Visit all existing chains. Check if its IVOper can be computed as a
3206 // profitable loop invariant increment from the last link in the Chain.
3207 unsigned ChainIdx = 0, NChains = IVChainVec.size();
3208 const SCEV *LastIncExpr = nullptr;
3209 for (; ChainIdx < NChains; ++ChainIdx) {
3210 IVChain &Chain = IVChainVec[ChainIdx];
3211
3212 // Prune the solution space aggressively by checking that both IV operands
3213 // are expressions that operate on the same unscaled SCEVUnknown. This
3214 // "base" will be canceled by the subsequent getMinusSCEV call. Checking
3215 // first avoids creating extra SCEV expressions.
3216 if (!StressIVChain && Chain.ExprBase != OperExprBase)
3217 continue;
3218
3219 Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
3220 if (PrevIV->getType() != NextIV->getType())
3221 continue;
3222
3223 // A phi node terminates a chain.
3224 if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
3225 continue;
3226
3227 // The increment must be loop-invariant so it can be kept in a register.
3228 const SCEV *PrevExpr = SE.getSCEV(PrevIV);
3229 const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
3230 if (isa<SCEVCouldNotCompute>(IncExpr) || !SE.isLoopInvariant(IncExpr, L))
3231 continue;
3232
3233 if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
3234 LastIncExpr = IncExpr;
3235 break;
3236 }
3237 }
3238 // If we haven't found a chain, create a new one, unless we hit the max. Don't
3239 // bother for phi nodes, because they must be last in the chain.
3240 if (ChainIdx == NChains) {
3241 if (isa<PHINode>(UserInst))
3242 return;
3243 if (NChains >= MaxChains && !StressIVChain) {
3244 LLVM_DEBUG(dbgs() << "IV Chain Limit\n");
3245 return;
3246 }
3247 LastIncExpr = OperExpr;
3248 // IVUsers may have skipped over sign/zero extensions. We don't currently
3249 // attempt to form chains involving extensions unless they can be hoisted
3250 // into this loop's AddRec.
3251 if (!isa<SCEVAddRecExpr>(LastIncExpr))
3252 return;
3253 ++NChains;
3254 IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
3255 OperExprBase));
3256 ChainUsersVec.resize(NChains);
3257 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
3258 << ") IV=" << *LastIncExpr << "\n");
3259 } else {
3260 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst
3261 << ") IV+" << *LastIncExpr << "\n");
3262 // Add this IV user to the end of the chain.
3263 IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
3264 }
3265 IVChain &Chain = IVChainVec[ChainIdx];
3266
3267 SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
3268 // This chain's NearUsers become FarUsers.
3269 if (!LastIncExpr->isZero()) {
3270 ChainUsersVec[ChainIdx].FarUsers.insert_range(NearUsers);
3271 NearUsers.clear();
3272 }
3273
3274 // All other uses of IVOperand become near uses of the chain.
3275 // We currently ignore intermediate values within SCEV expressions, assuming
3276 // they will eventually be used be the current chain, or can be computed
3277 // from one of the chain increments. To be more precise we could
3278 // transitively follow its user and only add leaf IV users to the set.
3279 for (User *U : IVOper->users()) {
3280 Instruction *OtherUse = dyn_cast<Instruction>(U);
3281 if (!OtherUse)
3282 continue;
3283 // Uses in the chain will no longer be uses if the chain is formed.
3284 // Include the head of the chain in this iteration (not Chain.begin()).
3285 IVChain::const_iterator IncIter = Chain.Incs.begin();
3286 IVChain::const_iterator IncEnd = Chain.Incs.end();
3287 for( ; IncIter != IncEnd; ++IncIter) {
3288 if (IncIter->UserInst == OtherUse)
3289 break;
3290 }
3291 if (IncIter != IncEnd)
3292 continue;
3293
3294 if (SE.isSCEVable(OtherUse->getType())
3295 && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
3296 && IU.isIVUserOrOperand(OtherUse)) {
3297 continue;
3298 }
3299 NearUsers.insert(OtherUse);
3300 }
3301
3302 // Since this user is part of the chain, it's no longer considered a use
3303 // of the chain.
3304 ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
3305}
3306
3307/// Populate the vector of Chains.
3308///
3309/// This decreases ILP at the architecture level. Targets with ample registers,
3310/// multiple memory ports, and no register renaming probably don't want
3311/// this. However, such targets should probably disable LSR altogether.
3312///
3313/// The job of LSR is to make a reasonable choice of induction variables across
3314/// the loop. Subsequent passes can easily "unchain" computation exposing more
3315/// ILP *within the loop* if the target wants it.
3316///
3317/// Finding the best IV chain is potentially a scheduling problem. Since LSR
3318/// will not reorder memory operations, it will recognize this as a chain, but
3319/// will generate redundant IV increments. Ideally this would be corrected later
3320/// by a smart scheduler:
3321/// = A[i]
3322/// = A[i+x]
3323/// A[i] =
3324/// A[i+x] =
3325///
3326/// TODO: Walk the entire domtree within this loop, not just the path to the
3327/// loop latch. This will discover chains on side paths, but requires
3328/// maintaining multiple copies of the Chains state.
3329void LSRInstance::CollectChains() {
3330 LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n");
3331 SmallVector<ChainUsers, 8> ChainUsersVec;
3332
3333 SmallVector<BasicBlock *,8> LatchPath;
3334 BasicBlock *LoopHeader = L->getHeader();
3335 for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
3336 Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
3337 LatchPath.push_back(Rung->getBlock());
3338 }
3339 LatchPath.push_back(LoopHeader);
3340
3341 // Walk the instruction stream from the loop header to the loop latch.
3342 for (BasicBlock *BB : reverse(LatchPath)) {
3343 for (Instruction &I : *BB) {
3344 // Skip instructions that weren't seen by IVUsers analysis.
3345 if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&I))
3346 continue;
3347
3348 // Skip ephemeral values, as they don't produce real code.
3349 if (IU.isEphemeral(&I))
3350 continue;
3351
3352 // Ignore users that are part of a SCEV expression. This way we only
3353 // consider leaf IV Users. This effectively rediscovers a portion of
3354 // IVUsers analysis but in program order this time.
3355 if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I)))
3356 continue;
3357
3358 // Remove this instruction from any NearUsers set it may be in.
3359 for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
3360 ChainIdx < NChains; ++ChainIdx) {
3361 ChainUsersVec[ChainIdx].NearUsers.erase(&I);
3362 }
3363 // Search for operands that can be chained.
3364 SmallPtrSet<Instruction*, 4> UniqueOperands;
3365 User::op_iterator IVOpEnd = I.op_end();
3366 User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE);
3367 while (IVOpIter != IVOpEnd) {
3368 Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
3369 if (UniqueOperands.insert(IVOpInst).second)
3370 ChainInstruction(&I, IVOpInst, ChainUsersVec);
3371 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3372 }
3373 } // Continue walking down the instructions.
3374 } // Continue walking down the domtree.
3375 // Visit phi backedges to determine if the chain can generate the IV postinc.
3376 for (PHINode &PN : L->getHeader()->phis()) {
3377 if (!SE.isSCEVable(PN.getType()))
3378 continue;
3379
3380 Instruction *IncV =
3381 dyn_cast<Instruction>(PN.getIncomingValueForBlock(L->getLoopLatch()));
3382 if (IncV)
3383 ChainInstruction(&PN, IncV, ChainUsersVec);
3384 }
3385 // Remove any unprofitable chains.
3386 unsigned ChainIdx = 0;
3387 for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
3388 UsersIdx < NChains; ++UsersIdx) {
3389 if (!isProfitableChain(IVChainVec[UsersIdx],
3390 ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
3391 continue;
3392 // Preserve the chain at UsesIdx.
3393 if (ChainIdx != UsersIdx)
3394 IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
3395 FinalizeChain(IVChainVec[ChainIdx]);
3396 ++ChainIdx;
3397 }
3398 IVChainVec.resize(ChainIdx);
3399}
3400
3401void LSRInstance::FinalizeChain(IVChain &Chain) {
3402 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3403 LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
3404
3405 for (const IVInc &Inc : Chain) {
3406 LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");
3407 auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
3408 assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
3409 IVIncSet.insert(UseI);
3410 }
3411}
3412
3413/// Return true if the IVInc can be folded into an addressing mode.
3414static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
3415 Value *Operand, const TargetTransformInfo &TTI) {
3416 const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
3417 Immediate IncOffset = Immediate::getZero();
3418 if (IncConst) {
3419 if (IncConst && IncConst->getAPInt().getSignificantBits() > 64)
3420 return false;
3421 IncOffset = Immediate::getFixed(IncConst->getValue()->getSExtValue());
3422 } else {
3423 // Look for mul(vscale, constant), to detect a scalable offset.
3424 const APInt *C;
3425 if (!match(IncExpr, m_scev_Mul(m_scev_APInt(C), m_SCEVVScale())) ||
3426 C->getSignificantBits() > 64)
3427 return false;
3428 IncOffset = Immediate::getScalable(C->getSExtValue());
3429 }
3430
3431 if (!isAddressUse(TTI, UserInst, Operand))
3432 return false;
3433
3434 MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
3435 if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
3436 IncOffset, /*HasBaseReg=*/false))
3437 return false;
3438
3439 return true;
3440}
3441
3442/// Generate an add or subtract for each IVInc in a chain to materialize the IV
3443/// user's operand from the previous IV user's operand.
3444void LSRInstance::GenerateIVChain(const IVChain &Chain,
3445 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
3446 // Find the new IVOperand for the head of the chain. It may have been replaced
3447 // by LSR.
3448 const IVInc &Head = Chain.Incs[0];
3449 User::op_iterator IVOpEnd = Head.UserInst->op_end();
3450 // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
3451 User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
3452 IVOpEnd, L, SE);
3453 Value *IVSrc = nullptr;
3454 while (IVOpIter != IVOpEnd) {
3455 IVSrc = getWideOperand(*IVOpIter);
3456
3457 // If this operand computes the expression that the chain needs, we may use
3458 // it. (Check this after setting IVSrc which is used below.)
3459 //
3460 // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
3461 // narrow for the chain, so we can no longer use it. We do allow using a
3462 // wider phi, assuming the LSR checked for free truncation. In that case we
3463 // should already have a truncate on this operand such that
3464 // getSCEV(IVSrc) == IncExpr.
3465 if (SE.getSCEV(*IVOpIter) == Head.IncExpr
3466 || SE.getSCEV(IVSrc) == Head.IncExpr) {
3467 break;
3468 }
3469 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3470 }
3471 if (IVOpIter == IVOpEnd) {
3472 // Gracefully give up on this chain.
3473 LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
3474 return;
3475 }
3476 assert(IVSrc && "Failed to find IV chain source");
3477
3478 LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
3479 Type *IVTy = IVSrc->getType();
3480 Type *IntTy = SE.getEffectiveSCEVType(IVTy);
3481 const SCEV *LeftOverExpr = nullptr;
3482 const SCEV *Accum = SE.getZero(IntTy);
3484 Bases.emplace_back(Accum, IVSrc);
3485
3486 for (const IVInc &Inc : Chain) {
3487 Instruction *InsertPt = Inc.UserInst;
3488 if (isa<PHINode>(InsertPt))
3489 InsertPt = L->getLoopLatch()->getTerminator();
3490
3491 // IVOper will replace the current IV User's operand. IVSrc is the IV
3492 // value currently held in a register.
3493 Value *IVOper = IVSrc;
3494 if (!Inc.IncExpr->isZero()) {
3495 // IncExpr was the result of subtraction of two narrow values, so must
3496 // be signed.
3497 const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
3498 Accum = SE.getAddExpr(Accum, IncExpr);
3499 LeftOverExpr = LeftOverExpr ?
3500 SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
3501 }
3502
3503 // Look through each base to see if any can produce a nice addressing mode.
3504 bool FoundBase = false;
3505 for (auto [MapScev, MapIVOper] : reverse(Bases)) {
3506 const SCEV *Remainder = SE.getMinusSCEV(Accum, MapScev);
3507 if (canFoldIVIncExpr(Remainder, Inc.UserInst, Inc.IVOperand, TTI)) {
3508 if (!Remainder->isZero()) {
3509 Rewriter.clearPostInc();
3510 Value *IncV = Rewriter.expandCodeFor(Remainder, IntTy, InsertPt);
3511 const SCEV *IVOperExpr =
3512 SE.getAddExpr(SE.getUnknown(MapIVOper), SE.getUnknown(IncV));
3513 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3514 } else {
3515 IVOper = MapIVOper;
3516 }
3517
3518 FoundBase = true;
3519 break;
3520 }
3521 }
3522 if (!FoundBase && LeftOverExpr && !LeftOverExpr->isZero()) {
3523 // Expand the IV increment.
3524 Rewriter.clearPostInc();
3525 Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
3526 const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
3527 SE.getUnknown(IncV));
3528 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3529
3530 // If an IV increment can't be folded, use it as the next IV value.
3531 if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
3532 assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
3533 Bases.emplace_back(Accum, IVOper);
3534 IVSrc = IVOper;
3535 LeftOverExpr = nullptr;
3536 }
3537 }
3538 Type *OperTy = Inc.IVOperand->getType();
3539 if (IVTy != OperTy) {
3540 assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
3541 "cannot extend a chained IV");
3542 IRBuilder<> Builder(InsertPt);
3543 IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
3544 }
3545 Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
3546 if (auto *OperandIsInstr = dyn_cast<Instruction>(Inc.IVOperand))
3547 DeadInsts.emplace_back(OperandIsInstr);
3548 }
3549 // If LSR created a new, wider phi, we may also replace its postinc. We only
3550 // do this if we also found a wide value for the head of the chain.
3551 if (isa<PHINode>(Chain.tailUserInst())) {
3552 for (PHINode &Phi : L->getHeader()->phis()) {
3553 if (Phi.getType() != IVSrc->getType())
3554 continue;
3556 Phi.getIncomingValueForBlock(L->getLoopLatch()));
3557 if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
3558 continue;
3559 Value *IVOper = IVSrc;
3560 Type *PostIncTy = PostIncV->getType();
3561 if (IVTy != PostIncTy) {
3562 assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
3563 IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
3564 Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
3565 IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
3566 }
3567 Phi.replaceUsesOfWith(PostIncV, IVOper);
3568 DeadInsts.emplace_back(PostIncV);
3569 }
3570 }
3571}
3572
3573void LSRInstance::CollectFixupsAndInitialFormulae() {
3574 CondBrInst *ExitBranch = nullptr;
3575 bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &TLI);
3576
3577 // For calculating baseline cost
3578 SmallPtrSet<const SCEV *, 16> Regs;
3579 DenseSet<const SCEV *> VisitedRegs;
3580 DenseSet<size_t> VisitedLSRUse;
3581
3582 for (const IVStrideUse &U : IU) {
3583 Instruction *UserInst = U.getUser();
3584 // Skip IV users that are part of profitable IV Chains.
3585 User::op_iterator UseI =
3586 find(UserInst->operands(), U.getOperandValToReplace());
3587 assert(UseI != UserInst->op_end() && "cannot find IV operand");
3588 if (IVIncSet.count(UseI)) {
3589 LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
3590 continue;
3591 }
3592
3593 LSRUse::KindType Kind = LSRUse::Basic;
3594 MemAccessTy AccessTy;
3595 if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
3596 Kind = LSRUse::Address;
3597 AccessTy = getAccessType(TTI, UserInst, U.getOperandValToReplace());
3598 }
3599
3600 const SCEV *S = IU.getExpr(U);
3601 if (!S)
3602 continue;
3603 PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
3604
3605 // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
3606 // (N - i == 0), and this allows (N - i) to be the expression that we work
3607 // with rather than just N or i, so we can consider the register
3608 // requirements for both N and i at the same time. Limiting this code to
3609 // equality icmps is not a problem because all interesting loops use
3610 // equality icmps, thanks to IndVarSimplify.
3611 if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) {
3612 // If CI can be saved in some target, like replaced inside hardware loop
3613 // in PowerPC, no need to generate initial formulae for it.
3614 if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition()))
3615 continue;
3616 if (CI->isEquality()) {
3617 // Swap the operands if needed to put the OperandValToReplace on the
3618 // left, for consistency.
3619 Value *NV = CI->getOperand(1);
3620 if (NV == U.getOperandValToReplace()) {
3621 CI->setOperand(1, CI->getOperand(0));
3622 CI->setOperand(0, NV);
3623 NV = CI->getOperand(1);
3624 Changed = true;
3625 }
3626
3627 // x == y --> x - y == 0
3628 const SCEV *N = SE.getSCEV(NV);
3629 if (SE.isLoopInvariant(N, L) && Rewriter.isSafeToExpand(N) &&
3630 (!NV->getType()->isPointerTy() ||
3631 SE.getPointerBase(N) == SE.getPointerBase(S))) {
3632 // S is normalized, so normalize N before folding it into S
3633 // to keep the result normalized.
3634 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3635 if (!N)
3636 continue;
3637 Kind = LSRUse::ICmpZero;
3638 S = SE.getMinusSCEV(N, S);
3639 } else if (L->isLoopInvariant(NV) &&
3640 (!isa<Instruction>(NV) ||
3641 DT.dominates(cast<Instruction>(NV), L->getHeader())) &&
3642 !NV->getType()->isPointerTy()) {
3643 // If we can't generally expand the expression (e.g. it contains
3644 // a divide), but it is already at a loop invariant point before the
3645 // loop, wrap it in an unknown (to prevent the expander from trying
3646 // to re-expand in a potentially unsafe way.) The restriction to
3647 // integer types is required because the unknown hides the base, and
3648 // SCEV can't compute the difference of two unknown pointers.
3649 N = SE.getUnknown(NV);
3650 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3651 if (!N)
3652 continue;
3653 Kind = LSRUse::ICmpZero;
3654 S = SE.getMinusSCEV(N, S);
3656 }
3657
3658 // -1 and the negations of all interesting strides (except the negation
3659 // of -1) are now also interesting.
3660 for (size_t i = 0, e = Factors.size(); i != e; ++i)
3661 if (Factors[i] != -1)
3662 Factors.insert(-(uint64_t)Factors[i]);
3663 Factors.insert(-1);
3664 }
3665 }
3666
3667 // Get or create an LSRUse.
3668 std::pair<size_t, Immediate> P = getUse(S, Kind, AccessTy);
3669 size_t LUIdx = P.first;
3670 Immediate Offset = P.second;
3671 LSRUse &LU = Uses[LUIdx];
3672
3673 // Record the fixup.
3674 LSRFixup &LF = LU.getNewFixup();
3675 LF.UserInst = UserInst;
3676 LF.OperandValToReplace = U.getOperandValToReplace();
3677 LF.PostIncLoops = TmpPostIncLoops;
3678 LF.Offset = Offset;
3679 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3680 LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF);
3681
3682 // Create SCEV as Formula for calculating baseline cost
3683 if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) {
3684 Formula F;
3685 F.initialMatch(S, L, SE);
3686 BaselineCost.RateFormula(F, Regs, VisitedRegs, LU,
3687 HardwareLoopProfitable);
3688 VisitedLSRUse.insert(LUIdx);
3689 }
3690
3691 // If this is the first use of this LSRUse, give it a formula.
3692 if (LU.Formulae.empty()) {
3693 InsertInitialFormula(S, LU, LUIdx);
3694 CountRegisters(LU.Formulae.back(), LUIdx);
3695 }
3696 }
3697
3698 LLVM_DEBUG(print_fixups(dbgs()));
3699}
3700
3701/// Insert a formula for the given expression into the given use, separating out
3702/// loop-variant portions from loop-invariant and loop-computable portions.
3703void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU,
3704 size_t LUIdx) {
3705 // Mark uses whose expressions cannot be expanded.
3706 if (!Rewriter.isSafeToExpand(S))
3707 LU.RigidFormula = true;
3708
3709 Formula F;
3710 F.initialMatch(S, L, SE);
3711 bool Inserted = InsertFormula(LU, LUIdx, F);
3712 assert(Inserted && "Initial formula already exists!"); (void)Inserted;
3713}
3714
3715/// Insert a simple single-register formula for the given expression into the
3716/// given use.
3717void
3718LSRInstance::InsertSupplementalFormula(const SCEV *S,
3719 LSRUse &LU, size_t LUIdx) {
3720 Formula F;
3721 F.BaseRegs.push_back(S);
3722 F.HasBaseReg = true;
3723 bool Inserted = InsertFormula(LU, LUIdx, F);
3724 assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
3725}
3726
3727/// Note which registers are used by the given formula, updating RegUses.
3728void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
3729 if (F.ScaledReg)
3730 RegUses.countRegister(F.ScaledReg, LUIdx);
3731 for (const SCEV *BaseReg : F.BaseRegs)
3732 RegUses.countRegister(BaseReg, LUIdx);
3733}
3734
3735/// If the given formula has not yet been inserted, add it to the list, and
3736/// return true. Return false otherwise.
3737bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
3738 // Do not insert formula that we will not be able to expand.
3739 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
3740 "Formula is illegal");
3741
3742 if (!LU.InsertFormula(F, *L))
3743 return false;
3744
3745 CountRegisters(F, LUIdx);
3746 return true;
3747}
3748
3749/// Test whether this fixup will be executed each time the corresponding IV
3750/// increment instruction is executed.
3751bool LSRInstance::IsFixupExecutedEachIncrement(const LSRFixup &LF) const {
3752 // If the fixup block dominates the IV increment block then there is no path
3753 // through the loop to the increment that doesn't pass through the fixup.
3754 return DT.dominates(LF.UserInst->getParent(), IVIncInsertPos->getParent());
3755}
3756
3757/// Check for other uses of loop-invariant values which we're tracking. These
3758/// other uses will pin these values in registers, making them less profitable
3759/// for elimination.
3760/// TODO: This currently misses non-constant addrec step registers.
3761/// TODO: Should this give more weight to users inside the loop?
3762void
3763LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
3764 SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
3765 SmallPtrSet<const SCEV *, 32> Visited;
3766
3767 // Don't collect outside uses if we are favoring postinc - the instructions in
3768 // the loop are more important than the ones outside of it.
3769 if (AMK == TTI::AMK_PostIndexed)
3770 return;
3771
3772 while (!Worklist.empty()) {
3773 const SCEV *S = Worklist.pop_back_val();
3774
3775 // Don't process the same SCEV twice
3776 if (!Visited.insert(S).second)
3777 continue;
3778
3779 if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
3780 append_range(Worklist, N->operands());
3781 else if (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(S))
3782 Worklist.push_back(C->getOperand());
3783 else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
3784 Worklist.push_back(D->getLHS());
3785 Worklist.push_back(D->getRHS());
3786 } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
3787 const Value *V = US->getValue();
3788 if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
3789 // Look for instructions defined outside the loop.
3790 if (L->contains(Inst)) continue;
3791 } else if (isa<Constant>(V))
3792 // Constants can be re-materialized.
3793 continue;
3794 for (const Use &U : V->uses()) {
3795 const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
3796 // Ignore non-instructions.
3797 if (!UserInst)
3798 continue;
3799 // Don't bother if the instruction is an EHPad.
3800 if (UserInst->isEHPad())
3801 continue;
3802 // Ignore instructions in other functions (as can happen with
3803 // Constants).
3804 if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
3805 continue;
3806 // Ignore instructions not dominated by the loop.
3807 const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
3808 UserInst->getParent() :
3809 cast<PHINode>(UserInst)->getIncomingBlock(
3811 if (!DT.dominates(L->getHeader(), UseBB))
3812 continue;
3813 // Don't bother if the instruction is in a BB which ends in an EHPad.
3814 if (UseBB->getTerminator()->isEHPad())
3815 continue;
3816
3817 // Ignore cases in which the currently-examined value could come from
3818 // a basic block terminated with an EHPad. This checks all incoming
3819 // blocks of the phi node since it is possible that the same incoming
3820 // value comes from multiple basic blocks, only some of which may end
3821 // in an EHPad. If any of them do, a subsequent rewrite attempt by this
3822 // pass would try to insert instructions into an EHPad, hitting an
3823 // assertion.
3824 if (isa<PHINode>(UserInst)) {
3825 const auto *PhiNode = cast<PHINode>(UserInst);
3826 bool HasIncompatibleEHPTerminatedBlock = false;
3827 llvm::Value *ExpectedValue = U;
3828 for (unsigned int I = 0; I < PhiNode->getNumIncomingValues(); I++) {
3829 if (PhiNode->getIncomingValue(I) == ExpectedValue) {
3830 if (PhiNode->getIncomingBlock(I)->getTerminator()->isEHPad()) {
3831 HasIncompatibleEHPTerminatedBlock = true;
3832 break;
3833 }
3834 }
3835 }
3836 if (HasIncompatibleEHPTerminatedBlock) {
3837 continue;
3838 }
3839 }
3840
3841 // Don't bother rewriting PHIs in catchswitch blocks.
3842 if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
3843 continue;
3844 // Ignore uses which are part of other SCEV expressions, to avoid
3845 // analyzing them multiple times.
3846 if (SE.isSCEVable(UserInst->getType())) {
3847 const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
3848 // If the user is a no-op, look through to its uses.
3849 if (!isa<SCEVUnknown>(UserS))
3850 continue;
3851 if (UserS == US) {
3852 Worklist.push_back(
3853 SE.getUnknown(const_cast<Instruction *>(UserInst)));
3854 continue;
3855 }
3856 }
3857 // Ignore icmp instructions which are already being analyzed.
3858 if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
3859 unsigned OtherIdx = !U.getOperandNo();
3860 Value *OtherOp = ICI->getOperand(OtherIdx);
3861 if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
3862 continue;
3863 }
3864
3865 // Do not consider uses inside lifetime intrinsics. These are not
3866 // actually materialized.
3867 if (UserInst->isLifetimeStartOrEnd())
3868 continue;
3869
3870 std::pair<size_t, Immediate> P =
3871 getUse(S, LSRUse::Basic, MemAccessTy());
3872 size_t LUIdx = P.first;
3873 Immediate Offset = P.second;
3874 LSRUse &LU = Uses[LUIdx];
3875 LSRFixup &LF = LU.getNewFixup();
3876 LF.UserInst = const_cast<Instruction *>(UserInst);
3877 LF.OperandValToReplace = U;
3878 LF.Offset = Offset;
3879 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3880 LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF);
3881 InsertSupplementalFormula(US, LU, LUIdx);
3882 CountRegisters(LU.Formulae.back(), Uses.size() - 1);
3883 break;
3884 }
3885 }
3886 }
3887}
3888
3889/// Split S into subexpressions which can be pulled out into separate
3890/// registers. If C is non-null, multiply each subexpression by C.
3891///
3892/// Return remainder expression after factoring the subexpressions captured by
3893/// Ops. If Ops is complete, return NULL.
3894static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
3896 const Loop *L,
3897 ScalarEvolution &SE,
3898 unsigned Depth = 0) {
3899 // Arbitrarily cap recursion to protect compile time.
3900 if (Depth >= 3)
3901 return S;
3902
3903 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
3904 // Break out add operands.
3905 for (const SCEV *S : Add->operands()) {
3906 const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
3907 if (Remainder)
3908 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3909 }
3910 return nullptr;
3911 }
3912 const SCEV *Start, *Step;
3913 const SCEVConstant *Op0;
3914 const SCEV *Op1;
3915 if (match(S, m_scev_AffineAddRec(m_SCEV(Start), m_SCEV(Step)))) {
3916 // Split a non-zero base out of an addrec.
3917 if (Start->isZero())
3918 return S;
3919
3920 const SCEV *Remainder = CollectSubexprs(Start, C, Ops, L, SE, Depth + 1);
3921 // Split the non-zero AddRec unless it is part of a nested recurrence that
3922 // does not pertain to this loop.
3923 if (Remainder && (cast<SCEVAddRecExpr>(S)->getLoop() == L ||
3924 !isa<SCEVAddRecExpr>(Remainder))) {
3925 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3926 Remainder = nullptr;
3927 }
3928 if (Remainder != Start) {
3929 if (!Remainder)
3930 Remainder = SE.getConstant(S->getType(), 0);
3931 return SE.getAddRecExpr(Remainder, Step,
3932 cast<SCEVAddRecExpr>(S)->getLoop(),
3933 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
3935 }
3936 } else if (match(S, m_scev_Mul(m_SCEVConstant(Op0), m_SCEV(Op1)))) {
3937 // Break (C * (a + b + c)) into C*a + C*b + C*c.
3938 C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
3939 const SCEV *Remainder = CollectSubexprs(Op1, C, Ops, L, SE, Depth + 1);
3940 if (Remainder)
3941 Ops.push_back(SE.getMulExpr(C, Remainder));
3942 return nullptr;
3943 }
3944 return S;
3945}
3946
3947/// Return true if the SCEV represents a value that may end up as a
3948/// post-increment operation.
3950 LSRUse &LU, const SCEV *S, const Loop *L,
3951 ScalarEvolution &SE) {
3952 if (LU.Kind != LSRUse::Address ||
3953 !LU.AccessTy.getType()->isIntOrIntVectorTy())
3954 return false;
3955 const SCEV *Start;
3956 if (!match(S, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant())))
3957 return false;
3958 // Check if a post-indexed load/store can be used.
3959 if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, S->getType()) ||
3960 TTI.isIndexedStoreLegal(TTI.MIM_PostInc, S->getType())) {
3961 if (!isa<SCEVConstant>(Start) && SE.isLoopInvariant(Start, L))
3962 return true;
3963 }
3964 return false;
3965}
3966
3967/// Helper function for LSRInstance::GenerateReassociations.
3968void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
3969 const Formula &Base,
3970 unsigned Depth, size_t Idx,
3971 bool IsScaledReg) {
3972 const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3973 // Don't generate reassociations for the base register of a value that
3974 // may generate a post-increment operator. The reason is that the
3975 // reassociations cause extra base+register formula to be created,
3976 // and possibly chosen, but the post-increment is more efficient.
3977 if (AMK == TTI::AMK_PostIndexed && mayUsePostIncMode(TTI, LU, BaseReg, L, SE))
3978 return;
3980 const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
3981 if (Remainder)
3982 AddOps.push_back(Remainder);
3983
3984 if (AddOps.size() == 1)
3985 return;
3986
3988 JE = AddOps.end();
3989 J != JE; ++J) {
3990 // Loop-variant "unknown" values are uninteresting; we won't be able to
3991 // do anything meaningful with them.
3992 if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
3993 continue;
3994
3995 // Don't pull a constant into a register if the constant could be folded
3996 // into an immediate field.
3997 if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3998 LU.AccessTy, *J, Base.getNumRegs() > 1))
3999 continue;
4000
4001 // Collect all operands except *J.
4002 SmallVector<SCEVUse, 8> InnerAddOps(std::as_const(AddOps).begin(), J);
4003 InnerAddOps.append(std::next(J), std::as_const(AddOps).end());
4004
4005 // Don't leave just a constant behind in a register if the constant could
4006 // be folded into an immediate field.
4007 if (InnerAddOps.size() == 1 &&
4008 isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
4009 LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
4010 continue;
4011
4012 const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
4013 if (InnerSum->isZero())
4014 continue;
4015 Formula F = Base;
4016
4017 if (F.UnfoldedOffset.isNonZero() && F.UnfoldedOffset.isScalable())
4018 continue;
4019
4020 // Add the remaining pieces of the add back into the new formula.
4021 const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
4022 if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
4023 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
4024 InnerSumSC->getValue()->getZExtValue())) {
4025 F.UnfoldedOffset =
4026 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
4027 InnerSumSC->getValue()->getZExtValue());
4028 if (IsScaledReg) {
4029 F.ScaledReg = nullptr;
4030 F.Scale = 0;
4031 } else
4032 F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
4033 } else if (IsScaledReg)
4034 F.ScaledReg = InnerSum;
4035 else
4036 F.BaseRegs[Idx] = InnerSum;
4037
4038 // Add J as its own register, or an unfolded immediate.
4039 const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
4040 if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
4041 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
4042 SC->getValue()->getZExtValue()))
4043 F.UnfoldedOffset =
4044 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
4045 SC->getValue()->getZExtValue());
4046 else
4047 F.BaseRegs.push_back(*J);
4048 // We may have changed the number of register in base regs, adjust the
4049 // formula accordingly.
4050 F.canonicalize(*L);
4051
4052 if (InsertFormula(LU, LUIdx, F))
4053 // If that formula hadn't been seen before, recurse to find more like
4054 // it.
4055 // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2)
4056 // Because just Depth is not enough to bound compile time.
4057 // This means that every time AddOps.size() is greater 16^x we will add
4058 // x to Depth.
4059 GenerateReassociations(LU, LUIdx, LU.Formulae.back(),
4060 Depth + 1 + (Log2_32(AddOps.size()) >> 2));
4061 }
4062}
4063
4064/// Split out subexpressions from adds and the bases of addrecs.
4065void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
4066 Formula Base, unsigned Depth) {
4067 assert(Base.isCanonical(*L) && "Input must be in the canonical form");
4068 // Arbitrarily cap recursion to protect compile time.
4069 if (Depth >= 3)
4070 return;
4071
4072 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4073 GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
4074
4075 if (Base.Scale == 1)
4076 GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
4077 /* Idx */ -1, /* IsScaledReg */ true);
4078}
4079
4080/// Generate a formula consisting of all of the loop-dominating registers added
4081/// into a single register.
4082void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
4083 Formula Base) {
4084 // This method is only interesting on a plurality of registers.
4085 if (Base.BaseRegs.size() + (Base.Scale == 1) +
4086 (Base.UnfoldedOffset.isNonZero()) <=
4087 1)
4088 return;
4089
4090 // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
4091 // processing the formula.
4092 Base.unscale();
4094 Formula NewBase = Base;
4095 NewBase.BaseRegs.clear();
4096 Type *CombinedIntegerType = nullptr;
4097 for (const SCEV *BaseReg : Base.BaseRegs) {
4098 if (SE.properlyDominates(BaseReg, L->getHeader()) &&
4099 !SE.hasComputableLoopEvolution(BaseReg, L)) {
4100 if (!CombinedIntegerType)
4101 CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType());
4102 Ops.push_back(BaseReg);
4103 }
4104 else
4105 NewBase.BaseRegs.push_back(BaseReg);
4106 }
4107
4108 // If no register is relevant, we're done.
4109 if (Ops.size() == 0)
4110 return;
4111
4112 // Utility function for generating the required variants of the combined
4113 // registers.
4114 auto GenerateFormula = [&](const SCEV *Sum) {
4115 Formula F = NewBase;
4116
4117 // TODO: If Sum is zero, it probably means ScalarEvolution missed an
4118 // opportunity to fold something. For now, just ignore such cases
4119 // rather than proceed with zero in a register.
4120 if (Sum->isZero())
4121 return;
4122
4123 F.BaseRegs.push_back(Sum);
4124 F.canonicalize(*L);
4125 (void)InsertFormula(LU, LUIdx, F);
4126 };
4127
4128 // If we collected at least two registers, generate a formula combining them.
4129 if (Ops.size() > 1) {
4130 SmallVector<SCEVUse, 4> OpsCopy(Ops); // Don't let SE modify Ops.
4131 GenerateFormula(SE.getAddExpr(OpsCopy));
4132 }
4133
4134 // If we have an unfolded offset, generate a formula combining it with the
4135 // registers collected.
4136 if (NewBase.UnfoldedOffset.isNonZero() && NewBase.UnfoldedOffset.isFixed()) {
4137 assert(CombinedIntegerType && "Missing a type for the unfolded offset");
4138 Ops.push_back(SE.getConstant(CombinedIntegerType,
4139 NewBase.UnfoldedOffset.getFixedValue(), true));
4140 NewBase.UnfoldedOffset = Immediate::getFixed(0);
4141 GenerateFormula(SE.getAddExpr(Ops));
4142 }
4143}
4144
4145/// Helper function for LSRInstance::GenerateSymbolicOffsets.
4146void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
4147 const Formula &Base, size_t Idx,
4148 bool IsScaledReg) {
4149 SCEVUse G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4150 GlobalValue *GV = ExtractSymbol(G, SE);
4151 if (G->isZero() || !GV)
4152 return;
4153 Formula F = Base;
4154 F.BaseGV = GV;
4155 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4156 return;
4157 if (IsScaledReg)
4158 F.ScaledReg = G;
4159 else
4160 F.BaseRegs[Idx] = G;
4161 (void)InsertFormula(LU, LUIdx, F);
4162}
4163
4164/// Generate reuse formulae using symbolic offsets.
4165void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
4166 Formula Base) {
4167 // We can't add a symbolic offset if the address already contains one.
4168 if (Base.BaseGV) return;
4169
4170 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4171 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
4172 if (Base.Scale == 1)
4173 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
4174 /* IsScaledReg */ true);
4175}
4176
4177/// Helper function for LSRInstance::GenerateConstantOffsets.
4178void LSRInstance::GenerateConstantOffsetsImpl(
4179 LSRUse &LU, unsigned LUIdx, const Formula &Base,
4180 const SmallVectorImpl<Immediate> &Worklist, size_t Idx, bool IsScaledReg) {
4181
4182 auto GenerateOffset = [&](const SCEV *G, Immediate Offset) {
4183 Formula F = Base;
4184 if (!Base.BaseOffset.isCompatibleImmediate(Offset))
4185 return;
4186 F.BaseOffset = Base.BaseOffset.subUnsigned(Offset);
4187
4188 if (isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) {
4189 // Add the offset to the base register.
4190 const SCEV *NewOffset = Offset.getSCEV(SE, G->getType());
4191 const SCEV *NewG = SE.getAddExpr(NewOffset, G);
4192 // If it cancelled out, drop the base register, otherwise update it.
4193 if (NewG->isZero()) {
4194 if (IsScaledReg) {
4195 F.Scale = 0;
4196 F.ScaledReg = nullptr;
4197 } else
4198 F.deleteBaseReg(F.BaseRegs[Idx]);
4199 F.canonicalize(*L);
4200 } else if (IsScaledReg)
4201 F.ScaledReg = NewG;
4202 else
4203 F.BaseRegs[Idx] = NewG;
4204
4205 (void)InsertFormula(LU, LUIdx, F);
4206 }
4207 };
4208
4209 SCEVUse G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4210
4211 // With constant offsets and constant steps, we can generate pre-inc
4212 // accesses by having the offset equal the step. So, for access #0 with a
4213 // step of 8, we generate a G - 8 base which would require the first access
4214 // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer
4215 // for itself and hopefully becomes the base for other accesses. This means
4216 // means that a single pre-indexed access can be generated to become the new
4217 // base pointer for each iteration of the loop, resulting in no extra add/sub
4218 // instructions for pointer updating.
4219 if ((AMK & TTI::AMK_PreIndexed) && LU.Kind == LSRUse::Address) {
4220 const APInt *StepInt;
4221 if (match(G, m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StepInt)))) {
4222 int64_t Step = StepInt->isNegative() ? StepInt->getSExtValue()
4223 : StepInt->getZExtValue();
4224
4225 for (Immediate Offset : Worklist) {
4226 if (Offset.isFixed()) {
4227 Offset = Immediate::getFixed(Offset.getFixedValue() - Step);
4228 GenerateOffset(G, Offset);
4229 }
4230 }
4231 }
4232 }
4233 for (Immediate Offset : Worklist)
4234 GenerateOffset(G, Offset);
4235
4236 // TODO: It likely makes sense to extract the immediate corresponding to the
4237 // access type (i.e., set PreferScalable to AccessTy.MemTy &&
4238 // AccessTy.MemTy->isScalableTy()).
4239 Immediate Imm = ExtractImmediate(G, SE, /*PreferScalable=*/false);
4240 if (G->isZero() || Imm.isZero() ||
4241 !Base.BaseOffset.isCompatibleImmediate(Imm))
4242 return;
4243 Formula F = Base;
4244 F.BaseOffset = F.BaseOffset.addUnsigned(Imm);
4245 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4246 return;
4247 if (IsScaledReg) {
4248 F.ScaledReg = G;
4249 } else {
4250 F.BaseRegs[Idx] = G;
4251 // We may generate non canonical Formula if G is a recurrent expr reg
4252 // related with current loop while F.ScaledReg is not.
4253 F.canonicalize(*L);
4254 }
4255 (void)InsertFormula(LU, LUIdx, F);
4256}
4257
4258/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
4259void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
4260 Formula Base) {
4261 // TODO: For now, just add the min and max offset, because it usually isn't
4262 // worthwhile looking at everything inbetween.
4264 Worklist.push_back(LU.MinOffset);
4265 if (LU.MaxOffset != LU.MinOffset)
4266 Worklist.push_back(LU.MaxOffset);
4267
4268 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4269 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
4270 if (Base.Scale == 1)
4271 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
4272 /* IsScaledReg */ true);
4273}
4274
4275/// For ICmpZero, check to see if we can scale up the comparison. For example, x
4276/// == y -> x*c == y*c.
4277void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
4278 Formula Base) {
4279 if (LU.Kind != LSRUse::ICmpZero) return;
4280
4281 // Determine the integer type for the base formula.
4282 Type *IntTy = Base.getType();
4283 if (!IntTy) return;
4284 if (SE.getTypeSizeInBits(IntTy) > 64) return;
4285
4286 // Don't do this if there is more than one offset.
4287 if (LU.MinOffset != LU.MaxOffset) return;
4288
4289 // Check if transformation is valid. It is illegal to multiply pointer.
4290 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4291 return;
4292 for (const SCEV *BaseReg : Base.BaseRegs)
4293 if (BaseReg->getType()->isPointerTy())
4294 return;
4295 assert(!Base.BaseGV && "ICmpZero use is not legal!");
4296
4297 // Check each interesting stride.
4298 for (int64_t Factor : Factors) {
4299 // Check that Factor can be represented by IntTy
4300 if (!ConstantInt::isValueValidForType(IntTy, Factor))
4301 continue;
4302 // Check that the multiplication doesn't overflow.
4303 if (Base.BaseOffset.isMin() && Factor == -1)
4304 continue;
4305 // Not supporting scalable immediates.
4306 if (Base.BaseOffset.isNonZero() && Base.BaseOffset.isScalable())
4307 continue;
4308 Immediate NewBaseOffset = Base.BaseOffset.mulUnsigned(Factor);
4309 assert(Factor != 0 && "Zero factor not expected!");
4310 if (NewBaseOffset.getFixedValue() / Factor !=
4311 Base.BaseOffset.getFixedValue())
4312 continue;
4313 // If the offset will be truncated at this use, check that it is in bounds.
4314 if (!IntTy->isPointerTy() &&
4315 !ConstantInt::isValueValidForType(IntTy, NewBaseOffset.getFixedValue()))
4316 continue;
4317
4318 // Check that multiplying with the use offset doesn't overflow.
4319 Immediate Offset = LU.MinOffset;
4320 if (Offset.isMin() && Factor == -1)
4321 continue;
4322 Offset = Offset.mulUnsigned(Factor);
4323 if (Offset.getFixedValue() / Factor != LU.MinOffset.getFixedValue())
4324 continue;
4325 // If the offset will be truncated at this use, check that it is in bounds.
4326 if (!IntTy->isPointerTy() &&
4327 !ConstantInt::isValueValidForType(IntTy, Offset.getFixedValue()))
4328 continue;
4329
4330 Formula F = Base;
4331 F.BaseOffset = NewBaseOffset;
4332
4333 // Check that this scale is legal.
4334 if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
4335 continue;
4336
4337 // Compensate for the use having MinOffset built into it.
4338 F.BaseOffset = F.BaseOffset.addUnsigned(Offset).subUnsigned(LU.MinOffset);
4339
4340 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4341
4342 // Check that multiplying with each base register doesn't overflow.
4343 for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
4344 F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
4345 if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
4346 goto next;
4347 }
4348
4349 // Check that multiplying with the scaled register doesn't overflow.
4350 if (F.ScaledReg) {
4351 F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
4352 if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
4353 continue;
4354 }
4355
4356 // Check that multiplying with the unfolded offset doesn't overflow.
4357 if (F.UnfoldedOffset.isNonZero()) {
4358 if (F.UnfoldedOffset.isMin() && Factor == -1)
4359 continue;
4360 F.UnfoldedOffset = F.UnfoldedOffset.mulUnsigned(Factor);
4361 if (F.UnfoldedOffset.getFixedValue() / Factor !=
4362 Base.UnfoldedOffset.getFixedValue())
4363 continue;
4364 // If the offset will be truncated, check that it is in bounds.
4366 IntTy, F.UnfoldedOffset.getFixedValue()))
4367 continue;
4368 }
4369
4370 // If we make it here and it's legal, add it.
4371 (void)InsertFormula(LU, LUIdx, F);
4372 next:;
4373 }
4374}
4375
4376/// Generate stride factor reuse formulae by making use of scaled-offset address
4377/// modes, for example.
4378void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
4379 // Determine the integer type for the base formula.
4380 Type *IntTy = Base.getType();
4381 if (!IntTy) return;
4382
4383 // If this Formula already has a scaled register, we can't add another one.
4384 // Try to unscale the formula to generate a better scale.
4385 if (Base.Scale != 0 && !Base.unscale())
4386 return;
4387
4388 assert(Base.Scale == 0 && "unscale did not did its job!");
4389
4390 // Check each interesting stride.
4391 for (int64_t Factor : Factors) {
4392 Base.Scale = Factor;
4393 Base.HasBaseReg = Base.BaseRegs.size() > 1;
4394 // Check whether this scale is going to be legal.
4395 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4396 Base)) {
4397 // As a special-case, handle special out-of-loop Basic users specially.
4398 // TODO: Reconsider this special case.
4399 if (LU.Kind == LSRUse::Basic &&
4400 isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
4401 LU.AccessTy, Base) &&
4402 LU.AllFixupsOutsideLoop)
4403 LU.Kind = LSRUse::Special;
4404 else
4405 continue;
4406 }
4407 // For an ICmpZero, negating a solitary base register won't lead to
4408 // new solutions.
4409 if (LU.Kind == LSRUse::ICmpZero && !Base.HasBaseReg &&
4410 Base.BaseOffset.isZero() && !Base.BaseGV)
4411 continue;
4412 // For each addrec base reg, if its loop is current loop, apply the scale.
4413 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
4414 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
4415 if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
4416 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4417 if (FactorS->isZero())
4418 continue;
4419 // Divide out the factor, ignoring high bits, since we'll be
4420 // scaling the value back up in the end.
4421 if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true))
4422 if (!Quotient->isZero()) {
4423 // TODO: This could be optimized to avoid all the copying.
4424 Formula F = Base;
4425 F.ScaledReg = Quotient;
4426 F.deleteBaseReg(F.BaseRegs[i]);
4427 // The canonical representation of 1*reg is reg, which is already in
4428 // Base. In that case, do not try to insert the formula, it will be
4429 // rejected anyway.
4430 if (F.Scale == 1 && (F.BaseRegs.empty() ||
4431 (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
4432 continue;
4433 // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
4434 // non canonical Formula with ScaledReg's loop not being L.
4435 if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
4436 F.canonicalize(*L);
4437 (void)InsertFormula(LU, LUIdx, F);
4438 }
4439 }
4440 }
4441 }
4442}
4443
4444/// Extend/Truncate \p Expr to \p ToTy considering post-inc uses in \p Loops.
4445/// For all PostIncLoopSets in \p Loops, first de-normalize \p Expr, then
4446/// perform the extension/truncate and normalize again, as the normalized form
4447/// can result in folds that are not valid in the post-inc use contexts. The
4448/// expressions for all PostIncLoopSets must match, otherwise return nullptr.
4449static const SCEV *
4451 const SCEV *Expr, Type *ToTy,
4452 ScalarEvolution &SE) {
4453 const SCEV *Result = nullptr;
4454 for (auto &L : Loops) {
4455 auto *DenormExpr = denormalizeForPostIncUse(Expr, L, SE);
4456 const SCEV *NewDenormExpr = SE.getAnyExtendExpr(DenormExpr, ToTy);
4457 const SCEV *New = normalizeForPostIncUse(NewDenormExpr, L, SE);
4458 if (!New || (Result && New != Result))
4459 return nullptr;
4460 Result = New;
4461 }
4462
4463 assert(Result && "failed to create expression");
4464 return Result;
4465}
4466
4467/// Generate reuse formulae from different IV types.
4468void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
4469 // Don't bother truncating symbolic values.
4470 if (Base.BaseGV) return;
4471
4472 // Determine the integer type for the base formula.
4473 Type *DstTy = Base.getType();
4474 if (!DstTy) return;
4475 if (DstTy->isPointerTy())
4476 return;
4477
4478 // It is invalid to extend a pointer type so exit early if ScaledReg or
4479 // any of the BaseRegs are pointers.
4480 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4481 return;
4482 if (any_of(Base.BaseRegs,
4483 [](const SCEV *S) { return S->getType()->isPointerTy(); }))
4484 return;
4485
4487 for (auto &LF : LU.Fixups)
4488 Loops.push_back(LF.PostIncLoops);
4489
4490 for (Type *SrcTy : Types) {
4491 if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
4492 Formula F = Base;
4493
4494 // Sometimes SCEV is able to prove zero during ext transform. It may
4495 // happen if SCEV did not do all possible transforms while creating the
4496 // initial node (maybe due to depth limitations), but it can do them while
4497 // taking ext.
4498 if (F.ScaledReg) {
4499 const SCEV *NewScaledReg =
4500 getAnyExtendConsideringPostIncUses(Loops, F.ScaledReg, SrcTy, SE);
4501 if (!NewScaledReg || NewScaledReg->isZero())
4502 continue;
4503 F.ScaledReg = NewScaledReg;
4504 }
4505 bool HasZeroBaseReg = false;
4506 for (const SCEV *&BaseReg : F.BaseRegs) {
4507 const SCEV *NewBaseReg =
4508 getAnyExtendConsideringPostIncUses(Loops, BaseReg, SrcTy, SE);
4509 if (!NewBaseReg || NewBaseReg->isZero()) {
4510 HasZeroBaseReg = true;
4511 break;
4512 }
4513 BaseReg = NewBaseReg;
4514 }
4515 if (HasZeroBaseReg)
4516 continue;
4517
4518 // TODO: This assumes we've done basic processing on all uses and
4519 // have an idea what the register usage is.
4520 if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
4521 continue;
4522
4523 F.canonicalize(*L);
4524 (void)InsertFormula(LU, LUIdx, F);
4525 }
4526 }
4527}
4528
4529namespace {
4530
4531/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
4532/// modifications so that the search phase doesn't have to worry about the data
4533/// structures moving underneath it.
4534struct WorkItem {
4535 size_t LUIdx;
4536 Immediate Imm;
4537 const SCEV *OrigReg;
4538
4539 WorkItem(size_t LI, Immediate I, const SCEV *R)
4540 : LUIdx(LI), Imm(I), OrigReg(R) {}
4541
4542 void print(raw_ostream &OS) const;
4543 void dump() const;
4544};
4545
4546} // end anonymous namespace
4547
4548#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4549void WorkItem::print(raw_ostream &OS) const {
4550 OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
4551 << " , add offset " << Imm;
4552}
4553
4554LLVM_DUMP_METHOD void WorkItem::dump() const {
4555 print(errs()); errs() << '\n';
4556}
4557#endif
4558
4559/// Look for registers which are a constant distance apart and try to form reuse
4560/// opportunities between them.
4561void LSRInstance::GenerateCrossUseConstantOffsets() {
4562 // Group the registers by their value without any added constant offset.
4563 using ImmMapTy = std::map<Immediate, const SCEV *, KeyOrderTargetImmediate>;
4564
4565 DenseMap<const SCEV *, ImmMapTy> Map;
4566 DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
4568 for (const SCEV *Use : RegUses) {
4569 SCEVUse Reg = Use; // Make a copy for ExtractImmediate to modify.
4570 // TODO: Extract both scalable and fixed immediates (if present)?
4571 Immediate Imm = ExtractImmediate(Reg, SE);
4572 auto Pair = Map.try_emplace(Reg);
4573 if (Pair.second)
4574 Sequence.push_back(Reg);
4575 Pair.first->second.insert(std::make_pair(Imm, Use));
4576 UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
4577 }
4578
4579 // Now examine each set of registers with the same base value. Build up
4580 // a list of work to do and do the work in a separate step so that we're
4581 // not adding formulae and register counts while we're searching.
4582 SmallVector<WorkItem, 32> WorkItems;
4583 SmallSet<std::pair<size_t, Immediate>, 32, KeyOrderSizeTAndImmediate>
4584 UniqueItems;
4585 for (const SCEV *Reg : Sequence) {
4586 const ImmMapTy &Imms = Map.find(Reg)->second;
4587
4588 // It's not worthwhile looking for reuse if there's only one offset.
4589 if (Imms.size() == 1)
4590 continue;
4591
4592 LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
4593 for (const auto &Entry
4594 : Imms) dbgs()
4595 << ' ' << Entry.first;
4596 dbgs() << '\n');
4597
4598 // Examine each offset.
4599 for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
4600 J != JE; ++J) {
4601 const SCEV *OrigReg = J->second;
4602
4603 Immediate JImm = J->first;
4604 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
4605
4606 if (!isa<SCEVConstant>(OrigReg) &&
4607 UsedByIndicesMap[Reg].count() == 1) {
4608 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4609 << '\n');
4610 continue;
4611 }
4612
4613 // Conservatively examine offsets between this orig reg a few selected
4614 // other orig regs.
4615 Immediate First = Imms.begin()->first;
4616 Immediate Last = std::prev(Imms.end())->first;
4617 if (!First.isCompatibleImmediate(Last)) {
4618 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4619 << "\n");
4620 continue;
4621 }
4622 // Only scalable if both terms are scalable, or if one is scalable and
4623 // the other is 0.
4624 bool Scalable = First.isScalable() || Last.isScalable();
4625 int64_t FI = First.getKnownMinValue();
4626 int64_t LI = Last.getKnownMinValue();
4627 // Compute (First + Last) / 2 without overflow using the fact that
4628 // First + Last = 2 * (First + Last) + (First ^ Last).
4629 int64_t Avg = (FI & LI) + ((FI ^ LI) >> 1);
4630 // If the result is negative and FI is odd and LI even (or vice versa),
4631 // we rounded towards -inf. Add 1 in that case, to round towards 0.
4632 Avg = Avg + ((FI ^ LI) & ((uint64_t)Avg >> 63));
4633 ImmMapTy::const_iterator OtherImms[] = {
4634 Imms.begin(), std::prev(Imms.end()),
4635 Imms.lower_bound(Immediate::get(Avg, Scalable))};
4636 for (const auto &M : OtherImms) {
4637 if (M == J || M == JE) continue;
4638 if (!JImm.isCompatibleImmediate(M->first))
4639 continue;
4640
4641 // Compute the difference between the two.
4642 Immediate Imm = JImm.subUnsigned(M->first);
4643 for (unsigned LUIdx : UsedByIndices.set_bits())
4644 // Make a memo of this use, offset, and register tuple.
4645 if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
4646 WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
4647 }
4648 }
4649 }
4650
4651 Map.clear();
4652 Sequence.clear();
4653 UsedByIndicesMap.clear();
4654 UniqueItems.clear();
4655
4656 // Now iterate through the worklist and add new formulae.
4657 for (const WorkItem &WI : WorkItems) {
4658 size_t LUIdx = WI.LUIdx;
4659 LSRUse &LU = Uses[LUIdx];
4660 Immediate Imm = WI.Imm;
4661 const SCEV *OrigReg = WI.OrigReg;
4662
4663 Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
4664 const SCEV *NegImmS = Imm.getNegativeSCEV(SE, IntTy);
4665 unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
4666
4667 // TODO: Use a more targeted data structure.
4668 for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
4669 Formula F = LU.Formulae[L];
4670 // FIXME: The code for the scaled and unscaled registers looks
4671 // very similar but slightly different. Investigate if they
4672 // could be merged. That way, we would not have to unscale the
4673 // Formula.
4674 F.unscale();
4675 // Use the immediate in the scaled register.
4676 if (F.ScaledReg == OrigReg) {
4677 if (!F.BaseOffset.isCompatibleImmediate(Imm))
4678 continue;
4679 Immediate Offset = F.BaseOffset.addUnsigned(Imm.mulUnsigned(F.Scale));
4680 // Don't create 50 + reg(-50).
4681 const SCEV *S = Offset.getNegativeSCEV(SE, IntTy);
4682 if (F.referencesReg(S))
4683 continue;
4684 Formula NewF = F;
4685 NewF.BaseOffset = Offset;
4686 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4687 NewF))
4688 continue;
4689 NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
4690
4691 // If the new scale is a constant in a register, and adding the constant
4692 // value to the immediate would produce a value closer to zero than the
4693 // immediate itself, then the formula isn't worthwhile.
4694 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) {
4695 // FIXME: Do we need to do something for scalable immediates here?
4696 // A scalable SCEV won't be constant, but we might still have
4697 // something in the offset? Bail out for now to be safe.
4698 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4699 continue;
4700 if (C->getValue()->isNegative() !=
4701 (NewF.BaseOffset.isLessThanZero()) &&
4702 (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
4703 .ule(std::abs(NewF.BaseOffset.getFixedValue())))
4704 continue;
4705 }
4706
4707 // OK, looks good.
4708 NewF.canonicalize(*this->L);
4709 (void)InsertFormula(LU, LUIdx, NewF);
4710 } else {
4711 // Use the immediate in a base register.
4712 for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
4713 const SCEV *BaseReg = F.BaseRegs[N];
4714 if (BaseReg != OrigReg)
4715 continue;
4716 Formula NewF = F;
4717 if (!NewF.BaseOffset.isCompatibleImmediate(Imm) ||
4718 !NewF.UnfoldedOffset.isCompatibleImmediate(Imm) ||
4719 !NewF.BaseOffset.isCompatibleImmediate(NewF.UnfoldedOffset))
4720 continue;
4721 NewF.BaseOffset = NewF.BaseOffset.addUnsigned(Imm);
4722 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
4723 LU.Kind, LU.AccessTy, NewF)) {
4724 if (AMK == TTI::AMK_PostIndexed &&
4725 mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
4726 continue;
4727 Immediate NewUnfoldedOffset = NewF.UnfoldedOffset.addUnsigned(Imm);
4728 if (!isLegalAddImmediate(TTI, NewUnfoldedOffset))
4729 continue;
4730 NewF = F;
4731 NewF.UnfoldedOffset = NewUnfoldedOffset;
4732 }
4733 NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
4734
4735 // If the new formula has a constant in a register, and adding the
4736 // constant value to the immediate would produce a value closer to
4737 // zero than the immediate itself, then the formula isn't worthwhile.
4738 for (const SCEV *NewReg : NewF.BaseRegs)
4739 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg)) {
4740 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4741 goto skip_formula;
4742 if ((C->getAPInt() + NewF.BaseOffset.getFixedValue())
4743 .abs()
4744 .slt(std::abs(NewF.BaseOffset.getFixedValue())) &&
4745 (C->getAPInt() + NewF.BaseOffset.getFixedValue())
4746 .countr_zero() >=
4748 NewF.BaseOffset.getFixedValue()))
4749 goto skip_formula;
4750 }
4751
4752 // Ok, looks good.
4753 NewF.canonicalize(*this->L);
4754 (void)InsertFormula(LU, LUIdx, NewF);
4755 break;
4756 skip_formula:;
4757 }
4758 }
4759 }
4760 }
4761}
4762
4763/// Generate formulae for each use.
4764void
4765LSRInstance::GenerateAllReuseFormulae() {
4766 // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
4767 // queries are more precise.
4768 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4769 LSRUse &LU = Uses[LUIdx];
4770 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4771 GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
4772 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4773 GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
4774 }
4775 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4776 LSRUse &LU = Uses[LUIdx];
4777 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4778 GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
4779 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4780 GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
4781 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4782 GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
4783 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4784 GenerateScales(LU, LUIdx, LU.Formulae[i]);
4785 }
4786 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4787 LSRUse &LU = Uses[LUIdx];
4788 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4789 GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
4790 }
4791
4792 GenerateCrossUseConstantOffsets();
4793
4794 LLVM_DEBUG(dbgs() << "\n"
4795 "After generating reuse formulae:\n";
4796 print_uses(dbgs()));
4797}
4798
4799/// If there are multiple formulae with the same set of registers used
4800/// by other uses, pick the best one and delete the others.
4801void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
4802 DenseSet<const SCEV *> VisitedRegs;
4803 SmallPtrSet<const SCEV *, 16> Regs;
4804 SmallPtrSet<const SCEV *, 16> LoserRegs;
4805#ifndef NDEBUG
4806 bool ChangedFormulae = false;
4807#endif
4808
4809 // Collect the best formula for each unique set of shared registers. This
4810 // is reset for each use.
4811 using BestFormulaeTy = DenseMap<SmallVector<const SCEV *, 4>, size_t>;
4812
4813 BestFormulaeTy BestFormulae;
4814
4815 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4816 LSRUse &LU = Uses[LUIdx];
4817 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
4818 dbgs() << '\n');
4819
4820 bool Any = false;
4821 for (size_t FIdx = 0, NumForms = LU.Formulae.size();
4822 FIdx != NumForms; ++FIdx) {
4823 Formula &F = LU.Formulae[FIdx];
4824
4825 // Some formulas are instant losers. For example, they may depend on
4826 // nonexistent AddRecs from other loops. These need to be filtered
4827 // immediately, otherwise heuristics could choose them over others leading
4828 // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
4829 // avoids the need to recompute this information across formulae using the
4830 // same bad AddRec. Passing LoserRegs is also essential unless we remove
4831 // the corresponding bad register from the Regs set.
4832 Cost CostF(L, SE, TTI, AMK);
4833 Regs.clear();
4834 CostF.RateFormula(F, Regs, VisitedRegs, LU, HardwareLoopProfitable,
4835 &LoserRegs);
4836 if (CostF.isLoser()) {
4837 // During initial formula generation, undesirable formulae are generated
4838 // by uses within other loops that have some non-trivial address mode or
4839 // use the postinc form of the IV. LSR needs to provide these formulae
4840 // as the basis of rediscovering the desired formula that uses an AddRec
4841 // corresponding to the existing phi. Once all formulae have been
4842 // generated, these initial losers may be pruned.
4843 LLVM_DEBUG(dbgs() << " Filtering loser "; F.print(dbgs());
4844 dbgs() << "\n");
4845 }
4846 else {
4848 for (const SCEV *Reg : F.BaseRegs) {
4849 if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
4850 Key.push_back(Reg);
4851 }
4852 if (F.ScaledReg &&
4853 RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
4854 Key.push_back(F.ScaledReg);
4855 // Unstable sort by host order ok, because this is only used for
4856 // uniquifying.
4857 llvm::sort(Key);
4858
4859 std::pair<BestFormulaeTy::const_iterator, bool> P =
4860 BestFormulae.insert(std::make_pair(Key, FIdx));
4861 if (P.second)
4862 continue;
4863
4864 Formula &Best = LU.Formulae[P.first->second];
4865
4866 Cost CostBest(L, SE, TTI, AMK);
4867 Regs.clear();
4868 CostBest.RateFormula(Best, Regs, VisitedRegs, LU,
4869 HardwareLoopProfitable);
4870 if (CostF.isLess(CostBest))
4871 std::swap(F, Best);
4872 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
4873 dbgs() << "\n"
4874 " in favor of formula ";
4875 Best.print(dbgs()); dbgs() << '\n');
4876 }
4877#ifndef NDEBUG
4878 ChangedFormulae = true;
4879#endif
4880 LU.DeleteFormula(F);
4881 --FIdx;
4882 --NumForms;
4883 Any = true;
4884 }
4885
4886 // Now that we've filtered out some formulae, recompute the Regs set.
4887 if (Any)
4888 LU.RecomputeRegs(LUIdx, RegUses);
4889
4890 // Reset this to prepare for the next use.
4891 BestFormulae.clear();
4892 }
4893
4894 LLVM_DEBUG(if (ChangedFormulae) {
4895 dbgs() << "\n"
4896 "After filtering out undesirable candidates:\n";
4897 print_uses(dbgs());
4898 });
4899}
4900
4901/// Estimate the worst-case number of solutions the solver might have to
4902/// consider. It almost never considers this many solutions because it prune the
4903/// search space, but the pruning isn't always sufficient.
4904size_t LSRInstance::EstimateSearchSpaceComplexity() const {
4905 size_t Power = 1;
4906 for (const LSRUse &LU : Uses) {
4907 size_t FSize = LU.Formulae.size();
4908 if (FSize >= ComplexityLimit) {
4909 Power = ComplexityLimit;
4910 break;
4911 }
4912 Power *= FSize;
4913 if (Power >= ComplexityLimit)
4914 break;
4915 }
4916 return Power;
4917}
4918
4919/// When one formula uses a superset of the registers of another formula, it
4920/// won't help reduce register pressure (though it may not necessarily hurt
4921/// register pressure); remove it to simplify the system.
4922void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
4923 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4924 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4925
4926 LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
4927 "which use a superset of registers used by other "
4928 "formulae.\n");
4929
4930 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4931 LSRUse &LU = Uses[LUIdx];
4932 bool Any = false;
4933 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
4934 Formula &F = LU.Formulae[i];
4935 if (F.BaseOffset.isNonZero() && F.BaseOffset.isScalable())
4936 continue;
4937 // Look for a formula with a constant or GV in a register. If the use
4938 // also has a formula with that same value in an immediate field,
4939 // delete the one that uses a register.
4941 I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
4942 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
4943 Formula NewF = F;
4944 //FIXME: Formulas should store bitwidth to do wrapping properly.
4945 // See PR41034.
4946 NewF.BaseOffset =
4947 Immediate::getFixed(NewF.BaseOffset.getFixedValue() +
4948 (uint64_t)C->getValue()->getSExtValue());
4949 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4950 (I - F.BaseRegs.begin()));
4951 if (LU.HasFormulaWithSameRegs(NewF)) {
4952 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4953 dbgs() << '\n');
4954 LU.DeleteFormula(F);
4955 --i;
4956 --e;
4957 Any = true;
4958 break;
4959 }
4960 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
4961 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
4962 if (!F.BaseGV) {
4963 Formula NewF = F;
4964 NewF.BaseGV = GV;
4965 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4966 (I - F.BaseRegs.begin()));
4967 if (LU.HasFormulaWithSameRegs(NewF)) {
4968 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4969 dbgs() << '\n');
4970 LU.DeleteFormula(F);
4971 --i;
4972 --e;
4973 Any = true;
4974 break;
4975 }
4976 }
4977 }
4978 }
4979 }
4980 if (Any)
4981 LU.RecomputeRegs(LUIdx, RegUses);
4982 }
4983
4984 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4985 }
4986}
4987
4988/// When there are many registers for expressions like A, A+1, A+2, etc.,
4989/// allocate a single register for them.
4990void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
4991 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4992 return;
4993
4994 LLVM_DEBUG(
4995 dbgs() << "The search space is too complex.\n"
4996 "Narrowing the search space by assuming that uses separated "
4997 "by a constant offset will use the same registers.\n");
4998
4999 // This is especially useful for unrolled loops.
5000
5001 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5002 LSRUse &LU = Uses[LUIdx];
5003 for (const Formula &F : LU.Formulae) {
5004 if (F.BaseOffset.isZero() || (F.Scale != 0 && F.Scale != 1))
5005 continue;
5006 assert((LU.Kind == LSRUse::Address || LU.Kind == LSRUse::ICmpZero) &&
5007 "Only address and cmp uses expected to have nonzero BaseOffset");
5008
5009 LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
5010 if (!LUThatHas)
5011 continue;
5012
5013 if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
5014 LU.Kind, LU.AccessTy))
5015 continue;
5016
5017 LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');
5018
5019 LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
5020 LUThatHas->AllFixupsUnconditional &= LU.AllFixupsUnconditional;
5021
5022 // Transfer the fixups of LU to LUThatHas.
5023 for (LSRFixup &Fixup : LU.Fixups) {
5024 Fixup.Offset += F.BaseOffset;
5025 LUThatHas->pushFixup(Fixup);
5026 LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
5027 }
5028
5029#ifndef NDEBUG
5030 Type *FixupType = LUThatHas->Fixups[0].OperandValToReplace->getType();
5031 for (LSRFixup &Fixup : LUThatHas->Fixups)
5032 assert(Fixup.OperandValToReplace->getType() == FixupType &&
5033 "Expected all fixups to have the same type");
5034#endif
5035
5036 // Delete formulae from the new use which are no longer legal.
5037 bool Any = false;
5038 for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
5039 Formula &F = LUThatHas->Formulae[i];
5040 if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
5041 LUThatHas->Kind, LUThatHas->AccessTy, F)) {
5042 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
5043 LUThatHas->DeleteFormula(F);
5044 --i;
5045 --e;
5046 Any = true;
5047 }
5048 }
5049
5050 if (Any)
5051 LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
5052
5053 // Delete the old use.
5054 DeleteUse(LU, LUIdx);
5055 --LUIdx;
5056 --NumUses;
5057 break;
5058 }
5059 }
5060
5061 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5062}
5063
5064/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
5065/// we've done more filtering, as it may be able to find more formulae to
5066/// eliminate.
5067void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
5068 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5069 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5070
5071 LLVM_DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
5072 "undesirable dedicated registers.\n");
5073
5074 FilterOutUndesirableDedicatedRegisters();
5075
5076 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5077 }
5078}
5079
5080/// If a LSRUse has multiple formulae with the same ScaledReg and Scale.
5081/// Pick the best one and delete the others.
5082/// This narrowing heuristic is to keep as many formulae with different
5083/// Scale and ScaledReg pair as possible while narrowing the search space.
5084/// The benefit is that it is more likely to find out a better solution
5085/// from a formulae set with more Scale and ScaledReg variations than
5086/// a formulae set with the same Scale and ScaledReg. The picking winner
5087/// reg heuristic will often keep the formulae with the same Scale and
5088/// ScaledReg and filter others, and we want to avoid that if possible.
5089void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
5090 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5091 return;
5092
5093 LLVM_DEBUG(
5094 dbgs() << "The search space is too complex.\n"
5095 "Narrowing the search space by choosing the best Formula "
5096 "from the Formulae with the same Scale and ScaledReg.\n");
5097
5098 // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
5099 using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>;
5100
5101 BestFormulaeTy BestFormulae;
5102#ifndef NDEBUG
5103 bool ChangedFormulae = false;
5104#endif
5105 DenseSet<const SCEV *> VisitedRegs;
5106 SmallPtrSet<const SCEV *, 16> Regs;
5107
5108 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5109 LSRUse &LU = Uses[LUIdx];
5110 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
5111 dbgs() << '\n');
5112
5113 // Return true if Formula FA is better than Formula FB.
5114 auto IsBetterThan = [&](Formula &FA, Formula &FB) {
5115 // First we will try to choose the Formula with fewer new registers.
5116 // For a register used by current Formula, the more the register is
5117 // shared among LSRUses, the less we increase the register number
5118 // counter of the formula.
5119 size_t FARegNum = 0;
5120 for (const SCEV *Reg : FA.BaseRegs) {
5121 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5122 FARegNum += (NumUses - UsedByIndices.count() + 1);
5123 }
5124 size_t FBRegNum = 0;
5125 for (const SCEV *Reg : FB.BaseRegs) {
5126 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5127 FBRegNum += (NumUses - UsedByIndices.count() + 1);
5128 }
5129 if (FARegNum != FBRegNum)
5130 return FARegNum < FBRegNum;
5131
5132 // If the new register numbers are the same, choose the Formula with
5133 // less Cost.
5134 Cost CostFA(L, SE, TTI, AMK);
5135 Cost CostFB(L, SE, TTI, AMK);
5136 Regs.clear();
5137 CostFA.RateFormula(FA, Regs, VisitedRegs, LU, HardwareLoopProfitable);
5138 Regs.clear();
5139 CostFB.RateFormula(FB, Regs, VisitedRegs, LU, HardwareLoopProfitable);
5140 return CostFA.isLess(CostFB);
5141 };
5142
5143 bool Any = false;
5144 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5145 ++FIdx) {
5146 Formula &F = LU.Formulae[FIdx];
5147 if (!F.ScaledReg)
5148 continue;
5149 auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx});
5150 if (P.second)
5151 continue;
5152
5153 Formula &Best = LU.Formulae[P.first->second];
5154 if (IsBetterThan(F, Best))
5155 std::swap(F, Best);
5156 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5157 dbgs() << "\n"
5158 " in favor of formula ";
5159 Best.print(dbgs()); dbgs() << '\n');
5160#ifndef NDEBUG
5161 ChangedFormulae = true;
5162#endif
5163 LU.DeleteFormula(F);
5164 --FIdx;
5165 --NumForms;
5166 Any = true;
5167 }
5168 if (Any)
5169 LU.RecomputeRegs(LUIdx, RegUses);
5170
5171 // Reset this to prepare for the next use.
5172 BestFormulae.clear();
5173 }
5174
5175 LLVM_DEBUG(if (ChangedFormulae) {
5176 dbgs() << "\n"
5177 "After filtering out undesirable candidates:\n";
5178 print_uses(dbgs());
5179 });
5180}
5181
5182/// If we are over the complexity limit, filter out any post-inc prefering
5183/// variables to only post-inc values.
5184void LSRInstance::NarrowSearchSpaceByFilterPostInc() {
5185 if (AMK != TTI::AMK_PostIndexed)
5186 return;
5187 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5188 return;
5189
5190 LLVM_DEBUG(dbgs() << "The search space is too complex.\n"
5191 "Narrowing the search space by choosing the lowest "
5192 "register Formula for PostInc Uses.\n");
5193
5194 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5195 LSRUse &LU = Uses[LUIdx];
5196
5197 if (LU.Kind != LSRUse::Address)
5198 continue;
5199 if (!TTI.isIndexedLoadLegal(TTI.MIM_PostInc, LU.AccessTy.getType()) &&
5200 !TTI.isIndexedStoreLegal(TTI.MIM_PostInc, LU.AccessTy.getType()))
5201 continue;
5202
5203 size_t MinRegs = std::numeric_limits<size_t>::max();
5204 for (const Formula &F : LU.Formulae)
5205 MinRegs = std::min(F.getNumRegs(), MinRegs);
5206
5207 bool Any = false;
5208 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5209 ++FIdx) {
5210 Formula &F = LU.Formulae[FIdx];
5211 if (F.getNumRegs() > MinRegs) {
5212 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5213 dbgs() << "\n");
5214 LU.DeleteFormula(F);
5215 --FIdx;
5216 --NumForms;
5217 Any = true;
5218 }
5219 }
5220 if (Any)
5221 LU.RecomputeRegs(LUIdx, RegUses);
5222
5223 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5224 break;
5225 }
5226
5227 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5228}
5229
5230void LSRInstance::NarrowSearchSpaceByMergingUsesOutsideLoop() {
5231 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5232 return;
5233
5234 LLVM_DEBUG(
5235 dbgs() << "The search space is too complex.\n"
5236 "Narrowing the search space by merging uses with fixups "
5237 "entirely outside the loop with uses inside the loop.\n");
5238
5239 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5240 LSRUse &LU = Uses[LUIdx];
5241 // Don't merge ICmpZero uses outside the loop, as ICmpZero needs to be
5242 // handled specially when expanding.
5243 if (!LU.AllFixupsOutsideLoop || LU.Formulae.empty() ||
5244 LU.Kind == LSRUse::ICmpZero)
5245 continue;
5246
5247 LLVM_DEBUG(dbgs() << " Trying to eliminate use "; LU.print(dbgs());
5248 dbgs() << '\n');
5249
5250 // Find a compatible LSRUse inside the loop that we could merge LU with
5251 LSRUse *LUToMergeWith = nullptr;
5252 const Formula &ThisF = LU.Formulae[0];
5253 for (LSRUse &OtherLU : Uses) {
5254 // Only merge with uses inside the loop
5255 if (OtherLU.AllFixupsOutsideLoop)
5256 continue;
5257 // Can't merge with ICmpZero uses as they're handled specially when
5258 // expanding
5259 if (OtherLU.Kind == LSRUse::ICmpZero)
5260 continue;
5261 // Can't merge with uses without any formulae
5262 if (OtherLU.Formulae.empty())
5263 continue;
5264 // Can't merge if LU's offsets aren't legal for all of OtherLU's formulae
5265 if (any_of(OtherLU.Formulae, [&](const Formula &F) {
5266 return !isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, OtherLU.Kind,
5267 OtherLU.AccessTy, F);
5268 }))
5269 continue;
5270 // We can merge with uses that have the same initial formula. We allow
5271 // merging of uses with different Kind and AccessTy which means that the
5272 // cost may end up being inaccurate, but it's also what we would have
5273 // gotten if we'd ignored uses outside the loop entirely.
5274 const Formula &OtherF = OtherLU.Formulae[0];
5275 if (ThisF.BaseRegs == OtherF.BaseRegs &&
5276 ThisF.ScaledReg == OtherF.ScaledReg &&
5277 ThisF.BaseGV == OtherF.BaseGV && ThisF.Scale == OtherF.Scale &&
5278 ThisF.UnfoldedOffset == OtherF.UnfoldedOffset &&
5279 ThisF.BaseOffset == OtherF.BaseOffset) {
5280 LUToMergeWith = &OtherLU;
5281 break;
5282 }
5283 }
5284 if (!LUToMergeWith)
5285 continue;
5286
5287 LLVM_DEBUG(dbgs() << " Merging with "; LUToMergeWith->print(dbgs());
5288 dbgs() << '\n');
5289
5290 // Copy fixups
5291 for (LSRFixup &Fixup : LU.Fixups) {
5292 LUToMergeWith->pushFixup(Fixup);
5293 }
5294
5295 // Delete the old use.
5296 DeleteUse(LU, LUIdx);
5297 --LUIdx;
5298 --NumUses;
5299 }
5300
5301 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5302}
5303
5304/// The function delete formulas with high registers number expectation.
5305/// Assuming we don't know the value of each formula (already delete
5306/// all inefficient), generate probability of not selecting for each
5307/// register.
5308/// For example,
5309/// Use1:
5310/// reg(a) + reg({0,+,1})
5311/// reg(a) + reg({-1,+,1}) + 1
5312/// reg({a,+,1})
5313/// Use2:
5314/// reg(b) + reg({0,+,1})
5315/// reg(b) + reg({-1,+,1}) + 1
5316/// reg({b,+,1})
5317/// Use3:
5318/// reg(c) + reg(b) + reg({0,+,1})
5319/// reg(c) + reg({b,+,1})
5320///
5321/// Probability of not selecting
5322/// Use1 Use2 Use3
5323/// reg(a) (1/3) * 1 * 1
5324/// reg(b) 1 * (1/3) * (1/2)
5325/// reg({0,+,1}) (2/3) * (2/3) * (1/2)
5326/// reg({-1,+,1}) (2/3) * (2/3) * 1
5327/// reg({a,+,1}) (2/3) * 1 * 1
5328/// reg({b,+,1}) 1 * (2/3) * (2/3)
5329/// reg(c) 1 * 1 * 0
5330///
5331/// Now count registers number mathematical expectation for each formula:
5332/// Note that for each use we exclude probability if not selecting for the use.
5333/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
5334/// probabilty 1/3 of not selecting for Use1).
5335/// Use1:
5336/// reg(a) + reg({0,+,1}) 1 + 1/3 -- to be deleted
5337/// reg(a) + reg({-1,+,1}) + 1 1 + 4/9 -- to be deleted
5338/// reg({a,+,1}) 1
5339/// Use2:
5340/// reg(b) + reg({0,+,1}) 1/2 + 1/3 -- to be deleted
5341/// reg(b) + reg({-1,+,1}) + 1 1/2 + 2/3 -- to be deleted
5342/// reg({b,+,1}) 2/3
5343/// Use3:
5344/// reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
5345/// reg(c) + reg({b,+,1}) 1 + 2/3
5346void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
5347 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5348 return;
5349 // Ok, we have too many of formulae on our hands to conveniently handle.
5350 // Use a rough heuristic to thin out the list.
5351
5352 // Set of Regs wich will be 100% used in final solution.
5353 // Used in each formula of a solution (in example above this is reg(c)).
5354 // We can skip them in calculations.
5355 SmallPtrSet<const SCEV *, 4> UniqRegs;
5356 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5357
5358 // Map each register to probability of not selecting
5359 DenseMap <const SCEV *, float> RegNumMap;
5360 for (const SCEV *Reg : RegUses) {
5361 if (UniqRegs.count(Reg))
5362 continue;
5363 float PNotSel = 1;
5364 for (const LSRUse &LU : Uses) {
5365 if (!LU.Regs.count(Reg))
5366 continue;
5367 float P = LU.getNotSelectedProbability(Reg);
5368 if (P != 0.0)
5369 PNotSel *= P;
5370 else
5371 UniqRegs.insert(Reg);
5372 }
5373 RegNumMap.insert(std::make_pair(Reg, PNotSel));
5374 }
5375
5376 LLVM_DEBUG(
5377 dbgs() << "Narrowing the search space by deleting costly formulas\n");
5378
5379 // Delete formulas where registers number expectation is high.
5380 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5381 LSRUse &LU = Uses[LUIdx];
5382 // If nothing to delete - continue.
5383 if (LU.Formulae.size() < 2)
5384 continue;
5385 // This is temporary solution to test performance. Float should be
5386 // replaced with round independent type (based on integers) to avoid
5387 // different results for different target builds.
5388 float FMinRegNum = LU.Formulae[0].getNumRegs();
5389 float FMinARegNum = LU.Formulae[0].getNumRegs();
5390 size_t MinIdx = 0;
5391 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5392 Formula &F = LU.Formulae[i];
5393 float FRegNum = 0;
5394 float FARegNum = 0;
5395 for (const SCEV *BaseReg : F.BaseRegs) {
5396 if (UniqRegs.count(BaseReg))
5397 continue;
5398 FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5399 if (isa<SCEVAddRecExpr>(BaseReg))
5400 FARegNum +=
5401 RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5402 }
5403 if (const SCEV *ScaledReg = F.ScaledReg) {
5404 if (!UniqRegs.count(ScaledReg)) {
5405 FRegNum +=
5406 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5407 if (isa<SCEVAddRecExpr>(ScaledReg))
5408 FARegNum +=
5409 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5410 }
5411 }
5412 if (FMinRegNum > FRegNum ||
5413 (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
5414 FMinRegNum = FRegNum;
5415 FMinARegNum = FARegNum;
5416 MinIdx = i;
5417 }
5418 }
5419 LLVM_DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs());
5420 dbgs() << " with min reg num " << FMinRegNum << '\n');
5421 if (MinIdx != 0)
5422 std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
5423 while (LU.Formulae.size() != 1) {
5424 LLVM_DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs());
5425 dbgs() << '\n');
5426 LU.Formulae.pop_back();
5427 }
5428 LU.RecomputeRegs(LUIdx, RegUses);
5429 assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
5430 Formula &F = LU.Formulae[0];
5431 LLVM_DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n');
5432 // When we choose the formula, the regs become unique.
5433 UniqRegs.insert_range(F.BaseRegs);
5434 if (F.ScaledReg)
5435 UniqRegs.insert(F.ScaledReg);
5436 }
5437 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5438}
5439
5440// Check if Best and Reg are SCEVs separated by a constant amount C, and if so
5441// would the addressing offset +C would be legal where the negative offset -C is
5442// not.
5444 ScalarEvolution &SE, const SCEV *Best,
5445 const SCEV *Reg,
5446 MemAccessTy AccessType) {
5447 if (Best->getType() != Reg->getType() ||
5449 cast<SCEVAddRecExpr>(Best)->getLoop() !=
5450 cast<SCEVAddRecExpr>(Reg)->getLoop()))
5451 return false;
5452 std::optional<APInt> Diff = SE.computeConstantDifference(Best, Reg);
5453 if (!Diff)
5454 return false;
5455
5456 return TTI.isLegalAddressingMode(
5457 AccessType.MemTy, /*BaseGV=*/nullptr,
5458 /*BaseOffset=*/Diff->getSExtValue(),
5459 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace) &&
5460 !TTI.isLegalAddressingMode(
5461 AccessType.MemTy, /*BaseGV=*/nullptr,
5462 /*BaseOffset=*/-Diff->getSExtValue(),
5463 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace);
5464}
5465
5466/// Pick a register which seems likely to be profitable, and then in any use
5467/// which has any reference to that register, delete all formulae which do not
5468/// reference that register.
5469void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
5470 // With all other options exhausted, loop until the system is simple
5471 // enough to handle.
5472 SmallPtrSet<const SCEV *, 4> Taken;
5473 while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5474 // Ok, we have too many of formulae on our hands to conveniently handle.
5475 // Use a rough heuristic to thin out the list.
5476 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5477
5478 // Pick the register which is used by the most LSRUses, which is likely
5479 // to be a good reuse register candidate.
5480 const SCEV *Best = nullptr;
5481 unsigned BestNum = 0;
5482 for (const SCEV *Reg : RegUses) {
5483 if (Taken.count(Reg))
5484 continue;
5485 if (!Best) {
5486 Best = Reg;
5487 BestNum = RegUses.getUsedByIndices(Reg).count();
5488 } else {
5489 unsigned Count = RegUses.getUsedByIndices(Reg).count();
5490 if (Count > BestNum) {
5491 Best = Reg;
5492 BestNum = Count;
5493 }
5494
5495 // If the scores are the same, but the Reg is simpler for the target
5496 // (for example {x,+,1} as opposed to {x+C,+,1}, where the target can
5497 // handle +C but not -C), opt for the simpler formula.
5498 if (Count == BestNum) {
5499 int LUIdx = RegUses.getUsedByIndices(Reg).find_first();
5500 if (LUIdx >= 0 && Uses[LUIdx].Kind == LSRUse::Address &&
5502 Uses[LUIdx].AccessTy)) {
5503 Best = Reg;
5504 BestNum = Count;
5505 }
5506 }
5507 }
5508 }
5509 assert(Best && "Failed to find best LSRUse candidate");
5510
5511 LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
5512 << " will yield profitable reuse.\n");
5513 Taken.insert(Best);
5514
5515 // In any use with formulae which references this register, delete formulae
5516 // which don't reference it.
5517 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5518 LSRUse &LU = Uses[LUIdx];
5519 if (!LU.Regs.count(Best)) continue;
5520
5521 bool Any = false;
5522 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5523 Formula &F = LU.Formulae[i];
5524 if (!F.referencesReg(Best)) {
5525 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
5526 LU.DeleteFormula(F);
5527 --e;
5528 --i;
5529 Any = true;
5530 assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
5531 continue;
5532 }
5533 }
5534
5535 if (Any)
5536 LU.RecomputeRegs(LUIdx, RegUses);
5537 }
5538
5539 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5540 }
5541}
5542
5543/// If there are an extraordinary number of formulae to choose from, use some
5544/// rough heuristics to prune down the number of formulae. This keeps the main
5545/// solver from taking an extraordinary amount of time in some worst-case
5546/// scenarios.
5547void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
5548 NarrowSearchSpaceByDetectingSupersets();
5549 NarrowSearchSpaceByCollapsingUnrolledCode();
5550 NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
5552 NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
5553 NarrowSearchSpaceByFilterPostInc();
5554 NarrowSearchSpaceByMergingUsesOutsideLoop();
5555 if (LSRExpNarrow)
5556 NarrowSearchSpaceByDeletingCostlyFormulas();
5557 else
5558 NarrowSearchSpaceByPickingWinnerRegs();
5559}
5560
5561/// This is the recursive solver.
5562void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
5563 Cost &SolutionCost,
5564 SmallVectorImpl<const Formula *> &Workspace,
5565 const Cost &CurCost,
5566 const SmallPtrSet<const SCEV *, 16> &CurRegs,
5567 DenseSet<const SCEV *> &VisitedRegs) const {
5568 // Some ideas:
5569 // - prune more:
5570 // - use more aggressive filtering
5571 // - sort the formula so that the most profitable solutions are found first
5572 // - sort the uses too
5573 // - search faster:
5574 // - don't compute a cost, and then compare. compare while computing a cost
5575 // and bail early.
5576 // - track register sets with SmallBitVector
5577
5578 const LSRUse &LU = Uses[Workspace.size()];
5579
5580 // If this use references any register that's already a part of the
5581 // in-progress solution, consider it a requirement that a formula must
5582 // reference that register in order to be considered. This prunes out
5583 // unprofitable searching.
5584 SmallSetVector<const SCEV *, 4> ReqRegs;
5585 for (const SCEV *S : CurRegs)
5586 if (LU.Regs.count(S))
5587 ReqRegs.insert(S);
5588
5589 SmallPtrSet<const SCEV *, 16> NewRegs;
5590 Cost NewCost(L, SE, TTI, AMK);
5591 for (const Formula &F : LU.Formulae) {
5592 // Ignore formulae which may not be ideal in terms of register reuse of
5593 // ReqRegs. The formula should use all required registers before
5594 // introducing new ones.
5595 // This can sometimes (notably when trying to favour postinc) lead to
5596 // sub-optimial decisions. There it is best left to the cost modelling to
5597 // get correct.
5598 if (!(AMK & TTI::AMK_PostIndexed) || LU.Kind != LSRUse::Address) {
5599 int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
5600 for (const SCEV *Reg : ReqRegs) {
5601 if ((F.ScaledReg && F.ScaledReg == Reg) ||
5602 is_contained(F.BaseRegs, Reg)) {
5603 --NumReqRegsToFind;
5604 if (NumReqRegsToFind == 0)
5605 break;
5606 }
5607 }
5608 if (NumReqRegsToFind != 0) {
5609 // If none of the formulae satisfied the required registers, then we could
5610 // clear ReqRegs and try again. Currently, we simply give up in this case.
5611 continue;
5612 }
5613 }
5614
5615 // Evaluate the cost of the current formula. If it's already worse than
5616 // the current best, prune the search at that point.
5617 NewCost = CurCost;
5618 NewRegs = CurRegs;
5619 NewCost.RateFormula(F, NewRegs, VisitedRegs, LU, HardwareLoopProfitable);
5620 if (NewCost.isLess(SolutionCost)) {
5621 Workspace.push_back(&F);
5622 if (Workspace.size() != Uses.size()) {
5623 SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
5624 NewRegs, VisitedRegs);
5625 if (F.getNumRegs() == 1 && Workspace.size() == 1)
5626 VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
5627 } else {
5628 LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
5629 dbgs() << ".\nRegs:\n";
5630 for (const SCEV *S : NewRegs) dbgs()
5631 << "- " << *S << "\n";
5632 dbgs() << '\n');
5633
5634 SolutionCost = NewCost;
5635 Solution = Workspace;
5636 }
5637 Workspace.pop_back();
5638 }
5639 }
5640}
5641
5642/// Choose one formula from each use. Return the results in the given Solution
5643/// vector.
5644void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
5646 Cost SolutionCost(L, SE, TTI, AMK);
5647 SolutionCost.Lose();
5648 Cost CurCost(L, SE, TTI, AMK);
5649 SmallPtrSet<const SCEV *, 16> CurRegs;
5650 DenseSet<const SCEV *> VisitedRegs;
5651 Workspace.reserve(Uses.size());
5652
5653 // SolveRecurse does all the work.
5654 SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
5655 CurRegs, VisitedRegs);
5656 if (Solution.empty()) {
5657 LLVM_DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
5658 return;
5659 }
5660
5661 // Ok, we've now made all our decisions.
5662 LLVM_DEBUG(dbgs() << "\n"
5663 "The chosen solution requires ";
5664 SolutionCost.print(dbgs()); dbgs() << ":\n";
5665 for (size_t i = 0, e = Uses.size(); i != e; ++i) {
5666 dbgs() << " ";
5667 Uses[i].print(dbgs());
5668 dbgs() << "\n"
5669 " ";
5670 Solution[i]->print(dbgs());
5671 dbgs() << '\n';
5672 });
5673
5674 assert(Solution.size() == Uses.size() && "Malformed solution!");
5675
5676 const bool EnableDropUnprofitableSolution = [&] {
5678 case cl::boolOrDefault::BOU_TRUE:
5679 return true;
5680 case cl::boolOrDefault::BOU_FALSE:
5681 return false;
5682 case cl::boolOrDefault::BOU_UNSET:
5684 }
5685 llvm_unreachable("Unhandled cl::boolOrDefault enum");
5686 }();
5687
5688 if (BaselineCost.isLess(SolutionCost)) {
5689 if (!EnableDropUnprofitableSolution)
5690 LLVM_DEBUG(
5691 dbgs() << "Baseline is more profitable than chosen solution, "
5692 "add option 'lsr-drop-solution' to drop LSR solution.\n");
5693 else {
5694 LLVM_DEBUG(dbgs() << "Baseline is more profitable than chosen "
5695 "solution, dropping LSR solution.\n";);
5696 Solution.clear();
5697 }
5698 }
5699}
5700
5701/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
5702/// we can go while still being dominated by the input positions. This helps
5703/// canonicalize the insert position, which encourages sharing.
5705LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
5706 const SmallVectorImpl<Instruction *> &Inputs)
5707 const {
5708 Instruction *Tentative = &*IP;
5709 while (true) {
5710 bool AllDominate = true;
5711 Instruction *BetterPos = nullptr;
5712 // Don't bother attempting to insert before a catchswitch, their basic block
5713 // cannot have other non-PHI instructions.
5714 if (isa<CatchSwitchInst>(Tentative))
5715 return IP;
5716
5717 for (Instruction *Inst : Inputs) {
5718 if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
5719 AllDominate = false;
5720 break;
5721 }
5722 // Attempt to find an insert position in the middle of the block,
5723 // instead of at the end, so that it can be used for other expansions.
5724 if (Tentative->getParent() == Inst->getParent() &&
5725 (!BetterPos || !DT.dominates(Inst, BetterPos)))
5726 BetterPos = &*std::next(BasicBlock::iterator(Inst));
5727 }
5728 if (!AllDominate)
5729 break;
5730 if (BetterPos)
5731 IP = BetterPos->getIterator();
5732 else
5733 IP = Tentative->getIterator();
5734
5735 const Loop *IPLoop = LI.getLoopFor(IP->getParent());
5736 unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
5737
5738 BasicBlock *IDom;
5739 for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
5740 if (!Rung) return IP;
5741 Rung = Rung->getIDom();
5742 if (!Rung) return IP;
5743 IDom = Rung->getBlock();
5744
5745 // Don't climb into a loop though.
5746 const Loop *IDomLoop = LI.getLoopFor(IDom);
5747 unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
5748 if (IDomDepth <= IPLoopDepth &&
5749 (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
5750 break;
5751 }
5752
5753 Tentative = IDom->getTerminator();
5754 }
5755
5756 return IP;
5757}
5758
5759/// Determine an input position which will be dominated by the operands and
5760/// which will dominate the result.
5761BasicBlock::iterator LSRInstance::AdjustInsertPositionForExpand(
5762 BasicBlock::iterator LowestIP, const LSRFixup &LF, const LSRUse &LU) const {
5763 // Collect some instructions which must be dominated by the
5764 // expanding replacement. These must be dominated by any operands that
5765 // will be required in the expansion.
5766 SmallVector<Instruction *, 4> Inputs;
5767 if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
5768 Inputs.push_back(I);
5769 if (LU.Kind == LSRUse::ICmpZero)
5770 if (Instruction *I =
5771 dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
5772 Inputs.push_back(I);
5773 if (LF.PostIncLoops.count(L)) {
5774 if (LF.isUseFullyOutsideLoop(L))
5775 Inputs.push_back(L->getLoopLatch()->getTerminator());
5776 else
5777 Inputs.push_back(IVIncInsertPos);
5778 }
5779 // The expansion must also be dominated by the increment positions of any
5780 // loops it for which it is using post-inc mode.
5781 for (const Loop *PIL : LF.PostIncLoops) {
5782 if (PIL == L) continue;
5783
5784 // Be dominated by the loop exit.
5785 SmallVector<BasicBlock *, 4> ExitingBlocks;
5786 PIL->getExitingBlocks(ExitingBlocks);
5787 if (!ExitingBlocks.empty()) {
5788 BasicBlock *BB = ExitingBlocks[0];
5789 for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
5790 BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
5791 Inputs.push_back(BB->getTerminator());
5792 }
5793 }
5794
5795 assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad() &&
5796 "Insertion point must be a normal instruction");
5797
5798 // Then, climb up the immediate dominator tree as far as we can go while
5799 // still being dominated by the input positions.
5800 BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
5801
5802 // Don't insert instructions before PHI nodes.
5803 while (isa<PHINode>(IP)) ++IP;
5804
5805 // Ignore landingpad instructions.
5806 while (IP->isEHPad()) ++IP;
5807
5808 // Set IP below instructions recently inserted by SCEVExpander. This keeps the
5809 // IP consistent across expansions and allows the previously inserted
5810 // instructions to be reused by subsequent expansion.
5811 while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP)
5812 ++IP;
5813
5814 return IP;
5815}
5816
5817/// Emit instructions for the leading candidate expression for this LSRUse (this
5818/// is called "expanding").
5819Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
5820 const Formula &F, BasicBlock::iterator IP,
5821 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5822 if (LU.RigidFormula)
5823 return LF.OperandValToReplace;
5824
5825 // Determine an input position which will be dominated by the operands and
5826 // which will dominate the result.
5827 IP = AdjustInsertPositionForExpand(IP, LF, LU);
5828 Rewriter.setInsertPoint(&*IP);
5829
5830 // Inform the Rewriter if we have a post-increment use, so that it can
5831 // perform an advantageous expansion.
5832 Rewriter.setPostInc(LF.PostIncLoops);
5833
5834 // This is the type that the user actually needs.
5835 Type *OpTy = LF.OperandValToReplace->getType();
5836 // This will be the type that we'll initially expand to.
5837 Type *Ty = F.getType();
5838 if (!Ty)
5839 // No type known; just expand directly to the ultimate type.
5840 Ty = OpTy;
5841 else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
5842 // Expand directly to the ultimate type if it's the right size.
5843 Ty = OpTy;
5844 // This is the type to do integer arithmetic in.
5845 Type *IntTy = SE.getEffectiveSCEVType(Ty);
5846 // For ICmpZero with pointer-typed operands, keep the comparison in the
5847 // integer domain to avoid generating inttoptr casts. Use IntTy (the
5848 // formula's arithmetic width) so that both icmp operands match even when
5849 // the IV is wider than the pointer.
5850 if (LU.Kind == LSRUse::ICmpZero && OpTy->isPointerTy()) {
5851 OpTy = IntTy;
5852 Ty = IntTy;
5853 }
5854
5855 // Build up a list of operands to add together to form the full base.
5857
5858 // Expand the BaseRegs portion.
5859 for (const SCEV *Reg : F.BaseRegs) {
5860 assert(!Reg->isZero() && "Zero allocated in a base register!");
5861
5862 // If we're expanding for a post-inc user, make the post-inc adjustment.
5863 Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE);
5864 Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
5865 }
5866
5867 // Expand the ScaledReg portion.
5868 Value *ICmpScaledV = nullptr;
5869 if (F.Scale != 0) {
5870 const SCEV *ScaledS = F.ScaledReg;
5871
5872 // If we're expanding for a post-inc user, make the post-inc adjustment.
5873 PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
5874 ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE);
5875
5876 if (LU.Kind == LSRUse::ICmpZero) {
5877 // Expand ScaleReg as if it was part of the base regs.
5878 if (F.Scale == 1)
5879 Ops.push_back(
5880 SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)));
5881 else {
5882 // An interesting way of "folding" with an icmp is to use a negated
5883 // scale, which we'll implement by inserting it into the other operand
5884 // of the icmp.
5885 assert(F.Scale == -1 &&
5886 "The only scale supported by ICmpZero uses is -1!");
5887 ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr);
5888 }
5889 } else {
5890 // Otherwise just expand the scaled register and an explicit scale,
5891 // which is expected to be matched as part of the address.
5892
5893 // Flush the operand list to suppress SCEVExpander hoisting address modes.
5894 // Unless the addressing mode will not be folded.
5895 if (!Ops.empty() && LU.Kind == LSRUse::Address &&
5896 isAMCompletelyFolded(TTI, LU, F)) {
5897 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), nullptr);
5898 Ops.clear();
5899 Ops.push_back(SE.getUnknown(FullV));
5900 }
5901 ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr));
5902 if (F.Scale != 1)
5903 ScaledS =
5904 SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
5905 Ops.push_back(ScaledS);
5906 }
5907 }
5908
5909 // Expand the GV portion.
5910 if (F.BaseGV) {
5911 // Flush the operand list to suppress SCEVExpander hoisting.
5912 if (!Ops.empty()) {
5913 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), IntTy);
5914 Ops.clear();
5915 Ops.push_back(SE.getUnknown(FullV));
5916 }
5917 Ops.push_back(SE.getUnknown(F.BaseGV));
5918 }
5919
5920 // Flush the operand list to suppress SCEVExpander hoisting of both folded and
5921 // unfolded offsets. LSR assumes they both live next to their uses.
5922 if (!Ops.empty()) {
5923 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
5924 Ops.clear();
5925 Ops.push_back(SE.getUnknown(FullV));
5926 }
5927
5928 // FIXME: Are we sure we won't get a mismatch here? Is there a way to bail
5929 // out at this point, or should we generate a SCEV adding together mixed
5930 // offsets?
5931 assert(F.BaseOffset.isCompatibleImmediate(LF.Offset) &&
5932 "Expanding mismatched offsets\n");
5933 // Expand the immediate portion.
5934 Immediate Offset = F.BaseOffset.addUnsigned(LF.Offset);
5935 if (Offset.isNonZero()) {
5936 if (LU.Kind == LSRUse::ICmpZero) {
5937 // The other interesting way of "folding" with an ICmpZero is to use a
5938 // negated immediate.
5939 if (!ICmpScaledV) {
5940 // TODO: Avoid implicit trunc?
5941 // See https://github.com/llvm/llvm-project/issues/112510.
5942 ICmpScaledV = ConstantInt::getSigned(
5943 IntTy, -(uint64_t)Offset.getFixedValue(), /*ImplicitTrunc=*/true);
5944 } else {
5945 Ops.push_back(SE.getUnknown(ICmpScaledV));
5946 ICmpScaledV = ConstantInt::getSigned(IntTy, Offset.getFixedValue(),
5947 /*ImplicitTrunc=*/true);
5948 }
5949 } else {
5950 // Just add the immediate values. These again are expected to be matched
5951 // as part of the address.
5952 Ops.push_back(Offset.getUnknownSCEV(SE, IntTy));
5953 }
5954 }
5955
5956 // Expand the unfolded offset portion.
5957 Immediate UnfoldedOffset = F.UnfoldedOffset;
5958 if (UnfoldedOffset.isNonZero()) {
5959 // Just add the immediate values.
5960 Ops.push_back(UnfoldedOffset.getUnknownSCEV(SE, IntTy));
5961 }
5962
5963 // Emit instructions summing all the operands.
5964 const SCEV *FullS = Ops.empty() ?
5965 SE.getConstant(IntTy, 0) :
5966 SE.getAddExpr(Ops);
5967 Value *FullV = Rewriter.expandCodeFor(FullS, Ty);
5968
5969 // We're done expanding now, so reset the rewriter.
5970 Rewriter.clearPostInc();
5971
5972 // An ICmpZero Formula represents an ICmp which we're handling as a
5973 // comparison against zero. Now that we've expanded an expression for that
5974 // form, update the ICmp's other operand.
5975 if (LU.Kind == LSRUse::ICmpZero) {
5976 ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
5977 if (auto *OperandIsInstr = dyn_cast<Instruction>(CI->getOperand(1)))
5978 DeadInsts.emplace_back(OperandIsInstr);
5979 assert(!F.BaseGV && "ICmp does not support folding a global value and "
5980 "a scale at the same time!");
5981 if (F.Scale == -1) {
5982 if (ICmpScaledV->getType() != OpTy) {
5984 CastInst::getCastOpcode(ICmpScaledV, false, OpTy, false),
5985 ICmpScaledV, OpTy, "tmp", CI->getIterator());
5986 ICmpScaledV = Cast;
5987 }
5988 CI->setOperand(1, ICmpScaledV);
5989 } else {
5990 // A scale of 1 means that the scale has been expanded as part of the
5991 // base regs.
5992 assert((F.Scale == 0 || F.Scale == 1) &&
5993 "ICmp does not support folding a global value and "
5994 "a scale at the same time!");
5995 // TODO: Avoid implicit trunc?
5996 // See https://github.com/llvm/llvm-project/issues/112510.
5998 -(uint64_t)Offset.getFixedValue(),
5999 /*ImplicitTrunc=*/true);
6000 if (C->getType() != OpTy) {
6002 CastInst::getCastOpcode(C, false, OpTy, false), C, OpTy,
6003 CI->getDataLayout());
6004 assert(C && "Cast of ConstantInt should have folded");
6005 }
6006
6007 CI->setOperand(1, C);
6008 }
6009 }
6010
6011 return FullV;
6012}
6013
6014/// Helper for Rewrite. PHI nodes are special because the use of their operands
6015/// effectively happens in their predecessor blocks, so the expression may need
6016/// to be expanded in multiple places.
6017void LSRInstance::RewriteForPHI(PHINode *PN, const LSRUse &LU,
6018 const LSRFixup &LF, const Formula &F,
6019 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
6020 DenseMap<BasicBlock *, Value *> Inserted;
6021
6022 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
6023 if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
6024 bool needUpdateFixups = false;
6025 BasicBlock *BB = PN->getIncomingBlock(i);
6026
6027 // If this is a critical edge, split the edge so that we do not insert
6028 // the code on all predecessor/successor paths. We do this unless this
6029 // is the canonical backedge for this loop, which complicates post-inc
6030 // users.
6031 if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
6034 BasicBlock *Parent = PN->getParent();
6035 Loop *PNLoop = LI.getLoopFor(Parent);
6036 if (!PNLoop || Parent != PNLoop->getHeader()) {
6037 // Split the critical edge.
6038 BasicBlock *NewBB = nullptr;
6039 if (!Parent->isLandingPad()) {
6040 CriticalEdgeSplittingOptions SplitOptions(&DT, &LI, MSSAU);
6041 SplitOptions =
6042 SplitOptions.setMergeIdenticalEdges().setKeepOneInputPHIs();
6043 if (ShouldPreserveLCSSA)
6044 SplitOptions = SplitOptions.setPreserveLCSSA();
6045 NewBB = SplitCriticalEdge(BB, Parent, SplitOptions);
6046 } else {
6048 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
6049 SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DTU, &LI);
6050 NewBB = NewBBs[0];
6051 }
6052 // If NewBB==NULL, then SplitCriticalEdge refused to split because all
6053 // phi predecessors are identical. The simple thing to do is skip
6054 // splitting in this case rather than complicate the API.
6055 if (NewBB) {
6056 // If PN is outside of the loop and BB is in the loop, we want to
6057 // move the block to be immediately before the PHI block, not
6058 // immediately after BB.
6059 if (L->contains(BB) && !L->contains(PN))
6060 NewBB->moveBefore(PN->getParent());
6061
6062 // Splitting the edge can reduce the number of PHI entries we have.
6063 e = PN->getNumIncomingValues();
6064 BB = NewBB;
6065 i = PN->getBasicBlockIndex(BB);
6066
6067 needUpdateFixups = true;
6068 }
6069 }
6070 }
6071
6072 std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
6073 Inserted.try_emplace(BB);
6074 if (!Pair.second)
6075 PN->setIncomingValue(i, Pair.first->second);
6076 else {
6077 Value *FullV =
6078 Expand(LU, LF, F, BB->getTerminator()->getIterator(), DeadInsts);
6079
6080 // If this is reuse-by-noop-cast, insert the noop cast.
6081 Type *OpTy = LF.OperandValToReplace->getType();
6082 if (FullV->getType() != OpTy)
6083 FullV = CastInst::Create(
6084 CastInst::getCastOpcode(FullV, false, OpTy, false), FullV,
6085 LF.OperandValToReplace->getType(), "tmp",
6086 BB->getTerminator()->getIterator());
6087
6088 // If the incoming block for this value is not in the loop, it means the
6089 // current PHI is not in a loop exit, so we must create a LCSSA PHI for
6090 // the inserted value.
6091 if (auto *I = dyn_cast<Instruction>(FullV))
6092 if (L->contains(I) && !L->contains(BB))
6093 InsertedNonLCSSAInsts.insert(I);
6094
6095 PN->setIncomingValue(i, FullV);
6096 Pair.first->second = FullV;
6097 }
6098
6099 // If LSR splits critical edge and phi node has other pending
6100 // fixup operands, we need to update those pending fixups. Otherwise
6101 // formulae will not be implemented completely and some instructions
6102 // will not be eliminated.
6103 if (needUpdateFixups) {
6104 for (LSRUse &LU : Uses)
6105 for (LSRFixup &Fixup : LU.Fixups)
6106 // If fixup is supposed to rewrite some operand in the phi
6107 // that was just updated, it may be already moved to
6108 // another phi node. Such fixup requires update.
6109 if (Fixup.UserInst == PN) {
6110 // Check if the operand we try to replace still exists in the
6111 // original phi.
6112 bool foundInOriginalPHI = false;
6113 for (const auto &val : PN->incoming_values())
6114 if (val == Fixup.OperandValToReplace) {
6115 foundInOriginalPHI = true;
6116 break;
6117 }
6118
6119 // If fixup operand found in original PHI - nothing to do.
6120 if (foundInOriginalPHI)
6121 continue;
6122
6123 // Otherwise it might be moved to another PHI and requires update.
6124 // If fixup operand not found in any of the incoming blocks that
6125 // means we have already rewritten it - nothing to do.
6126 for (const auto &Block : PN->blocks())
6127 for (BasicBlock::iterator I = Block->begin(); isa<PHINode>(I);
6128 ++I) {
6129 PHINode *NewPN = cast<PHINode>(I);
6130 for (const auto &val : NewPN->incoming_values())
6131 if (val == Fixup.OperandValToReplace)
6132 Fixup.UserInst = NewPN;
6133 }
6134 }
6135 }
6136 }
6137}
6138
6139/// Emit instructions for the leading candidate expression for this LSRUse (this
6140/// is called "expanding"), and update the UserInst to reference the newly
6141/// expanded value.
6142void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
6143 const Formula &F,
6144 SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
6145 // First, find an insertion point that dominates UserInst. For PHI nodes,
6146 // find the nearest block which dominates all the relevant uses.
6147 if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
6148 RewriteForPHI(PN, LU, LF, F, DeadInsts);
6149 } else {
6150 Value *FullV = Expand(LU, LF, F, LF.UserInst->getIterator(), DeadInsts);
6151
6152 // If this is reuse-by-noop-cast, insert the noop cast.
6153 // For ICmpZero with pointer operands, Expand() already set both operands
6154 // in integer domain, so no cast is needed here.
6155 Type *OpTy = LF.OperandValToReplace->getType();
6156 if (FullV->getType() != OpTy &&
6157 !(LU.Kind == LSRUse::ICmpZero && OpTy->isPointerTy())) {
6158 Instruction *Cast =
6159 CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
6160 FullV, OpTy, "tmp", LF.UserInst->getIterator());
6161 FullV = Cast;
6162 }
6163
6164 // Update the user. ICmpZero is handled specially here (for now) because
6165 // Expand may have updated one of the operands of the icmp already, and
6166 // its new value may happen to be equal to LF.OperandValToReplace, in
6167 // which case doing replaceUsesOfWith leads to replacing both operands
6168 // with the same value. TODO: Reorganize this.
6169 if (LU.Kind == LSRUse::ICmpZero)
6170 LF.UserInst->setOperand(0, FullV);
6171 else
6172 LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
6173 }
6174
6175 if (auto *OperandIsInstr = dyn_cast<Instruction>(LF.OperandValToReplace))
6176 DeadInsts.emplace_back(OperandIsInstr);
6177}
6178
6179// Determine where to insert the transformed IV increment instruction for this
6180// fixup. By default this is the default insert position, but if this is a
6181// postincrement opportunity then we try to insert it in the same block as the
6182// fixup user instruction, as this is needed for a postincrement instruction to
6183// be generated.
6185 const LSRFixup &Fixup, const LSRUse &LU,
6186 Instruction *IVIncInsertPos,
6187 DominatorTree &DT) {
6188 // Only address uses can be postincremented
6189 if (LU.Kind != LSRUse::Address)
6190 return IVIncInsertPos;
6191
6192 // Don't try to postincrement if it's not legal
6193 Instruction *I = Fixup.UserInst;
6194 Type *Ty = I->getType();
6195 if (!(isa<LoadInst>(I) && TTI.isIndexedLoadLegal(TTI.MIM_PostInc, Ty)) &&
6196 !(isa<StoreInst>(I) && TTI.isIndexedStoreLegal(TTI.MIM_PostInc, Ty)))
6197 return IVIncInsertPos;
6198
6199 // It's only legal to hoist to the user block if it dominates the default
6200 // insert position.
6201 BasicBlock *HoistBlock = I->getParent();
6202 BasicBlock *IVIncBlock = IVIncInsertPos->getParent();
6203 if (!DT.dominates(I, IVIncBlock))
6204 return IVIncInsertPos;
6205
6206 return HoistBlock->getTerminator();
6207}
6208
6209/// Rewrite all the fixup locations with new values, following the chosen
6210/// solution.
6211void LSRInstance::ImplementSolution(
6212 const SmallVectorImpl<const Formula *> &Solution) {
6213 // Keep track of instructions we may have made dead, so that
6214 // we can remove them after we are done working.
6216
6217 // Mark phi nodes that terminate chains so the expander tries to reuse them.
6218 for (const IVChain &Chain : IVChainVec) {
6219 if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst()))
6220 Rewriter.setChainedPhi(PN);
6221 }
6222
6223 // Expand the new value definitions and update the users.
6224 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
6225 for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) {
6226 Instruction *InsertPos =
6227 getFixupInsertPos(TTI, Fixup, Uses[LUIdx], IVIncInsertPos, DT);
6228 Rewriter.setIVIncInsertPos(L, InsertPos);
6229 Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], DeadInsts);
6230 Changed = true;
6231 }
6232
6233 auto InsertedInsts = InsertedNonLCSSAInsts.takeVector();
6234 formLCSSAForInstructions(InsertedInsts, DT, LI, &SE);
6235
6236 for (const IVChain &Chain : IVChainVec) {
6237 GenerateIVChain(Chain, DeadInsts);
6238 Changed = true;
6239 }
6240
6241 for (const WeakVH &IV : Rewriter.getInsertedIVs())
6242 if (IV && dyn_cast<Instruction>(&*IV)->getParent())
6243 ScalarEvolutionIVs.push_back(IV);
6244
6245 // Clean up after ourselves. This must be done before deleting any
6246 // instructions.
6247 Rewriter.clear();
6248
6250 &TLI, MSSAU);
6251
6252 // In our cost analysis above, we assume that each addrec consumes exactly
6253 // one register, and arrange to have increments inserted just before the
6254 // latch to maximimize the chance this is true. However, if we reused
6255 // existing IVs, we now need to move the increments to match our
6256 // expectations. Otherwise, our cost modeling results in us having a
6257 // chosen a non-optimal result for the actual schedule. (And yes, this
6258 // scheduling decision does impact later codegen.)
6259 for (PHINode &PN : L->getHeader()->phis()) {
6260 BinaryOperator *BO = nullptr;
6261 Value *Start = nullptr, *Step = nullptr;
6262 if (!matchSimpleRecurrence(&PN, BO, Start, Step))
6263 continue;
6264
6265 switch (BO->getOpcode()) {
6266 case Instruction::Sub:
6267 if (BO->getOperand(0) != &PN)
6268 // sub is non-commutative - match handling elsewhere in LSR
6269 continue;
6270 break;
6271 case Instruction::Add:
6272 break;
6273 default:
6274 continue;
6275 };
6276
6277 if (!isa<Constant>(Step))
6278 // If not a constant step, might increase register pressure
6279 // (We assume constants have been canonicalized to RHS)
6280 continue;
6281
6282 if (BO->getParent() == IVIncInsertPos->getParent())
6283 // Only bother moving across blocks. Isel can handle block local case.
6284 continue;
6285
6286 // Can we legally schedule inc at the desired point?
6287 if (!llvm::all_of(BO->uses(),
6288 [&](Use &U) {return DT.dominates(IVIncInsertPos, U);}))
6289 continue;
6290 BO->moveBefore(IVIncInsertPos->getIterator());
6291 Changed = true;
6292 }
6293
6294
6295}
6296
6297LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
6298 DominatorTree &DT, LoopInfo &LI,
6299 const TargetTransformInfo &TTI, AssumptionCache &AC,
6300 TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU,
6301 bool PreserveLCSSA)
6302 : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), TLI(TLI), TTI(TTI), L(L),
6303 MSSAU(MSSAU), AMK(PreferredAddresingMode.getNumOccurrences() > 0
6305 : TTI.getPreferredAddressingMode(L, &SE)),
6306 Rewriter(SE, "lsr", PreserveLCSSA), ShouldPreserveLCSSA(PreserveLCSSA),
6307 BaselineCost(L, SE, TTI, AMK) {
6308 // If LoopSimplify form is not available, stay out of trouble.
6309 if (!L->isLoopSimplifyForm())
6310 return;
6311
6312 // If there's no interesting work to be done, bail early.
6313 if (IU.empty()) return;
6314
6315 // If there's too much analysis to be done, bail early. We won't be able to
6316 // model the problem anyway.
6317 unsigned NumUsers = 0;
6318 for (const IVStrideUse &U : IU) {
6319 if (++NumUsers > MaxIVUsers) {
6320 (void)U;
6321 LLVM_DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U
6322 << "\n");
6323 return;
6324 }
6325 // Bail out if we have a PHI on an EHPad that gets a value from a
6326 // CatchSwitchInst. Because the CatchSwitchInst cannot be split, there is
6327 // no good place to stick any instructions.
6328 if (auto *PN = dyn_cast<PHINode>(U.getUser())) {
6329 auto FirstNonPHI = PN->getParent()->getFirstNonPHIIt();
6330 if (isa<FuncletPadInst>(FirstNonPHI) ||
6331 isa<CatchSwitchInst>(FirstNonPHI))
6332 for (BasicBlock *PredBB : PN->blocks())
6333 if (isa<CatchSwitchInst>(PredBB->getFirstNonPHIIt()))
6334 return;
6335 }
6336 }
6337
6338 LLVM_DEBUG(dbgs() << "\nLSR on loop ";
6339 L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
6340 dbgs() << ":\n");
6341
6342 // Check if we expect this loop to use a hardware loop instruction, which will
6343 // be used when calculating the costs of formulas.
6344 HardwareLoopInfo HWLoopInfo(L);
6345 HardwareLoopProfitable =
6346 TTI.isHardwareLoopProfitable(L, SE, AC, &TLI, HWLoopInfo);
6347
6348 // Configure SCEVExpander already now, so the correct mode is used for
6349 // isSafeToExpand() checks.
6350#if LLVM_ENABLE_ABI_BREAKING_CHECKS
6351 Rewriter.setDebugType(DEBUG_TYPE);
6352#endif
6353 Rewriter.disableCanonicalMode();
6354 Rewriter.enableLSRMode();
6355
6356 // First, perform some low-level loop optimizations.
6357 OptimizeShadowIV();
6358 OptimizeLoopTermCond();
6359
6360 // If loop preparation eliminates all interesting IV users, bail.
6361 if (IU.empty()) return;
6362
6363 // Skip nested loops until we can model them better with formulae.
6364 if (!L->isInnermost()) {
6365 LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
6366 return;
6367 }
6368
6369 // Start collecting data and preparing for the solver.
6370 // If number of registers is not the major cost, we cannot benefit from the
6371 // current profitable chain optimization which is based on number of
6372 // registers.
6373 // FIXME: add profitable chain optimization for other kinds major cost, for
6374 // example number of instructions.
6375 if (TTI.isNumRegsMajorCostOfLSR() || StressIVChain)
6376 CollectChains();
6377 CollectInterestingTypesAndFactors();
6378 CollectFixupsAndInitialFormulae();
6379 CollectLoopInvariantFixupsAndFormulae();
6380
6381 if (Uses.empty())
6382 return;
6383
6384 LLVM_DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
6385 print_uses(dbgs()));
6386 LLVM_DEBUG(dbgs() << "The baseline solution requires ";
6387 BaselineCost.print(dbgs()); dbgs() << "\n");
6388
6389 // Now use the reuse data to generate a bunch of interesting ways
6390 // to formulate the values needed for the uses.
6391 GenerateAllReuseFormulae();
6392
6393 FilterOutUndesirableDedicatedRegisters();
6394 NarrowSearchSpaceUsingHeuristics();
6395
6397 Solve(Solution);
6398
6399 // Release memory that is no longer needed.
6400 Factors.clear();
6401 Types.clear();
6402 RegUses.clear();
6403
6404 if (Solution.empty())
6405 return;
6406
6407#ifndef NDEBUG
6408 // Formulae should be legal.
6409 for (const LSRUse &LU : Uses) {
6410 for (const Formula &F : LU.Formulae)
6411 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
6412 F) && "Illegal formula generated!");
6413 };
6414#endif
6415
6416 // Now that we've decided what we want, make it so.
6417 ImplementSolution(Solution);
6418}
6419
6420#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
6421void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
6422 if (Factors.empty() && Types.empty()) return;
6423
6424 OS << "LSR has identified the following interesting factors and types: ";
6425 ListSeparator LS;
6426
6427 for (int64_t Factor : Factors)
6428 OS << LS << '*' << Factor;
6429
6430 for (Type *Ty : Types)
6431 OS << LS << '(' << *Ty << ')';
6432 OS << '\n';
6433}
6434
6435void LSRInstance::print_fixups(raw_ostream &OS) const {
6436 OS << "LSR is examining the following fixup sites:\n";
6437 for (const LSRUse &LU : Uses)
6438 for (const LSRFixup &LF : LU.Fixups) {
6439 dbgs() << " ";
6440 LF.print(OS);
6441 OS << '\n';
6442 }
6443}
6444
6445void LSRInstance::print_uses(raw_ostream &OS) const {
6446 OS << "LSR is examining the following uses:\n";
6447 for (const LSRUse &LU : Uses) {
6448 dbgs() << " ";
6449 LU.print(OS);
6450 OS << '\n';
6451 for (const Formula &F : LU.Formulae) {
6452 OS << " ";
6453 F.print(OS);
6454 OS << '\n';
6455 }
6456 }
6457}
6458
6459void LSRInstance::print(raw_ostream &OS) const {
6460 print_factors_and_types(OS);
6461 print_fixups(OS);
6462 print_uses(OS);
6463}
6464
6465LLVM_DUMP_METHOD void LSRInstance::dump() const {
6466 print(errs()); errs() << '\n';
6467}
6468#endif
6469
6470namespace {
6471
6472class LoopStrengthReduce : public LoopPass {
6473public:
6474 static char ID; // Pass ID, replacement for typeid
6475
6476 LoopStrengthReduce();
6477
6478private:
6479 bool runOnLoop(Loop *L, LPPassManager &LPM) override;
6480 void getAnalysisUsage(AnalysisUsage &AU) const override;
6481};
6482
6483} // end anonymous namespace
6484
6485LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
6487}
6488
6489void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
6490 // We split critical edges, so we change the CFG. However, we do update
6491 // many analyses if they are around.
6493
6494 AU.addRequired<LoopInfoWrapperPass>();
6495 AU.addPreserved<LoopInfoWrapperPass>();
6497 AU.addRequired<DominatorTreeWrapperPass>();
6498 AU.addPreserved<DominatorTreeWrapperPass>();
6499 AU.addRequired<ScalarEvolutionWrapperPass>();
6500 AU.addPreserved<ScalarEvolutionWrapperPass>();
6501 AU.addRequired<AssumptionCacheTracker>();
6502 AU.addRequired<TargetLibraryInfoWrapperPass>();
6503 // Requiring LoopSimplify a second time here prevents IVUsers from running
6504 // twice, since LoopSimplify was invalidated by running ScalarEvolution.
6506 AU.addRequired<IVUsersWrapperPass>();
6507 AU.addPreserved<IVUsersWrapperPass>();
6508 AU.addRequired<TargetTransformInfoWrapperPass>();
6509 AU.addPreserved<MemorySSAWrapperPass>();
6510}
6511
6512namespace {
6513
6514/// Enables more convenient iteration over a DWARF expression vector.
6516ToDwarfOpIter(SmallVectorImpl<uint64_t> &Expr) {
6517 llvm::DIExpression::expr_op_iterator Begin =
6518 llvm::DIExpression::expr_op_iterator(Expr.begin());
6519 llvm::DIExpression::expr_op_iterator End =
6520 llvm::DIExpression::expr_op_iterator(Expr.end());
6521 return {Begin, End};
6522}
6523
6524struct SCEVDbgValueBuilder {
6525 SCEVDbgValueBuilder() = default;
6526 SCEVDbgValueBuilder(const SCEVDbgValueBuilder &Base) { clone(Base); }
6527
6528 void clone(const SCEVDbgValueBuilder &Base) {
6529 LocationOps = Base.LocationOps;
6530 Expr = Base.Expr;
6531 }
6532
6533 void clear() {
6534 LocationOps.clear();
6535 Expr.clear();
6536 }
6537
6538 /// The DIExpression as we translate the SCEV.
6540 /// The location ops of the DIExpression.
6541 SmallVector<Value *, 2> LocationOps;
6542
6543 void pushOperator(uint64_t Op) { Expr.push_back(Op); }
6544 void pushUInt(uint64_t Operand) { Expr.push_back(Operand); }
6545
6546 /// Add a DW_OP_LLVM_arg to the expression, followed by the index of the value
6547 /// in the set of values referenced by the expression.
6548 void pushLocation(llvm::Value *V) {
6550 auto *It = llvm::find(LocationOps, V);
6551 unsigned ArgIndex = 0;
6552 if (It != LocationOps.end()) {
6553 ArgIndex = std::distance(LocationOps.begin(), It);
6554 } else {
6555 ArgIndex = LocationOps.size();
6556 LocationOps.push_back(V);
6557 }
6558 Expr.push_back(ArgIndex);
6559 }
6560
6561 void pushValue(const SCEVUnknown *U) {
6562 llvm::Value *V = cast<SCEVUnknown>(U)->getValue();
6563 pushLocation(V);
6564 }
6565
6566 bool pushConst(const SCEVConstant *C) {
6567 if (C->getAPInt().getSignificantBits() > 64)
6568 return false;
6569 Expr.push_back(llvm::dwarf::DW_OP_consts);
6570 Expr.push_back(C->getAPInt().getSExtValue());
6571 return true;
6572 }
6573
6574 // Iterating the expression as DWARF ops is convenient when updating
6575 // DWARF_OP_LLVM_args.
6577 return ToDwarfOpIter(Expr);
6578 }
6579
6580 /// Several SCEV types are sequences of the same arithmetic operator applied
6581 /// to constants and values that may be extended or truncated.
6582 bool pushArithmeticExpr(const llvm::SCEVCommutativeExpr *CommExpr,
6583 uint64_t DwarfOp) {
6584 assert((isa<llvm::SCEVAddExpr>(CommExpr) || isa<SCEVMulExpr>(CommExpr)) &&
6585 "Expected arithmetic SCEV type");
6586 bool Success = true;
6587 unsigned EmitOperator = 0;
6588 for (const auto &Op : CommExpr->operands()) {
6589 Success &= pushSCEV(Op);
6590
6591 if (EmitOperator >= 1)
6592 pushOperator(DwarfOp);
6593 ++EmitOperator;
6594 }
6595 return Success;
6596 }
6597
6598 // TODO: Identify and omit noop casts.
6599 bool pushCast(const llvm::SCEVCastExpr *C, bool IsSigned) {
6600 const llvm::SCEV *Inner = C->getOperand(0);
6601 const llvm::Type *Type = C->getType();
6602 uint64_t ToWidth = Type->getIntegerBitWidth();
6603 bool Success = pushSCEV(Inner);
6604 uint64_t CastOps[] = {dwarf::DW_OP_LLVM_convert, ToWidth,
6605 IsSigned ? llvm::dwarf::DW_ATE_signed
6606 : llvm::dwarf::DW_ATE_unsigned};
6607 for (const auto &Op : CastOps)
6608 pushOperator(Op);
6609 return Success;
6610 }
6611
6612 // TODO: MinMax - although these haven't been encountered in the test suite.
6613 bool pushSCEV(const llvm::SCEV *S) {
6614 bool Success = true;
6615 if (const SCEVConstant *StartInt = dyn_cast<SCEVConstant>(S)) {
6616 Success &= pushConst(StartInt);
6617
6618 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
6619 if (!U->getValue())
6620 return false;
6621 pushLocation(U->getValue());
6622
6623 } else if (const SCEVMulExpr *MulRec = dyn_cast<SCEVMulExpr>(S)) {
6624 Success &= pushArithmeticExpr(MulRec, llvm::dwarf::DW_OP_mul);
6625
6626 } else if (const SCEVUDivExpr *UDiv = dyn_cast<SCEVUDivExpr>(S)) {
6627 Success &= pushSCEV(UDiv->getLHS());
6628 Success &= pushSCEV(UDiv->getRHS());
6629 pushOperator(llvm::dwarf::DW_OP_div);
6630
6631 } else if (const SCEVCastExpr *Cast = dyn_cast<SCEVCastExpr>(S)) {
6632 // Assert if a new and unknown SCEVCastEXpr type is encountered.
6635 isa<SCEVSignExtendExpr>(Cast)) &&
6636 "Unexpected cast type in SCEV.");
6637 Success &= pushCast(Cast, (isa<SCEVSignExtendExpr>(Cast)));
6638
6639 } else if (const SCEVAddExpr *AddExpr = dyn_cast<SCEVAddExpr>(S)) {
6640 Success &= pushArithmeticExpr(AddExpr, llvm::dwarf::DW_OP_plus);
6641
6642 } else if (isa<SCEVAddRecExpr>(S)) {
6643 // Nested SCEVAddRecExpr are generated by nested loops and are currently
6644 // unsupported.
6645 return false;
6646
6647 } else {
6648 return false;
6649 }
6650 return Success;
6651 }
6652
6653 /// Return true if the combination of arithmetic operator and underlying
6654 /// SCEV constant value is an identity function.
6655 bool isIdentityFunction(uint64_t Op, const SCEV *S) {
6656 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
6657 if (C->getAPInt().getSignificantBits() > 64)
6658 return false;
6659 int64_t I = C->getAPInt().getSExtValue();
6660 switch (Op) {
6661 case llvm::dwarf::DW_OP_plus:
6662 case llvm::dwarf::DW_OP_minus:
6663 return I == 0;
6664 case llvm::dwarf::DW_OP_mul:
6665 case llvm::dwarf::DW_OP_div:
6666 return I == 1;
6667 }
6668 }
6669 return false;
6670 }
6671
6672 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6673 /// builder's expression stack. The stack should already contain an
6674 /// expression for the iteration count, so that it can be multiplied by
6675 /// the stride and added to the start.
6676 /// Components of the expression are omitted if they are an identity function.
6677 /// Chain (non-affine) SCEVs are not supported.
6678 bool SCEVToValueExpr(const llvm::SCEVAddRecExpr &SAR, ScalarEvolution &SE) {
6679 assert(SAR.isAffine() && "Expected affine SCEV");
6680 const SCEV *Start = SAR.getStart();
6681 const SCEV *Stride = SAR.getStepRecurrence(SE);
6682
6683 // Skip pushing arithmetic noops.
6684 if (!isIdentityFunction(llvm::dwarf::DW_OP_mul, Stride)) {
6685 if (!pushSCEV(Stride))
6686 return false;
6687 pushOperator(llvm::dwarf::DW_OP_mul);
6688 }
6689 if (!isIdentityFunction(llvm::dwarf::DW_OP_plus, Start)) {
6690 if (!pushSCEV(Start))
6691 return false;
6692 pushOperator(llvm::dwarf::DW_OP_plus);
6693 }
6694 return true;
6695 }
6696
6697 /// Create an expression that is an offset from a value (usually the IV).
6698 void createOffsetExpr(int64_t Offset, Value *OffsetValue) {
6699 pushLocation(OffsetValue);
6701 LLVM_DEBUG(
6702 dbgs() << "scev-salvage: Generated IV offset expression. Offset: "
6703 << std::to_string(Offset) << "\n");
6704 }
6705
6706 /// Combine a translation of the SCEV and the IV to create an expression that
6707 /// recovers a location's value.
6708 /// returns true if an expression was created.
6709 bool createIterCountExpr(const SCEV *S,
6710 const SCEVDbgValueBuilder &IterationCount,
6711 ScalarEvolution &SE) {
6712 // SCEVs for SSA values are most frquently of the form
6713 // {start,+,stride}, but sometimes they are ({start,+,stride} + %a + ..).
6714 // This is because %a is a PHI node that is not the IV. However, these
6715 // SCEVs have not been observed to result in debuginfo-lossy optimisations,
6716 // so its not expected this point will be reached.
6717 if (!isa<SCEVAddRecExpr>(S))
6718 return false;
6719
6720 LLVM_DEBUG(dbgs() << "scev-salvage: Location to salvage SCEV: " << *S
6721 << '\n');
6722
6723 const auto *Rec = cast<SCEVAddRecExpr>(S);
6724 if (!Rec->isAffine())
6725 return false;
6726
6728 return false;
6729
6730 // Initialise a new builder with the iteration count expression. In
6731 // combination with the value's SCEV this enables recovery.
6732 clone(IterationCount);
6733 if (!SCEVToValueExpr(*Rec, SE))
6734 return false;
6735
6736 return true;
6737 }
6738
6739 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6740 /// builder's expression stack. The stack should already contain an
6741 /// expression for the iteration count, so that it can be multiplied by
6742 /// the stride and added to the start.
6743 /// Components of the expression are omitted if they are an identity function.
6744 bool SCEVToIterCountExpr(const llvm::SCEVAddRecExpr &SAR,
6745 ScalarEvolution &SE) {
6746 assert(SAR.isAffine() && "Expected affine SCEV");
6747 const SCEV *Start = SAR.getStart();
6748 const SCEV *Stride = SAR.getStepRecurrence(SE);
6749
6750 // Skip pushing arithmetic noops.
6751 if (!isIdentityFunction(llvm::dwarf::DW_OP_minus, Start)) {
6752 if (!pushSCEV(Start))
6753 return false;
6754 pushOperator(llvm::dwarf::DW_OP_minus);
6755 }
6756 if (!isIdentityFunction(llvm::dwarf::DW_OP_div, Stride)) {
6757 if (!pushSCEV(Stride))
6758 return false;
6759 pushOperator(llvm::dwarf::DW_OP_div);
6760 }
6761 return true;
6762 }
6763
6764 // Append the current expression and locations to a location list and an
6765 // expression list. Modify the DW_OP_LLVM_arg indexes to account for
6766 // the locations already present in the destination list.
6767 void appendToVectors(SmallVectorImpl<uint64_t> &DestExpr,
6768 SmallVectorImpl<Value *> &DestLocations) {
6769 assert(!DestLocations.empty() &&
6770 "Expected the locations vector to contain the IV");
6771 // The DWARF_OP_LLVM_arg arguments of the expression being appended must be
6772 // modified to account for the locations already in the destination vector.
6773 // All builders contain the IV as the first location op.
6774 assert(!LocationOps.empty() &&
6775 "Expected the location ops to contain the IV.");
6776 // DestIndexMap[n] contains the index in DestLocations for the nth
6777 // location in this SCEVDbgValueBuilder.
6778 SmallVector<uint64_t, 2> DestIndexMap;
6779 for (const auto &Op : LocationOps) {
6780 auto It = find(DestLocations, Op);
6781 if (It != DestLocations.end()) {
6782 // Location already exists in DestLocations, reuse existing ArgIndex.
6783 DestIndexMap.push_back(std::distance(DestLocations.begin(), It));
6784 continue;
6785 }
6786 // Location is not in DestLocations, add it.
6787 DestIndexMap.push_back(DestLocations.size());
6788 DestLocations.push_back(Op);
6789 }
6790
6791 for (const auto &Op : expr_ops()) {
6792 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6793 Op.appendToVector(DestExpr);
6794 continue;
6795 }
6796
6798 // `DW_OP_LLVM_arg n` represents the nth LocationOp in this SCEV,
6799 // DestIndexMap[n] contains its new index in DestLocations.
6800 uint64_t NewIndex = DestIndexMap[Op.getArg(0)];
6801 DestExpr.push_back(NewIndex);
6802 }
6803 }
6804};
6805
6806/// Holds all the required data to salvage a dbg.value using the pre-LSR SCEVs
6807/// and DIExpression.
6808struct DVIRecoveryRec {
6809 DVIRecoveryRec(DbgVariableRecord *DVR)
6810 : DbgRef(DVR), Expr(DVR->getExpression()), HadLocationArgList(false) {}
6811
6812 DbgVariableRecord *DbgRef;
6813 DIExpression *Expr;
6814 bool HadLocationArgList;
6815 SmallVector<WeakVH, 2> LocationOps;
6818
6819 void clear() {
6820 for (auto &RE : RecoveryExprs)
6821 RE.reset();
6822 RecoveryExprs.clear();
6823 }
6824
6825 ~DVIRecoveryRec() { clear(); }
6826};
6827} // namespace
6828
6829/// Returns the total number of DW_OP_llvm_arg operands in the expression.
6830/// This helps in determining if a DIArglist is necessary or can be omitted from
6831/// the dbg.value.
6833 auto expr_ops = ToDwarfOpIter(Expr);
6834 unsigned Count = 0;
6835 for (auto Op : expr_ops)
6836 if (Op.getOp() == dwarf::DW_OP_LLVM_arg)
6837 Count++;
6838 return Count;
6839}
6840
6841/// Overwrites DVI with the location and Ops as the DIExpression. This will
6842/// create an invalid expression if Ops has any dwarf::DW_OP_llvm_arg operands,
6843/// because a DIArglist is not created for the first argument of the dbg.value.
6844template <typename T>
6845static void updateDVIWithLocation(T &DbgVal, Value *Location,
6847 assert(numLLVMArgOps(Ops) == 0 && "Expected expression that does not "
6848 "contain any DW_OP_llvm_arg operands.");
6849 DbgVal.setRawLocation(ValueAsMetadata::get(Location));
6850 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6851 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6852}
6853
6854/// Overwrite DVI with locations placed into a DIArglist.
6855template <typename T>
6856static void updateDVIWithLocations(T &DbgVal,
6857 SmallVectorImpl<Value *> &Locations,
6859 assert(numLLVMArgOps(Ops) != 0 &&
6860 "Expected expression that references DIArglist locations using "
6861 "DW_OP_llvm_arg operands.");
6863 for (Value *V : Locations)
6864 MetadataLocs.push_back(ValueAsMetadata::get(V));
6865 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6866 DbgVal.setRawLocation(llvm::DIArgList::get(DbgVal.getContext(), ValArrayRef));
6867 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6868}
6869
6870/// Write the new expression and new location ops for the dbg.value. If possible
6871/// reduce the szie of the dbg.value by omitting DIArglist. This
6872/// can be omitted if:
6873/// 1. There is only a single location, refenced by a single DW_OP_llvm_arg.
6874/// 2. The DW_OP_LLVM_arg is the first operand in the expression.
6875static void UpdateDbgValue(DVIRecoveryRec &DVIRec,
6876 SmallVectorImpl<Value *> &NewLocationOps,
6878 DbgVariableRecord *DbgVal = DVIRec.DbgRef;
6879 unsigned NumLLVMArgs = numLLVMArgOps(NewExpr);
6880 if (NumLLVMArgs == 0) {
6881 // Location assumed to be on the stack.
6882 updateDVIWithLocation(*DbgVal, NewLocationOps[0], NewExpr);
6883 } else if (NumLLVMArgs == 1 && NewExpr[0] == dwarf::DW_OP_LLVM_arg) {
6884 // There is only a single DW_OP_llvm_arg at the start of the expression,
6885 // so it can be omitted along with DIArglist.
6886 assert(NewExpr[1] == 0 &&
6887 "Lone LLVM_arg in a DIExpression should refer to location-op 0.");
6889 updateDVIWithLocation(*DbgVal, NewLocationOps[0], ShortenedOps);
6890 } else {
6891 // Multiple DW_OP_llvm_arg, so DIArgList is strictly necessary.
6892 updateDVIWithLocations(*DbgVal, NewLocationOps, NewExpr);
6893 }
6894
6895 // If the DIExpression was previously empty then add the stack terminator.
6896 // Non-empty expressions have only had elements inserted into them and so
6897 // the terminator should already be present e.g. stack_value or fragment.
6898 DIExpression *SalvageExpr = DbgVal->getExpression();
6899 if (!DVIRec.Expr->isComplex() && SalvageExpr->isComplex()) {
6900 SalvageExpr = DIExpression::append(SalvageExpr, {dwarf::DW_OP_stack_value});
6901 DbgVal->setExpression(SalvageExpr);
6902 }
6903}
6904
6905/// Cached location ops may be erased during LSR, in which case a poison is
6906/// required when restoring from the cache. The type of that location is no
6907/// longer available, so just use int8. The poison will be replaced by one or
6908/// more locations later when a SCEVDbgValueBuilder selects alternative
6909/// locations to use for the salvage.
6911 return (VH) ? VH : PoisonValue::get(llvm::Type::getInt8Ty(C));
6912}
6913
6914/// Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
6915static void restorePreTransformState(DVIRecoveryRec &DVIRec) {
6916 DbgVariableRecord *DbgVal = DVIRec.DbgRef;
6917 LLVM_DEBUG(dbgs() << "scev-salvage: restore dbg.value to pre-LSR state\n"
6918 << "scev-salvage: post-LSR: " << *DbgVal << '\n');
6919 assert(DVIRec.Expr && "Expected an expression");
6920 DbgVal->setExpression(DVIRec.Expr);
6921
6922 // Even a single location-op may be inside a DIArgList and referenced with
6923 // DW_OP_LLVM_arg, which is valid only with a DIArgList.
6924 if (!DVIRec.HadLocationArgList) {
6925 assert(DVIRec.LocationOps.size() == 1 &&
6926 "Unexpected number of location ops.");
6927 // LSR's unsuccessful salvage attempt may have added DIArgList, which in
6928 // this case was not present before, so force the location back to a
6929 // single uncontained Value.
6930 Value *CachedValue =
6931 getValueOrPoison(DVIRec.LocationOps[0], DbgVal->getContext());
6932 DbgVal->setRawLocation(ValueAsMetadata::get(CachedValue));
6933 } else {
6935 for (WeakVH VH : DVIRec.LocationOps) {
6936 Value *CachedValue = getValueOrPoison(VH, DbgVal->getContext());
6937 MetadataLocs.push_back(ValueAsMetadata::get(CachedValue));
6938 }
6939 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6940 DbgVal->setRawLocation(
6941 llvm::DIArgList::get(DbgVal->getContext(), ValArrayRef));
6942 }
6943 LLVM_DEBUG(dbgs() << "scev-salvage: pre-LSR: " << *DbgVal << '\n');
6944}
6945
6947 llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec,
6948 const SCEV *SCEVInductionVar,
6949 SCEVDbgValueBuilder IterCountExpr) {
6950
6951 if (!DVIRec.DbgRef->isKillLocation())
6952 return false;
6953
6954 // LSR may have caused several changes to the dbg.value in the failed salvage
6955 // attempt. So restore the DIExpression, the location ops and also the
6956 // location ops format, which is always DIArglist for multiple ops, but only
6957 // sometimes for a single op.
6959
6960 // LocationOpIndexMap[i] will store the post-LSR location index of
6961 // the non-optimised out location at pre-LSR index i.
6962 SmallVector<int64_t, 2> LocationOpIndexMap;
6963 LocationOpIndexMap.assign(DVIRec.LocationOps.size(), -1);
6964 SmallVector<Value *, 2> NewLocationOps;
6965 NewLocationOps.push_back(LSRInductionVar);
6966
6967 for (unsigned i = 0; i < DVIRec.LocationOps.size(); i++) {
6968 WeakVH VH = DVIRec.LocationOps[i];
6969 // Place the locations not optimised out in the list first, avoiding
6970 // inserts later. The map is used to update the DIExpression's
6971 // DW_OP_LLVM_arg arguments as the expression is updated.
6972 if (VH && !isa<UndefValue>(VH)) {
6973 NewLocationOps.push_back(VH);
6974 LocationOpIndexMap[i] = NewLocationOps.size() - 1;
6975 LLVM_DEBUG(dbgs() << "scev-salvage: Location index " << i
6976 << " now at index " << LocationOpIndexMap[i] << "\n");
6977 continue;
6978 }
6979
6980 // It's possible that a value referred to in the SCEV may have been
6981 // optimised out by LSR.
6982 if (SE.containsErasedValue(DVIRec.SCEVs[i]) ||
6983 SE.containsUndefs(DVIRec.SCEVs[i])) {
6984 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV for location at index: " << i
6985 << " refers to a location that is now undef or erased. "
6986 "Salvage abandoned.\n");
6987 return false;
6988 }
6989
6990 LLVM_DEBUG(dbgs() << "scev-salvage: salvaging location at index " << i
6991 << " with SCEV: " << *DVIRec.SCEVs[i] << "\n");
6992
6993 DVIRec.RecoveryExprs[i] = std::make_unique<SCEVDbgValueBuilder>();
6994 SCEVDbgValueBuilder *SalvageExpr = DVIRec.RecoveryExprs[i].get();
6995
6996 // Create an offset-based salvage expression if possible, as it requires
6997 // less DWARF ops than an iteration count-based expression.
6998 if (std::optional<APInt> Offset =
6999 SE.computeConstantDifference(DVIRec.SCEVs[i], SCEVInductionVar)) {
7000 if (Offset->getSignificantBits() <= 64)
7001 SalvageExpr->createOffsetExpr(Offset->getSExtValue(), LSRInductionVar);
7002 else
7003 return false;
7004 } else if (!SalvageExpr->createIterCountExpr(DVIRec.SCEVs[i], IterCountExpr,
7005 SE))
7006 return false;
7007 }
7008
7009 // Merge the DbgValueBuilder generated expressions and the original
7010 // DIExpression, place the result into an new vector.
7012 if (DVIRec.Expr->getNumElements() == 0) {
7013 assert(DVIRec.RecoveryExprs.size() == 1 &&
7014 "Expected only a single recovery expression for an empty "
7015 "DIExpression.");
7016 assert(DVIRec.RecoveryExprs[0] &&
7017 "Expected a SCEVDbgSalvageBuilder for location 0");
7018 SCEVDbgValueBuilder *B = DVIRec.RecoveryExprs[0].get();
7019 B->appendToVectors(NewExpr, NewLocationOps);
7020 }
7021 for (const auto &Op : DVIRec.Expr->expr_ops()) {
7022 // Most Ops needn't be updated.
7023 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
7024 Op.appendToVector(NewExpr);
7025 continue;
7026 }
7027
7028 uint64_t LocationArgIndex = Op.getArg(0);
7029 SCEVDbgValueBuilder *DbgBuilder =
7030 DVIRec.RecoveryExprs[LocationArgIndex].get();
7031 // The location doesn't have s SCEVDbgValueBuilder, so LSR did not
7032 // optimise it away. So just translate the argument to the updated
7033 // location index.
7034 if (!DbgBuilder) {
7035 NewExpr.push_back(dwarf::DW_OP_LLVM_arg);
7036 assert(LocationOpIndexMap[Op.getArg(0)] != -1 &&
7037 "Expected a positive index for the location-op position.");
7038 NewExpr.push_back(LocationOpIndexMap[Op.getArg(0)]);
7039 continue;
7040 }
7041 // The location has a recovery expression.
7042 DbgBuilder->appendToVectors(NewExpr, NewLocationOps);
7043 }
7044
7045 UpdateDbgValue(DVIRec, NewLocationOps, NewExpr);
7046 LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: " << *DVIRec.DbgRef << "\n");
7047 return true;
7048}
7049
7050/// Obtain an expression for the iteration count, then attempt to salvage the
7051/// dbg.value intrinsics.
7053 llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar,
7054 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &DVIToUpdate) {
7055 if (DVIToUpdate.empty())
7056 return;
7057
7058 const llvm::SCEV *SCEVInductionVar = SE.getSCEV(LSRInductionVar);
7059 assert(SCEVInductionVar &&
7060 "Anticipated a SCEV for the post-LSR induction variable");
7061
7062 if (const SCEVAddRecExpr *IVAddRec =
7063 dyn_cast<SCEVAddRecExpr>(SCEVInductionVar)) {
7064 if (!IVAddRec->isAffine())
7065 return;
7066
7067 // Prevent translation using excessive resources.
7068 if (IVAddRec->getExpressionSize() > MaxSCEVSalvageExpressionSize)
7069 return;
7070
7071 // The iteration count is required to recover location values.
7072 SCEVDbgValueBuilder IterCountExpr;
7073 IterCountExpr.pushLocation(LSRInductionVar);
7074 if (!IterCountExpr.SCEVToIterCountExpr(*IVAddRec, SE))
7075 return;
7076
7077 LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV: " << *SCEVInductionVar
7078 << '\n');
7079
7080 for (auto &DVIRec : DVIToUpdate) {
7081 SalvageDVI(L, SE, LSRInductionVar, *DVIRec, SCEVInductionVar,
7082 IterCountExpr);
7083 }
7084 }
7085}
7086
7087/// Identify and cache salvageable DVI locations and expressions along with the
7088/// corresponding SCEV(s). Also ensure that the DVI is not deleted between
7089/// cacheing and salvaging.
7091 Loop *L, ScalarEvolution &SE,
7092 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &SalvageableDVISCEVs) {
7093 for (const auto &B : L->getBlocks()) {
7094 for (auto &I : *B) {
7095 for (DbgVariableRecord &DbgVal : filterDbgVars(I.getDbgRecordRange())) {
7096 if (!DbgVal.isDbgValue() && !DbgVal.isDbgAssign())
7097 continue;
7098
7099 // Ensure that if any location op is undef that the dbg.vlue is not
7100 // cached.
7101 if (DbgVal.isKillLocation())
7102 continue;
7103
7104 // Check that the location op SCEVs are suitable for translation to
7105 // DIExpression.
7106 const auto &HasTranslatableLocationOps =
7107 [&](const DbgVariableRecord &DbgValToTranslate) -> bool {
7108 for (const auto LocOp : DbgValToTranslate.location_ops()) {
7109 if (!LocOp)
7110 return false;
7111
7112 if (!SE.isSCEVable(LocOp->getType()))
7113 return false;
7114
7115 const SCEV *S = SE.getSCEV(LocOp);
7116 if (SE.containsUndefs(S))
7117 return false;
7118 }
7119 return true;
7120 };
7121
7122 if (!HasTranslatableLocationOps(DbgVal))
7123 continue;
7124
7125 std::unique_ptr<DVIRecoveryRec> NewRec =
7126 std::make_unique<DVIRecoveryRec>(&DbgVal);
7127 // Each location Op may need a SCEVDbgValueBuilder in order to recover
7128 // it. Pre-allocating a vector will enable quick lookups of the builder
7129 // later during the salvage.
7130 NewRec->RecoveryExprs.resize(DbgVal.getNumVariableLocationOps());
7131 for (const auto LocOp : DbgVal.location_ops()) {
7132 NewRec->SCEVs.push_back(SE.getSCEV(LocOp));
7133 NewRec->LocationOps.push_back(LocOp);
7134 NewRec->HadLocationArgList = DbgVal.hasArgList();
7135 }
7136 SalvageableDVISCEVs.push_back(std::move(NewRec));
7137 }
7138 }
7139 }
7140}
7141
7142/// Ideally pick the PHI IV inserted by ScalarEvolutionExpander. As a fallback
7143/// any PHi from the loop header is usable, but may have less chance of
7144/// surviving subsequent transforms.
7146 const LSRInstance &LSR) {
7147
7148 auto IsSuitableIV = [&](PHINode *P) {
7149 if (!SE.isSCEVable(P->getType()))
7150 return false;
7151 if (const SCEVAddRecExpr *Rec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(P)))
7152 return Rec->isAffine() && !SE.containsUndefs(SE.getSCEV(P));
7153 return false;
7154 };
7155
7156 // For now, just pick the first IV that was generated and inserted by
7157 // ScalarEvolution. Ideally pick an IV that is unlikely to be optimised away
7158 // by subsequent transforms.
7159 for (const WeakVH &IV : LSR.getScalarEvolutionIVs()) {
7160 if (!IV)
7161 continue;
7162
7163 // There should only be PHI node IVs.
7164 PHINode *P = cast<PHINode>(&*IV);
7165
7166 if (IsSuitableIV(P))
7167 return P;
7168 }
7169
7170 for (PHINode &P : L.getHeader()->phis()) {
7171 if (IsSuitableIV(&P))
7172 return &P;
7173 }
7174 return nullptr;
7175}
7176
7178 DominatorTree &DT, LoopInfo &LI,
7179 const TargetTransformInfo &TTI,
7181 MemorySSA *MSSA, bool PreserveLCSSA) {
7182
7183 // Debug preservation - before we start removing anything identify which DVI
7184 // meet the salvageable criteria and store their DIExpression and SCEVs.
7185 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> SalvageableDVIRecords;
7186 DbgGatherSalvagableDVI(L, SE, SalvageableDVIRecords);
7187
7188 bool Changed = false;
7189 std::unique_ptr<MemorySSAUpdater> MSSAU;
7190 if (MSSA)
7191 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
7192
7193 // Run the main LSR transformation.
7194 const LSRInstance &Reducer =
7195 LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get(), PreserveLCSSA);
7196 Changed |= Reducer.getChanged();
7197
7198 // Remove any extra phis created by processing inner loops.
7199 Changed |= DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7200 if (EnablePhiElim && L->isLoopSimplifyForm()) {
7202 SCEVExpander Rewriter(SE, "lsr", false);
7203#if LLVM_ENABLE_ABI_BREAKING_CHECKS
7204 Rewriter.setDebugType(DEBUG_TYPE);
7205#endif
7206 unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI);
7207 Rewriter.clear();
7208 if (numFolded) {
7209 Changed = true;
7211 MSSAU.get());
7212 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7213 }
7214 }
7215 // LSR may at times remove all uses of an induction variable from a loop.
7216 // The only remaining use is the PHI in the exit block.
7217 // When this is the case, if the exit value of the IV can be calculated using
7218 // SCEV, we can replace the exit block PHI with the final value of the IV and
7219 // skip the updates in each loop iteration.
7220 if (L->isRecursivelyLCSSAForm(DT, LI) && L->getExitBlock()) {
7222 SCEVExpander Rewriter(SE, "lsr", true);
7223 int Rewrites = rewriteLoopExitValues(L, &LI, &TLI, &SE, &TTI, Rewriter, &DT,
7224 UnusedIndVarInLoop, DeadInsts);
7225 Rewriter.clear();
7226 if (Rewrites) {
7227 Changed = true;
7229 MSSAU.get());
7230 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7231 }
7232 }
7233
7234 if (SalvageableDVIRecords.empty())
7235 return Changed;
7236
7237 // Obtain relevant IVs and attempt to rewrite the salvageable DVIs with
7238 // expressions composed using the derived iteration count.
7239 // TODO: Allow for multiple IV references for nested AddRecSCEVs
7240 for (const auto &L : LI) {
7241 if (llvm::PHINode *IV = GetInductionVariable(*L, SE, Reducer))
7242 DbgRewriteSalvageableDVIs(L, SE, IV, SalvageableDVIRecords);
7243 else {
7244 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV salvaging not possible. An IV "
7245 "could not be identified.\n");
7246 }
7247 }
7248
7249 for (auto &Rec : SalvageableDVIRecords)
7250 Rec->clear();
7251 SalvageableDVIRecords.clear();
7252 return Changed;
7253}
7254
7255bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
7256 if (skipLoop(L))
7257 return false;
7258
7259 auto &IU = getAnalysis<IVUsersWrapperPass>().getIU();
7260 auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
7261 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
7262 auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
7263 const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
7264 *L->getHeader()->getParent());
7265 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
7266 *L->getHeader()->getParent());
7267 auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
7268 *L->getHeader()->getParent());
7269 auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
7270 MemorySSA *MSSA = nullptr;
7271 if (MSSAAnalysis)
7272 MSSA = &MSSAAnalysis->getMSSA();
7273 return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, TLI, MSSA,
7274 /*PreserveLCSSA=*/false);
7275}
7276
7279 LPMUpdater &) {
7280 if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,
7281 AR.DT, AR.LI, AR.TTI, AR.AC, AR.TLI, AR.MSSA,
7282 /*PreserveLCSSA=*/true))
7283 return PreservedAnalyses::all();
7284
7285 auto PA = getLoopPassPreservedAnalyses();
7286 if (AR.MSSA)
7287 PA.preserve<MemorySSAAnalysis>();
7288 return PA;
7289}
7290
7291char LoopStrengthReduce::ID = 0;
7292
7293INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
7294 "Loop Strength Reduction", false, false)
7300INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
7301INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
7302 "Loop Strength Reduction", false, false)
7303
7304Pass *llvm::createLoopStrengthReducePass() { return new LoopStrengthReduce(); }
#define Success
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
Function Alias Analysis false
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:856
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:663
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isCanonical(const MDString *S)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
This file contains constants used for implementing Dwarf debug support.
early cse Early CSE w MemorySSA
#define DEBUG_TYPE
Hexagon Hardware Loops
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
This header provides classes for managing per-loop analyses.
static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec, const SCEV *SCEVInductionVar, SCEVDbgValueBuilder IterCountExpr)
static cl::opt< bool > DropScaledForVScale("lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true), cl::desc("Avoid using scaled registers with vscale-relative addressing"))
static Value * getWideOperand(Value *Oper)
IVChain logic must consistently peek base TruncInst operands, so wrap it in a convenient helper.
static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE)
Return true if the given add can be sign-extended without changing its value.
static bool mayUsePostIncMode(const TargetTransformInfo &TTI, LSRUse &LU, const SCEV *S, const Loop *L, ScalarEvolution &SE)
Return true if the SCEV represents a value that may end up as a post-increment operation.
static void restorePreTransformState(DVIRecoveryRec &DVIRec)
Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L)
static User::op_iterator findIVOperand(User::op_iterator OI, User::op_iterator OE, Loop *L, ScalarEvolution &SE)
Helper for CollectChains that finds an IV operand (computed by an AddRec in this loop) within [OI,...
static cl::opt< TTI::AddressingModeKind > PreferredAddresingMode("lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None), cl::desc("A flag that overrides the target's preferred addressing mode."), cl::values(clEnumValN(TTI::AMK_None, "none", "Don't prefer any addressing mode"), clEnumValN(TTI::AMK_PreIndexed, "preindexed", "Prefer pre-indexed addressing mode"), clEnumValN(TTI::AMK_PostIndexed, "postindexed", "Prefer post-indexed addressing mode"), clEnumValN(TTI::AMK_All, "all", "Consider all addressing modes")))
static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset, Immediate MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg, int64_t Scale)
Test whether we know how to expand the current formula.
static void DbgGatherSalvagableDVI(Loop *L, ScalarEvolution &SE, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &SalvageableDVISCEVs)
Identify and cache salvageable DVI locations and expressions along with the corresponding SCEV(s).
static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE)
Return true if the given mul can be sign-extended without changing its value.
static const unsigned MaxSCEVSalvageExpressionSize
Limit the size of expression that SCEV-based salvaging will attempt to translate into a DIExpression.
static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if this AddRec is already a phi in its loop.
static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F, const Loop &L)
static cl::opt< bool > InsnsCost("lsr-insns-cost", cl::Hidden, cl::init(true), cl::desc("Add instruction count to a LSR cost model"))
static cl::opt< bool > StressIVChain("stress-ivchain", cl::Hidden, cl::init(false), cl::desc("Stress test LSR IV chains"))
static bool isAddressUse(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Returns true if the specified instruction is using the specified value as an address.
static void DoInitialMatch(const SCEV *S, Loop *L, SmallVectorImpl< SCEVUse > &Good, SmallVectorImpl< SCEVUse > &Bad, ScalarEvolution &SE)
Recursion helper for initialMatch.
static void updateDVIWithLocation(T &DbgVal, Value *Location, SmallVectorImpl< uint64_t > &Ops)
Overwrites DVI with the location and Ops as the DIExpression.
static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC, TargetLibraryInfo &TLI, MemorySSA *MSSA, bool PreserveLCSSA)
static bool isLegalAddImmediate(const TargetTransformInfo &TTI, Immediate Offset)
static cl::opt< cl::boolOrDefault > AllowDropSolutionIfLessProfitable("lsr-drop-solution", cl::Hidden, cl::desc("Attempt to drop solution if it is less profitable"))
static cl::opt< bool > EnableVScaleImmediates("lsr-enable-vscale-immediates", cl::Hidden, cl::init(true), cl::desc("Enable analysis of vscale-relative immediates in LSR"))
static Instruction * getFixupInsertPos(const TargetTransformInfo &TTI, const LSRFixup &Fixup, const LSRUse &LU, Instruction *IVIncInsertPos, DominatorTree &DT)
static const SCEV * getExprBase(const SCEV *S)
Return an approximation of this SCEV expression's "base", or NULL for any constant.
static bool isAlwaysFoldable(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg)
static llvm::PHINode * GetInductionVariable(const Loop &L, ScalarEvolution &SE, const LSRInstance &LSR)
Ideally pick the PHI IV inserted by ScalarEvolutionExpander.
static bool IsSimplerBaseSCEVForTarget(const TargetTransformInfo &TTI, ScalarEvolution &SE, const SCEV *Best, const SCEV *Reg, MemAccessTy AccessType)
static const unsigned MaxIVUsers
MaxIVUsers is an arbitrary threshold that provides an early opportunity for bail out.
static bool isHighCostExpansion(const SCEV *S, SmallPtrSetImpl< const SCEV * > &Processed, ScalarEvolution &SE)
Check if expanding this expression is likely to incur significant cost.
static Value * getValueOrPoison(WeakVH &VH, LLVMContext &C)
Cached location ops may be erased during LSR, in which case a poison is required when restoring from ...
static MemAccessTy getAccessType(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Return the type of the memory being accessed.
static unsigned numLLVMArgOps(SmallVectorImpl< uint64_t > &Expr)
Returns the total number of DW_OP_llvm_arg operands in the expression.
static Immediate ExtractImmediate(SCEVUse &S, ScalarEvolution &SE, bool PreferScalable=false)
If S involves the addition of a constant integer value, return that integer value,...
static void DbgRewriteSalvageableDVIs(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &DVIToUpdate)
Obtain an expression for the iteration count, then attempt to salvage the dbg.value intrinsics.
static cl::opt< bool > EnablePhiElim("enable-lsr-phielim", cl::Hidden, cl::init(true), cl::desc("Enable LSR phi elimination"))
static void UpdateDbgValue(DVIRecoveryRec &DVIRec, SmallVectorImpl< Value * > &NewLocationOps, SmallVectorImpl< uint64_t > &NewExpr)
Write the new expression and new location ops for the dbg.value.
static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if the given addrec can be sign-extended without changing its value.
static Immediate ExtractImmediateOperand(MutableArrayRef< SCEVUse > Ops, ScalarEvolution &SE, bool PreferScalable)
Extracts an immediate operand from Ops and replaces the operand with zero.
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F)
Check if the addressing mode defined by F is completely folded in LU at isel time.
static cl::opt< bool > LSRExpNarrow("lsr-exp-narrow", cl::Hidden, cl::init(false), cl::desc("Narrow LSR complex solution using" " expectation of registers number"))
static cl::opt< bool > FilterSameScaledReg("lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true), cl::desc("Narrow LSR search space by filtering non-optimal formulae" " with the same ScaledReg and Scale"))
static void updateDVIWithLocations(T &DbgVal, SmallVectorImpl< Value * > &Locations, SmallVectorImpl< uint64_t > &Ops)
Overwrite DVI with locations placed into a DIArglist.
static cl::opt< unsigned > ComplexityLimit("lsr-complexity-limit", cl::Hidden, cl::init(std::numeric_limits< uint16_t >::max()), cl::desc("LSR search space complexity limit"))
static GlobalValue * ExtractSymbol(SCEVUse &S, ScalarEvolution &SE)
If S involves the addition of a GlobalValue address, return that symbol, and mutate S to point to a n...
static bool isProfitableChain(IVChain &Chain, SmallPtrSetImpl< Instruction * > &Users, ScalarEvolution &SE, const TargetTransformInfo &TTI)
Return true if the number of registers needed for the chain is estimated to be less than the number r...
static const SCEV * CollectSubexprs(const SCEV *S, const SCEVConstant *C, SmallVectorImpl< const SCEV * > &Ops, const Loop *L, ScalarEvolution &SE, unsigned Depth=0)
Split S into subexpressions which can be pulled out into separate registers.
static const SCEV * getExactSDiv(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool IgnoreSignificantBits=false)
Return an expression for LHS /s RHS, if it can be determined and if the remainder is known to be zero...
static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, Value *Operand, const TargetTransformInfo &TTI)
Return true if the IVInc can be folded into an addressing mode.
static const SCEV * getAnyExtendConsideringPostIncUses(ArrayRef< PostIncLoopSet > Loops, const SCEV *Expr, Type *ToTy, ScalarEvolution &SE)
Extend/Truncate Expr to ToTy considering post-inc uses in Loops.
static unsigned getSetupCost(const SCEV *Reg, unsigned Depth, const TargetTransformInfo &TTI)
static cl::opt< unsigned > SetupCostDepthLimit("lsr-setupcost-depth-limit", cl::Hidden, cl::init(7), cl::desc("The limit on recursion depth for LSRs setup cost"))
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
Register Reg
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
#define T
uint64_t IntrinsicInst * II
#define P(N)
PowerPC TLS Dynamic Call Fixup
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the PointerIntPair class.
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
SI optimize exec mask operations pre RA
This file contains some templates that are useful if you are working with the STL at all.
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
static const unsigned UnknownAddressSpace
#define LLVM_DEBUG(...)
Definition Debug.h:119
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
Virtual Register Rewriter
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:330
LLVM_ABI APInt sdiv(const APInt &RHS) const
Signed division function for APInt.
Definition APInt.cpp:1670
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition APInt.h:1554
LLVM_ABI APInt srem(const APInt &RHS) const
Function for signed remainder operation.
Definition APInt.cpp:1771
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
LLVM_ABI AnalysisUsage & addRequiredID(const void *ID)
Definition Pass.cpp:289
AnalysisUsage & addPreservedID(const void *ID)
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition BasicBlock.h:530
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:388
LLVM_ABI bool isLandingPad() const
Return true if this basic block is a landing pad.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
BinaryOps getOpcode() const
Definition InstrTypes.h:409
static LLVM_ABI BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
static LLVM_ABI Instruction::CastOps getCastOpcode(const Value *Val, bool SrcIsSigned, Type *Ty, bool DstIsSigned)
Returns the opcode necessary to cast Val into Ty using usual casting rules.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ ICMP_NE
not equal
Definition InstrTypes.h:762
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:852
Value * getCondition() const
static LLVM_ABI bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI DIArgList * get(LLVMContext &Context, ArrayRef< ValueAsMetadata * > Args)
DWARF expression.
iterator_range< expr_op_iterator > expr_ops() const
static LLVM_ABI DIExpression * append(const DIExpression *Expr, ArrayRef< uint64_t > Ops)
Append the opcodes Ops to DIExpr.
unsigned getNumElements() const
static LLVM_ABI void appendOffset(SmallVectorImpl< uint64_t > &Ops, int64_t Offset)
Append Ops with operations to apply the Offset.
LLVM_ABI bool isComplex() const
Return whether the location is computed on the expression stack, meaning it cannot be a simple regist...
LLVM_ABI LLVMContext & getContext()
Record of a variable value-assignment, aka a non instruction representation of the dbg....
LLVM_ABI bool isKillLocation() const
void setRawLocation(Metadata *NewLocation)
Use of this should generally be avoided; instead, replaceVariableLocationOp and addVariableLocationOp...
void setExpression(DIExpression *NewExpr)
DIExpression * getExpression() const
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:301
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:286
NodeT * getBlock() const
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:306
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:151
LLVM_ABI Instruction * findNearestCommonDominator(Instruction *I1, Instruction *I2) const
Find the nearest instruction I that dominates both I1 and I2, in the sense that a result produced bef...
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
PointerType * getType() const
Global values are always pointers.
IVStrideUse - Keep track of one use of a strided induction variable.
Definition IVUsers.h:36
void transformToPostInc(const Loop *L)
transformToPostInc - Transform the expression to post-inc form for the given loop.
Definition IVUsers.cpp:365
Value * getOperandValToReplace() const
getOperandValToReplace - Return the Value of the operand in the user instruction that this IVStrideUs...
Definition IVUsers.h:55
void setUser(Instruction *NewUser)
setUser - Assign a new user instruction for this use.
Definition IVUsers.h:49
Analysis pass that exposes the IVUsers for a loop.
Definition IVUsers.h:187
ilist< IVStrideUse >::const_iterator const_iterator
Definition IVUsers.h:143
iterator end()
Definition IVUsers.h:145
iterator begin()
Definition IVUsers.h:144
bool empty() const
Definition IVUsers.h:148
LLVM_ABI void print(raw_ostream &OS) const
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isLifetimeStartOrEnd() const LLVM_READONLY
Return true if the instruction is a llvm.lifetime.start or llvm.lifetime.end marker.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isEHPad() const
Return true if the instruction is a variety of EH-block.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI Type * getAccessType() const LLVM_READONLY
Return the type this instruction accesses in memory, if any.
const char * getOpcodeName() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:348
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
An instruction for reading from memory.
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
The legacy pass manager's analysis pass to compute loop information.
Definition LoopInfo.h:612
LLVM_ABI PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1554
An analysis that produces MemorySSA for a function.
Definition MemorySSA.h:922
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition MemorySSA.h:702
Represent a mutable reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:294
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
iterator_range< const_block_iterator > blocks() const
op_range incoming_values()
void setIncomingValue(unsigned i, Value *V)
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
static unsigned getIncomingValueNumForOperand(unsigned i)
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
This node represents an addition of some number of SCEVs.
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
SCEVUse getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
This class represents a constant integer value.
ConstantInt * getValue() const
const APInt & getAPInt() const
This class uses information about analyze scalars to rewrite expressions in canonical form.
This node represents multiplication of some number of SCEVs.
ArrayRef< SCEVUse > operands() const
This means that we are dealing with an entirely unknown SCEV value, and only represent it as its LLVM...
This class represents an analyzed expression in the program.
unsigned short getExpressionSize() const
LLVM_ABI bool isZero() const
Return true if the expression is a constant zero.
static constexpr auto FlagAnyWrap
LLVM_ABI ArrayRef< SCEVUse > operands() const
Return operands of this SCEV expression.
SCEVTypes getSCEVType() const
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
LLVM_ABI uint64_t getTypeSizeInBits(Type *Ty) const
Return the size in bits of the specified type, for which isSCEVable must return true.
LLVM_ABI const SCEV * getConstant(ConstantInt *V)
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getMinusSCEV(SCEVUse LHS, SCEVUse RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getAddRecExpr(SCEVUse Start, SCEVUse Step, const Loop *L, SCEV::NoWrapFlags Flags)
Get an add recurrence expression for the specified loop.
LLVM_ABI const SCEV * getNoopOrSignExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
LLVM_ABI Type * getEffectiveSCEVType(Type *Ty) const
Return a type with the same bitwidth as the given type and which represents how SCEV will treat the g...
LLVM_ABI const SCEV * getAnyExtendExpr(const SCEV *Op, Type *Ty)
getAnyExtendExpr - Return a SCEV for the given operand extended with unspecified bits out to the give...
LLVM_ABI bool containsUndefs(const SCEV *S) const
Return true if the SCEV expression contains an undef value.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI const SCEV * getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
LLVM_ABI const SCEV * getVScale(Type *Ty)
LLVM_ABI bool hasComputableLoopEvolution(const SCEV *S, const Loop *L)
Return true if the given SCEV changes value in a known way in the specified loop.
LLVM_ABI const SCEV * getPointerBase(const SCEV *V)
Transitively follow the chain of pointer-type operands until reaching a SCEV that does not have a sin...
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
LLVM_ABI const SCEV * getUnknown(Value *V)
LLVM_ABI std::optional< APInt > computeConstantDifference(const SCEV *LHS, const SCEV *RHS)
Compute LHS - RHS and returns the result as an APInt if it is a constant, and std::nullopt if it isn'...
LLVM_ABI bool properlyDominates(const SCEV *S, const BasicBlock *BB)
Return true if elements that makes up the given SCEV properly dominate the specified basic block.
LLVM_ABI bool containsErasedValue(const SCEV *S) const
Return true if the SCEV expression contains a Value that has been optimised out and is now a nullptr.
LLVMContext & getContext() const
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
iterator end()
Get an iterator to the end of the SetVector.
Definition SetVector.h:112
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition SetVector.h:106
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
iterator_range< const_set_bits_iterator > set_bits() const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
size_type size() const
Returns the number of bits in this bitvector.
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
iterator erase(const_iterator CI)
typename SuperClass::const_iterator const_iterator
typename SuperClass::iterator iterator
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
static StackOffset get(int64_t Fixed, int64_t Scalable)
Definition TypeSize.h:41
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI bool shouldDropLSRSolutionIfLessProfitable() const
Return true if LSR should drop a found solution if it's calculated to be less profitable than the bas...
LLVM_ABI bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const
Return true if LSR cost of C1 is lower than C2.
LLVM_ABI bool isIndexedStoreLegal(enum MemIndexedMode Mode, Type *Ty) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace=0, Instruction *I=nullptr, int64_t ScalableOffset=0) const
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI bool isIndexedLoadLegal(enum MemIndexedMode Mode, Type *Ty) const
LLVM_ABI bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
LLVM_ABI bool isLegalAddImmediate(int64_t Imm) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
LLVM_ABI bool canSaveCmp(Loop *L, CondBrInst **BI, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *LibInfo) const
Return true if the target can save a compare for loop count, for example hardware loop saves a compar...
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
@ TCC_Free
Expected to fold away in lowering.
LLVM_ABI bool canMacroFuseCmp() const
Return true if the target can fuse a compare and branch.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
@ AMK_All
Consider all addressing modes.
@ AMK_PreIndexed
Prefer pre-indexed addressing mode.
@ AMK_None
Don't prefer any addressing mode.
LLVM_ABI bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
This class represents a truncation of integer types.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:307
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI int getFPMantissaWidth() const
Return the width of the mantissa of this type.
Definition Type.cpp:237
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
Use * op_iterator
Definition User.h:254
op_range operands()
Definition User.h:267
op_iterator op_begin()
Definition User.h:259
void setOperand(unsigned i, Value *Val)
Definition User.h:212
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition User.cpp:25
Value * getOperand(unsigned i) const
Definition User.h:207
op_iterator op_end()
Definition User.h:261
static LLVM_ABI ValueAsMetadata * get(Value *V)
Definition Metadata.cpp:509
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI void printAsOperand(raw_ostream &O, bool PrintType=true, const Module *M=nullptr) const
Print the name of this Value out to the specified raw_ostream.
iterator_range< use_iterator > uses()
Definition Value.h:380
A nullable Value handle that is nullable.
int getNumOccurrences() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:212
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition DenseSet.h:190
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
match_bind< const SCEVMulExpr > m_scev_Mul(const SCEVMulExpr *&V)
bool match(const SCEV *S, const Pattern &P)
SCEVAffineAddRec_match< Op0_t, Op1_t, match_isa< const Loop > > m_scev_AffineAddRec(const Op0_t &Op0, const Op1_t &Op1)
cst_pred_ty< is_specific_cst > m_scev_SpecificInt(uint64_t V)
Match an SCEV constant with a plain unsigned integer.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
@ DW_OP_LLVM_arg
Only used in LLVM metadata.
Definition Dwarf.h:149
@ DW_OP_LLVM_convert
Only used in LLVM metadata.
Definition Dwarf.h:145
constexpr double e
Sequence
A sequence of states that a pointer may go through in which an objc_retain and objc_release are actua...
Definition PtrState.h:41
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
iterator end() const
Definition BasicBlock.h:89
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI iterator begin() const
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
Definition SFrame.h:77
unsigned KindType
For isa, dyn_cast, etc operations on TelemetryInfo.
Definition Telemetry.h:83
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:573
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1765
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
InstructionCost Cost
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition Utils.cpp:1690
bool operator!=(uint64_t V1, const APInt &V2)
Definition APInt.h:2142
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
LLVM_ABI char & LoopSimplifyID
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Value
Definition InstrProf.h:143
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
DomTreeNodeBase< BasicBlock > DomTreeNode
Definition Dominators.h:94
AnalysisManager< Loop, LoopStandardAnalysisResults & > LoopAnalysisManager
The loop analysis manager.
LLVM_ABI bool matchSimpleRecurrence(const PHINode *P, BinaryOperator *&BO, Value *&Start, Value *&Step)
Attempt to match a simple first order recurrence cycle of the form: iv = phi Ty [Start,...
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
LLVM_ABI bool DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr)
Examine each PHI in the given block and delete it if it is dead.
LLVM_ABI void initializeLoopStrengthReducePass(PassRegistry &)
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
LLVM_ABI const SCEV * denormalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE)
Denormalize S to be post-increment for all loops present in Loops.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
LLVM_ABI Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void SplitLandingPadPredecessors(BasicBlock *OrigBB, ArrayRef< BasicBlock * > Preds, const char *Suffix, const char *Suffix2, SmallVectorImpl< BasicBlock * > &NewBBs, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, bool PreserveLCSSA=false)
This method transforms the landing pad, OrigBB, by introducing two new basic blocks into the function...
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
LLVM_ABI const SCEV * normalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE, bool CheckInvertible=true)
Normalize S to be post-increment for all loops present in Loops.
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Other
Any other memory.
Definition ModRef.h:68
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
@ Add
Sum of integers.
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Count
Definition InstrProf.h:145
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2012
DWARFExpression::Operation Op
LLVM_ABI Pass * createLoopStrengthReducePass()
LLVM_ABI BasicBlock * SplitCriticalEdge(Instruction *TI, unsigned SuccNum, const CriticalEdgeSplittingOptions &Options=CriticalEdgeSplittingOptions(), const Twine &BBName="")
If this edge is a critical edge, insert a new node to split the critical edge.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructionsPermissive(SmallVectorImpl< WeakTrackingVH > &DeadInsts, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
Same functionality as RecursivelyDeleteTriviallyDeadInstructions, but allow instructions that are not...
Definition Local.cpp:550
constexpr unsigned BitWidth
LLVM_ABI bool formLCSSAForInstructions(SmallVectorImpl< Instruction * > &Worklist, const DominatorTree &DT, const LoopInfo &LI, ScalarEvolution *SE, SmallVectorImpl< PHINode * > *PHIsToRemove=nullptr, SmallVectorImpl< PHINode * > *InsertedPHIs=nullptr)
Ensures LCSSA form for every instruction from the Worklist in the scope of innermost containing loop.
Definition LCSSA.cpp:308
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
SmallPtrSet< const Loop *, 2 > PostIncLoopSet
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
LLVM_ABI int rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI, ScalarEvolution *SE, const TargetTransformInfo *TTI, SCEVExpander &Rewriter, DominatorTree *DT, ReplaceExitVal ReplaceExitValue, SmallVector< WeakTrackingVH, 16 > &DeadInsts)
If the final value of any expressions that are recurrent in the loop can be computed,...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
@ UnusedIndVarInLoop
Definition LoopUtils.h:600
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
SCEVUseT< const SCEV * > SCEVUse
bool SCEVExprContains(const SCEV *Root, PredTy Pred)
Return true if any node in Root satisfies the predicate Pred.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
#define N
Attributes of a target dependent hardware loop.
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Information about a load/store intrinsic defined by the target.
Value * PtrVal
This is the pointer that the intrinsic is loading from or storing to.