LLVM 19.0.0git
LoopStrengthReduce.cpp
Go to the documentation of this file.
1//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This transformation analyzes and transforms the induction variables (and
10// computations derived from them) into forms suitable for efficient execution
11// on the target.
12//
13// This pass performs a strength reduction on array references inside loops that
14// have as one or more of their components the loop induction variable, it
15// rewrites expressions to take advantage of scaled-index addressing modes
16// available on the target, and it performs a variety of other optimizations
17// related to loop induction variables.
18//
19// Terminology note: this code has a lot of handling for "post-increment" or
20// "post-inc" users. This is not talking about post-increment addressing modes;
21// it is instead talking about code like this:
22//
23// %i = phi [ 0, %entry ], [ %i.next, %latch ]
24// ...
25// %i.next = add %i, 1
26// %c = icmp eq %i.next, %n
27//
28// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
29// it's useful to think about these as the same register, with some uses using
30// the value of the register before the add and some using it after. In this
31// example, the icmp is a post-increment user, since it uses %i.next, which is
32// the value of the induction variable after the increment. The other common
33// case of post-increment users is users outside the loop.
34//
35// TODO: More sophistication in the way Formulae are generated and filtered.
36//
37// TODO: Handle multiple loops at a time.
38//
39// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
40// of a GlobalValue?
41//
42// TODO: When truncation is free, truncate ICmp users' operands to make it a
43// smaller encoding (on x86 at least).
44//
45// TODO: When a negated register is used by an add (such as in a list of
46// multiple base registers, or as the increment expression in an addrec),
47// we may not actually need both reg and (-1 * reg) in registers; the
48// negation can be implemented by using a sub instead of an add. The
49// lack of support for taking this into consideration when making
50// register pressure decisions is partly worked around by the "Special"
51// use kind.
52//
53//===----------------------------------------------------------------------===//
54
56#include "llvm/ADT/APInt.h"
57#include "llvm/ADT/DenseMap.h"
58#include "llvm/ADT/DenseSet.h"
59#include "llvm/ADT/Hashing.h"
61#include "llvm/ADT/STLExtras.h"
62#include "llvm/ADT/SetVector.h"
65#include "llvm/ADT/SmallSet.h"
67#include "llvm/ADT/Statistic.h"
84#include "llvm/Config/llvm-config.h"
85#include "llvm/IR/BasicBlock.h"
86#include "llvm/IR/Constant.h"
87#include "llvm/IR/Constants.h"
90#include "llvm/IR/Dominators.h"
91#include "llvm/IR/GlobalValue.h"
92#include "llvm/IR/IRBuilder.h"
93#include "llvm/IR/InstrTypes.h"
94#include "llvm/IR/Instruction.h"
97#include "llvm/IR/Module.h"
98#include "llvm/IR/Operator.h"
99#include "llvm/IR/PassManager.h"
100#include "llvm/IR/Type.h"
101#include "llvm/IR/Use.h"
102#include "llvm/IR/User.h"
103#include "llvm/IR/Value.h"
104#include "llvm/IR/ValueHandle.h"
106#include "llvm/Pass.h"
107#include "llvm/Support/Casting.h"
110#include "llvm/Support/Debug.h"
120#include <algorithm>
121#include <cassert>
122#include <cstddef>
123#include <cstdint>
124#include <iterator>
125#include <limits>
126#include <map>
127#include <numeric>
128#include <optional>
129#include <utility>
130
131using namespace llvm;
132
133#define DEBUG_TYPE "loop-reduce"
134
135/// MaxIVUsers is an arbitrary threshold that provides an early opportunity for
136/// bail out. This threshold is far beyond the number of users that LSR can
137/// conceivably solve, so it should not affect generated code, but catches the
138/// worst cases before LSR burns too much compile time and stack space.
139static const unsigned MaxIVUsers = 200;
140
141/// Limit the size of expression that SCEV-based salvaging will attempt to
142/// translate into a DIExpression.
143/// Choose a maximum size such that debuginfo is not excessively increased and
144/// the salvaging is not too expensive for the compiler.
145static const unsigned MaxSCEVSalvageExpressionSize = 64;
146
147// Cleanup congruent phis after LSR phi expansion.
149 "enable-lsr-phielim", cl::Hidden, cl::init(true),
150 cl::desc("Enable LSR phi elimination"));
151
152// The flag adds instruction count to solutions cost comparison.
154 "lsr-insns-cost", cl::Hidden, cl::init(true),
155 cl::desc("Add instruction count to a LSR cost model"));
156
157// Flag to choose how to narrow complex lsr solution
159 "lsr-exp-narrow", cl::Hidden, cl::init(false),
160 cl::desc("Narrow LSR complex solution using"
161 " expectation of registers number"));
162
163// Flag to narrow search space by filtering non-optimal formulae with
164// the same ScaledReg and Scale.
166 "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true),
167 cl::desc("Narrow LSR search space by filtering non-optimal formulae"
168 " with the same ScaledReg and Scale"));
169
171 "lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None),
172 cl::desc("A flag that overrides the target's preferred addressing mode."),
174 "none",
175 "Don't prefer any addressing mode"),
177 "preindexed",
178 "Prefer pre-indexed addressing mode"),
180 "postindexed",
181 "Prefer post-indexed addressing mode")));
182
184 "lsr-complexity-limit", cl::Hidden,
185 cl::init(std::numeric_limits<uint16_t>::max()),
186 cl::desc("LSR search space complexity limit"));
187
189 "lsr-setupcost-depth-limit", cl::Hidden, cl::init(7),
190 cl::desc("The limit on recursion depth for LSRs setup cost"));
191
193 "lsr-term-fold", cl::Hidden,
194 cl::desc("Attempt to replace primary IV with other IV."));
195
197 "lsr-drop-solution", cl::Hidden,
198 cl::desc("Attempt to drop solution if it is less profitable"));
199
201 "lsr-enable-vscale-immediates", cl::Hidden, cl::init(true),
202 cl::desc("Enable analysis of vscale-relative immediates in LSR"));
203
205 "lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true),
206 cl::desc("Avoid using scaled registers with vscale-relative addressing"));
207
208STATISTIC(NumTermFold,
209 "Number of terminating condition fold recognized and performed");
210
211#ifndef NDEBUG
212// Stress test IV chain generation.
214 "stress-ivchain", cl::Hidden, cl::init(false),
215 cl::desc("Stress test LSR IV chains"));
216#else
217static bool StressIVChain = false;
218#endif
219
220namespace {
221
222struct MemAccessTy {
223 /// Used in situations where the accessed memory type is unknown.
224 static const unsigned UnknownAddressSpace =
225 std::numeric_limits<unsigned>::max();
226
227 Type *MemTy = nullptr;
228 unsigned AddrSpace = UnknownAddressSpace;
229
230 MemAccessTy() = default;
231 MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {}
232
233 bool operator==(MemAccessTy Other) const {
234 return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
235 }
236
237 bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
238
239 static MemAccessTy getUnknown(LLVMContext &Ctx,
240 unsigned AS = UnknownAddressSpace) {
241 return MemAccessTy(Type::getVoidTy(Ctx), AS);
242 }
243
244 Type *getType() { return MemTy; }
245};
246
247/// This class holds data which is used to order reuse candidates.
248class RegSortData {
249public:
250 /// This represents the set of LSRUse indices which reference
251 /// a particular register.
252 SmallBitVector UsedByIndices;
253
254 void print(raw_ostream &OS) const;
255 void dump() const;
256};
257
258// An offset from an address that is either scalable or fixed. Used for
259// per-target optimizations of addressing modes.
260class Immediate : public details::FixedOrScalableQuantity<Immediate, int64_t> {
261 constexpr Immediate(ScalarTy MinVal, bool Scalable)
262 : FixedOrScalableQuantity(MinVal, Scalable) {}
263
264 constexpr Immediate(const FixedOrScalableQuantity<Immediate, int64_t> &V)
265 : FixedOrScalableQuantity(V) {}
266
267public:
268 constexpr Immediate() = delete;
269
270 static constexpr Immediate getFixed(ScalarTy MinVal) {
271 return {MinVal, false};
272 }
273 static constexpr Immediate getScalable(ScalarTy MinVal) {
274 return {MinVal, true};
275 }
276 static constexpr Immediate get(ScalarTy MinVal, bool Scalable) {
277 return {MinVal, Scalable};
278 }
279 static constexpr Immediate getZero() { return {0, false}; }
280 static constexpr Immediate getFixedMin() {
281 return {std::numeric_limits<int64_t>::min(), false};
282 }
283 static constexpr Immediate getFixedMax() {
284 return {std::numeric_limits<int64_t>::max(), false};
285 }
286 static constexpr Immediate getScalableMin() {
287 return {std::numeric_limits<int64_t>::min(), true};
288 }
289 static constexpr Immediate getScalableMax() {
290 return {std::numeric_limits<int64_t>::max(), true};
291 }
292
293 constexpr bool isLessThanZero() const { return Quantity < 0; }
294
295 constexpr bool isGreaterThanZero() const { return Quantity > 0; }
296
297 constexpr bool isCompatibleImmediate(const Immediate &Imm) const {
298 return isZero() || Imm.isZero() || Imm.Scalable == Scalable;
299 }
300
301 constexpr bool isMin() const {
302 return Quantity == std::numeric_limits<ScalarTy>::min();
303 }
304
305 constexpr bool isMax() const {
306 return Quantity == std::numeric_limits<ScalarTy>::max();
307 }
308
309 // Arithmetic 'operators' that cast to unsigned types first.
310 constexpr Immediate addUnsigned(const Immediate &RHS) const {
311 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
312 ScalarTy Value = (uint64_t)Quantity + RHS.getKnownMinValue();
313 return {Value, Scalable || RHS.isScalable()};
314 }
315
316 constexpr Immediate subUnsigned(const Immediate &RHS) const {
317 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
318 ScalarTy Value = (uint64_t)Quantity - RHS.getKnownMinValue();
319 return {Value, Scalable || RHS.isScalable()};
320 }
321
322 // Scale the quantity by a constant without caring about runtime scalability.
323 constexpr Immediate mulUnsigned(const ScalarTy RHS) const {
324 ScalarTy Value = (uint64_t)Quantity * RHS;
325 return {Value, Scalable};
326 }
327
328 // Helpers for generating SCEVs with vscale terms where needed.
329 const SCEV *getSCEV(ScalarEvolution &SE, Type *Ty) const {
330 const SCEV *S = SE.getConstant(Ty, Quantity);
331 if (Scalable)
332 S = SE.getMulExpr(S, SE.getVScale(S->getType()));
333 return S;
334 }
335
336 const SCEV *getNegativeSCEV(ScalarEvolution &SE, Type *Ty) const {
337 const SCEV *NegS = SE.getConstant(Ty, -(uint64_t)Quantity);
338 if (Scalable)
339 NegS = SE.getMulExpr(NegS, SE.getVScale(NegS->getType()));
340 return NegS;
341 }
342
343 const SCEV *getUnknownSCEV(ScalarEvolution &SE, Type *Ty) const {
344 const SCEV *SU = SE.getUnknown(ConstantInt::getSigned(Ty, Quantity));
345 if (Scalable)
346 SU = SE.getMulExpr(SU, SE.getVScale(SU->getType()));
347 return SU;
348 }
349};
350
351// This is needed for the Compare type of std::map when Immediate is used
352// as a key. We don't need it to be fully correct against any value of vscale,
353// just to make sure that vscale-related terms in the map are considered against
354// each other rather than being mixed up and potentially missing opportunities.
355struct KeyOrderTargetImmediate {
356 bool operator()(const Immediate &LHS, const Immediate &RHS) const {
357 if (LHS.isScalable() && !RHS.isScalable())
358 return false;
359 if (!LHS.isScalable() && RHS.isScalable())
360 return true;
361 return LHS.getKnownMinValue() < RHS.getKnownMinValue();
362 }
363};
364
365// This would be nicer if we could be generic instead of directly using size_t,
366// but there doesn't seem to be a type trait for is_orderable or
367// is_lessthan_comparable or similar.
368struct KeyOrderSizeTAndImmediate {
369 bool operator()(const std::pair<size_t, Immediate> &LHS,
370 const std::pair<size_t, Immediate> &RHS) const {
371 size_t LSize = LHS.first;
372 size_t RSize = RHS.first;
373 if (LSize != RSize)
374 return LSize < RSize;
375 return KeyOrderTargetImmediate()(LHS.second, RHS.second);
376 }
377};
378} // end anonymous namespace
379
380#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
381void RegSortData::print(raw_ostream &OS) const {
382 OS << "[NumUses=" << UsedByIndices.count() << ']';
383}
384
385LLVM_DUMP_METHOD void RegSortData::dump() const {
386 print(errs()); errs() << '\n';
387}
388#endif
389
390namespace {
391
392/// Map register candidates to information about how they are used.
393class RegUseTracker {
394 using RegUsesTy = DenseMap<const SCEV *, RegSortData>;
395
396 RegUsesTy RegUsesMap;
398
399public:
400 void countRegister(const SCEV *Reg, size_t LUIdx);
401 void dropRegister(const SCEV *Reg, size_t LUIdx);
402 void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
403
404 bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
405
406 const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
407
408 void clear();
409
412
413 iterator begin() { return RegSequence.begin(); }
414 iterator end() { return RegSequence.end(); }
415 const_iterator begin() const { return RegSequence.begin(); }
416 const_iterator end() const { return RegSequence.end(); }
417};
418
419} // end anonymous namespace
420
421void
422RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
423 std::pair<RegUsesTy::iterator, bool> Pair =
424 RegUsesMap.insert(std::make_pair(Reg, RegSortData()));
425 RegSortData &RSD = Pair.first->second;
426 if (Pair.second)
427 RegSequence.push_back(Reg);
428 RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
429 RSD.UsedByIndices.set(LUIdx);
430}
431
432void
433RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
434 RegUsesTy::iterator It = RegUsesMap.find(Reg);
435 assert(It != RegUsesMap.end());
436 RegSortData &RSD = It->second;
437 assert(RSD.UsedByIndices.size() > LUIdx);
438 RSD.UsedByIndices.reset(LUIdx);
439}
440
441void
442RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
443 assert(LUIdx <= LastLUIdx);
444
445 // Update RegUses. The data structure is not optimized for this purpose;
446 // we must iterate through it and update each of the bit vectors.
447 for (auto &Pair : RegUsesMap) {
448 SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
449 if (LUIdx < UsedByIndices.size())
450 UsedByIndices[LUIdx] =
451 LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : false;
452 UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
453 }
454}
455
456bool
457RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
458 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
459 if (I == RegUsesMap.end())
460 return false;
461 const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
462 int i = UsedByIndices.find_first();
463 if (i == -1) return false;
464 if ((size_t)i != LUIdx) return true;
465 return UsedByIndices.find_next(i) != -1;
466}
467
468const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
469 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
470 assert(I != RegUsesMap.end() && "Unknown register!");
471 return I->second.UsedByIndices;
472}
473
474void RegUseTracker::clear() {
475 RegUsesMap.clear();
476 RegSequence.clear();
477}
478
479namespace {
480
481/// This class holds information that describes a formula for computing
482/// satisfying a use. It may include broken-out immediates and scaled registers.
483struct Formula {
484 /// Global base address used for complex addressing.
485 GlobalValue *BaseGV = nullptr;
486
487 /// Base offset for complex addressing.
488 Immediate BaseOffset = Immediate::getZero();
489
490 /// Whether any complex addressing has a base register.
491 bool HasBaseReg = false;
492
493 /// The scale of any complex addressing.
494 int64_t Scale = 0;
495
496 /// The list of "base" registers for this use. When this is non-empty. The
497 /// canonical representation of a formula is
498 /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
499 /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
500 /// 3. The reg containing recurrent expr related with currect loop in the
501 /// formula should be put in the ScaledReg.
502 /// #1 enforces that the scaled register is always used when at least two
503 /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
504 /// #2 enforces that 1 * reg is reg.
505 /// #3 ensures invariant regs with respect to current loop can be combined
506 /// together in LSR codegen.
507 /// This invariant can be temporarily broken while building a formula.
508 /// However, every formula inserted into the LSRInstance must be in canonical
509 /// form.
511
512 /// The 'scaled' register for this use. This should be non-null when Scale is
513 /// not zero.
514 const SCEV *ScaledReg = nullptr;
515
516 /// An additional constant offset which added near the use. This requires a
517 /// temporary register, but the offset itself can live in an add immediate
518 /// field rather than a register.
519 Immediate UnfoldedOffset = Immediate::getZero();
520
521 Formula() = default;
522
523 void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
524
525 bool isCanonical(const Loop &L) const;
526
527 void canonicalize(const Loop &L);
528
529 bool unscale();
530
531 bool hasZeroEnd() const;
532
533 size_t getNumRegs() const;
534 Type *getType() const;
535
536 void deleteBaseReg(const SCEV *&S);
537
538 bool referencesReg(const SCEV *S) const;
539 bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
540 const RegUseTracker &RegUses) const;
541
542 void print(raw_ostream &OS) const;
543 void dump() const;
544};
545
546} // end anonymous namespace
547
548/// Recursion helper for initialMatch.
549static void DoInitialMatch(const SCEV *S, Loop *L,
552 ScalarEvolution &SE) {
553 // Collect expressions which properly dominate the loop header.
554 if (SE.properlyDominates(S, L->getHeader())) {
555 Good.push_back(S);
556 return;
557 }
558
559 // Look at add operands.
560 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
561 for (const SCEV *S : Add->operands())
562 DoInitialMatch(S, L, Good, Bad, SE);
563 return;
564 }
565
566 // Look at addrec operands.
567 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
568 if (!AR->getStart()->isZero() && AR->isAffine()) {
569 DoInitialMatch(AR->getStart(), L, Good, Bad, SE);
570 DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
571 AR->getStepRecurrence(SE),
572 // FIXME: AR->getNoWrapFlags()
573 AR->getLoop(), SCEV::FlagAnyWrap),
574 L, Good, Bad, SE);
575 return;
576 }
577
578 // Handle a multiplication by -1 (negation) if it didn't fold.
579 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
580 if (Mul->getOperand(0)->isAllOnesValue()) {
582 const SCEV *NewMul = SE.getMulExpr(Ops);
583
586 DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
587 const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
588 SE.getEffectiveSCEVType(NewMul->getType())));
589 for (const SCEV *S : MyGood)
590 Good.push_back(SE.getMulExpr(NegOne, S));
591 for (const SCEV *S : MyBad)
592 Bad.push_back(SE.getMulExpr(NegOne, S));
593 return;
594 }
595
596 // Ok, we can't do anything interesting. Just stuff the whole thing into a
597 // register and hope for the best.
598 Bad.push_back(S);
599}
600
601/// Incorporate loop-variant parts of S into this Formula, attempting to keep
602/// all loop-invariant and loop-computable values in a single base register.
603void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
606 DoInitialMatch(S, L, Good, Bad, SE);
607 if (!Good.empty()) {
608 const SCEV *Sum = SE.getAddExpr(Good);
609 if (!Sum->isZero())
610 BaseRegs.push_back(Sum);
611 HasBaseReg = true;
612 }
613 if (!Bad.empty()) {
614 const SCEV *Sum = SE.getAddExpr(Bad);
615 if (!Sum->isZero())
616 BaseRegs.push_back(Sum);
617 HasBaseReg = true;
618 }
619 canonicalize(*L);
620}
621
622static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L) {
623 return SCEVExprContains(S, [&L](const SCEV *S) {
624 return isa<SCEVAddRecExpr>(S) && (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
625 });
626}
627
628/// Check whether or not this formula satisfies the canonical
629/// representation.
630/// \see Formula::BaseRegs.
631bool Formula::isCanonical(const Loop &L) const {
632 if (!ScaledReg)
633 return BaseRegs.size() <= 1;
634
635 if (Scale != 1)
636 return true;
637
638 if (Scale == 1 && BaseRegs.empty())
639 return false;
640
641 if (containsAddRecDependentOnLoop(ScaledReg, L))
642 return true;
643
644 // If ScaledReg is not a recurrent expr, or it is but its loop is not current
645 // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
646 // loop, we want to swap the reg in BaseRegs with ScaledReg.
647 return none_of(BaseRegs, [&L](const SCEV *S) {
649 });
650}
651
652/// Helper method to morph a formula into its canonical representation.
653/// \see Formula::BaseRegs.
654/// Every formula having more than one base register, must use the ScaledReg
655/// field. Otherwise, we would have to do special cases everywhere in LSR
656/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
657/// On the other hand, 1*reg should be canonicalized into reg.
658void Formula::canonicalize(const Loop &L) {
659 if (isCanonical(L))
660 return;
661
662 if (BaseRegs.empty()) {
663 // No base reg? Use scale reg with scale = 1 as such.
664 assert(ScaledReg && "Expected 1*reg => reg");
665 assert(Scale == 1 && "Expected 1*reg => reg");
666 BaseRegs.push_back(ScaledReg);
667 Scale = 0;
668 ScaledReg = nullptr;
669 return;
670 }
671
672 // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
673 if (!ScaledReg) {
674 ScaledReg = BaseRegs.pop_back_val();
675 Scale = 1;
676 }
677
678 // If ScaledReg is an invariant with respect to L, find the reg from
679 // BaseRegs containing the recurrent expr related with Loop L. Swap the
680 // reg with ScaledReg.
681 if (!containsAddRecDependentOnLoop(ScaledReg, L)) {
682 auto I = find_if(BaseRegs, [&L](const SCEV *S) {
684 });
685 if (I != BaseRegs.end())
686 std::swap(ScaledReg, *I);
687 }
688 assert(isCanonical(L) && "Failed to canonicalize?");
689}
690
691/// Get rid of the scale in the formula.
692/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
693/// \return true if it was possible to get rid of the scale, false otherwise.
694/// \note After this operation the formula may not be in the canonical form.
695bool Formula::unscale() {
696 if (Scale != 1)
697 return false;
698 Scale = 0;
699 BaseRegs.push_back(ScaledReg);
700 ScaledReg = nullptr;
701 return true;
702}
703
704bool Formula::hasZeroEnd() const {
705 if (UnfoldedOffset || BaseOffset)
706 return false;
707 if (BaseRegs.size() != 1 || ScaledReg)
708 return false;
709 return true;
710}
711
712/// Return the total number of register operands used by this formula. This does
713/// not include register uses implied by non-constant addrec strides.
714size_t Formula::getNumRegs() const {
715 return !!ScaledReg + BaseRegs.size();
716}
717
718/// Return the type of this formula, if it has one, or null otherwise. This type
719/// is meaningless except for the bit size.
720Type *Formula::getType() const {
721 return !BaseRegs.empty() ? BaseRegs.front()->getType() :
722 ScaledReg ? ScaledReg->getType() :
723 BaseGV ? BaseGV->getType() :
724 nullptr;
725}
726
727/// Delete the given base reg from the BaseRegs list.
728void Formula::deleteBaseReg(const SCEV *&S) {
729 if (&S != &BaseRegs.back())
730 std::swap(S, BaseRegs.back());
731 BaseRegs.pop_back();
732}
733
734/// Test if this formula references the given register.
735bool Formula::referencesReg(const SCEV *S) const {
736 return S == ScaledReg || is_contained(BaseRegs, S);
737}
738
739/// Test whether this formula uses registers which are used by uses other than
740/// the use with the given index.
741bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
742 const RegUseTracker &RegUses) const {
743 if (ScaledReg)
744 if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
745 return true;
746 for (const SCEV *BaseReg : BaseRegs)
747 if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
748 return true;
749 return false;
750}
751
752#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
753void Formula::print(raw_ostream &OS) const {
754 bool First = true;
755 if (BaseGV) {
756 if (!First) OS << " + "; else First = false;
757 BaseGV->printAsOperand(OS, /*PrintType=*/false);
758 }
759 if (BaseOffset.isNonZero()) {
760 if (!First) OS << " + "; else First = false;
761 OS << BaseOffset;
762 }
763 for (const SCEV *BaseReg : BaseRegs) {
764 if (!First) OS << " + "; else First = false;
765 OS << "reg(" << *BaseReg << ')';
766 }
767 if (HasBaseReg && BaseRegs.empty()) {
768 if (!First) OS << " + "; else First = false;
769 OS << "**error: HasBaseReg**";
770 } else if (!HasBaseReg && !BaseRegs.empty()) {
771 if (!First) OS << " + "; else First = false;
772 OS << "**error: !HasBaseReg**";
773 }
774 if (Scale != 0) {
775 if (!First) OS << " + "; else First = false;
776 OS << Scale << "*reg(";
777 if (ScaledReg)
778 OS << *ScaledReg;
779 else
780 OS << "<unknown>";
781 OS << ')';
782 }
783 if (UnfoldedOffset.isNonZero()) {
784 if (!First) OS << " + ";
785 OS << "imm(" << UnfoldedOffset << ')';
786 }
787}
788
789LLVM_DUMP_METHOD void Formula::dump() const {
790 print(errs()); errs() << '\n';
791}
792#endif
793
794/// Return true if the given addrec can be sign-extended without changing its
795/// value.
797 Type *WideTy =
799 return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
800}
801
802/// Return true if the given add can be sign-extended without changing its
803/// value.
804static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
805 Type *WideTy =
806 IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
807 return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
808}
809
810/// Return true if the given mul can be sign-extended without changing its
811/// value.
812static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
813 Type *WideTy =
815 SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
816 return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
817}
818
819/// Return an expression for LHS /s RHS, if it can be determined and if the
820/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
821/// is true, expressions like (X * Y) /s Y are simplified to X, ignoring that
822/// the multiplication may overflow, which is useful when the result will be
823/// used in a context where the most significant bits are ignored.
824static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
825 ScalarEvolution &SE,
826 bool IgnoreSignificantBits = false) {
827 // Handle the trivial case, which works for any SCEV type.
828 if (LHS == RHS)
829 return SE.getConstant(LHS->getType(), 1);
830
831 // Handle a few RHS special cases.
832 const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
833 if (RC) {
834 const APInt &RA = RC->getAPInt();
835 // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
836 // some folding.
837 if (RA.isAllOnes()) {
838 if (LHS->getType()->isPointerTy())
839 return nullptr;
840 return SE.getMulExpr(LHS, RC);
841 }
842 // Handle x /s 1 as x.
843 if (RA == 1)
844 return LHS;
845 }
846
847 // Check for a division of a constant by a constant.
848 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
849 if (!RC)
850 return nullptr;
851 const APInt &LA = C->getAPInt();
852 const APInt &RA = RC->getAPInt();
853 if (LA.srem(RA) != 0)
854 return nullptr;
855 return SE.getConstant(LA.sdiv(RA));
856 }
857
858 // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
859 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
860 if ((IgnoreSignificantBits || isAddRecSExtable(AR, SE)) && AR->isAffine()) {
861 const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
862 IgnoreSignificantBits);
863 if (!Step) return nullptr;
864 const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
865 IgnoreSignificantBits);
866 if (!Start) return nullptr;
867 // FlagNW is independent of the start value, step direction, and is
868 // preserved with smaller magnitude steps.
869 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
870 return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
871 }
872 return nullptr;
873 }
874
875 // Distribute the sdiv over add operands, if the add doesn't overflow.
876 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
877 if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
879 for (const SCEV *S : Add->operands()) {
880 const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
881 if (!Op) return nullptr;
882 Ops.push_back(Op);
883 }
884 return SE.getAddExpr(Ops);
885 }
886 return nullptr;
887 }
888
889 // Check for a multiply operand that we can pull RHS out of.
890 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
891 if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
892 // Handle special case C1*X*Y /s C2*X*Y.
893 if (const SCEVMulExpr *MulRHS = dyn_cast<SCEVMulExpr>(RHS)) {
894 if (IgnoreSignificantBits || isMulSExtable(MulRHS, SE)) {
895 const SCEVConstant *LC = dyn_cast<SCEVConstant>(Mul->getOperand(0));
896 const SCEVConstant *RC =
897 dyn_cast<SCEVConstant>(MulRHS->getOperand(0));
898 if (LC && RC) {
900 SmallVector<const SCEV *, 4> ROps(drop_begin(MulRHS->operands()));
901 if (LOps == ROps)
902 return getExactSDiv(LC, RC, SE, IgnoreSignificantBits);
903 }
904 }
905 }
906
908 bool Found = false;
909 for (const SCEV *S : Mul->operands()) {
910 if (!Found)
911 if (const SCEV *Q = getExactSDiv(S, RHS, SE,
912 IgnoreSignificantBits)) {
913 S = Q;
914 Found = true;
915 }
916 Ops.push_back(S);
917 }
918 return Found ? SE.getMulExpr(Ops) : nullptr;
919 }
920 return nullptr;
921 }
922
923 // Otherwise we don't know.
924 return nullptr;
925}
926
927/// If S involves the addition of a constant integer value, return that integer
928/// value, and mutate S to point to a new SCEV with that value excluded.
929static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
930 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
931 if (C->getAPInt().getSignificantBits() <= 64) {
932 S = SE.getConstant(C->getType(), 0);
933 return Immediate::getFixed(C->getValue()->getSExtValue());
934 }
935 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
936 SmallVector<const SCEV *, 8> NewOps(Add->operands());
937 Immediate Result = ExtractImmediate(NewOps.front(), SE);
938 if (Result.isNonZero())
939 S = SE.getAddExpr(NewOps);
940 return Result;
941 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
942 SmallVector<const SCEV *, 8> NewOps(AR->operands());
943 Immediate Result = ExtractImmediate(NewOps.front(), SE);
944 if (Result.isNonZero())
945 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
946 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
948 return Result;
949 } else if (EnableVScaleImmediates)
950 if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S))
951 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(M->getOperand(0)))
952 if (isa<SCEVVScale>(M->getOperand(1))) {
953 S = SE.getConstant(M->getType(), 0);
954 return Immediate::getScalable(C->getValue()->getSExtValue());
955 }
956 return Immediate::getZero();
957}
958
959/// If S involves the addition of a GlobalValue address, return that symbol, and
960/// mutate S to point to a new SCEV with that value excluded.
962 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
963 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
964 S = SE.getConstant(GV->getType(), 0);
965 return GV;
966 }
967 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
968 SmallVector<const SCEV *, 8> NewOps(Add->operands());
969 GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
970 if (Result)
971 S = SE.getAddExpr(NewOps);
972 return Result;
973 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
974 SmallVector<const SCEV *, 8> NewOps(AR->operands());
975 GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
976 if (Result)
977 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
978 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
980 return Result;
981 }
982 return nullptr;
983}
984
985/// Returns true if the specified instruction is using the specified value as an
986/// address.
988 Instruction *Inst, Value *OperandVal) {
989 bool isAddress = isa<LoadInst>(Inst);
990 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
991 if (SI->getPointerOperand() == OperandVal)
992 isAddress = true;
993 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
994 // Addressing modes can also be folded into prefetches and a variety
995 // of intrinsics.
996 switch (II->getIntrinsicID()) {
997 case Intrinsic::memset:
998 case Intrinsic::prefetch:
999 case Intrinsic::masked_load:
1000 if (II->getArgOperand(0) == OperandVal)
1001 isAddress = true;
1002 break;
1003 case Intrinsic::masked_store:
1004 if (II->getArgOperand(1) == OperandVal)
1005 isAddress = true;
1006 break;
1007 case Intrinsic::memmove:
1008 case Intrinsic::memcpy:
1009 if (II->getArgOperand(0) == OperandVal ||
1010 II->getArgOperand(1) == OperandVal)
1011 isAddress = true;
1012 break;
1013 default: {
1014 MemIntrinsicInfo IntrInfo;
1015 if (TTI.getTgtMemIntrinsic(II, IntrInfo)) {
1016 if (IntrInfo.PtrVal == OperandVal)
1017 isAddress = true;
1018 }
1019 }
1020 }
1021 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1022 if (RMW->getPointerOperand() == OperandVal)
1023 isAddress = true;
1024 } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1025 if (CmpX->getPointerOperand() == OperandVal)
1026 isAddress = true;
1027 }
1028 return isAddress;
1029}
1030
1031/// Return the type of the memory being accessed.
1032static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
1033 Instruction *Inst, Value *OperandVal) {
1034 MemAccessTy AccessTy = MemAccessTy::getUnknown(Inst->getContext());
1035
1036 // First get the type of memory being accessed.
1037 if (Type *Ty = Inst->getAccessType())
1038 AccessTy.MemTy = Ty;
1039
1040 // Then get the pointer address space.
1041 if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
1042 AccessTy.AddrSpace = SI->getPointerAddressSpace();
1043 } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
1044 AccessTy.AddrSpace = LI->getPointerAddressSpace();
1045 } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1046 AccessTy.AddrSpace = RMW->getPointerAddressSpace();
1047 } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1048 AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
1049 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
1050 switch (II->getIntrinsicID()) {
1051 case Intrinsic::prefetch:
1052 case Intrinsic::memset:
1053 AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
1054 AccessTy.MemTy = OperandVal->getType();
1055 break;
1056 case Intrinsic::memmove:
1057 case Intrinsic::memcpy:
1058 AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace();
1059 AccessTy.MemTy = OperandVal->getType();
1060 break;
1061 case Intrinsic::masked_load:
1062 AccessTy.AddrSpace =
1063 II->getArgOperand(0)->getType()->getPointerAddressSpace();
1064 break;
1065 case Intrinsic::masked_store:
1066 AccessTy.AddrSpace =
1067 II->getArgOperand(1)->getType()->getPointerAddressSpace();
1068 break;
1069 default: {
1070 MemIntrinsicInfo IntrInfo;
1071 if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) {
1072 AccessTy.AddrSpace
1073 = IntrInfo.PtrVal->getType()->getPointerAddressSpace();
1074 }
1075
1076 break;
1077 }
1078 }
1079 }
1080
1081 return AccessTy;
1082}
1083
1084/// Return true if this AddRec is already a phi in its loop.
1085static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
1086 for (PHINode &PN : AR->getLoop()->getHeader()->phis()) {
1087 if (SE.isSCEVable(PN.getType()) &&
1088 (SE.getEffectiveSCEVType(PN.getType()) ==
1089 SE.getEffectiveSCEVType(AR->getType())) &&
1090 SE.getSCEV(&PN) == AR)
1091 return true;
1092 }
1093 return false;
1094}
1095
1096/// Check if expanding this expression is likely to incur significant cost. This
1097/// is tricky because SCEV doesn't track which expressions are actually computed
1098/// by the current IR.
1099///
1100/// We currently allow expansion of IV increments that involve adds,
1101/// multiplication by constants, and AddRecs from existing phis.
1102///
1103/// TODO: Allow UDivExpr if we can find an existing IV increment that is an
1104/// obvious multiple of the UDivExpr.
1105static bool isHighCostExpansion(const SCEV *S,
1107 ScalarEvolution &SE) {
1108 // Zero/One operand expressions
1109 switch (S->getSCEVType()) {
1110 case scUnknown:
1111 case scConstant:
1112 case scVScale:
1113 return false;
1114 case scTruncate:
1115 return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
1116 Processed, SE);
1117 case scZeroExtend:
1118 return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
1119 Processed, SE);
1120 case scSignExtend:
1121 return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
1122 Processed, SE);
1123 default:
1124 break;
1125 }
1126
1127 if (!Processed.insert(S).second)
1128 return false;
1129
1130 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
1131 for (const SCEV *S : Add->operands()) {
1132 if (isHighCostExpansion(S, Processed, SE))
1133 return true;
1134 }
1135 return false;
1136 }
1137
1138 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
1139 if (Mul->getNumOperands() == 2) {
1140 // Multiplication by a constant is ok
1141 if (isa<SCEVConstant>(Mul->getOperand(0)))
1142 return isHighCostExpansion(Mul->getOperand(1), Processed, SE);
1143
1144 // If we have the value of one operand, check if an existing
1145 // multiplication already generates this expression.
1146 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) {
1147 Value *UVal = U->getValue();
1148 for (User *UR : UVal->users()) {
1149 // If U is a constant, it may be used by a ConstantExpr.
1150 Instruction *UI = dyn_cast<Instruction>(UR);
1151 if (UI && UI->getOpcode() == Instruction::Mul &&
1152 SE.isSCEVable(UI->getType())) {
1153 return SE.getSCEV(UI) == Mul;
1154 }
1155 }
1156 }
1157 }
1158 }
1159
1160 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
1161 if (isExistingPhi(AR, SE))
1162 return false;
1163 }
1164
1165 // Fow now, consider any other type of expression (div/mul/min/max) high cost.
1166 return true;
1167}
1168
1169namespace {
1170
1171class LSRUse;
1172
1173} // end anonymous namespace
1174
1175/// Check if the addressing mode defined by \p F is completely
1176/// folded in \p LU at isel time.
1177/// This includes address-mode folding and special icmp tricks.
1178/// This function returns true if \p LU can accommodate what \p F
1179/// defines and up to 1 base + 1 scaled + offset.
1180/// In other words, if \p F has several base registers, this function may
1181/// still return true. Therefore, users still need to account for
1182/// additional base registers and/or unfolded offsets to derive an
1183/// accurate cost model.
1185 const LSRUse &LU, const Formula &F);
1186
1187// Get the cost of the scaling factor used in F for LU.
1189 const LSRUse &LU, const Formula &F,
1190 const Loop &L);
1191
1192namespace {
1193
1194/// This class is used to measure and compare candidate formulae.
1195class Cost {
1196 const Loop *L = nullptr;
1197 ScalarEvolution *SE = nullptr;
1198 const TargetTransformInfo *TTI = nullptr;
1201
1202public:
1203 Cost() = delete;
1204 Cost(const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
1206 L(L), SE(&SE), TTI(&TTI), AMK(AMK) {
1207 C.Insns = 0;
1208 C.NumRegs = 0;
1209 C.AddRecCost = 0;
1210 C.NumIVMuls = 0;
1211 C.NumBaseAdds = 0;
1212 C.ImmCost = 0;
1213 C.SetupCost = 0;
1214 C.ScaleCost = 0;
1215 }
1216
1217 bool isLess(const Cost &Other) const;
1218
1219 void Lose();
1220
1221#ifndef NDEBUG
1222 // Once any of the metrics loses, they must all remain losers.
1223 bool isValid() {
1224 return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
1225 | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
1226 || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
1227 & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
1228 }
1229#endif
1230
1231 bool isLoser() {
1232 assert(isValid() && "invalid cost");
1233 return C.NumRegs == ~0u;
1234 }
1235
1236 void RateFormula(const Formula &F,
1238 const DenseSet<const SCEV *> &VisitedRegs,
1239 const LSRUse &LU,
1240 SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
1241
1242 void print(raw_ostream &OS) const;
1243 void dump() const;
1244
1245private:
1246 void RateRegister(const Formula &F, const SCEV *Reg,
1248 void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1251};
1252
1253/// An operand value in an instruction which is to be replaced with some
1254/// equivalent, possibly strength-reduced, replacement.
1255struct LSRFixup {
1256 /// The instruction which will be updated.
1257 Instruction *UserInst = nullptr;
1258
1259 /// The operand of the instruction which will be replaced. The operand may be
1260 /// used more than once; every instance will be replaced.
1261 Value *OperandValToReplace = nullptr;
1262
1263 /// If this user is to use the post-incremented value of an induction
1264 /// variable, this set is non-empty and holds the loops associated with the
1265 /// induction variable.
1266 PostIncLoopSet PostIncLoops;
1267
1268 /// A constant offset to be added to the LSRUse expression. This allows
1269 /// multiple fixups to share the same LSRUse with different offsets, for
1270 /// example in an unrolled loop.
1271 Immediate Offset = Immediate::getZero();
1272
1273 LSRFixup() = default;
1274
1275 bool isUseFullyOutsideLoop(const Loop *L) const;
1276
1277 void print(raw_ostream &OS) const;
1278 void dump() const;
1279};
1280
1281/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted
1282/// SmallVectors of const SCEV*.
1283struct UniquifierDenseMapInfo {
1284 static SmallVector<const SCEV *, 4> getEmptyKey() {
1286 V.push_back(reinterpret_cast<const SCEV *>(-1));
1287 return V;
1288 }
1289
1290 static SmallVector<const SCEV *, 4> getTombstoneKey() {
1292 V.push_back(reinterpret_cast<const SCEV *>(-2));
1293 return V;
1294 }
1295
1296 static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) {
1297 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
1298 }
1299
1300 static bool isEqual(const SmallVector<const SCEV *, 4> &LHS,
1302 return LHS == RHS;
1303 }
1304};
1305
1306/// This class holds the state that LSR keeps for each use in IVUsers, as well
1307/// as uses invented by LSR itself. It includes information about what kinds of
1308/// things can be folded into the user, information about the user itself, and
1309/// information about how the use may be satisfied. TODO: Represent multiple
1310/// users of the same expression in common?
1311class LSRUse {
1312 DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;
1313
1314public:
1315 /// An enum for a kind of use, indicating what types of scaled and immediate
1316 /// operands it might support.
1317 enum KindType {
1318 Basic, ///< A normal use, with no folding.
1319 Special, ///< A special case of basic, allowing -1 scales.
1320 Address, ///< An address use; folding according to TargetLowering
1321 ICmpZero ///< An equality icmp with both operands folded into one.
1322 // TODO: Add a generic icmp too?
1323 };
1324
1325 using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>;
1326
1327 KindType Kind;
1328 MemAccessTy AccessTy;
1329
1330 /// The list of operands which are to be replaced.
1332
1333 /// Keep track of the min and max offsets of the fixups.
1334 Immediate MinOffset = Immediate::getFixedMax();
1335 Immediate MaxOffset = Immediate::getFixedMin();
1336
1337 /// This records whether all of the fixups using this LSRUse are outside of
1338 /// the loop, in which case some special-case heuristics may be used.
1339 bool AllFixupsOutsideLoop = true;
1340
1341 /// RigidFormula is set to true to guarantee that this use will be associated
1342 /// with a single formula--the one that initially matched. Some SCEV
1343 /// expressions cannot be expanded. This allows LSR to consider the registers
1344 /// used by those expressions without the need to expand them later after
1345 /// changing the formula.
1346 bool RigidFormula = false;
1347
1348 /// This records the widest use type for any fixup using this
1349 /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
1350 /// fixup widths to be equivalent, because the narrower one may be relying on
1351 /// the implicit truncation to truncate away bogus bits.
1352 Type *WidestFixupType = nullptr;
1353
1354 /// A list of ways to build a value that can satisfy this user. After the
1355 /// list is populated, one of these is selected heuristically and used to
1356 /// formulate a replacement for OperandValToReplace in UserInst.
1357 SmallVector<Formula, 12> Formulae;
1358
1359 /// The set of register candidates used by all formulae in this LSRUse.
1361
1362 LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {}
1363
1364 LSRFixup &getNewFixup() {
1365 Fixups.push_back(LSRFixup());
1366 return Fixups.back();
1367 }
1368
1369 void pushFixup(LSRFixup &f) {
1370 Fixups.push_back(f);
1371 if (Immediate::isKnownGT(f.Offset, MaxOffset))
1372 MaxOffset = f.Offset;
1373 if (Immediate::isKnownLT(f.Offset, MinOffset))
1374 MinOffset = f.Offset;
1375 }
1376
1377 bool HasFormulaWithSameRegs(const Formula &F) const;
1378 float getNotSelectedProbability(const SCEV *Reg) const;
1379 bool InsertFormula(const Formula &F, const Loop &L);
1380 void DeleteFormula(Formula &F);
1381 void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
1382
1383 void print(raw_ostream &OS) const;
1384 void dump() const;
1385};
1386
1387} // end anonymous namespace
1388
1390 LSRUse::KindType Kind, MemAccessTy AccessTy,
1391 GlobalValue *BaseGV, Immediate BaseOffset,
1392 bool HasBaseReg, int64_t Scale,
1393 Instruction *Fixup = nullptr);
1394
1395static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) {
1396 if (isa<SCEVUnknown>(Reg) || isa<SCEVConstant>(Reg))
1397 return 1;
1398 if (Depth == 0)
1399 return 0;
1400 if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))
1401 return getSetupCost(S->getStart(), Depth - 1);
1402 if (auto S = dyn_cast<SCEVIntegralCastExpr>(Reg))
1403 return getSetupCost(S->getOperand(), Depth - 1);
1404 if (auto S = dyn_cast<SCEVNAryExpr>(Reg))
1405 return std::accumulate(S->operands().begin(), S->operands().end(), 0,
1406 [&](unsigned i, const SCEV *Reg) {
1407 return i + getSetupCost(Reg, Depth - 1);
1408 });
1409 if (auto S = dyn_cast<SCEVUDivExpr>(Reg))
1410 return getSetupCost(S->getLHS(), Depth - 1) +
1411 getSetupCost(S->getRHS(), Depth - 1);
1412 return 0;
1413}
1414
1415/// Tally up interesting quantities from the given register.
1416void Cost::RateRegister(const Formula &F, const SCEV *Reg,
1418 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
1419 // If this is an addrec for another loop, it should be an invariant
1420 // with respect to L since L is the innermost loop (at least
1421 // for now LSR only handles innermost loops).
1422 if (AR->getLoop() != L) {
1423 // If the AddRec exists, consider it's register free and leave it alone.
1424 if (isExistingPhi(AR, *SE) && AMK != TTI::AMK_PostIndexed)
1425 return;
1426
1427 // It is bad to allow LSR for current loop to add induction variables
1428 // for its sibling loops.
1429 if (!AR->getLoop()->contains(L)) {
1430 Lose();
1431 return;
1432 }
1433
1434 // Otherwise, it will be an invariant with respect to Loop L.
1435 ++C.NumRegs;
1436 return;
1437 }
1438
1439 unsigned LoopCost = 1;
1440 if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||
1441 TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {
1442
1443 // If the step size matches the base offset, we could use pre-indexed
1444 // addressing.
1445 if (AMK == TTI::AMK_PreIndexed && F.BaseOffset.isFixed()) {
1446 if (auto *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)))
1447 if (Step->getAPInt() == F.BaseOffset.getFixedValue())
1448 LoopCost = 0;
1449 } else if (AMK == TTI::AMK_PostIndexed) {
1450 const SCEV *LoopStep = AR->getStepRecurrence(*SE);
1451 if (isa<SCEVConstant>(LoopStep)) {
1452 const SCEV *LoopStart = AR->getStart();
1453 if (!isa<SCEVConstant>(LoopStart) &&
1454 SE->isLoopInvariant(LoopStart, L))
1455 LoopCost = 0;
1456 }
1457 }
1458 }
1459 C.AddRecCost += LoopCost;
1460
1461 // Add the step value register, if it needs one.
1462 // TODO: The non-affine case isn't precisely modeled here.
1463 if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
1464 if (!Regs.count(AR->getOperand(1))) {
1465 RateRegister(F, AR->getOperand(1), Regs);
1466 if (isLoser())
1467 return;
1468 }
1469 }
1470 }
1471 ++C.NumRegs;
1472
1473 // Rough heuristic; favor registers which don't require extra setup
1474 // instructions in the preheader.
1475 C.SetupCost += getSetupCost(Reg, SetupCostDepthLimit);
1476 // Ensure we don't, even with the recusion limit, produce invalid costs.
1477 C.SetupCost = std::min<unsigned>(C.SetupCost, 1 << 16);
1478
1479 C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
1480 SE->hasComputableLoopEvolution(Reg, L);
1481}
1482
1483/// Record this register in the set. If we haven't seen it before, rate
1484/// it. Optional LoserRegs provides a way to declare any formula that refers to
1485/// one of those regs an instant loser.
1486void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1488 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1489 if (LoserRegs && LoserRegs->count(Reg)) {
1490 Lose();
1491 return;
1492 }
1493 if (Regs.insert(Reg).second) {
1494 RateRegister(F, Reg, Regs);
1495 if (LoserRegs && isLoser())
1496 LoserRegs->insert(Reg);
1497 }
1498}
1499
1500void Cost::RateFormula(const Formula &F,
1502 const DenseSet<const SCEV *> &VisitedRegs,
1503 const LSRUse &LU,
1504 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1505 if (isLoser())
1506 return;
1507 assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
1508 // Tally up the registers.
1509 unsigned PrevAddRecCost = C.AddRecCost;
1510 unsigned PrevNumRegs = C.NumRegs;
1511 unsigned PrevNumBaseAdds = C.NumBaseAdds;
1512 if (const SCEV *ScaledReg = F.ScaledReg) {
1513 if (VisitedRegs.count(ScaledReg)) {
1514 Lose();
1515 return;
1516 }
1517 RatePrimaryRegister(F, ScaledReg, Regs, LoserRegs);
1518 if (isLoser())
1519 return;
1520 }
1521 for (const SCEV *BaseReg : F.BaseRegs) {
1522 if (VisitedRegs.count(BaseReg)) {
1523 Lose();
1524 return;
1525 }
1526 RatePrimaryRegister(F, BaseReg, Regs, LoserRegs);
1527 if (isLoser())
1528 return;
1529 }
1530
1531 // Determine how many (unfolded) adds we'll need inside the loop.
1532 size_t NumBaseParts = F.getNumRegs();
1533 if (NumBaseParts > 1)
1534 // Do not count the base and a possible second register if the target
1535 // allows to fold 2 registers.
1536 C.NumBaseAdds +=
1537 NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F)));
1538 C.NumBaseAdds += (F.UnfoldedOffset.isNonZero());
1539
1540 // Accumulate non-free scaling amounts.
1541 C.ScaleCost += *getScalingFactorCost(*TTI, LU, F, *L).getValue();
1542
1543 // Tally up the non-zero immediates.
1544 for (const LSRFixup &Fixup : LU.Fixups) {
1545 if (Fixup.Offset.isCompatibleImmediate(F.BaseOffset)) {
1546 Immediate Offset = Fixup.Offset.addUnsigned(F.BaseOffset);
1547 if (F.BaseGV)
1548 C.ImmCost += 64; // Handle symbolic values conservatively.
1549 // TODO: This should probably be the pointer size.
1550 else if (Offset.isNonZero())
1551 C.ImmCost +=
1552 APInt(64, Offset.getKnownMinValue(), true).getSignificantBits();
1553
1554 // Check with target if this offset with this instruction is
1555 // specifically not supported.
1556 if (LU.Kind == LSRUse::Address && Offset.isNonZero() &&
1557 !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1558 Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
1559 C.NumBaseAdds++;
1560 } else {
1561 // Incompatible immediate type, increase cost to avoid using
1562 C.ImmCost += 2048;
1563 }
1564 }
1565
1566 // If we don't count instruction cost exit here.
1567 if (!InsnsCost) {
1568 assert(isValid() && "invalid cost");
1569 return;
1570 }
1571
1572 // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
1573 // additional instruction (at least fill).
1574 // TODO: Need distinguish register class?
1575 unsigned TTIRegNum = TTI->getNumberOfRegisters(
1576 TTI->getRegisterClassForType(false, F.getType())) - 1;
1577 if (C.NumRegs > TTIRegNum) {
1578 // Cost already exceeded TTIRegNum, then only newly added register can add
1579 // new instructions.
1580 if (PrevNumRegs > TTIRegNum)
1581 C.Insns += (C.NumRegs - PrevNumRegs);
1582 else
1583 C.Insns += (C.NumRegs - TTIRegNum);
1584 }
1585
1586 // If ICmpZero formula ends with not 0, it could not be replaced by
1587 // just add or sub. We'll need to compare final result of AddRec.
1588 // That means we'll need an additional instruction. But if the target can
1589 // macro-fuse a compare with a branch, don't count this extra instruction.
1590 // For -10 + {0, +, 1}:
1591 // i = i + 1;
1592 // cmp i, 10
1593 //
1594 // For {-10, +, 1}:
1595 // i = i + 1;
1596 if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() &&
1597 !TTI->canMacroFuseCmp())
1598 C.Insns++;
1599 // Each new AddRec adds 1 instruction to calculation.
1600 C.Insns += (C.AddRecCost - PrevAddRecCost);
1601
1602 // BaseAdds adds instructions for unfolded registers.
1603 if (LU.Kind != LSRUse::ICmpZero)
1604 C.Insns += C.NumBaseAdds - PrevNumBaseAdds;
1605 assert(isValid() && "invalid cost");
1606}
1607
1608/// Set this cost to a losing value.
1609void Cost::Lose() {
1610 C.Insns = std::numeric_limits<unsigned>::max();
1611 C.NumRegs = std::numeric_limits<unsigned>::max();
1612 C.AddRecCost = std::numeric_limits<unsigned>::max();
1613 C.NumIVMuls = std::numeric_limits<unsigned>::max();
1614 C.NumBaseAdds = std::numeric_limits<unsigned>::max();
1615 C.ImmCost = std::numeric_limits<unsigned>::max();
1616 C.SetupCost = std::numeric_limits<unsigned>::max();
1617 C.ScaleCost = std::numeric_limits<unsigned>::max();
1618}
1619
1620/// Choose the lower cost.
1621bool Cost::isLess(const Cost &Other) const {
1622 if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
1623 C.Insns != Other.C.Insns)
1624 return C.Insns < Other.C.Insns;
1625 return TTI->isLSRCostLess(C, Other.C);
1626}
1627
1628#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1629void Cost::print(raw_ostream &OS) const {
1630 if (InsnsCost)
1631 OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
1632 OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s");
1633 if (C.AddRecCost != 0)
1634 OS << ", with addrec cost " << C.AddRecCost;
1635 if (C.NumIVMuls != 0)
1636 OS << ", plus " << C.NumIVMuls << " IV mul"
1637 << (C.NumIVMuls == 1 ? "" : "s");
1638 if (C.NumBaseAdds != 0)
1639 OS << ", plus " << C.NumBaseAdds << " base add"
1640 << (C.NumBaseAdds == 1 ? "" : "s");
1641 if (C.ScaleCost != 0)
1642 OS << ", plus " << C.ScaleCost << " scale cost";
1643 if (C.ImmCost != 0)
1644 OS << ", plus " << C.ImmCost << " imm cost";
1645 if (C.SetupCost != 0)
1646 OS << ", plus " << C.SetupCost << " setup cost";
1647}
1648
1649LLVM_DUMP_METHOD void Cost::dump() const {
1650 print(errs()); errs() << '\n';
1651}
1652#endif
1653
1654/// Test whether this fixup always uses its value outside of the given loop.
1655bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
1656 // PHI nodes use their value in their incoming blocks.
1657 if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
1658 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
1659 if (PN->getIncomingValue(i) == OperandValToReplace &&
1660 L->contains(PN->getIncomingBlock(i)))
1661 return false;
1662 return true;
1663 }
1664
1665 return !L->contains(UserInst);
1666}
1667
1668#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1669void LSRFixup::print(raw_ostream &OS) const {
1670 OS << "UserInst=";
1671 // Store is common and interesting enough to be worth special-casing.
1672 if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
1673 OS << "store ";
1674 Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
1675 } else if (UserInst->getType()->isVoidTy())
1676 OS << UserInst->getOpcodeName();
1677 else
1678 UserInst->printAsOperand(OS, /*PrintType=*/false);
1679
1680 OS << ", OperandValToReplace=";
1681 OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
1682
1683 for (const Loop *PIL : PostIncLoops) {
1684 OS << ", PostIncLoop=";
1685 PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
1686 }
1687
1688 if (Offset.isNonZero())
1689 OS << ", Offset=" << Offset;
1690}
1691
1692LLVM_DUMP_METHOD void LSRFixup::dump() const {
1693 print(errs()); errs() << '\n';
1694}
1695#endif
1696
1697/// Test whether this use as a formula which has the same registers as the given
1698/// formula.
1699bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
1701 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1702 // Unstable sort by host order ok, because this is only used for uniquifying.
1703 llvm::sort(Key);
1704 return Uniquifier.count(Key);
1705}
1706
1707/// The function returns a probability of selecting formula without Reg.
1708float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
1709 unsigned FNum = 0;
1710 for (const Formula &F : Formulae)
1711 if (F.referencesReg(Reg))
1712 FNum++;
1713 return ((float)(Formulae.size() - FNum)) / Formulae.size();
1714}
1715
1716/// If the given formula has not yet been inserted, add it to the list, and
1717/// return true. Return false otherwise. The formula must be in canonical form.
1718bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
1719 assert(F.isCanonical(L) && "Invalid canonical representation");
1720
1721 if (!Formulae.empty() && RigidFormula)
1722 return false;
1723
1725 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1726 // Unstable sort by host order ok, because this is only used for uniquifying.
1727 llvm::sort(Key);
1728
1729 if (!Uniquifier.insert(Key).second)
1730 return false;
1731
1732 // Using a register to hold the value of 0 is not profitable.
1733 assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
1734 "Zero allocated in a scaled register!");
1735#ifndef NDEBUG
1736 for (const SCEV *BaseReg : F.BaseRegs)
1737 assert(!BaseReg->isZero() && "Zero allocated in a base register!");
1738#endif
1739
1740 // Add the formula to the list.
1741 Formulae.push_back(F);
1742
1743 // Record registers now being used by this use.
1744 Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1745 if (F.ScaledReg)
1746 Regs.insert(F.ScaledReg);
1747
1748 return true;
1749}
1750
1751/// Remove the given formula from this use's list.
1752void LSRUse::DeleteFormula(Formula &F) {
1753 if (&F != &Formulae.back())
1754 std::swap(F, Formulae.back());
1755 Formulae.pop_back();
1756}
1757
1758/// Recompute the Regs field, and update RegUses.
1759void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
1760 // Now that we've filtered out some formulae, recompute the Regs set.
1761 SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
1762 Regs.clear();
1763 for (const Formula &F : Formulae) {
1764 if (F.ScaledReg) Regs.insert(F.ScaledReg);
1765 Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1766 }
1767
1768 // Update the RegTracker.
1769 for (const SCEV *S : OldRegs)
1770 if (!Regs.count(S))
1771 RegUses.dropRegister(S, LUIdx);
1772}
1773
1774#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1775void LSRUse::print(raw_ostream &OS) const {
1776 OS << "LSR Use: Kind=";
1777 switch (Kind) {
1778 case Basic: OS << "Basic"; break;
1779 case Special: OS << "Special"; break;
1780 case ICmpZero: OS << "ICmpZero"; break;
1781 case Address:
1782 OS << "Address of ";
1783 if (AccessTy.MemTy->isPointerTy())
1784 OS << "pointer"; // the full pointer type could be really verbose
1785 else {
1786 OS << *AccessTy.MemTy;
1787 }
1788
1789 OS << " in addrspace(" << AccessTy.AddrSpace << ')';
1790 }
1791
1792 OS << ", Offsets={";
1793 bool NeedComma = false;
1794 for (const LSRFixup &Fixup : Fixups) {
1795 if (NeedComma) OS << ',';
1796 OS << Fixup.Offset;
1797 NeedComma = true;
1798 }
1799 OS << '}';
1800
1801 if (AllFixupsOutsideLoop)
1802 OS << ", all-fixups-outside-loop";
1803
1804 if (WidestFixupType)
1805 OS << ", widest fixup type: " << *WidestFixupType;
1806}
1807
1808LLVM_DUMP_METHOD void LSRUse::dump() const {
1809 print(errs()); errs() << '\n';
1810}
1811#endif
1812
1814 LSRUse::KindType Kind, MemAccessTy AccessTy,
1815 GlobalValue *BaseGV, Immediate BaseOffset,
1816 bool HasBaseReg, int64_t Scale,
1817 Instruction *Fixup /* = nullptr */) {
1818 switch (Kind) {
1819 case LSRUse::Address: {
1820 int64_t FixedOffset =
1821 BaseOffset.isScalable() ? 0 : BaseOffset.getFixedValue();
1822 int64_t ScalableOffset =
1823 BaseOffset.isScalable() ? BaseOffset.getKnownMinValue() : 0;
1824 return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, FixedOffset,
1825 HasBaseReg, Scale, AccessTy.AddrSpace,
1826 Fixup, ScalableOffset);
1827 }
1828 case LSRUse::ICmpZero:
1829 // There's not even a target hook for querying whether it would be legal to
1830 // fold a GV into an ICmp.
1831 if (BaseGV)
1832 return false;
1833
1834 // ICmp only has two operands; don't allow more than two non-trivial parts.
1835 if (Scale != 0 && HasBaseReg && BaseOffset.isNonZero())
1836 return false;
1837
1838 // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
1839 // putting the scaled register in the other operand of the icmp.
1840 if (Scale != 0 && Scale != -1)
1841 return false;
1842
1843 // If we have low-level target information, ask the target if it can fold an
1844 // integer immediate on an icmp.
1845 if (BaseOffset.isNonZero()) {
1846 // We don't have an interface to query whether the target supports
1847 // icmpzero against scalable quantities yet.
1848 if (BaseOffset.isScalable())
1849 return false;
1850
1851 // We have one of:
1852 // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
1853 // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
1854 // Offs is the ICmp immediate.
1855 if (Scale == 0)
1856 // The cast does the right thing with
1857 // std::numeric_limits<int64_t>::min().
1858 BaseOffset = BaseOffset.getFixed(-(uint64_t)BaseOffset.getFixedValue());
1859 return TTI.isLegalICmpImmediate(BaseOffset.getFixedValue());
1860 }
1861
1862 // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
1863 return true;
1864
1865 case LSRUse::Basic:
1866 // Only handle single-register values.
1867 return !BaseGV && Scale == 0 && BaseOffset.isZero();
1868
1869 case LSRUse::Special:
1870 // Special case Basic to handle -1 scales.
1871 return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset.isZero();
1872 }
1873
1874 llvm_unreachable("Invalid LSRUse Kind!");
1875}
1876
1878 Immediate MinOffset, Immediate MaxOffset,
1879 LSRUse::KindType Kind, MemAccessTy AccessTy,
1880 GlobalValue *BaseGV, Immediate BaseOffset,
1881 bool HasBaseReg, int64_t Scale) {
1882 if (BaseOffset.isNonZero() &&
1883 (BaseOffset.isScalable() != MinOffset.isScalable() ||
1884 BaseOffset.isScalable() != MaxOffset.isScalable()))
1885 return false;
1886 // Check for overflow.
1887 int64_t Base = BaseOffset.getKnownMinValue();
1888 int64_t Min = MinOffset.getKnownMinValue();
1889 int64_t Max = MaxOffset.getKnownMinValue();
1890 if (((int64_t)((uint64_t)Base + Min) > Base) != (Min > 0))
1891 return false;
1892 MinOffset = Immediate::get((uint64_t)Base + Min, MinOffset.isScalable());
1893 if (((int64_t)((uint64_t)Base + Max) > Base) != (Max > 0))
1894 return false;
1895 MaxOffset = Immediate::get((uint64_t)Base + Max, MaxOffset.isScalable());
1896
1897 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
1898 HasBaseReg, Scale) &&
1899 isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
1900 HasBaseReg, Scale);
1901}
1902
1904 Immediate MinOffset, Immediate MaxOffset,
1905 LSRUse::KindType Kind, MemAccessTy AccessTy,
1906 const Formula &F, const Loop &L) {
1907 // For the purpose of isAMCompletelyFolded either having a canonical formula
1908 // or a scale not equal to zero is correct.
1909 // Problems may arise from non canonical formulae having a scale == 0.
1910 // Strictly speaking it would best to just rely on canonical formulae.
1911 // However, when we generate the scaled formulae, we first check that the
1912 // scaling factor is profitable before computing the actual ScaledReg for
1913 // compile time sake.
1914 assert((F.isCanonical(L) || F.Scale != 0));
1915 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1916 F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
1917}
1918
1919/// Test whether we know how to expand the current formula.
1920static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1921 Immediate MaxOffset, LSRUse::KindType Kind,
1922 MemAccessTy AccessTy, GlobalValue *BaseGV,
1923 Immediate BaseOffset, bool HasBaseReg, int64_t Scale) {
1924 // We know how to expand completely foldable formulae.
1925 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1926 BaseOffset, HasBaseReg, Scale) ||
1927 // Or formulae that use a base register produced by a sum of base
1928 // registers.
1929 (Scale == 1 &&
1930 isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1931 BaseGV, BaseOffset, true, 0));
1932}
1933
1934static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1935 Immediate MaxOffset, LSRUse::KindType Kind,
1936 MemAccessTy AccessTy, const Formula &F) {
1937 return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
1938 F.BaseOffset, F.HasBaseReg, F.Scale);
1939}
1940
1942 Immediate Offset) {
1943 if (Offset.isScalable())
1944 return TTI.isLegalAddScalableImmediate(Offset.getKnownMinValue());
1945
1946 return TTI.isLegalAddImmediate(Offset.getFixedValue());
1947}
1948
1950 const LSRUse &LU, const Formula &F) {
1951 // Target may want to look at the user instructions.
1952 if (LU.Kind == LSRUse::Address && TTI.LSRWithInstrQueries()) {
1953 for (const LSRFixup &Fixup : LU.Fixups)
1954 if (!isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1955 (F.BaseOffset + Fixup.Offset), F.HasBaseReg,
1956 F.Scale, Fixup.UserInst))
1957 return false;
1958 return true;
1959 }
1960
1961 return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1962 LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
1963 F.Scale);
1964}
1965
1967 const LSRUse &LU, const Formula &F,
1968 const Loop &L) {
1969 if (!F.Scale)
1970 return 0;
1971
1972 // If the use is not completely folded in that instruction, we will have to
1973 // pay an extra cost only for scale != 1.
1974 if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1975 LU.AccessTy, F, L))
1976 return F.Scale != 1;
1977
1978 switch (LU.Kind) {
1979 case LSRUse::Address: {
1980 // Check the scaling factor cost with both the min and max offsets.
1981 int64_t ScalableMin = 0, ScalableMax = 0, FixedMin = 0, FixedMax = 0;
1982 if (F.BaseOffset.isScalable()) {
1983 ScalableMin = (F.BaseOffset + LU.MinOffset).getKnownMinValue();
1984 ScalableMax = (F.BaseOffset + LU.MaxOffset).getKnownMinValue();
1985 } else {
1986 FixedMin = (F.BaseOffset + LU.MinOffset).getFixedValue();
1987 FixedMax = (F.BaseOffset + LU.MaxOffset).getFixedValue();
1988 }
1989 InstructionCost ScaleCostMinOffset = TTI.getScalingFactorCost(
1990 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMin, ScalableMin),
1991 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
1992 InstructionCost ScaleCostMaxOffset = TTI.getScalingFactorCost(
1993 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMax, ScalableMax),
1994 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
1995
1996 assert(ScaleCostMinOffset.isValid() && ScaleCostMaxOffset.isValid() &&
1997 "Legal addressing mode has an illegal cost!");
1998 return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
1999 }
2000 case LSRUse::ICmpZero:
2001 case LSRUse::Basic:
2002 case LSRUse::Special:
2003 // The use is completely folded, i.e., everything is folded into the
2004 // instruction.
2005 return 0;
2006 }
2007
2008 llvm_unreachable("Invalid LSRUse Kind!");
2009}
2010
2012 LSRUse::KindType Kind, MemAccessTy AccessTy,
2013 GlobalValue *BaseGV, Immediate BaseOffset,
2014 bool HasBaseReg) {
2015 // Fast-path: zero is always foldable.
2016 if (BaseOffset.isZero() && !BaseGV)
2017 return true;
2018
2019 // Conservatively, create an address with an immediate and a
2020 // base and a scale.
2021 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2022
2023 // Canonicalize a scale of 1 to a base register if the formula doesn't
2024 // already have a base register.
2025 if (!HasBaseReg && Scale == 1) {
2026 Scale = 0;
2027 HasBaseReg = true;
2028 }
2029
2030 // FIXME: Try with + without a scale? Maybe based on TTI?
2031 // I think basereg + scaledreg + immediateoffset isn't a good 'conservative'
2032 // default for many architectures, not just AArch64 SVE. More investigation
2033 // needed later to determine if this should be used more widely than just
2034 // on scalable types.
2035 if (HasBaseReg && BaseOffset.isNonZero() && Kind != LSRUse::ICmpZero &&
2036 AccessTy.MemTy && AccessTy.MemTy->isScalableTy() && DropScaledForVScale)
2037 Scale = 0;
2038
2039 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
2040 HasBaseReg, Scale);
2041}
2042
2044 ScalarEvolution &SE, Immediate MinOffset,
2045 Immediate MaxOffset, LSRUse::KindType Kind,
2046 MemAccessTy AccessTy, const SCEV *S,
2047 bool HasBaseReg) {
2048 // Fast-path: zero is always foldable.
2049 if (S->isZero()) return true;
2050
2051 // Conservatively, create an address with an immediate and a
2052 // base and a scale.
2053 Immediate BaseOffset = ExtractImmediate(S, SE);
2054 GlobalValue *BaseGV = ExtractSymbol(S, SE);
2055
2056 // If there's anything else involved, it's not foldable.
2057 if (!S->isZero()) return false;
2058
2059 // Fast-path: zero is always foldable.
2060 if (BaseOffset.isZero() && !BaseGV)
2061 return true;
2062
2063 if (BaseOffset.isScalable())
2064 return false;
2065
2066 // Conservatively, create an address with an immediate and a
2067 // base and a scale.
2068 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2069
2070 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
2071 BaseOffset, HasBaseReg, Scale);
2072}
2073
2074namespace {
2075
2076/// An individual increment in a Chain of IV increments. Relate an IV user to
2077/// an expression that computes the IV it uses from the IV used by the previous
2078/// link in the Chain.
2079///
2080/// For the head of a chain, IncExpr holds the absolute SCEV expression for the
2081/// original IVOperand. The head of the chain's IVOperand is only valid during
2082/// chain collection, before LSR replaces IV users. During chain generation,
2083/// IncExpr can be used to find the new IVOperand that computes the same
2084/// expression.
2085struct IVInc {
2086 Instruction *UserInst;
2087 Value* IVOperand;
2088 const SCEV *IncExpr;
2089
2090 IVInc(Instruction *U, Value *O, const SCEV *E)
2091 : UserInst(U), IVOperand(O), IncExpr(E) {}
2092};
2093
2094// The list of IV increments in program order. We typically add the head of a
2095// chain without finding subsequent links.
2096struct IVChain {
2098 const SCEV *ExprBase = nullptr;
2099
2100 IVChain() = default;
2101 IVChain(const IVInc &Head, const SCEV *Base)
2102 : Incs(1, Head), ExprBase(Base) {}
2103
2105
2106 // Return the first increment in the chain.
2107 const_iterator begin() const {
2108 assert(!Incs.empty());
2109 return std::next(Incs.begin());
2110 }
2111 const_iterator end() const {
2112 return Incs.end();
2113 }
2114
2115 // Returns true if this chain contains any increments.
2116 bool hasIncs() const { return Incs.size() >= 2; }
2117
2118 // Add an IVInc to the end of this chain.
2119 void add(const IVInc &X) { Incs.push_back(X); }
2120
2121 // Returns the last UserInst in the chain.
2122 Instruction *tailUserInst() const { return Incs.back().UserInst; }
2123
2124 // Returns true if IncExpr can be profitably added to this chain.
2125 bool isProfitableIncrement(const SCEV *OperExpr,
2126 const SCEV *IncExpr,
2128};
2129
2130/// Helper for CollectChains to track multiple IV increment uses. Distinguish
2131/// between FarUsers that definitely cross IV increments and NearUsers that may
2132/// be used between IV increments.
2133struct ChainUsers {
2136};
2137
2138/// This class holds state for the main loop strength reduction logic.
2139class LSRInstance {
2140 IVUsers &IU;
2141 ScalarEvolution &SE;
2142 DominatorTree &DT;
2143 LoopInfo &LI;
2144 AssumptionCache &AC;
2145 TargetLibraryInfo &TLI;
2146 const TargetTransformInfo &TTI;
2147 Loop *const L;
2148 MemorySSAUpdater *MSSAU;
2150 mutable SCEVExpander Rewriter;
2151 bool Changed = false;
2152
2153 /// This is the insert position that the current loop's induction variable
2154 /// increment should be placed. In simple loops, this is the latch block's
2155 /// terminator. But in more complicated cases, this is a position which will
2156 /// dominate all the in-loop post-increment users.
2157 Instruction *IVIncInsertPos = nullptr;
2158
2159 /// Interesting factors between use strides.
2160 ///
2161 /// We explicitly use a SetVector which contains a SmallSet, instead of the
2162 /// default, a SmallDenseSet, because we need to use the full range of
2163 /// int64_ts, and there's currently no good way of doing that with
2164 /// SmallDenseSet.
2166
2167 /// The cost of the current SCEV, the best solution by LSR will be dropped if
2168 /// the solution is not profitable.
2169 Cost BaselineCost;
2170
2171 /// Interesting use types, to facilitate truncation reuse.
2173
2174 /// The list of interesting uses.
2176
2177 /// Track which uses use which register candidates.
2178 RegUseTracker RegUses;
2179
2180 // Limit the number of chains to avoid quadratic behavior. We don't expect to
2181 // have more than a few IV increment chains in a loop. Missing a Chain falls
2182 // back to normal LSR behavior for those uses.
2183 static const unsigned MaxChains = 8;
2184
2185 /// IV users can form a chain of IV increments.
2187
2188 /// IV users that belong to profitable IVChains.
2190
2191 /// Induction variables that were generated and inserted by the SCEV Expander.
2192 SmallVector<llvm::WeakVH, 2> ScalarEvolutionIVs;
2193
2194 void OptimizeShadowIV();
2195 bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
2196 ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
2197 void OptimizeLoopTermCond();
2198
2199 void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
2200 SmallVectorImpl<ChainUsers> &ChainUsersVec);
2201 void FinalizeChain(IVChain &Chain);
2202 void CollectChains();
2203 void GenerateIVChain(const IVChain &Chain,
2205
2206 void CollectInterestingTypesAndFactors();
2207 void CollectFixupsAndInitialFormulae();
2208
2209 // Support for sharing of LSRUses between LSRFixups.
2211 UseMapTy UseMap;
2212
2213 bool reconcileNewOffset(LSRUse &LU, Immediate NewOffset, bool HasBaseReg,
2214 LSRUse::KindType Kind, MemAccessTy AccessTy);
2215
2216 std::pair<size_t, Immediate> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
2217 MemAccessTy AccessTy);
2218
2219 void DeleteUse(LSRUse &LU, size_t LUIdx);
2220
2221 LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
2222
2223 void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2224 void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2225 void CountRegisters(const Formula &F, size_t LUIdx);
2226 bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
2227
2228 void CollectLoopInvariantFixupsAndFormulae();
2229
2230 void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
2231 unsigned Depth = 0);
2232
2233 void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
2234 const Formula &Base, unsigned Depth,
2235 size_t Idx, bool IsScaledReg = false);
2236 void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
2237 void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2238 const Formula &Base, size_t Idx,
2239 bool IsScaledReg = false);
2240 void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2241 void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2242 const Formula &Base,
2243 const SmallVectorImpl<Immediate> &Worklist,
2244 size_t Idx, bool IsScaledReg = false);
2245 void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2246 void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2247 void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2248 void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
2249 void GenerateCrossUseConstantOffsets();
2250 void GenerateAllReuseFormulae();
2251
2252 void FilterOutUndesirableDedicatedRegisters();
2253
2254 size_t EstimateSearchSpaceComplexity() const;
2255 void NarrowSearchSpaceByDetectingSupersets();
2256 void NarrowSearchSpaceByCollapsingUnrolledCode();
2257 void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
2258 void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
2259 void NarrowSearchSpaceByFilterPostInc();
2260 void NarrowSearchSpaceByDeletingCostlyFormulas();
2261 void NarrowSearchSpaceByPickingWinnerRegs();
2262 void NarrowSearchSpaceUsingHeuristics();
2263
2264 void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
2265 Cost &SolutionCost,
2267 const Cost &CurCost,
2268 const SmallPtrSet<const SCEV *, 16> &CurRegs,
2269 DenseSet<const SCEV *> &VisitedRegs) const;
2270 void Solve(SmallVectorImpl<const Formula *> &Solution) const;
2271
2273 HoistInsertPosition(BasicBlock::iterator IP,
2274 const SmallVectorImpl<Instruction *> &Inputs) const;
2275 BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP,
2276 const LSRFixup &LF,
2277 const LSRUse &LU) const;
2278
2279 Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2281 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2282 void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
2283 const Formula &F,
2284 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2285 void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2286 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2287 void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
2288
2289public:
2290 LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
2292 TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU);
2293
2294 bool getChanged() const { return Changed; }
2295 const SmallVectorImpl<WeakVH> &getScalarEvolutionIVs() const {
2296 return ScalarEvolutionIVs;
2297 }
2298
2299 void print_factors_and_types(raw_ostream &OS) const;
2300 void print_fixups(raw_ostream &OS) const;
2301 void print_uses(raw_ostream &OS) const;
2302 void print(raw_ostream &OS) const;
2303 void dump() const;
2304};
2305
2306} // end anonymous namespace
2307
2308/// If IV is used in a int-to-float cast inside the loop then try to eliminate
2309/// the cast operation.
2310void LSRInstance::OptimizeShadowIV() {
2311 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2312 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2313 return;
2314
2315 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
2316 UI != E; /* empty */) {
2317 IVUsers::const_iterator CandidateUI = UI;
2318 ++UI;
2319 Instruction *ShadowUse = CandidateUI->getUser();
2320 Type *DestTy = nullptr;
2321 bool IsSigned = false;
2322
2323 /* If shadow use is a int->float cast then insert a second IV
2324 to eliminate this cast.
2325
2326 for (unsigned i = 0; i < n; ++i)
2327 foo((double)i);
2328
2329 is transformed into
2330
2331 double d = 0.0;
2332 for (unsigned i = 0; i < n; ++i, ++d)
2333 foo(d);
2334 */
2335 if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
2336 IsSigned = false;
2337 DestTy = UCast->getDestTy();
2338 }
2339 else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
2340 IsSigned = true;
2341 DestTy = SCast->getDestTy();
2342 }
2343 if (!DestTy) continue;
2344
2345 // If target does not support DestTy natively then do not apply
2346 // this transformation.
2347 if (!TTI.isTypeLegal(DestTy)) continue;
2348
2349 PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
2350 if (!PH) continue;
2351 if (PH->getNumIncomingValues() != 2) continue;
2352
2353 // If the calculation in integers overflows, the result in FP type will
2354 // differ. So we only can do this transformation if we are guaranteed to not
2355 // deal with overflowing values
2356 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PH));
2357 if (!AR) continue;
2358 if (IsSigned && !AR->hasNoSignedWrap()) continue;
2359 if (!IsSigned && !AR->hasNoUnsignedWrap()) continue;
2360
2361 Type *SrcTy = PH->getType();
2362 int Mantissa = DestTy->getFPMantissaWidth();
2363 if (Mantissa == -1) continue;
2364 if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
2365 continue;
2366
2367 unsigned Entry, Latch;
2368 if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
2369 Entry = 0;
2370 Latch = 1;
2371 } else {
2372 Entry = 1;
2373 Latch = 0;
2374 }
2375
2376 ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
2377 if (!Init) continue;
2378 Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
2379 (double)Init->getSExtValue() :
2380 (double)Init->getZExtValue());
2381
2382 BinaryOperator *Incr =
2383 dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
2384 if (!Incr) continue;
2385 if (Incr->getOpcode() != Instruction::Add
2386 && Incr->getOpcode() != Instruction::Sub)
2387 continue;
2388
2389 /* Initialize new IV, double d = 0.0 in above example. */
2390 ConstantInt *C = nullptr;
2391 if (Incr->getOperand(0) == PH)
2392 C = dyn_cast<ConstantInt>(Incr->getOperand(1));
2393 else if (Incr->getOperand(1) == PH)
2394 C = dyn_cast<ConstantInt>(Incr->getOperand(0));
2395 else
2396 continue;
2397
2398 if (!C) continue;
2399
2400 // Ignore negative constants, as the code below doesn't handle them
2401 // correctly. TODO: Remove this restriction.
2402 if (!C->getValue().isStrictlyPositive())
2403 continue;
2404
2405 /* Add new PHINode. */
2406 PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH->getIterator());
2407 NewPH->setDebugLoc(PH->getDebugLoc());
2408
2409 /* create new increment. '++d' in above example. */
2410 Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
2412 Incr->getOpcode() == Instruction::Add ? Instruction::FAdd
2413 : Instruction::FSub,
2414 NewPH, CFP, "IV.S.next.", Incr->getIterator());
2415 NewIncr->setDebugLoc(Incr->getDebugLoc());
2416
2417 NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
2418 NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
2419
2420 /* Remove cast operation */
2421 ShadowUse->replaceAllUsesWith(NewPH);
2422 ShadowUse->eraseFromParent();
2423 Changed = true;
2424 break;
2425 }
2426}
2427
2428/// If Cond has an operand that is an expression of an IV, set the IV user and
2429/// stride information and return true, otherwise return false.
2430bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
2431 for (IVStrideUse &U : IU)
2432 if (U.getUser() == Cond) {
2433 // NOTE: we could handle setcc instructions with multiple uses here, but
2434 // InstCombine does it as well for simple uses, it's not clear that it
2435 // occurs enough in real life to handle.
2436 CondUse = &U;
2437 return true;
2438 }
2439 return false;
2440}
2441
2442/// Rewrite the loop's terminating condition if it uses a max computation.
2443///
2444/// This is a narrow solution to a specific, but acute, problem. For loops
2445/// like this:
2446///
2447/// i = 0;
2448/// do {
2449/// p[i] = 0.0;
2450/// } while (++i < n);
2451///
2452/// the trip count isn't just 'n', because 'n' might not be positive. And
2453/// unfortunately this can come up even for loops where the user didn't use
2454/// a C do-while loop. For example, seemingly well-behaved top-test loops
2455/// will commonly be lowered like this:
2456///
2457/// if (n > 0) {
2458/// i = 0;
2459/// do {
2460/// p[i] = 0.0;
2461/// } while (++i < n);
2462/// }
2463///
2464/// and then it's possible for subsequent optimization to obscure the if
2465/// test in such a way that indvars can't find it.
2466///
2467/// When indvars can't find the if test in loops like this, it creates a
2468/// max expression, which allows it to give the loop a canonical
2469/// induction variable:
2470///
2471/// i = 0;
2472/// max = n < 1 ? 1 : n;
2473/// do {
2474/// p[i] = 0.0;
2475/// } while (++i != max);
2476///
2477/// Canonical induction variables are necessary because the loop passes
2478/// are designed around them. The most obvious example of this is the
2479/// LoopInfo analysis, which doesn't remember trip count values. It
2480/// expects to be able to rediscover the trip count each time it is
2481/// needed, and it does this using a simple analysis that only succeeds if
2482/// the loop has a canonical induction variable.
2483///
2484/// However, when it comes time to generate code, the maximum operation
2485/// can be quite costly, especially if it's inside of an outer loop.
2486///
2487/// This function solves this problem by detecting this type of loop and
2488/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
2489/// the instructions for the maximum computation.
2490ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
2491 // Check that the loop matches the pattern we're looking for.
2492 if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
2493 Cond->getPredicate() != CmpInst::ICMP_NE)
2494 return Cond;
2495
2496 SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
2497 if (!Sel || !Sel->hasOneUse()) return Cond;
2498
2499 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2500 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2501 return Cond;
2502 const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
2503
2504 // Add one to the backedge-taken count to get the trip count.
2505 const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
2506 if (IterationCount != SE.getSCEV(Sel)) return Cond;
2507
2508 // Check for a max calculation that matches the pattern. There's no check
2509 // for ICMP_ULE here because the comparison would be with zero, which
2510 // isn't interesting.
2511 CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
2512 const SCEVNAryExpr *Max = nullptr;
2513 if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
2514 Pred = ICmpInst::ICMP_SLE;
2515 Max = S;
2516 } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
2517 Pred = ICmpInst::ICMP_SLT;
2518 Max = S;
2519 } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
2520 Pred = ICmpInst::ICMP_ULT;
2521 Max = U;
2522 } else {
2523 // No match; bail.
2524 return Cond;
2525 }
2526
2527 // To handle a max with more than two operands, this optimization would
2528 // require additional checking and setup.
2529 if (Max->getNumOperands() != 2)
2530 return Cond;
2531
2532 const SCEV *MaxLHS = Max->getOperand(0);
2533 const SCEV *MaxRHS = Max->getOperand(1);
2534
2535 // ScalarEvolution canonicalizes constants to the left. For < and >, look
2536 // for a comparison with 1. For <= and >=, a comparison with zero.
2537 if (!MaxLHS ||
2538 (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
2539 return Cond;
2540
2541 // Check the relevant induction variable for conformance to
2542 // the pattern.
2543 const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
2544 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
2545 if (!AR || !AR->isAffine() ||
2546 AR->getStart() != One ||
2547 AR->getStepRecurrence(SE) != One)
2548 return Cond;
2549
2550 assert(AR->getLoop() == L &&
2551 "Loop condition operand is an addrec in a different loop!");
2552
2553 // Check the right operand of the select, and remember it, as it will
2554 // be used in the new comparison instruction.
2555 Value *NewRHS = nullptr;
2556 if (ICmpInst::isTrueWhenEqual(Pred)) {
2557 // Look for n+1, and grab n.
2558 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
2559 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2560 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2561 NewRHS = BO->getOperand(0);
2562 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
2563 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2564 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2565 NewRHS = BO->getOperand(0);
2566 if (!NewRHS)
2567 return Cond;
2568 } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
2569 NewRHS = Sel->getOperand(1);
2570 else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
2571 NewRHS = Sel->getOperand(2);
2572 else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
2573 NewRHS = SU->getValue();
2574 else
2575 // Max doesn't match expected pattern.
2576 return Cond;
2577
2578 // Determine the new comparison opcode. It may be signed or unsigned,
2579 // and the original comparison may be either equality or inequality.
2580 if (Cond->getPredicate() == CmpInst::ICMP_EQ)
2581 Pred = CmpInst::getInversePredicate(Pred);
2582
2583 // Ok, everything looks ok to change the condition into an SLT or SGE and
2584 // delete the max calculation.
2585 ICmpInst *NewCond = new ICmpInst(Cond->getIterator(), Pred,
2586 Cond->getOperand(0), NewRHS, "scmp");
2587
2588 // Delete the max calculation instructions.
2589 NewCond->setDebugLoc(Cond->getDebugLoc());
2590 Cond->replaceAllUsesWith(NewCond);
2591 CondUse->setUser(NewCond);
2592 Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
2593 Cond->eraseFromParent();
2594 Sel->eraseFromParent();
2595 if (Cmp->use_empty())
2596 Cmp->eraseFromParent();
2597 return NewCond;
2598}
2599
2600/// Change loop terminating condition to use the postinc iv when possible.
2601void
2602LSRInstance::OptimizeLoopTermCond() {
2604
2605 // We need a different set of heuristics for rotated and non-rotated loops.
2606 // If a loop is rotated then the latch is also the backedge, so inserting
2607 // post-inc expressions just before the latch is ideal. To reduce live ranges
2608 // it also makes sense to rewrite terminating conditions to use post-inc
2609 // expressions.
2610 //
2611 // If the loop is not rotated then the latch is not a backedge; the latch
2612 // check is done in the loop head. Adding post-inc expressions before the
2613 // latch will cause overlapping live-ranges of pre-inc and post-inc expressions
2614 // in the loop body. In this case we do *not* want to use post-inc expressions
2615 // in the latch check, and we want to insert post-inc expressions before
2616 // the backedge.
2617 BasicBlock *LatchBlock = L->getLoopLatch();
2618 SmallVector<BasicBlock*, 8> ExitingBlocks;
2619 L->getExitingBlocks(ExitingBlocks);
2620 if (!llvm::is_contained(ExitingBlocks, LatchBlock)) {
2621 // The backedge doesn't exit the loop; treat this as a head-tested loop.
2622 IVIncInsertPos = LatchBlock->getTerminator();
2623 return;
2624 }
2625
2626 // Otherwise treat this as a rotated loop.
2627 for (BasicBlock *ExitingBlock : ExitingBlocks) {
2628 // Get the terminating condition for the loop if possible. If we
2629 // can, we want to change it to use a post-incremented version of its
2630 // induction variable, to allow coalescing the live ranges for the IV into
2631 // one register value.
2632
2633 BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
2634 if (!TermBr)
2635 continue;
2636 // FIXME: Overly conservative, termination condition could be an 'or' etc..
2637 if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
2638 continue;
2639
2640 // Search IVUsesByStride to find Cond's IVUse if there is one.
2641 IVStrideUse *CondUse = nullptr;
2642 ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
2643 if (!FindIVUserForCond(Cond, CondUse))
2644 continue;
2645
2646 // If the trip count is computed in terms of a max (due to ScalarEvolution
2647 // being unable to find a sufficient guard, for example), change the loop
2648 // comparison to use SLT or ULT instead of NE.
2649 // One consequence of doing this now is that it disrupts the count-down
2650 // optimization. That's not always a bad thing though, because in such
2651 // cases it may still be worthwhile to avoid a max.
2652 Cond = OptimizeMax(Cond, CondUse);
2653
2654 // If this exiting block dominates the latch block, it may also use
2655 // the post-inc value if it won't be shared with other uses.
2656 // Check for dominance.
2657 if (!DT.dominates(ExitingBlock, LatchBlock))
2658 continue;
2659
2660 // Conservatively avoid trying to use the post-inc value in non-latch
2661 // exits if there may be pre-inc users in intervening blocks.
2662 if (LatchBlock != ExitingBlock)
2663 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
2664 // Test if the use is reachable from the exiting block. This dominator
2665 // query is a conservative approximation of reachability.
2666 if (&*UI != CondUse &&
2667 !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {
2668 // Conservatively assume there may be reuse if the quotient of their
2669 // strides could be a legal scale.
2670 const SCEV *A = IU.getStride(*CondUse, L);
2671 const SCEV *B = IU.getStride(*UI, L);
2672 if (!A || !B) continue;
2673 if (SE.getTypeSizeInBits(A->getType()) !=
2674 SE.getTypeSizeInBits(B->getType())) {
2675 if (SE.getTypeSizeInBits(A->getType()) >
2676 SE.getTypeSizeInBits(B->getType()))
2677 B = SE.getSignExtendExpr(B, A->getType());
2678 else
2679 A = SE.getSignExtendExpr(A, B->getType());
2680 }
2681 if (const SCEVConstant *D =
2682 dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
2683 const ConstantInt *C = D->getValue();
2684 // Stride of one or negative one can have reuse with non-addresses.
2685 if (C->isOne() || C->isMinusOne())
2686 goto decline_post_inc;
2687 // Avoid weird situations.
2688 if (C->getValue().getSignificantBits() >= 64 ||
2689 C->getValue().isMinSignedValue())
2690 goto decline_post_inc;
2691 // Check for possible scaled-address reuse.
2692 if (isAddressUse(TTI, UI->getUser(), UI->getOperandValToReplace())) {
2693 MemAccessTy AccessTy = getAccessType(
2694 TTI, UI->getUser(), UI->getOperandValToReplace());
2695 int64_t Scale = C->getSExtValue();
2696 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2697 /*BaseOffset=*/0,
2698 /*HasBaseReg=*/true, Scale,
2699 AccessTy.AddrSpace))
2700 goto decline_post_inc;
2701 Scale = -Scale;
2702 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2703 /*BaseOffset=*/0,
2704 /*HasBaseReg=*/true, Scale,
2705 AccessTy.AddrSpace))
2706 goto decline_post_inc;
2707 }
2708 }
2709 }
2710
2711 LLVM_DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: "
2712 << *Cond << '\n');
2713
2714 // It's possible for the setcc instruction to be anywhere in the loop, and
2715 // possible for it to have multiple users. If it is not immediately before
2716 // the exiting block branch, move it.
2717 if (Cond->getNextNonDebugInstruction() != TermBr) {
2718 if (Cond->hasOneUse()) {
2719 Cond->moveBefore(TermBr);
2720 } else {
2721 // Clone the terminating condition and insert into the loopend.
2722 ICmpInst *OldCond = Cond;
2723 Cond = cast<ICmpInst>(Cond->clone());
2724 Cond->setName(L->getHeader()->getName() + ".termcond");
2725 Cond->insertInto(ExitingBlock, TermBr->getIterator());
2726
2727 // Clone the IVUse, as the old use still exists!
2728 CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
2729 TermBr->replaceUsesOfWith(OldCond, Cond);
2730 }
2731 }
2732
2733 // If we get to here, we know that we can transform the setcc instruction to
2734 // use the post-incremented version of the IV, allowing us to coalesce the
2735 // live ranges for the IV correctly.
2736 CondUse->transformToPostInc(L);
2737 Changed = true;
2738
2739 PostIncs.insert(Cond);
2740 decline_post_inc:;
2741 }
2742
2743 // Determine an insertion point for the loop induction variable increment. It
2744 // must dominate all the post-inc comparisons we just set up, and it must
2745 // dominate the loop latch edge.
2746 IVIncInsertPos = L->getLoopLatch()->getTerminator();
2747 for (Instruction *Inst : PostIncs)
2748 IVIncInsertPos = DT.findNearestCommonDominator(IVIncInsertPos, Inst);
2749}
2750
2751/// Determine if the given use can accommodate a fixup at the given offset and
2752/// other details. If so, update the use and return true.
2753bool LSRInstance::reconcileNewOffset(LSRUse &LU, Immediate NewOffset,
2754 bool HasBaseReg, LSRUse::KindType Kind,
2755 MemAccessTy AccessTy) {
2756 Immediate NewMinOffset = LU.MinOffset;
2757 Immediate NewMaxOffset = LU.MaxOffset;
2758 MemAccessTy NewAccessTy = AccessTy;
2759
2760 // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
2761 // something conservative, however this can pessimize in the case that one of
2762 // the uses will have all its uses outside the loop, for example.
2763 if (LU.Kind != Kind)
2764 return false;
2765
2766 // Check for a mismatched access type, and fall back conservatively as needed.
2767 // TODO: Be less conservative when the type is similar and can use the same
2768 // addressing modes.
2769 if (Kind == LSRUse::Address) {
2770 if (AccessTy.MemTy != LU.AccessTy.MemTy) {
2771 NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext(),
2772 AccessTy.AddrSpace);
2773 }
2774 }
2775
2776 // Conservatively assume HasBaseReg is true for now.
2777 if (Immediate::isKnownLT(NewOffset, LU.MinOffset)) {
2778 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2779 LU.MaxOffset - NewOffset, HasBaseReg))
2780 return false;
2781 NewMinOffset = NewOffset;
2782 } else if (Immediate::isKnownGT(NewOffset, LU.MaxOffset)) {
2783 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2784 NewOffset - LU.MinOffset, HasBaseReg))
2785 return false;
2786 NewMaxOffset = NewOffset;
2787 }
2788
2789 // FIXME: We should be able to handle some level of scalable offset support
2790 // for 'void', but in order to get basic support up and running this is
2791 // being left out.
2792 if (NewAccessTy.MemTy && NewAccessTy.MemTy->isVoidTy() &&
2793 (NewMinOffset.isScalable() || NewMaxOffset.isScalable()))
2794 return false;
2795
2796 // Update the use.
2797 LU.MinOffset = NewMinOffset;
2798 LU.MaxOffset = NewMaxOffset;
2799 LU.AccessTy = NewAccessTy;
2800 return true;
2801}
2802
2803/// Return an LSRUse index and an offset value for a fixup which needs the given
2804/// expression, with the given kind and optional access type. Either reuse an
2805/// existing use or create a new one, as needed.
2806std::pair<size_t, Immediate> LSRInstance::getUse(const SCEV *&Expr,
2807 LSRUse::KindType Kind,
2808 MemAccessTy AccessTy) {
2809 const SCEV *Copy = Expr;
2810 Immediate Offset = ExtractImmediate(Expr, SE);
2811
2812 // Basic uses can't accept any offset, for example.
2813 if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
2814 Offset, /*HasBaseReg=*/ true)) {
2815 Expr = Copy;
2816 Offset = Immediate::getFixed(0);
2817 }
2818
2819 std::pair<UseMapTy::iterator, bool> P =
2820 UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0));
2821 if (!P.second) {
2822 // A use already existed with this base.
2823 size_t LUIdx = P.first->second;
2824 LSRUse &LU = Uses[LUIdx];
2825 if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
2826 // Reuse this use.
2827 return std::make_pair(LUIdx, Offset);
2828 }
2829
2830 // Create a new use.
2831 size_t LUIdx = Uses.size();
2832 P.first->second = LUIdx;
2833 Uses.push_back(LSRUse(Kind, AccessTy));
2834 LSRUse &LU = Uses[LUIdx];
2835
2836 LU.MinOffset = Offset;
2837 LU.MaxOffset = Offset;
2838 return std::make_pair(LUIdx, Offset);
2839}
2840
2841/// Delete the given use from the Uses list.
2842void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
2843 if (&LU != &Uses.back())
2844 std::swap(LU, Uses.back());
2845 Uses.pop_back();
2846
2847 // Update RegUses.
2848 RegUses.swapAndDropUse(LUIdx, Uses.size());
2849}
2850
2851/// Look for a use distinct from OrigLU which is has a formula that has the same
2852/// registers as the given formula.
2853LSRUse *
2854LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
2855 const LSRUse &OrigLU) {
2856 // Search all uses for the formula. This could be more clever.
2857 for (LSRUse &LU : Uses) {
2858 // Check whether this use is close enough to OrigLU, to see whether it's
2859 // worthwhile looking through its formulae.
2860 // Ignore ICmpZero uses because they may contain formulae generated by
2861 // GenerateICmpZeroScales, in which case adding fixup offsets may
2862 // be invalid.
2863 if (&LU != &OrigLU &&
2864 LU.Kind != LSRUse::ICmpZero &&
2865 LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
2866 LU.WidestFixupType == OrigLU.WidestFixupType &&
2867 LU.HasFormulaWithSameRegs(OrigF)) {
2868 // Scan through this use's formulae.
2869 for (const Formula &F : LU.Formulae) {
2870 // Check to see if this formula has the same registers and symbols
2871 // as OrigF.
2872 if (F.BaseRegs == OrigF.BaseRegs &&
2873 F.ScaledReg == OrigF.ScaledReg &&
2874 F.BaseGV == OrigF.BaseGV &&
2875 F.Scale == OrigF.Scale &&
2876 F.UnfoldedOffset == OrigF.UnfoldedOffset) {
2877 if (F.BaseOffset.isZero())
2878 return &LU;
2879 // This is the formula where all the registers and symbols matched;
2880 // there aren't going to be any others. Since we declined it, we
2881 // can skip the rest of the formulae and proceed to the next LSRUse.
2882 break;
2883 }
2884 }
2885 }
2886 }
2887
2888 // Nothing looked good.
2889 return nullptr;
2890}
2891
2892void LSRInstance::CollectInterestingTypesAndFactors() {
2894
2895 // Collect interesting types and strides.
2897 for (const IVStrideUse &U : IU) {
2898 const SCEV *Expr = IU.getExpr(U);
2899 if (!Expr)
2900 continue;
2901
2902 // Collect interesting types.
2903 Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
2904
2905 // Add strides for mentioned loops.
2906 Worklist.push_back(Expr);
2907 do {
2908 const SCEV *S = Worklist.pop_back_val();
2909 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
2910 if (AR->getLoop() == L)
2911 Strides.insert(AR->getStepRecurrence(SE));
2912 Worklist.push_back(AR->getStart());
2913 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
2914 append_range(Worklist, Add->operands());
2915 }
2916 } while (!Worklist.empty());
2917 }
2918
2919 // Compute interesting factors from the set of interesting strides.
2921 I = Strides.begin(), E = Strides.end(); I != E; ++I)
2923 std::next(I); NewStrideIter != E; ++NewStrideIter) {
2924 const SCEV *OldStride = *I;
2925 const SCEV *NewStride = *NewStrideIter;
2926
2927 if (SE.getTypeSizeInBits(OldStride->getType()) !=
2928 SE.getTypeSizeInBits(NewStride->getType())) {
2929 if (SE.getTypeSizeInBits(OldStride->getType()) >
2930 SE.getTypeSizeInBits(NewStride->getType()))
2931 NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
2932 else
2933 OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
2934 }
2935 if (const SCEVConstant *Factor =
2936 dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
2937 SE, true))) {
2938 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2939 Factors.insert(Factor->getAPInt().getSExtValue());
2940 } else if (const SCEVConstant *Factor =
2941 dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
2942 NewStride,
2943 SE, true))) {
2944 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2945 Factors.insert(Factor->getAPInt().getSExtValue());
2946 }
2947 }
2948
2949 // If all uses use the same type, don't bother looking for truncation-based
2950 // reuse.
2951 if (Types.size() == 1)
2952 Types.clear();
2953
2954 LLVM_DEBUG(print_factors_and_types(dbgs()));
2955}
2956
2957/// Helper for CollectChains that finds an IV operand (computed by an AddRec in
2958/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
2959/// IVStrideUses, we could partially skip this.
2960static User::op_iterator
2962 Loop *L, ScalarEvolution &SE) {
2963 for(; OI != OE; ++OI) {
2964 if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
2965 if (!SE.isSCEVable(Oper->getType()))
2966 continue;
2967
2968 if (const SCEVAddRecExpr *AR =
2969 dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) {
2970 if (AR->getLoop() == L)
2971 break;
2972 }
2973 }
2974 }
2975 return OI;
2976}
2977
2978/// IVChain logic must consistently peek base TruncInst operands, so wrap it in
2979/// a convenient helper.
2981 if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
2982 return Trunc->getOperand(0);
2983 return Oper;
2984}
2985
2986/// Return an approximation of this SCEV expression's "base", or NULL for any
2987/// constant. Returning the expression itself is conservative. Returning a
2988/// deeper subexpression is more precise and valid as long as it isn't less
2989/// complex than another subexpression. For expressions involving multiple
2990/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
2991/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
2992/// IVInc==b-a.
2993///
2994/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
2995/// SCEVUnknown, we simply return the rightmost SCEV operand.
2996static const SCEV *getExprBase(const SCEV *S) {
2997 switch (S->getSCEVType()) {
2998 default: // including scUnknown.
2999 return S;
3000 case scConstant:
3001 case scVScale:
3002 return nullptr;
3003 case scTruncate:
3004 return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
3005 case scZeroExtend:
3006 return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
3007 case scSignExtend:
3008 return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
3009 case scAddExpr: {
3010 // Skip over scaled operands (scMulExpr) to follow add operands as long as
3011 // there's nothing more complex.
3012 // FIXME: not sure if we want to recognize negation.
3013 const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
3014 for (const SCEV *SubExpr : reverse(Add->operands())) {
3015 if (SubExpr->getSCEVType() == scAddExpr)
3016 return getExprBase(SubExpr);
3017
3018 if (SubExpr->getSCEVType() != scMulExpr)
3019 return SubExpr;
3020 }
3021 return S; // all operands are scaled, be conservative.
3022 }
3023 case scAddRecExpr:
3024 return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
3025 }
3026 llvm_unreachable("Unknown SCEV kind!");
3027}
3028
3029/// Return true if the chain increment is profitable to expand into a loop
3030/// invariant value, which may require its own register. A profitable chain
3031/// increment will be an offset relative to the same base. We allow such offsets
3032/// to potentially be used as chain increment as long as it's not obviously
3033/// expensive to expand using real instructions.
3034bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
3035 const SCEV *IncExpr,
3036 ScalarEvolution &SE) {
3037 // Aggressively form chains when -stress-ivchain.
3038 if (StressIVChain)
3039 return true;
3040
3041 // Do not replace a constant offset from IV head with a nonconstant IV
3042 // increment.
3043 if (!isa<SCEVConstant>(IncExpr)) {
3044 const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
3045 if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
3046 return false;
3047 }
3048
3050 return !isHighCostExpansion(IncExpr, Processed, SE);
3051}
3052
3053/// Return true if the number of registers needed for the chain is estimated to
3054/// be less than the number required for the individual IV users. First prohibit
3055/// any IV users that keep the IV live across increments (the Users set should
3056/// be empty). Next count the number and type of increments in the chain.
3057///
3058/// Chaining IVs can lead to considerable code bloat if ISEL doesn't
3059/// effectively use postinc addressing modes. Only consider it profitable it the
3060/// increments can be computed in fewer registers when chained.
3061///
3062/// TODO: Consider IVInc free if it's already used in another chains.
3063static bool isProfitableChain(IVChain &Chain,
3065 ScalarEvolution &SE,
3066 const TargetTransformInfo &TTI) {
3067 if (StressIVChain)
3068 return true;
3069
3070 if (!Chain.hasIncs())
3071 return false;
3072
3073 if (!Users.empty()) {
3074 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
3075 for (Instruction *Inst
3076 : Users) { dbgs() << " " << *Inst << "\n"; });
3077 return false;
3078 }
3079 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3080
3081 // The chain itself may require a register, so intialize cost to 1.
3082 int cost = 1;
3083
3084 // A complete chain likely eliminates the need for keeping the original IV in
3085 // a register. LSR does not currently know how to form a complete chain unless
3086 // the header phi already exists.
3087 if (isa<PHINode>(Chain.tailUserInst())
3088 && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
3089 --cost;
3090 }
3091 const SCEV *LastIncExpr = nullptr;
3092 unsigned NumConstIncrements = 0;
3093 unsigned NumVarIncrements = 0;
3094 unsigned NumReusedIncrements = 0;
3095
3096 if (TTI.isProfitableLSRChainElement(Chain.Incs[0].UserInst))
3097 return true;
3098
3099 for (const IVInc &Inc : Chain) {
3100 if (TTI.isProfitableLSRChainElement(Inc.UserInst))
3101 return true;
3102 if (Inc.IncExpr->isZero())
3103 continue;
3104
3105 // Incrementing by zero or some constant is neutral. We assume constants can
3106 // be folded into an addressing mode or an add's immediate operand.
3107 if (isa<SCEVConstant>(Inc.IncExpr)) {
3108 ++NumConstIncrements;
3109 continue;
3110 }
3111
3112 if (Inc.IncExpr == LastIncExpr)
3113 ++NumReusedIncrements;
3114 else
3115 ++NumVarIncrements;
3116
3117 LastIncExpr = Inc.IncExpr;
3118 }
3119 // An IV chain with a single increment is handled by LSR's postinc
3120 // uses. However, a chain with multiple increments requires keeping the IV's
3121 // value live longer than it needs to be if chained.
3122 if (NumConstIncrements > 1)
3123 --cost;
3124
3125 // Materializing increment expressions in the preheader that didn't exist in
3126 // the original code may cost a register. For example, sign-extended array
3127 // indices can produce ridiculous increments like this:
3128 // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
3129 cost += NumVarIncrements;
3130
3131 // Reusing variable increments likely saves a register to hold the multiple of
3132 // the stride.
3133 cost -= NumReusedIncrements;
3134
3135 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
3136 << "\n");
3137
3138 return cost < 0;
3139}
3140
3141/// Add this IV user to an existing chain or make it the head of a new chain.
3142void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
3143 SmallVectorImpl<ChainUsers> &ChainUsersVec) {
3144 // When IVs are used as types of varying widths, they are generally converted
3145 // to a wider type with some uses remaining narrow under a (free) trunc.
3146 Value *const NextIV = getWideOperand(IVOper);
3147 const SCEV *const OperExpr = SE.getSCEV(NextIV);
3148 const SCEV *const OperExprBase = getExprBase(OperExpr);
3149
3150 // Visit all existing chains. Check if its IVOper can be computed as a
3151 // profitable loop invariant increment from the last link in the Chain.
3152 unsigned ChainIdx = 0, NChains = IVChainVec.size();
3153 const SCEV *LastIncExpr = nullptr;
3154 for (; ChainIdx < NChains; ++ChainIdx) {
3155 IVChain &Chain = IVChainVec[ChainIdx];
3156
3157 // Prune the solution space aggressively by checking that both IV operands
3158 // are expressions that operate on the same unscaled SCEVUnknown. This
3159 // "base" will be canceled by the subsequent getMinusSCEV call. Checking
3160 // first avoids creating extra SCEV expressions.
3161 if (!StressIVChain && Chain.ExprBase != OperExprBase)
3162 continue;
3163
3164 Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
3165 if (PrevIV->getType() != NextIV->getType())
3166 continue;
3167
3168 // A phi node terminates a chain.
3169 if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
3170 continue;
3171
3172 // The increment must be loop-invariant so it can be kept in a register.
3173 const SCEV *PrevExpr = SE.getSCEV(PrevIV);
3174 const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
3175 if (isa<SCEVCouldNotCompute>(IncExpr) || !SE.isLoopInvariant(IncExpr, L))
3176 continue;
3177
3178 if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
3179 LastIncExpr = IncExpr;
3180 break;
3181 }
3182 }
3183 // If we haven't found a chain, create a new one, unless we hit the max. Don't
3184 // bother for phi nodes, because they must be last in the chain.
3185 if (ChainIdx == NChains) {
3186 if (isa<PHINode>(UserInst))
3187 return;
3188 if (NChains >= MaxChains && !StressIVChain) {
3189 LLVM_DEBUG(dbgs() << "IV Chain Limit\n");
3190 return;
3191 }
3192 LastIncExpr = OperExpr;
3193 // IVUsers may have skipped over sign/zero extensions. We don't currently
3194 // attempt to form chains involving extensions unless they can be hoisted
3195 // into this loop's AddRec.
3196 if (!isa<SCEVAddRecExpr>(LastIncExpr))
3197 return;
3198 ++NChains;
3199 IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
3200 OperExprBase));
3201 ChainUsersVec.resize(NChains);
3202 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
3203 << ") IV=" << *LastIncExpr << "\n");
3204 } else {
3205 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst
3206 << ") IV+" << *LastIncExpr << "\n");
3207 // Add this IV user to the end of the chain.
3208 IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
3209 }
3210 IVChain &Chain = IVChainVec[ChainIdx];
3211
3212 SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
3213 // This chain's NearUsers become FarUsers.
3214 if (!LastIncExpr->isZero()) {
3215 ChainUsersVec[ChainIdx].FarUsers.insert(NearUsers.begin(),
3216 NearUsers.end());
3217 NearUsers.clear();
3218 }
3219
3220 // All other uses of IVOperand become near uses of the chain.
3221 // We currently ignore intermediate values within SCEV expressions, assuming
3222 // they will eventually be used be the current chain, or can be computed
3223 // from one of the chain increments. To be more precise we could
3224 // transitively follow its user and only add leaf IV users to the set.
3225 for (User *U : IVOper->users()) {
3226 Instruction *OtherUse = dyn_cast<Instruction>(U);
3227 if (!OtherUse)
3228 continue;
3229 // Uses in the chain will no longer be uses if the chain is formed.
3230 // Include the head of the chain in this iteration (not Chain.begin()).
3231 IVChain::const_iterator IncIter = Chain.Incs.begin();
3232 IVChain::const_iterator IncEnd = Chain.Incs.end();
3233 for( ; IncIter != IncEnd; ++IncIter) {
3234 if (IncIter->UserInst == OtherUse)
3235 break;
3236 }
3237 if (IncIter != IncEnd)
3238 continue;
3239
3240 if (SE.isSCEVable(OtherUse->getType())
3241 && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
3242 && IU.isIVUserOrOperand(OtherUse)) {
3243 continue;
3244 }
3245 NearUsers.insert(OtherUse);
3246 }
3247
3248 // Since this user is part of the chain, it's no longer considered a use
3249 // of the chain.
3250 ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
3251}
3252
3253/// Populate the vector of Chains.
3254///
3255/// This decreases ILP at the architecture level. Targets with ample registers,
3256/// multiple memory ports, and no register renaming probably don't want
3257/// this. However, such targets should probably disable LSR altogether.
3258///
3259/// The job of LSR is to make a reasonable choice of induction variables across
3260/// the loop. Subsequent passes can easily "unchain" computation exposing more
3261/// ILP *within the loop* if the target wants it.
3262///
3263/// Finding the best IV chain is potentially a scheduling problem. Since LSR
3264/// will not reorder memory operations, it will recognize this as a chain, but
3265/// will generate redundant IV increments. Ideally this would be corrected later
3266/// by a smart scheduler:
3267/// = A[i]
3268/// = A[i+x]
3269/// A[i] =
3270/// A[i+x] =
3271///
3272/// TODO: Walk the entire domtree within this loop, not just the path to the
3273/// loop latch. This will discover chains on side paths, but requires
3274/// maintaining multiple copies of the Chains state.
3275void LSRInstance::CollectChains() {
3276 LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n");
3277 SmallVector<ChainUsers, 8> ChainUsersVec;
3278
3280 BasicBlock *LoopHeader = L->getHeader();
3281 for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
3282 Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
3283 LatchPath.push_back(Rung->getBlock());
3284 }
3285 LatchPath.push_back(LoopHeader);
3286
3287 // Walk the instruction stream from the loop header to the loop latch.
3288 for (BasicBlock *BB : reverse(LatchPath)) {
3289 for (Instruction &I : *BB) {
3290 // Skip instructions that weren't seen by IVUsers analysis.
3291 if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&I))
3292 continue;
3293
3294 // Ignore users that are part of a SCEV expression. This way we only
3295 // consider leaf IV Users. This effectively rediscovers a portion of
3296 // IVUsers analysis but in program order this time.
3297 if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I)))
3298 continue;
3299
3300 // Remove this instruction from any NearUsers set it may be in.
3301 for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
3302 ChainIdx < NChains; ++ChainIdx) {
3303 ChainUsersVec[ChainIdx].NearUsers.erase(&I);
3304 }
3305 // Search for operands that can be chained.
3306 SmallPtrSet<Instruction*, 4> UniqueOperands;
3307 User::op_iterator IVOpEnd = I.op_end();
3308 User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE);
3309 while (IVOpIter != IVOpEnd) {
3310 Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
3311 if (UniqueOperands.insert(IVOpInst).second)
3312 ChainInstruction(&I, IVOpInst, ChainUsersVec);
3313 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3314 }
3315 } // Continue walking down the instructions.
3316 } // Continue walking down the domtree.
3317 // Visit phi backedges to determine if the chain can generate the IV postinc.
3318 for (PHINode &PN : L->getHeader()->phis()) {
3319 if (!SE.isSCEVable(PN.getType()))
3320 continue;
3321
3322 Instruction *IncV =
3323 dyn_cast<Instruction>(PN.getIncomingValueForBlock(L->getLoopLatch()));
3324 if (IncV)
3325 ChainInstruction(&PN, IncV, ChainUsersVec);
3326 }
3327 // Remove any unprofitable chains.
3328 unsigned ChainIdx = 0;
3329 for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
3330 UsersIdx < NChains; ++UsersIdx) {
3331 if (!isProfitableChain(IVChainVec[UsersIdx],
3332 ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
3333 continue;
3334 // Preserve the chain at UsesIdx.
3335 if (ChainIdx != UsersIdx)
3336 IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
3337 FinalizeChain(IVChainVec[ChainIdx]);
3338 ++ChainIdx;
3339 }
3340 IVChainVec.resize(ChainIdx);
3341}
3342
3343void LSRInstance::FinalizeChain(IVChain &Chain) {
3344 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3345 LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
3346
3347 for (const IVInc &Inc : Chain) {
3348 LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");
3349 auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
3350 assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
3351 IVIncSet.insert(UseI);
3352 }
3353}
3354
3355/// Return true if the IVInc can be folded into an addressing mode.
3356static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
3357 Value *Operand, const TargetTransformInfo &TTI) {
3358 const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
3359 Immediate IncOffset = Immediate::getZero();
3360 if (IncConst) {
3361 if (IncConst && IncConst->getAPInt().getSignificantBits() > 64)
3362 return false;
3363 IncOffset = Immediate::getFixed(IncConst->getValue()->getSExtValue());
3364 } else {
3365 // Look for mul(vscale, constant), to detect a scalable offset.
3366 auto *IncVScale = dyn_cast<SCEVMulExpr>(IncExpr);
3367 if (!IncVScale || IncVScale->getNumOperands() != 2 ||
3368 !isa<SCEVVScale>(IncVScale->getOperand(1)))
3369 return false;
3370 auto *Scale = dyn_cast<SCEVConstant>(IncVScale->getOperand(0));
3371 if (!Scale || Scale->getType()->getScalarSizeInBits() > 64)
3372 return false;
3373 IncOffset = Immediate::getScalable(Scale->getValue()->getSExtValue());
3374 }
3375
3376 if (!isAddressUse(TTI, UserInst, Operand))
3377 return false;
3378
3379 MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
3380 if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
3381 IncOffset, /*HasBaseReg=*/false))
3382 return false;
3383
3384 return true;
3385}
3386
3387/// Generate an add or subtract for each IVInc in a chain to materialize the IV
3388/// user's operand from the previous IV user's operand.
3389void LSRInstance::GenerateIVChain(const IVChain &Chain,
3391 // Find the new IVOperand for the head of the chain. It may have been replaced
3392 // by LSR.
3393 const IVInc &Head = Chain.Incs[0];
3394 User::op_iterator IVOpEnd = Head.UserInst->op_end();
3395 // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
3396 User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
3397 IVOpEnd, L, SE);
3398 Value *IVSrc = nullptr;
3399 while (IVOpIter != IVOpEnd) {
3400 IVSrc = getWideOperand(*IVOpIter);
3401
3402 // If this operand computes the expression that the chain needs, we may use
3403 // it. (Check this after setting IVSrc which is used below.)
3404 //
3405 // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
3406 // narrow for the chain, so we can no longer use it. We do allow using a
3407 // wider phi, assuming the LSR checked for free truncation. In that case we
3408 // should already have a truncate on this operand such that
3409 // getSCEV(IVSrc) == IncExpr.
3410 if (SE.getSCEV(*IVOpIter) == Head.IncExpr
3411 || SE.getSCEV(IVSrc) == Head.IncExpr) {
3412 break;
3413 }
3414 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3415 }
3416 if (IVOpIter == IVOpEnd) {
3417 // Gracefully give up on this chain.
3418 LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
3419 return;
3420 }
3421 assert(IVSrc && "Failed to find IV chain source");
3422
3423 LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
3424 Type *IVTy = IVSrc->getType();
3425 Type *IntTy = SE.getEffectiveSCEVType(IVTy);
3426 const SCEV *LeftOverExpr = nullptr;
3427 const SCEV *Accum = SE.getZero(IntTy);
3429 Bases.emplace_back(Accum, IVSrc);
3430
3431 for (const IVInc &Inc : Chain) {
3432 Instruction *InsertPt = Inc.UserInst;
3433 if (isa<PHINode>(InsertPt))
3434 InsertPt = L->getLoopLatch()->getTerminator();
3435
3436 // IVOper will replace the current IV User's operand. IVSrc is the IV
3437 // value currently held in a register.
3438 Value *IVOper = IVSrc;
3439 if (!Inc.IncExpr->isZero()) {
3440 // IncExpr was the result of subtraction of two narrow values, so must
3441 // be signed.
3442 const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
3443 Accum = SE.getAddExpr(Accum, IncExpr);
3444 LeftOverExpr = LeftOverExpr ?
3445 SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
3446 }
3447
3448 // Look through each base to see if any can produce a nice addressing mode.
3449 bool FoundBase = false;
3450 for (auto [MapScev, MapIVOper] : reverse(Bases)) {
3451 const SCEV *Remainder = SE.getMinusSCEV(Accum, MapScev);
3452 if (canFoldIVIncExpr(Remainder, Inc.UserInst, Inc.IVOperand, TTI)) {
3453 if (!Remainder->isZero()) {
3454 Rewriter.clearPostInc();
3455 Value *IncV = Rewriter.expandCodeFor(Remainder, IntTy, InsertPt);
3456 const SCEV *IVOperExpr =
3457 SE.getAddExpr(SE.getUnknown(MapIVOper), SE.getUnknown(IncV));
3458 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3459 } else {
3460 IVOper = MapIVOper;
3461 }
3462
3463 FoundBase = true;
3464 break;
3465 }
3466 }
3467 if (!FoundBase && LeftOverExpr && !LeftOverExpr->isZero()) {
3468 // Expand the IV increment.
3469 Rewriter.clearPostInc();
3470 Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
3471 const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
3472 SE.getUnknown(IncV));
3473 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3474
3475 // If an IV increment can't be folded, use it as the next IV value.
3476 if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
3477 assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
3478 Bases.emplace_back(Accum, IVOper);
3479 IVSrc = IVOper;
3480 LeftOverExpr = nullptr;
3481 }
3482 }
3483 Type *OperTy = Inc.IVOperand->getType();
3484 if (IVTy != OperTy) {
3485 assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
3486 "cannot extend a chained IV");
3487 IRBuilder<> Builder(InsertPt);
3488 IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
3489 }
3490 Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
3491 if (auto *OperandIsInstr = dyn_cast<Instruction>(Inc.IVOperand))
3492 DeadInsts.emplace_back(OperandIsInstr);
3493 }
3494 // If LSR created a new, wider phi, we may also replace its postinc. We only
3495 // do this if we also found a wide value for the head of the chain.
3496 if (isa<PHINode>(Chain.tailUserInst())) {
3497 for (PHINode &Phi : L->getHeader()->phis()) {
3498 if (Phi.getType() != IVSrc->getType())
3499 continue;
3500 Instruction *PostIncV = dyn_cast<Instruction>(
3501 Phi.getIncomingValueForBlock(L->getLoopLatch()));
3502 if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
3503 continue;
3504 Value *IVOper = IVSrc;
3505 Type *PostIncTy = PostIncV->getType();
3506 if (IVTy != PostIncTy) {
3507 assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
3508 IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
3509 Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
3510 IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
3511 }
3512 Phi.replaceUsesOfWith(PostIncV, IVOper);
3513 DeadInsts.emplace_back(PostIncV);
3514 }
3515 }
3516}
3517
3518void LSRInstance::CollectFixupsAndInitialFormulae() {
3519 BranchInst *ExitBranch = nullptr;
3520 bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &TLI);
3521
3522 // For calculating baseline cost
3524 DenseSet<const SCEV *> VisitedRegs;
3525 DenseSet<size_t> VisitedLSRUse;
3526
3527 for (const IVStrideUse &U : IU) {
3528 Instruction *UserInst = U.getUser();
3529 // Skip IV users that are part of profitable IV Chains.
3530 User::op_iterator UseI =
3531 find(UserInst->operands(), U.getOperandValToReplace());
3532 assert(UseI != UserInst->op_end() && "cannot find IV operand");
3533 if (IVIncSet.count(UseI)) {
3534 LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
3535 continue;
3536 }
3537
3538 LSRUse::KindType Kind = LSRUse::Basic;
3539 MemAccessTy AccessTy;
3540 if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
3541 Kind = LSRUse::Address;
3542 AccessTy = getAccessType(TTI, UserInst, U.getOperandValToReplace());
3543 }
3544
3545 const SCEV *S = IU.getExpr(U);
3546 if (!S)
3547 continue;
3548 PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
3549
3550 // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
3551 // (N - i == 0), and this allows (N - i) to be the expression that we work
3552 // with rather than just N or i, so we can consider the register
3553 // requirements for both N and i at the same time. Limiting this code to
3554 // equality icmps is not a problem because all interesting loops use
3555 // equality icmps, thanks to IndVarSimplify.
3556 if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) {
3557 // If CI can be saved in some target, like replaced inside hardware loop
3558 // in PowerPC, no need to generate initial formulae for it.
3559 if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition()))
3560 continue;
3561 if (CI->isEquality()) {
3562 // Swap the operands if needed to put the OperandValToReplace on the
3563 // left, for consistency.
3564 Value *NV = CI->getOperand(1);
3565 if (NV == U.getOperandValToReplace()) {
3566 CI->setOperand(1, CI->getOperand(0));
3567 CI->setOperand(0, NV);
3568 NV = CI->getOperand(1);
3569 Changed = true;
3570 }
3571
3572 // x == y --> x - y == 0
3573 const SCEV *N = SE.getSCEV(NV);
3574 if (SE.isLoopInvariant(N, L) && Rewriter.isSafeToExpand(N) &&
3575 (!NV->getType()->isPointerTy() ||
3576 SE.getPointerBase(N) == SE.getPointerBase(S))) {
3577 // S is normalized, so normalize N before folding it into S
3578 // to keep the result normalized.
3579 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3580 if (!N)
3581 continue;
3582 Kind = LSRUse::ICmpZero;
3583 S = SE.getMinusSCEV(N, S);
3584 } else if (L->isLoopInvariant(NV) &&
3585 (!isa<Instruction>(NV) ||
3586 DT.dominates(cast<Instruction>(NV), L->getHeader())) &&
3587 !NV->getType()->isPointerTy()) {
3588 // If we can't generally expand the expression (e.g. it contains
3589 // a divide), but it is already at a loop invariant point before the
3590 // loop, wrap it in an unknown (to prevent the expander from trying
3591 // to re-expand in a potentially unsafe way.) The restriction to
3592 // integer types is required because the unknown hides the base, and
3593 // SCEV can't compute the difference of two unknown pointers.
3594 N = SE.getUnknown(NV);
3595 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3596 if (!N)
3597 continue;
3598 Kind = LSRUse::ICmpZero;
3599 S = SE.getMinusSCEV(N, S);
3600 assert(!isa<SCEVCouldNotCompute>(S));
3601 }
3602
3603 // -1 and the negations of all interesting strides (except the negation
3604 // of -1) are now also interesting.
3605 for (size_t i = 0, e = Factors.size(); i != e; ++i)
3606 if (Factors[i] != -1)
3607 Factors.insert(-(uint64_t)Factors[i]);
3608 Factors.insert(-1);
3609 }
3610 }
3611
3612 // Get or create an LSRUse.
3613 std::pair<size_t, Immediate> P = getUse(S, Kind, AccessTy);
3614 size_t LUIdx = P.first;
3615 Immediate Offset = P.second;
3616 LSRUse &LU = Uses[LUIdx];
3617
3618 // Record the fixup.
3619 LSRFixup &LF = LU.getNewFixup();
3620 LF.UserInst = UserInst;
3621 LF.OperandValToReplace = U.getOperandValToReplace();
3622 LF.PostIncLoops = TmpPostIncLoops;
3623 LF.Offset = Offset;
3624 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3625
3626 // Create SCEV as Formula for calculating baseline cost
3627 if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) {
3628 Formula F;
3629 F.initialMatch(S, L, SE);
3630 BaselineCost.RateFormula(F, Regs, VisitedRegs, LU);
3631 VisitedLSRUse.insert(LUIdx);
3632 }
3633
3634 if (!LU.WidestFixupType ||
3635 SE.getTypeSizeInBits(LU.WidestFixupType) <
3636 SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3637 LU.WidestFixupType = LF.OperandValToReplace->getType();
3638
3639 // If this is the first use of this LSRUse, give it a formula.
3640 if (LU.Formulae.empty()) {
3641 InsertInitialFormula(S, LU, LUIdx);
3642 CountRegisters(LU.Formulae.back(), LUIdx);
3643 }
3644 }
3645
3646 LLVM_DEBUG(print_fixups(dbgs()));
3647}
3648
3649/// Insert a formula for the given expression into the given use, separating out
3650/// loop-variant portions from loop-invariant and loop-computable portions.
3651void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU,
3652 size_t LUIdx) {
3653 // Mark uses whose expressions cannot be expanded.
3654 if (!Rewriter.isSafeToExpand(S))
3655 LU.RigidFormula = true;
3656
3657 Formula F;
3658 F.initialMatch(S, L, SE);
3659 bool Inserted = InsertFormula(LU, LUIdx, F);
3660 assert(Inserted && "Initial formula already exists!"); (void)Inserted;
3661}
3662
3663/// Insert a simple single-register formula for the given expression into the
3664/// given use.
3665void
3666LSRInstance::InsertSupplementalFormula(const SCEV *S,
3667 LSRUse &LU, size_t LUIdx) {
3668 Formula F;
3669 F.BaseRegs.push_back(S);
3670 F.HasBaseReg = true;
3671 bool Inserted = InsertFormula(LU, LUIdx, F);
3672 assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
3673}
3674
3675/// Note which registers are used by the given formula, updating RegUses.
3676void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
3677 if (F.ScaledReg)
3678 RegUses.countRegister(F.ScaledReg, LUIdx);
3679 for (const SCEV *BaseReg : F.BaseRegs)
3680 RegUses.countRegister(BaseReg, LUIdx);
3681}
3682
3683/// If the given formula has not yet been inserted, add it to the list, and
3684/// return true. Return false otherwise.
3685bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
3686 // Do not insert formula that we will not be able to expand.
3687 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
3688 "Formula is illegal");
3689
3690 if (!LU.InsertFormula(F, *L))
3691 return false;
3692
3693 CountRegisters(F, LUIdx);
3694 return true;
3695}
3696
3697/// Check for other uses of loop-invariant values which we're tracking. These
3698/// other uses will pin these values in registers, making them less profitable
3699/// for elimination.
3700/// TODO: This currently misses non-constant addrec step registers.
3701/// TODO: Should this give more weight to users inside the loop?
3702void
3703LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
3704 SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
3706
3707 // Don't collect outside uses if we are favoring postinc - the instructions in
3708 // the loop are more important than the ones outside of it.
3709 if (AMK == TTI::AMK_PostIndexed)
3710 return;
3711
3712 while (!Worklist.empty()) {
3713 const SCEV *S = Worklist.pop_back_val();
3714
3715 // Don't process the same SCEV twice
3716 if (!Visited.insert(S).second)
3717 continue;
3718
3719 if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
3720 append_range(Worklist, N->operands());
3721 else if (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(S))
3722 Worklist.push_back(C->getOperand());
3723 else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
3724 Worklist.push_back(D->getLHS());
3725 Worklist.push_back(D->getRHS());
3726 } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
3727 const Value *V = US->getValue();
3728 if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
3729 // Look for instructions defined outside the loop.
3730 if (L->contains(Inst)) continue;
3731 } else if (isa<Constant>(V))
3732 // Constants can be re-materialized.
3733 continue;
3734 for (const Use &U : V->uses()) {
3735 const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
3736 // Ignore non-instructions.
3737 if (!UserInst)
3738 continue;
3739 // Don't bother if the instruction is an EHPad.
3740 if (UserInst->isEHPad())
3741 continue;
3742 // Ignore instructions in other functions (as can happen with
3743 // Constants).
3744 if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
3745 continue;
3746 // Ignore instructions not dominated by the loop.
3747 const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
3748 UserInst->getParent() :
3749 cast<PHINode>(UserInst)->getIncomingBlock(
3751 if (!DT.dominates(L->getHeader(), UseBB))
3752 continue;
3753 // Don't bother if the instruction is in a BB which ends in an EHPad.
3754 if (UseBB->getTerminator()->isEHPad())
3755 continue;
3756
3757 // Ignore cases in which the currently-examined value could come from
3758 // a basic block terminated with an EHPad. This checks all incoming
3759 // blocks of the phi node since it is possible that the same incoming
3760 // value comes from multiple basic blocks, only some of which may end
3761 // in an EHPad. If any of them do, a subsequent rewrite attempt by this
3762 // pass would try to insert instructions into an EHPad, hitting an
3763 // assertion.
3764 if (isa<PHINode>(UserInst)) {
3765 const auto *PhiNode = cast<PHINode>(UserInst);
3766 bool HasIncompatibleEHPTerminatedBlock = false;
3767 llvm::Value *ExpectedValue = U;
3768 for (unsigned int I = 0; I < PhiNode->getNumIncomingValues(); I++) {
3769 if (PhiNode->getIncomingValue(I) == ExpectedValue) {
3770 if (PhiNode->getIncomingBlock(I)->getTerminator()->isEHPad()) {
3771 HasIncompatibleEHPTerminatedBlock = true;
3772 break;
3773 }
3774 }
3775 }
3776 if (HasIncompatibleEHPTerminatedBlock) {
3777 continue;
3778 }
3779 }
3780
3781 // Don't bother rewriting PHIs in catchswitch blocks.
3782 if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
3783 continue;
3784 // Ignore uses which are part of other SCEV expressions, to avoid
3785 // analyzing them multiple times.
3786 if (SE.isSCEVable(UserInst->getType())) {
3787 const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
3788 // If the user is a no-op, look through to its uses.
3789 if (!isa<SCEVUnknown>(UserS))
3790 continue;
3791 if (UserS == US) {
3792 Worklist.push_back(
3793 SE.getUnknown(const_cast<Instruction *>(UserInst)));
3794 continue;
3795 }
3796 }
3797 // Ignore icmp instructions which are already being analyzed.
3798 if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
3799 unsigned OtherIdx = !U.getOperandNo();
3800 Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
3801 if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
3802 continue;
3803 }
3804
3805 std::pair<size_t, Immediate> P =
3806 getUse(S, LSRUse::Basic, MemAccessTy());
3807 size_t LUIdx = P.first;
3808 Immediate Offset = P.second;
3809 LSRUse &LU = Uses[LUIdx];
3810 LSRFixup &LF = LU.getNewFixup();
3811 LF.UserInst = const_cast<Instruction *>(UserInst);
3812 LF.OperandValToReplace = U;
3813 LF.Offset = Offset;
3814 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3815 if (!LU.WidestFixupType ||
3816 SE.getTypeSizeInBits(LU.WidestFixupType) <
3817 SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3818 LU.WidestFixupType = LF.OperandValToReplace->getType();
3819 InsertSupplementalFormula(US, LU, LUIdx);
3820 CountRegisters(LU.Formulae.back(), Uses.size() - 1);
3821 break;
3822 }
3823 }
3824 }
3825}
3826
3827/// Split S into subexpressions which can be pulled out into separate
3828/// registers. If C is non-null, multiply each subexpression by C.
3829///
3830/// Return remainder expression after factoring the subexpressions captured by
3831/// Ops. If Ops is complete, return NULL.
3832static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
3834 const Loop *L,
3835 ScalarEvolution &SE,
3836 unsigned Depth = 0) {
3837 // Arbitrarily cap recursion to protect compile time.
3838 if (Depth >= 3)
3839 return S;
3840
3841 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
3842 // Break out add operands.
3843 for (const SCEV *S : Add->operands()) {
3844 const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
3845 if (Remainder)
3846 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3847 }
3848 return nullptr;
3849 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
3850 // Split a non-zero base out of an addrec.
3851 if (AR->getStart()->isZero() || !AR->isAffine())
3852 return S;
3853
3854 const SCEV *Remainder = CollectSubexprs(AR->getStart(),
3855 C, Ops, L, SE, Depth+1);
3856 // Split the non-zero AddRec unless it is part of a nested recurrence that
3857 // does not pertain to this loop.
3858 if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) {
3859 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3860 Remainder = nullptr;
3861 }
3862 if (Remainder != AR->getStart()) {
3863 if (!Remainder)
3864 Remainder = SE.getConstant(AR->getType(), 0);
3865 return SE.getAddRecExpr(Remainder,
3866 AR->getStepRecurrence(SE),
3867 AR->getLoop(),
3868 //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
3870 }
3871 } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
3872 // Break (C * (a + b + c)) into C*a + C*b + C*c.
3873 if (Mul->getNumOperands() != 2)
3874 return S;
3875 if (const SCEVConstant *Op0 =
3876 dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
3877 C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
3878 const SCEV *Remainder =
3879 CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1);
3880 if (Remainder)
3881 Ops.push_back(SE.getMulExpr(C, Remainder));
3882 return nullptr;
3883 }
3884 }
3885 return S;
3886}
3887
3888/// Return true if the SCEV represents a value that may end up as a
3889/// post-increment operation.
3891 LSRUse &LU, const SCEV *S, const Loop *L,
3892 ScalarEvolution &SE) {
3893 if (LU.Kind != LSRUse::Address ||
3894 !LU.AccessTy.getType()->isIntOrIntVectorTy())
3895 return false;
3896 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S);
3897 if (!AR)
3898 return false;
3899 const SCEV *LoopStep = AR->getStepRecurrence(SE);
3900 if (!isa<SCEVConstant>(LoopStep))
3901 return false;
3902 // Check if a post-indexed load/store can be used.
3905 const SCEV *LoopStart = AR->getStart();
3906 if (!isa<SCEVConstant>(LoopStart) && SE.isLoopInvariant(LoopStart, L))
3907 return true;
3908 }
3909 return false;
3910}
3911
3912/// Helper function for LSRInstance::GenerateReassociations.
3913void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
3914 const Formula &Base,
3915 unsigned Depth, size_t Idx,
3916 bool IsScaledReg) {
3917 const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3918 // Don't generate reassociations for the base register of a value that
3919 // may generate a post-increment operator. The reason is that the
3920 // reassociations cause extra base+register formula to be created,
3921 // and possibly chosen, but the post-increment is more efficient.
3922 if (AMK == TTI::AMK_PostIndexed && mayUsePostIncMode(TTI, LU, BaseReg, L, SE))
3923 return;
3925 const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
3926 if (Remainder)
3927 AddOps.push_back(Remainder);
3928
3929 if (AddOps.size() == 1)
3930 return;
3931
3933 JE = AddOps.end();
3934 J != JE; ++J) {
3935 // Loop-variant "unknown" values are uninteresting; we won't be able to
3936 // do anything meaningful with them.
3937 if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
3938 continue;
3939
3940 // Don't pull a constant into a register if the constant could be folded
3941 // into an immediate field.
3942 if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3943 LU.AccessTy, *J, Base.getNumRegs() > 1))
3944 continue;
3945
3946 // Collect all operands except *J.
3947 SmallVector<const SCEV *, 8> InnerAddOps(
3948 ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
3949 InnerAddOps.append(std::next(J),
3950 ((const SmallVector<const SCEV *, 8> &)AddOps).end());
3951
3952 // Don't leave just a constant behind in a register if the constant could
3953 // be folded into an immediate field.
3954 if (InnerAddOps.size() == 1 &&
3955 isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3956 LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
3957 continue;
3958
3959 const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
3960 if (InnerSum->isZero())
3961 continue;
3962 Formula F = Base;
3963
3964 if (F.UnfoldedOffset.isNonZero() && F.UnfoldedOffset.isScalable())
3965 continue;
3966
3967 // Add the remaining pieces of the add back into the new formula.
3968 const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
3969 if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
3970 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
3971 InnerSumSC->getValue()->getZExtValue())) {
3972 F.UnfoldedOffset =
3973 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
3974 InnerSumSC->getValue()->getZExtValue());
3975 if (IsScaledReg)
3976 F.ScaledReg = nullptr;
3977 else
3978 F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
3979 } else if (IsScaledReg)
3980 F.ScaledReg = InnerSum;
3981 else
3982 F.BaseRegs[Idx] = InnerSum;
3983
3984 // Add J as its own register, or an unfolded immediate.
3985 const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
3986 if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
3987 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
3988 SC->getValue()->getZExtValue()))
3989 F.UnfoldedOffset =
3990 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
3991 SC->getValue()->getZExtValue());
3992 else
3993 F.BaseRegs.push_back(*J);
3994 // We may have changed the number of register in base regs, adjust the
3995 // formula accordingly.
3996 F.canonicalize(*L);
3997
3998 if (InsertFormula(LU, LUIdx, F))
3999 // If that formula hadn't been seen before, recurse to find more like
4000 // it.
4001 // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2)
4002 // Because just Depth is not enough to bound compile time.
4003 // This means that every time AddOps.size() is greater 16^x we will add
4004 // x to Depth.
4005 GenerateReassociations(LU, LUIdx, LU.Formulae.back(),
4006 Depth + 1 + (Log2_32(AddOps.size()) >> 2));
4007 }
4008}
4009
4010/// Split out subexpressions from adds and the bases of addrecs.
4011void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
4012 Formula Base, unsigned Depth) {
4013 assert(Base.isCanonical(*L) && "Input must be in the canonical form");
4014 // Arbitrarily cap recursion to protect compile time.
4015 if (Depth >= 3)
4016 return;
4017
4018 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4019 GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
4020
4021 if (Base.Scale == 1)
4022 GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
4023 /* Idx */ -1, /* IsScaledReg */ true);
4024}
4025
4026/// Generate a formula consisting of all of the loop-dominating registers added
4027/// into a single register.
4028void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
4029 Formula Base) {
4030 // This method is only interesting on a plurality of registers.
4031 if (Base.BaseRegs.size() + (Base.Scale == 1) +
4032 (Base.UnfoldedOffset.isNonZero()) <=
4033 1)
4034 return;
4035
4036 // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
4037 // processing the formula.
4038 Base.unscale();
4040 Formula NewBase = Base;
4041 NewBase.BaseRegs.clear();
4042 Type *CombinedIntegerType = nullptr;
4043 for (const SCEV *BaseReg : Base.BaseRegs) {
4044 if (SE.properlyDominates(BaseReg, L->getHeader()) &&
4045 !SE.hasComputableLoopEvolution(BaseReg, L)) {
4046 if (!CombinedIntegerType)
4047 CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType());
4048 Ops.push_back(BaseReg);
4049 }
4050 else
4051 NewBase.BaseRegs.push_back(BaseReg);
4052 }
4053
4054 // If no register is relevant, we're done.
4055 if (Ops.size() == 0)
4056 return;
4057
4058 // Utility function for generating the required variants of the combined
4059 // registers.
4060 auto GenerateFormula = [&](const SCEV *Sum) {
4061 Formula F = NewBase;
4062
4063 // TODO: If Sum is zero, it probably means ScalarEvolution missed an
4064 // opportunity to fold something. For now, just ignore such cases
4065 // rather than proceed with zero in a register.
4066 if (Sum->isZero())
4067 return;
4068
4069 F.BaseRegs.push_back(Sum);
4070 F.canonicalize(*L);
4071 (void)InsertFormula(LU, LUIdx, F);
4072 };
4073
4074 // If we collected at least two registers, generate a formula combining them.
4075 if (Ops.size() > 1) {
4076 SmallVector<const SCEV *, 4> OpsCopy(Ops); // Don't let SE modify Ops.
4077 GenerateFormula(SE.getAddExpr(OpsCopy));
4078 }
4079
4080 // If we have an unfolded offset, generate a formula combining it with the
4081 // registers collected.
4082 if (NewBase.UnfoldedOffset.isNonZero() && NewBase.UnfoldedOffset.isFixed()) {
4083 assert(CombinedIntegerType && "Missing a type for the unfolded offset");
4084 Ops.push_back(SE.getConstant(CombinedIntegerType,
4085 NewBase.UnfoldedOffset.getFixedValue(), true));
4086 NewBase.UnfoldedOffset = Immediate::getFixed(0);
4087 GenerateFormula(SE.getAddExpr(Ops));
4088 }
4089}
4090
4091/// Helper function for LSRInstance::GenerateSymbolicOffsets.
4092void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
4093 const Formula &Base, size_t Idx,
4094 bool IsScaledReg) {
4095 const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4096 GlobalValue *GV = ExtractSymbol(G, SE);
4097 if (G->isZero() || !GV)
4098 return;
4099 Formula F = Base;
4100 F.BaseGV = GV;
4101 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4102 return;
4103 if (IsScaledReg)
4104 F.ScaledReg = G;
4105 else
4106 F.BaseRegs[Idx] = G;
4107 (void)InsertFormula(LU, LUIdx, F);
4108}
4109
4110/// Generate reuse formulae using symbolic offsets.
4111void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
4112 Formula Base) {
4113 // We can't add a symbolic offset if the address already contains one.
4114 if (Base.BaseGV) return;
4115
4116 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4117 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
4118 if (Base.Scale == 1)
4119 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
4120 /* IsScaledReg */ true);
4121}
4122
4123/// Helper function for LSRInstance::GenerateConstantOffsets.
4124void LSRInstance::GenerateConstantOffsetsImpl(
4125 LSRUse &LU, unsigned LUIdx, const Formula &Base,
4126 const SmallVectorImpl<Immediate> &Worklist, size_t Idx, bool IsScaledReg) {
4127
4128 auto GenerateOffset = [&](const SCEV *G, Immediate Offset) {
4129 Formula F = Base;
4130 if (!Base.BaseOffset.isCompatibleImmediate(Offset))
4131 return;
4132 F.BaseOffset = Base.BaseOffset.subUnsigned(Offset);
4133
4134 if (isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) {
4135 // Add the offset to the base register.
4136 const SCEV *NewOffset = Offset.getSCEV(SE, G->getType());
4137 const SCEV *NewG = SE.getAddExpr(NewOffset, G);
4138 // If it cancelled out, drop the base register, otherwise update it.
4139 if (NewG->isZero()) {
4140 if (IsScaledReg) {
4141 F.Scale = 0;
4142 F.ScaledReg = nullptr;
4143 } else
4144 F.deleteBaseReg(F.BaseRegs[Idx]);
4145 F.canonicalize(*L);
4146 } else if (IsScaledReg)
4147 F.ScaledReg = NewG;
4148 else
4149 F.BaseRegs[Idx] = NewG;
4150
4151 (void)InsertFormula(LU, LUIdx, F);
4152 }
4153 };
4154
4155 const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4156
4157 // With constant offsets and constant steps, we can generate pre-inc
4158 // accesses by having the offset equal the step. So, for access #0 with a
4159 // step of 8, we generate a G - 8 base which would require the first access
4160 // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer
4161 // for itself and hopefully becomes the base for other accesses. This means
4162 // means that a single pre-indexed access can be generated to become the new
4163 // base pointer for each iteration of the loop, resulting in no extra add/sub
4164 // instructions for pointer updating.
4165 if (AMK == TTI::AMK_PreIndexed && LU.Kind == LSRUse::Address) {
4166 if (auto *GAR = dyn_cast<SCEVAddRecExpr>(G)) {
4167 if (auto *StepRec =
4168 dyn_cast<SCEVConstant>(GAR->getStepRecurrence(SE))) {
4169 const APInt &StepInt = StepRec->getAPInt();
4170 int64_t Step = StepInt.isNegative() ?
4171 StepInt.getSExtValue() : StepInt.getZExtValue();
4172
4173 for (Immediate Offset : Worklist) {
4174 if (Offset.isFixed()) {
4175 Offset = Immediate::getFixed(Offset.getFixedValue() - Step);
4176 GenerateOffset(G, Offset);
4177 }
4178 }
4179 }
4180 }
4181 }
4182 for (Immediate Offset : Worklist)
4183 GenerateOffset(G, Offset);
4184
4185 Immediate Imm = ExtractImmediate(G, SE);
4186 if (G->isZero() || Imm.isZero() ||
4187 !Base.BaseOffset.isCompatibleImmediate(Imm))
4188 return;
4189 Formula F = Base;
4190 F.BaseOffset = F.BaseOffset.addUnsigned(Imm);
4191 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4192 return;
4193 if (IsScaledReg) {
4194 F.ScaledReg = G;
4195 } else {
4196 F.BaseRegs[Idx] = G;
4197 // We may generate non canonical Formula if G is a recurrent expr reg
4198 // related with current loop while F.ScaledReg is not.
4199 F.canonicalize(*L);
4200 }
4201 (void)InsertFormula(LU, LUIdx, F);
4202}
4203
4204/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
4205void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
4206 Formula Base) {
4207 // TODO: For now, just add the min and max offset, because it usually isn't
4208 // worthwhile looking at everything inbetween.
4210 Worklist.push_back(LU.MinOffset);
4211 if (LU.MaxOffset != LU.MinOffset)
4212 Worklist.push_back(LU.MaxOffset);
4213
4214 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4215 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
4216 if (Base.Scale == 1)
4217 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
4218 /* IsScaledReg */ true);
4219}
4220
4221/// For ICmpZero, check to see if we can scale up the comparison. For example, x
4222/// == y -> x*c == y*c.
4223void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
4224 Formula Base) {
4225 if (LU.Kind != LSRUse::ICmpZero) return;
4226
4227 // Determine the integer type for the base formula.
4228 Type *IntTy = Base.getType();
4229 if (!IntTy) return;
4230 if (SE.getTypeSizeInBits(IntTy) > 64) return;
4231
4232 // Don't do this if there is more than one offset.
4233 if (LU.MinOffset != LU.MaxOffset) return;
4234
4235 // Check if transformation is valid. It is illegal to multiply pointer.
4236 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4237 return;
4238 for (const SCEV *BaseReg : Base.BaseRegs)
4239 if (BaseReg->getType()->isPointerTy())
4240 return;
4241 assert(!Base.BaseGV && "ICmpZero use is not legal!");
4242
4243 // Check each interesting stride.
4244 for (int64_t Factor : Factors) {
4245 // Check that Factor can be represented by IntTy
4246 if (!ConstantInt::isValueValidForType(IntTy, Factor))
4247 continue;
4248 // Check that the multiplication doesn't overflow.
4249 if (Base.BaseOffset.isMin() && Factor == -1)
4250 continue;
4251 // Not supporting scalable immediates.
4252 if (Base.BaseOffset.isNonZero() && Base.BaseOffset.isScalable())
4253 continue;
4254 Immediate NewBaseOffset = Base.BaseOffset.mulUnsigned(Factor);
4255 assert(Factor != 0 && "Zero factor not expected!");
4256 if (NewBaseOffset.getFixedValue() / Factor !=
4257 Base.BaseOffset.getFixedValue())
4258 continue;
4259 // If the offset will be truncated at this use, check that it is in bounds.
4260 if (!IntTy->isPointerTy() &&
4261 !ConstantInt::isValueValidForType(IntTy, NewBaseOffset.getFixedValue()))
4262 continue;
4263
4264 // Check that multiplying with the use offset doesn't overflow.
4265 Immediate Offset = LU.MinOffset;
4266 if (Offset.isMin() && Factor == -1)
4267 continue;
4268 Offset = Offset.mulUnsigned(Factor);
4269 if (Offset.getFixedValue() / Factor != LU.MinOffset.getFixedValue())
4270 continue;
4271 // If the offset will be truncated at this use, check that it is in bounds.
4272 if (!IntTy->isPointerTy() &&
4273 !ConstantInt::isValueValidForType(IntTy, Offset.getFixedValue()))
4274 continue;
4275
4276 Formula F = Base;
4277 F.BaseOffset = NewBaseOffset;
4278
4279 // Check that this scale is legal.
4280 if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
4281 continue;
4282
4283 // Compensate for the use having MinOffset built into it.
4284 F.BaseOffset = F.BaseOffset.addUnsigned(Offset).subUnsigned(LU.MinOffset);
4285
4286 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4287
4288 // Check that multiplying with each base register doesn't overflow.
4289 for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
4290 F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
4291 if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
4292 goto next;
4293 }
4294
4295 // Check that multiplying with the scaled register doesn't overflow.
4296 if (F.ScaledReg) {
4297 F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
4298 if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
4299 continue;
4300 }
4301
4302 // Check that multiplying with the unfolded offset doesn't overflow.
4303 if (F.UnfoldedOffset.isNonZero()) {
4304 if (F.UnfoldedOffset.isMin() && Factor == -1)
4305 continue;
4306 F.UnfoldedOffset = F.UnfoldedOffset.mulUnsigned(Factor);
4307 if (F.UnfoldedOffset.getFixedValue() / Factor !=
4308 Base.UnfoldedOffset.getFixedValue())
4309 continue;
4310 // If the offset will be truncated, check that it is in bounds.
4312 IntTy, F.UnfoldedOffset.getFixedValue()))
4313 continue;
4314 }
4315
4316 // If we make it here and it's legal, add it.
4317 (void)InsertFormula(LU, LUIdx, F);
4318 next:;
4319 }
4320}
4321
4322/// Generate stride factor reuse formulae by making use of scaled-offset address
4323/// modes, for example.
4324void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
4325 // Determine the integer type for the base formula.
4326 Type *IntTy = Base.getType();
4327 if (!IntTy) return;
4328
4329 // If this Formula already has a scaled register, we can't add another one.
4330 // Try to unscale the formula to generate a better scale.
4331 if (Base.Scale != 0 && !Base.unscale())
4332 return;
4333
4334 assert(Base.Scale == 0 && "unscale did not did its job!");
4335
4336 // Check each interesting stride.
4337 for (int64_t Factor : Factors) {
4338 Base.Scale = Factor;
4339 Base.HasBaseReg = Base.BaseRegs.size() > 1;
4340 // Check whether this scale is going to be legal.
4341 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4342 Base)) {
4343 // As a special-case, handle special out-of-loop Basic users specially.
4344 // TODO: Reconsider this special case.
4345 if (LU.Kind == LSRUse::Basic &&
4346 isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
4347 LU.AccessTy, Base) &&
4348 LU.AllFixupsOutsideLoop)
4349 LU.Kind = LSRUse::Special;
4350 else
4351 continue;
4352 }
4353 // For an ICmpZero, negating a solitary base register won't lead to
4354 // new solutions.
4355 if (LU.Kind == LSRUse::ICmpZero && !Base.HasBaseReg &&
4356 Base.BaseOffset.isZero() && !Base.BaseGV)
4357 continue;
4358 // For each addrec base reg, if its loop is current loop, apply the scale.
4359 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
4360 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
4361 if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
4362 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4363 if (FactorS->isZero())
4364 continue;
4365 // Divide out the factor, ignoring high bits, since we'll be
4366 // scaling the value back up in the end.
4367 if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true))
4368 if (!Quotient->isZero()) {
4369 // TODO: This could be optimized to avoid all the copying.
4370 Formula F = Base;
4371 F.ScaledReg = Quotient;
4372 F.deleteBaseReg(F.BaseRegs[i]);
4373 // The canonical representation of 1*reg is reg, which is already in
4374 // Base. In that case, do not try to insert the formula, it will be
4375 // rejected anyway.
4376 if (F.Scale == 1 && (F.BaseRegs.empty() ||
4377 (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
4378 continue;
4379 // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
4380 // non canonical Formula with ScaledReg's loop not being L.
4381 if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
4382 F.canonicalize(*L);
4383 (void)InsertFormula(LU, LUIdx, F);
4384 }
4385 }
4386 }
4387 }
4388}
4389
4390/// Extend/Truncate \p Expr to \p ToTy considering post-inc uses in \p Loops.
4391/// For all PostIncLoopSets in \p Loops, first de-normalize \p Expr, then
4392/// perform the extension/truncate and normalize again, as the normalized form
4393/// can result in folds that are not valid in the post-inc use contexts. The
4394/// expressions for all PostIncLoopSets must match, otherwise return nullptr.
4395static const SCEV *
4397 const SCEV *Expr, Type *ToTy,
4398 ScalarEvolution &SE) {
4399 const SCEV *Result = nullptr;
4400 for (auto &L : Loops) {
4401 auto *DenormExpr = denormalizeForPostIncUse(Expr, L, SE);
4402 const SCEV *NewDenormExpr = SE.getAnyExtendExpr(DenormExpr, ToTy);
4403 const SCEV *New = normalizeForPostIncUse(NewDenormExpr, L, SE);
4404 if (!New || (Result && New != Result))
4405 return nullptr;
4406 Result = New;
4407 }
4408
4409 assert(Result && "failed to create expression");
4410 return Result;
4411}
4412
4413/// Generate reuse formulae from different IV types.
4414void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
4415 // Don't bother truncating symbolic values.
4416 if (Base.BaseGV) return;
4417
4418 // Determine the integer type for the base formula.
4419 Type *DstTy = Base.getType();
4420 if (!DstTy) return;
4421 if (DstTy->isPointerTy())
4422 return;
4423
4424 // It is invalid to extend a pointer type so exit early if ScaledReg or
4425 // any of the BaseRegs are pointers.
4426 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4427 return;
4428 if (any_of(Base.BaseRegs,
4429 [](const SCEV *S) { return S->getType()->isPointerTy(); }))
4430 return;
4431
4433 for (auto &LF : LU.Fixups)
4434 Loops.push_back(LF.PostIncLoops);
4435
4436 for (Type *SrcTy : Types) {
4437 if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
4438 Formula F = Base;
4439
4440 // Sometimes SCEV is able to prove zero during ext transform. It may
4441 // happen if SCEV did not do all possible transforms while creating the
4442 // initial node (maybe due to depth limitations), but it can do them while
4443 // taking ext.
4444 if (F.ScaledReg) {
4445 const SCEV *NewScaledReg =
4446 getAnyExtendConsideringPostIncUses(Loops, F.ScaledReg, SrcTy, SE);
4447 if (!NewScaledReg || NewScaledReg->isZero())
4448 continue;
4449 F.ScaledReg = NewScaledReg;
4450 }
4451 bool HasZeroBaseReg = false;
4452 for (const SCEV *&BaseReg : F.BaseRegs) {
4453 const SCEV *NewBaseReg =
4454 getAnyExtendConsideringPostIncUses(Loops, BaseReg, SrcTy, SE);
4455 if (!NewBaseReg || NewBaseReg->isZero()) {
4456 HasZeroBaseReg = true;
4457 break;
4458 }
4459 BaseReg = NewBaseReg;
4460 }
4461 if (HasZeroBaseReg)
4462 continue;
4463
4464 // TODO: This assumes we've done basic processing on all uses and
4465 // have an idea what the register usage is.
4466 if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
4467 continue;
4468
4469 F.canonicalize(*L);
4470 (void)InsertFormula(LU, LUIdx, F);
4471 }
4472 }
4473}
4474
4475namespace {
4476
4477/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
4478/// modifications so that the search phase doesn't have to worry about the data
4479/// structures moving underneath it.
4480struct WorkItem {
4481 size_t LUIdx;
4482 Immediate Imm;
4483 const SCEV *OrigReg;
4484
4485 WorkItem(size_t LI, Immediate I, const SCEV *R)
4486 : LUIdx(LI), Imm(I), OrigReg(R) {}
4487
4488 void print(raw_ostream &OS) const;
4489 void dump() const;
4490};
4491
4492} // end anonymous namespace
4493
4494#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4495void WorkItem::print(raw_ostream &OS) const {
4496 OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
4497 << " , add offset " << Imm;
4498}
4499
4500LLVM_DUMP_METHOD void WorkItem::dump() const {
4501 print(errs()); errs() << '\n';
4502}
4503#endif
4504
4505/// Look for registers which are a constant distance apart and try to form reuse
4506/// opportunities between them.
4507void LSRInstance::GenerateCrossUseConstantOffsets() {
4508 // Group the registers by their value without any added constant offset.
4509 using ImmMapTy = std::map<Immediate, const SCEV *, KeyOrderTargetImmediate>;
4510
4514 for (const SCEV *Use : RegUses) {
4515 const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify.
4516 Immediate Imm = ExtractImmediate(Reg, SE);
4517 auto Pair = Map.insert(std::make_pair(Reg, ImmMapTy()));
4518 if (Pair.second)
4519 Sequence.push_back(Reg);
4520 Pair.first->second.insert(std::make_pair(Imm, Use));
4521 UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
4522 }
4523
4524 // Now examine each set of registers with the same base value. Build up
4525 // a list of work to do and do the work in a separate step so that we're
4526 // not adding formulae and register counts while we're searching.
4527 SmallVector<WorkItem, 32> WorkItems;
4528 SmallSet<std::pair<size_t, Immediate>, 32, KeyOrderSizeTAndImmediate>
4529 UniqueItems;
4530 for (const SCEV *Reg : Sequence) {
4531 const ImmMapTy &Imms = Map.find(Reg)->second;
4532
4533 // It's not worthwhile looking for reuse if there's only one offset.
4534 if (Imms.size() == 1)
4535 continue;
4536
4537 LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
4538 for (const auto &Entry
4539 : Imms) dbgs()
4540 << ' ' << Entry.first;
4541 dbgs() << '\n');
4542
4543 // Examine each offset.
4544 for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
4545 J != JE; ++J) {
4546 const SCEV *OrigReg = J->second;
4547
4548 Immediate JImm = J->first;
4549 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
4550
4551 if (!isa<SCEVConstant>(OrigReg) &&
4552 UsedByIndicesMap[Reg].count() == 1) {
4553 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4554 << '\n');
4555 continue;
4556 }
4557
4558 // Conservatively examine offsets between this orig reg a few selected
4559 // other orig regs.
4560 Immediate First = Imms.begin()->first;
4561 Immediate Last = std::prev(Imms.end())->first;
4562 if (!First.isCompatibleImmediate(Last)) {
4563 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4564 << "\n");
4565 continue;
4566 }
4567 // Only scalable if both terms are scalable, or if one is scalable and
4568 // the other is 0.
4569 bool Scalable = First.isScalable() || Last.isScalable();
4570 int64_t FI = First.getKnownMinValue();
4571 int64_t LI = Last.getKnownMinValue();
4572 // Compute (First + Last) / 2 without overflow using the fact that
4573 // First + Last = 2 * (First + Last) + (First ^ Last).
4574 int64_t Avg = (FI & LI) + ((FI ^ LI) >> 1);
4575 // If the result is negative and FI is odd and LI even (or vice versa),
4576 // we rounded towards -inf. Add 1 in that case, to round towards 0.
4577 Avg = Avg + ((FI ^ LI) & ((uint64_t)Avg >> 63));
4578 ImmMapTy::const_iterator OtherImms[] = {
4579 Imms.begin(), std::prev(Imms.end()),
4580 Imms.lower_bound(Immediate::get(Avg, Scalable))};
4581 for (const auto &M : OtherImms) {
4582 if (M == J || M == JE) continue;
4583 if (!JImm.isCompatibleImmediate(M->first))
4584 continue;
4585
4586 // Compute the difference between the two.
4587 Immediate Imm = JImm.subUnsigned(M->first);
4588 for (unsigned LUIdx : UsedByIndices.set_bits())
4589 // Make a memo of this use, offset, and register tuple.
4590 if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
4591 WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
4592 }
4593 }
4594 }
4595
4596 Map.clear();
4597 Sequence.clear();
4598 UsedByIndicesMap.clear();
4599 UniqueItems.clear();
4600
4601 // Now iterate through the worklist and add new formulae.
4602 for (const WorkItem &WI : WorkItems) {
4603 size_t LUIdx = WI.LUIdx;
4604 LSRUse &LU = Uses[LUIdx];
4605 Immediate Imm = WI.Imm;
4606 const SCEV *OrigReg = WI.OrigReg;
4607
4608 Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
4609 const SCEV *NegImmS = Imm.getNegativeSCEV(SE, IntTy);
4610 unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
4611
4612 // TODO: Use a more targeted data structure.
4613 for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
4614 Formula F = LU.Formulae[L];
4615 // FIXME: The code for the scaled and unscaled registers looks
4616 // very similar but slightly different. Investigate if they
4617 // could be merged. That way, we would not have to unscale the
4618 // Formula.
4619 F.unscale();
4620 // Use the immediate in the scaled register.
4621 if (F.ScaledReg == OrigReg) {
4622 if (!F.BaseOffset.isCompatibleImmediate(Imm))
4623 continue;
4624 Immediate Offset = F.BaseOffset.addUnsigned(Imm.mulUnsigned(F.Scale));
4625 // Don't create 50 + reg(-50).
4626 const SCEV *S = Offset.getNegativeSCEV(SE, IntTy);
4627 if (F.referencesReg(S))
4628 continue;
4629 Formula NewF = F;
4630 NewF.BaseOffset = Offset;
4631 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4632 NewF))
4633 continue;
4634 NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
4635
4636 // If the new scale is a constant in a register, and adding the constant
4637 // value to the immediate would produce a value closer to zero than the
4638 // immediate itself, then the formula isn't worthwhile.
4639 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) {
4640 // FIXME: Do we need to do something for scalable immediates here?
4641 // A scalable SCEV won't be constant, but we might still have
4642 // something in the offset? Bail out for now to be safe.
4643 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4644 continue;
4645 if (C->getValue()->isNegative() !=
4646 (NewF.BaseOffset.isLessThanZero()) &&
4647 (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
4648 .ule(std::abs(NewF.BaseOffset.getFixedValue())))
4649 continue;
4650 }
4651
4652 // OK, looks good.
4653 NewF.canonicalize(*this->L);
4654 (void)InsertFormula(LU, LUIdx, NewF);
4655 } else {
4656 // Use the immediate in a base register.
4657 for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
4658 const SCEV *BaseReg = F.BaseRegs[N];
4659 if (BaseReg != OrigReg)
4660 continue;
4661 Formula NewF = F;
4662 if (!NewF.BaseOffset.isCompatibleImmediate(Imm) ||
4663 !NewF.UnfoldedOffset.isCompatibleImmediate(Imm) ||
4664 !NewF.BaseOffset.isCompatibleImmediate(NewF.UnfoldedOffset))
4665 continue;
4666 NewF.BaseOffset = NewF.BaseOffset.addUnsigned(Imm);
4667 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
4668 LU.Kind, LU.AccessTy, NewF)) {
4669 if (AMK == TTI::AMK_PostIndexed &&
4670 mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
4671 continue;
4672 Immediate NewUnfoldedOffset = NewF.UnfoldedOffset.addUnsigned(Imm);
4673 if (!isLegalAddImmediate(TTI, NewUnfoldedOffset))
4674 continue;
4675 NewF = F;
4676 NewF.UnfoldedOffset = NewUnfoldedOffset;
4677 }
4678 NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
4679
4680 // If the new formula has a constant in a register, and adding the
4681 // constant value to the immediate would produce a value closer to
4682 // zero than the immediate itself, then the formula isn't worthwhile.
4683 for (const SCEV *NewReg : NewF.BaseRegs)
4684 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg)) {
4685 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4686 goto skip_formula;
4687 if ((C->getAPInt() + NewF.BaseOffset.getFixedValue())
4688 .abs()
4689 .slt(std::abs(NewF.BaseOffset.getFixedValue())) &&
4690 (C->getAPInt() + NewF.BaseOffset.getFixedValue())
4691 .countr_zero() >=
4692 (unsigned)llvm::countr_zero<uint64_t>(
4693 NewF.BaseOffset.getFixedValue()))
4694 goto skip_formula;
4695 }
4696
4697 // Ok, looks good.
4698 NewF.canonicalize(*this->L);
4699 (void)InsertFormula(LU, LUIdx, NewF);
4700 break;
4701 skip_formula:;
4702 }
4703 }
4704 }
4705 }
4706}
4707
4708/// Generate formulae for each use.
4709void
4710LSRInstance::GenerateAllReuseFormulae() {
4711 // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
4712 // queries are more precise.
4713 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4714 LSRUse &LU = Uses[LUIdx];
4715 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4716 GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
4717 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4718 GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
4719 }
4720 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4721 LSRUse &LU = Uses[LUIdx];
4722 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4723 GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
4724 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4725 GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
4726 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4727 GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
4728 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4729 GenerateScales(LU, LUIdx, LU.Formulae[i]);
4730 }
4731 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4732 LSRUse &LU = Uses[LUIdx];
4733 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4734 GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
4735 }
4736
4737 GenerateCrossUseConstantOffsets();
4738
4739 LLVM_DEBUG(dbgs() << "\n"
4740 "After generating reuse formulae:\n";
4741 print_uses(dbgs()));
4742}
4743
4744/// If there are multiple formulae with the same set of registers used
4745/// by other uses, pick the best one and delete the others.
4746void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
4747 DenseSet<const SCEV *> VisitedRegs;
4750#ifndef NDEBUG
4751 bool ChangedFormulae = false;
4752#endif
4753
4754 // Collect the best formula for each unique set of shared registers. This
4755 // is reset for each use.
4756 using BestFormulaeTy =
4757 DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>;
4758
4759 BestFormulaeTy BestFormulae;
4760
4761 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4762 LSRUse &LU = Uses[LUIdx];
4763 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
4764 dbgs() << '\n');
4765
4766 bool Any = false;
4767 for (size_t FIdx = 0, NumForms = LU.Formulae.size();
4768 FIdx != NumForms; ++FIdx) {
4769 Formula &F = LU.Formulae[FIdx];
4770
4771 // Some formulas are instant losers. For example, they may depend on
4772 // nonexistent AddRecs from other loops. These need to be filtered
4773 // immediately, otherwise heuristics could choose them over others leading
4774 // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
4775 // avoids the need to recompute this information across formulae using the
4776 // same bad AddRec. Passing LoserRegs is also essential unless we remove
4777 // the corresponding bad register from the Regs set.
4778 Cost CostF(L, SE, TTI, AMK);
4779 Regs.clear();
4780 CostF.RateFormula(F, Regs, VisitedRegs, LU, &LoserRegs);
4781 if (CostF.isLoser()) {
4782 // During initial formula generation, undesirable formulae are generated
4783 // by uses within other loops that have some non-trivial address mode or
4784 // use the postinc form of the IV. LSR needs to provide these formulae
4785 // as the basis of rediscovering the desired formula that uses an AddRec
4786 // corresponding to the existing phi. Once all formulae have been
4787 // generated, these initial losers may be pruned.
4788 LLVM_DEBUG(dbgs() << " Filtering loser "; F.print(dbgs());
4789 dbgs() << "\n");
4790 }
4791 else {
4793 for (const SCEV *Reg : F.BaseRegs) {
4794 if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
4795 Key.push_back(Reg);
4796 }
4797 if (F.ScaledReg &&
4798 RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
4799 Key.push_back(F.ScaledReg);
4800 // Unstable sort by host order ok, because this is only used for
4801 // uniquifying.
4802 llvm::sort(Key);
4803
4804 std::pair<BestFormulaeTy::const_iterator, bool> P =
4805 BestFormulae.insert(std::make_pair(Key, FIdx));
4806 if (P.second)
4807 continue;
4808
4809 Formula &Best = LU.Formulae[P.first->second];
4810
4811 Cost CostBest(L, SE, TTI, AMK);
4812 Regs.clear();
4813 CostBest.RateFormula(Best, Regs, VisitedRegs, LU);
4814 if (CostF.isLess(CostBest))
4815 std::swap(F, Best);
4816 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
4817 dbgs() << "\n"
4818 " in favor of formula ";
4819 Best.print(dbgs()); dbgs() << '\n');
4820 }
4821#ifndef NDEBUG
4822 ChangedFormulae = true;
4823#endif
4824 LU.DeleteFormula(F);
4825 --FIdx;
4826 --NumForms;
4827 Any = true;
4828 }
4829
4830 // Now that we've filtered out some formulae, recompute the Regs set.
4831 if (Any)
4832 LU.RecomputeRegs(LUIdx, RegUses);
4833
4834 // Reset this to prepare for the next use.
4835 BestFormulae.clear();
4836 }
4837
4838 LLVM_DEBUG(if (ChangedFormulae) {
4839 dbgs() << "\n"
4840 "After filtering out undesirable candidates:\n";
4841 print_uses(dbgs());
4842 });
4843}
4844
4845/// Estimate the worst-case number of solutions the solver might have to
4846/// consider. It almost never considers this many solutions because it prune the
4847/// search space, but the pruning isn't always sufficient.
4848size_t LSRInstance::EstimateSearchSpaceComplexity() const {
4849 size_t Power = 1;
4850 for (const LSRUse &LU : Uses) {
4851 size_t FSize = LU.Formulae.size();
4852 if (FSize >= ComplexityLimit) {
4853 Power = ComplexityLimit;
4854 break;
4855 }
4856 Power *= FSize;
4857 if (Power >= ComplexityLimit)
4858 break;
4859 }
4860 return Power;
4861}
4862
4863/// When one formula uses a superset of the registers of another formula, it
4864/// won't help reduce register pressure (though it may not necessarily hurt
4865/// register pressure); remove it to simplify the system.
4866void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
4867 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4868 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4869
4870 LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
4871 "which use a superset of registers used by other "
4872 "formulae.\n");
4873
4874 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4875 LSRUse &LU = Uses[LUIdx];
4876 bool Any = false;
4877 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
4878 Formula &F = LU.Formulae[i];
4879 if (F.BaseOffset.isNonZero() && F.BaseOffset.isScalable())
4880 continue;
4881 // Look for a formula with a constant or GV in a register. If the use
4882 // also has a formula with that same value in an immediate field,
4883 // delete the one that uses a register.
4885 I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
4886 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
4887 Formula NewF = F;
4888 //FIXME: Formulas should store bitwidth to do wrapping properly.
4889 // See PR41034.
4890 NewF.BaseOffset =
4891 Immediate::getFixed(NewF.BaseOffset.getFixedValue() +
4892 (uint64_t)C->getValue()->getSExtValue());
4893 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4894 (I - F.BaseRegs.begin()));
4895 if (LU.HasFormulaWithSameRegs(NewF)) {
4896 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4897 dbgs() << '\n');
4898 LU.DeleteFormula(F);
4899 --i;
4900 --e;
4901 Any = true;
4902 break;
4903 }
4904 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
4905 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
4906 if (!F.BaseGV) {
4907 Formula NewF = F;
4908 NewF.BaseGV = GV;
4909 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4910 (I - F.BaseRegs.begin()));
4911 if (LU.HasFormulaWithSameRegs(NewF)) {
4912 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4913 dbgs() << '\n');
4914 LU.DeleteFormula(F);
4915 --i;
4916 --e;
4917 Any = true;
4918 break;
4919 }
4920 }
4921 }
4922 }
4923 }
4924 if (Any)
4925 LU.RecomputeRegs(LUIdx, RegUses);
4926 }
4927
4928 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4929 }
4930}
4931
4932/// When there are many registers for expressions like A, A+1, A+2, etc.,
4933/// allocate a single register for them.
4934void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
4935 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4936 return;
4937
4938 LLVM_DEBUG(
4939 dbgs() << "The search space is too complex.\n"
4940 "Narrowing the search space by assuming that uses separated "
4941 "by a constant offset will use the same registers.\n");
4942
4943 // This is especially useful for unrolled loops.
4944
4945 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4946 LSRUse &LU = Uses[LUIdx];
4947 for (const Formula &F : LU.Formulae) {
4948 if (F.BaseOffset.isZero() || (F.Scale != 0 && F.Scale != 1))
4949 continue;
4950
4951 LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
4952 if (!LUThatHas)
4953 continue;
4954
4955 if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
4956 LU.Kind, LU.AccessTy))
4957 continue;
4958
4959 LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');
4960
4961 LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
4962
4963 // Transfer the fixups of LU to LUThatHas.
4964 for (LSRFixup &Fixup : LU.Fixups) {
4965 Fixup.Offset += F.BaseOffset;
4966 LUThatHas->pushFixup(Fixup);
4967 LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
4968 }
4969
4970 // Delete formulae from the new use which are no longer legal.
4971 bool Any = false;
4972 for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
4973 Formula &F = LUThatHas->Formulae[i];
4974 if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
4975 LUThatHas->Kind, LUThatHas->AccessTy, F)) {
4976 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
4977 LUThatHas->DeleteFormula(F);
4978 --i;
4979 --e;
4980 Any = true;
4981 }
4982 }
4983
4984 if (Any)
4985 LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
4986
4987 // Delete the old use.
4988 DeleteUse(LU, LUIdx);
4989 --LUIdx;
4990 --NumUses;
4991 break;
4992 }
4993 }
4994
4995 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4996}
4997
4998/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
4999/// we've done more filtering, as it may be able to find more formulae to
5000/// eliminate.
5001void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
5002 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5003 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5004
5005 LLVM_DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
5006 "undesirable dedicated registers.\n");
5007
5008 FilterOutUndesirableDedicatedRegisters();
5009
5010 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5011 }
5012}
5013
5014/// If a LSRUse has multiple formulae with the same ScaledReg and Scale.
5015/// Pick the best one and delete the others.
5016/// This narrowing heuristic is to keep as many formulae with different
5017/// Scale and ScaledReg pair as possible while narrowing the search space.
5018/// The benefit is that it is more likely to find out a better solution
5019/// from a formulae set with more Scale and ScaledReg variations than
5020/// a formulae set with the same Scale and ScaledReg. The picking winner
5021/// reg heuristic will often keep the formulae with the same Scale and
5022/// ScaledReg and filter others, and we want to avoid that if possible.
5023void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
5024 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5025 return;
5026
5027 LLVM_DEBUG(
5028 dbgs() << "The search space is too complex.\n"
5029 "Narrowing the search space by choosing the best Formula "
5030 "from the Formulae with the same Scale and ScaledReg.\n");
5031
5032 // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
5033 using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>;
5034
5035 BestFormulaeTy BestFormulae;
5036#ifndef NDEBUG
5037 bool ChangedFormulae = false;
5038#endif
5039 DenseSet<const SCEV *> VisitedRegs;
5041
5042 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5043 LSRUse &LU = Uses[LUIdx];
5044 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
5045 dbgs() << '\n');
5046
5047 // Return true if Formula FA is better than Formula FB.
5048 auto IsBetterThan = [&](Formula &FA, Formula &FB) {
5049 // First we will try to choose the Formula with fewer new registers.
5050 // For a register used by current Formula, the more the register is
5051 // shared among LSRUses, the less we increase the register number
5052 // counter of the formula.
5053 size_t FARegNum = 0;
5054 for (const SCEV *Reg : FA.BaseRegs) {
5055 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5056 FARegNum += (NumUses - UsedByIndices.count() + 1);
5057 }
5058 size_t FBRegNum = 0;
5059 for (const SCEV *Reg : FB.BaseRegs) {
5060 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5061 FBRegNum += (NumUses - UsedByIndices.count() + 1);
5062 }
5063 if (FARegNum != FBRegNum)
5064 return FARegNum < FBRegNum;
5065
5066 // If the new register numbers are the same, choose the Formula with
5067 // less Cost.
5068 Cost CostFA(L, SE, TTI, AMK);
5069 Cost CostFB(L, SE, TTI, AMK);
5070 Regs.clear();
5071 CostFA.RateFormula(FA, Regs, VisitedRegs, LU);
5072 Regs.clear();
5073 CostFB.RateFormula(FB, Regs, VisitedRegs, LU);
5074 return CostFA.isLess(CostFB);
5075 };
5076
5077 bool Any = false;
5078 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5079 ++FIdx) {
5080 Formula &F = LU.Formulae[FIdx];
5081 if (!F.ScaledReg)
5082 continue;
5083 auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx});
5084 if (P.second)
5085 continue;
5086
5087 Formula &Best = LU.Formulae[P.first->second];
5088 if (IsBetterThan(F, Best))
5089 std::swap(F, Best);
5090 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5091 dbgs() << "\n"
5092 " in favor of formula ";
5093 Best.print(dbgs()); dbgs() << '\n');
5094#ifndef NDEBUG
5095 ChangedFormulae = true;
5096#endif
5097 LU.DeleteFormula(F);
5098 --FIdx;
5099 --NumForms;
5100 Any = true;
5101 }
5102 if (Any)
5103 LU.RecomputeRegs(LUIdx, RegUses);
5104
5105 // Reset this to prepare for the next use.
5106 BestFormulae.clear();
5107 }
5108
5109 LLVM_DEBUG(if (ChangedFormulae) {
5110 dbgs() << "\n"
5111 "After filtering out undesirable candidates:\n";
5112 print_uses(dbgs());
5113 });
5114}
5115
5116/// If we are over the complexity limit, filter out any post-inc prefering
5117/// variables to only post-inc values.
5118void LSRInstance::NarrowSearchSpaceByFilterPostInc() {
5119 if (AMK != TTI::AMK_PostIndexed)
5120 return;
5121 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5122 return;
5123
5124 LLVM_DEBUG(dbgs() << "The search space is too complex.\n"
5125 "Narrowing the search space by choosing the lowest "
5126 "register Formula for PostInc Uses.\n");
5127
5128 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5129 LSRUse &LU = Uses[LUIdx];
5130
5131 if (LU.Kind != LSRUse::Address)
5132 continue;
5133 if (!TTI.isIndexedLoadLegal(TTI.MIM_PostInc, LU.AccessTy.getType()) &&
5134 !TTI.isIndexedStoreLegal(TTI.MIM_PostInc, LU.AccessTy.getType()))
5135 continue;
5136
5137 size_t MinRegs = std::numeric_limits<size_t>::max();
5138 for (const Formula &F : LU.Formulae)
5139 MinRegs = std::min(F.getNumRegs(), MinRegs);
5140
5141 bool Any = false;
5142 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5143 ++FIdx) {
5144 Formula &F = LU.Formulae[FIdx];
5145 if (F.getNumRegs() > MinRegs) {
5146 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5147 dbgs() << "\n");
5148 LU.DeleteFormula(F);
5149 --FIdx;
5150 --NumForms;
5151 Any = true;
5152 }
5153 }
5154 if (Any)
5155 LU.RecomputeRegs(LUIdx, RegUses);
5156
5157 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5158 break;
5159 }
5160
5161 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5162}
5163
5164/// The function delete formulas with high registers number expectation.
5165/// Assuming we don't know the value of each formula (already delete
5166/// all inefficient), generate probability of not selecting for each
5167/// register.
5168/// For example,
5169/// Use1:
5170/// reg(a) + reg({0,+,1})
5171/// reg(a) + reg({-1,+,1}) + 1
5172/// reg({a,+,1})
5173/// Use2:
5174/// reg(b) + reg({0,+,1})
5175/// reg(b) + reg({-1,+,1}) + 1
5176/// reg({b,+,1})
5177/// Use3:
5178/// reg(c) + reg(b) + reg({0,+,1})
5179/// reg(c) + reg({b,+,1})
5180///
5181/// Probability of not selecting
5182/// Use1 Use2 Use3
5183/// reg(a) (1/3) * 1 * 1
5184/// reg(b) 1 * (1/3) * (1/2)
5185/// reg({0,+,1}) (2/3) * (2/3) * (1/2)
5186/// reg({-1,+,1}) (2/3) * (2/3) * 1
5187/// reg({a,+,1}) (2/3) * 1 * 1
5188/// reg({b,+,1}) 1 * (2/3) * (2/3)
5189/// reg(c) 1 * 1 * 0
5190///
5191/// Now count registers number mathematical expectation for each formula:
5192/// Note that for each use we exclude probability if not selecting for the use.
5193/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
5194/// probabilty 1/3 of not selecting for Use1).
5195/// Use1:
5196/// reg(a) + reg({0,+,1}) 1 + 1/3 -- to be deleted
5197/// reg(a) + reg({-1,+,1}) + 1 1 + 4/9 -- to be deleted
5198/// reg({a,+,1}) 1
5199/// Use2:
5200/// reg(b) + reg({0,+,1}) 1/2 + 1/3 -- to be deleted
5201/// reg(b) + reg({-1,+,1}) + 1 1/2 + 2/3 -- to be deleted
5202/// reg({b,+,1}) 2/3
5203/// Use3:
5204/// reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
5205/// reg(c) + reg({b,+,1}) 1 + 2/3
5206void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
5207 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5208 return;
5209 // Ok, we have too many of formulae on our hands to conveniently handle.
5210 // Use a rough heuristic to thin out the list.
5211
5212 // Set of Regs wich will be 100% used in final solution.
5213 // Used in each formula of a solution (in example above this is reg(c)).
5214 // We can skip them in calculations.
5216 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5217
5218 // Map each register to probability of not selecting
5219 DenseMap <const SCEV *, float> RegNumMap;
5220 for (const SCEV *Reg : RegUses) {
5221 if (UniqRegs.count(Reg))
5222 continue;
5223 float PNotSel = 1;
5224 for (const LSRUse &LU : Uses) {
5225 if (!LU.Regs.count(Reg))
5226 continue;
5227 float P = LU.getNotSelectedProbability(Reg);
5228 if (P != 0.0)
5229 PNotSel *= P;
5230 else
5231 UniqRegs.insert(Reg);
5232 }
5233 RegNumMap.insert(std::make_pair(Reg, PNotSel));
5234 }
5235
5236 LLVM_DEBUG(
5237 dbgs() << "Narrowing the search space by deleting costly formulas\n");
5238
5239 // Delete formulas where registers number expectation is high.
5240 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5241 LSRUse &LU = Uses[LUIdx];
5242 // If nothing to delete - continue.
5243 if (LU.Formulae.size() < 2)
5244 continue;
5245 // This is temporary solution to test performance. Float should be
5246 // replaced with round independent type (based on integers) to avoid
5247 // different results for different target builds.
5248 float FMinRegNum = LU.Formulae[0].getNumRegs();
5249 float FMinARegNum = LU.Formulae[0].getNumRegs();
5250 size_t MinIdx = 0;
5251 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5252 Formula &F = LU.Formulae[i];
5253 float FRegNum = 0;
5254 float FARegNum = 0;
5255 for (const SCEV *BaseReg : F.BaseRegs) {
5256 if (UniqRegs.count(BaseReg))
5257 continue;
5258 FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5259 if (isa<SCEVAddRecExpr>(BaseReg))
5260 FARegNum +=
5261 RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5262 }
5263 if (const SCEV *ScaledReg = F.ScaledReg) {
5264 if (!UniqRegs.count(ScaledReg)) {
5265 FRegNum +=
5266 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5267 if (isa<SCEVAddRecExpr>(ScaledReg))
5268 FARegNum +=
5269 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5270 }
5271 }
5272 if (FMinRegNum > FRegNum ||
5273 (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
5274 FMinRegNum = FRegNum;
5275 FMinARegNum = FARegNum;
5276 MinIdx = i;
5277 }
5278 }
5279 LLVM_DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs());
5280 dbgs() << " with min reg num " << FMinRegNum << '\n');
5281 if (MinIdx != 0)
5282 std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
5283 while (LU.Formulae.size() != 1) {
5284 LLVM_DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs());
5285 dbgs() << '\n');
5286 LU.Formulae.pop_back();
5287 }
5288 LU.RecomputeRegs(LUIdx, RegUses);
5289 assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
5290 Formula &F = LU.Formulae[0];
5291 LLVM_DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n');
5292 // When we choose the formula, the regs become unique.
5293 UniqRegs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
5294 if (F.ScaledReg)
5295 UniqRegs.insert(F.ScaledReg);
5296 }
5297 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5298}
5299
5300// Check if Best and Reg are SCEVs separated by a constant amount C, and if so
5301// would the addressing offset +C would be legal where the negative offset -C is
5302// not.
5304 ScalarEvolution &SE, const SCEV *Best,
5305 const SCEV *Reg,
5306 MemAccessTy AccessType) {
5307 if (Best->getType() != Reg->getType() ||
5308 (isa<SCEVAddRecExpr>(Best) && isa<SCEVAddRecExpr>(Reg) &&
5309 cast<SCEVAddRecExpr>(Best)->getLoop() !=
5310 cast<SCEVAddRecExpr>(Reg)->getLoop()))
5311 return false;
5312 const auto *Diff = dyn_cast<SCEVConstant>(SE.getMinusSCEV(Best, Reg));
5313 if (!Diff)
5314 return false;
5315
5317 AccessType.MemTy, /*BaseGV=*/nullptr,
5318 /*BaseOffset=*/Diff->getAPInt().getSExtValue(),
5319 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace) &&
5321 AccessType.MemTy, /*BaseGV=*/nullptr,
5322 /*BaseOffset=*/-Diff->getAPInt().getSExtValue(),
5323 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace);
5324}
5325
5326/// Pick a register which seems likely to be profitable, and then in any use
5327/// which has any reference to that register, delete all formulae which do not
5328/// reference that register.
5329void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
5330 // With all other options exhausted, loop until the system is simple
5331 // enough to handle.
5333 while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5334 // Ok, we have too many of formulae on our hands to conveniently handle.
5335 // Use a rough heuristic to thin out the list.
5336 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5337
5338 // Pick the register which is used by the most LSRUses, which is likely
5339 // to be a good reuse register candidate.
5340 const SCEV *Best = nullptr;
5341 unsigned BestNum = 0;
5342 for (const SCEV *Reg : RegUses) {
5343 if (Taken.count(Reg))
5344 continue;
5345 if (!Best) {
5346 Best = Reg;
5347 BestNum = RegUses.getUsedByIndices(Reg).count();
5348 } else {
5349 unsigned Count = RegUses.getUsedByIndices(Reg).count();
5350 if (Count > BestNum) {
5351 Best = Reg;
5352 BestNum = Count;
5353 }
5354
5355 // If the scores are the same, but the Reg is simpler for the target
5356 // (for example {x,+,1} as opposed to {x+C,+,1}, where the target can
5357 // handle +C but not -C), opt for the simpler formula.
5358 if (Count == BestNum) {
5359 int LUIdx = RegUses.getUsedByIndices(Reg).find_first();
5360 if (LUIdx >= 0 && Uses[LUIdx].Kind == LSRUse::Address &&
5361 IsSimplerBaseSCEVForTarget(TTI, SE, Best, Reg,
5362 Uses[LUIdx].AccessTy)) {
5363 Best = Reg;
5364 BestNum = Count;
5365 }
5366 }
5367 }
5368 }
5369 assert(Best && "Failed to find best LSRUse candidate");
5370
5371 LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
5372 << " will yield profitable reuse.\n");
5373 Taken.insert(Best);
5374
5375 // In any use with formulae which references this register, delete formulae
5376 // which don't reference it.
5377 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5378 LSRUse &LU = Uses[LUIdx];
5379 if (!LU.Regs.count(Best)) continue;
5380
5381 bool Any = false;
5382 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5383 Formula &F = LU.Formulae[i];
5384 if (!F.referencesReg(Best)) {
5385 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
5386 LU.DeleteFormula(F);
5387 --e;
5388 --i;
5389 Any = true;
5390 assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
5391 continue;
5392 }
5393 }
5394
5395 if (Any)
5396 LU.RecomputeRegs(LUIdx, RegUses);
5397 }
5398
5399 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5400 }
5401}
5402
5403/// If there are an extraordinary number of formulae to choose from, use some
5404/// rough heuristics to prune down the number of formulae. This keeps the main
5405/// solver from taking an extraordinary amount of time in some worst-case
5406/// scenarios.
5407void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
5408 NarrowSearchSpaceByDetectingSupersets();
5409 NarrowSearchSpaceByCollapsingUnrolledCode();
5410 NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
5412 NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
5413 NarrowSearchSpaceByFilterPostInc();
5414 if (LSRExpNarrow)
5415 NarrowSearchSpaceByDeletingCostlyFormulas();
5416 else
5417 NarrowSearchSpaceByPickingWinnerRegs();
5418}
5419
5420/// This is the recursive solver.
5421void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
5422 Cost &SolutionCost,
5424 const Cost &CurCost,
5425 const SmallPtrSet<const SCEV *, 16> &CurRegs,
5426 DenseSet<const SCEV *> &VisitedRegs) const {
5427 // Some ideas:
5428 // - prune more:
5429 // - use more aggressive filtering
5430 // - sort the formula so that the most profitable solutions are found first
5431 // - sort the uses too
5432 // - search faster:
5433 // - don't compute a cost, and then compare. compare while computing a cost
5434 // and bail early.
5435 // - track register sets with SmallBitVector
5436
5437 const LSRUse &LU = Uses[Workspace.size()];
5438
5439 // If this use references any register that's already a part of the
5440 // in-progress solution, consider it a requirement that a formula must
5441 // reference that register in order to be considered. This prunes out
5442 // unprofitable searching.
5444 for (const SCEV *S : CurRegs)
5445 if (LU.Regs.count(S))
5446 ReqRegs.insert(S);
5447
5449 Cost NewCost(L, SE, TTI, AMK);
5450 for (const Formula &F : LU.Formulae) {
5451 // Ignore formulae which may not be ideal in terms of register reuse of
5452 // ReqRegs. The formula should use all required registers before
5453 // introducing new ones.
5454 // This can sometimes (notably when trying to favour postinc) lead to
5455 // sub-optimial decisions. There it is best left to the cost modelling to
5456 // get correct.
5457 if (AMK != TTI::AMK_PostIndexed || LU.Kind != LSRUse::Address) {
5458 int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
5459 for (const SCEV *Reg : ReqRegs) {
5460 if ((F.ScaledReg && F.ScaledReg == Reg) ||
5461 is_contained(F.BaseRegs, Reg)) {
5462 --NumReqRegsToFind;
5463 if (NumReqRegsToFind == 0)
5464 break;
5465 }
5466 }
5467 if (NumReqRegsToFind != 0) {
5468 // If none of the formulae satisfied the required registers, then we could
5469 // clear ReqRegs and try again. Currently, we simply give up in this case.
5470 continue;
5471 }
5472 }
5473
5474 // Evaluate the cost of the current formula. If it's already worse than
5475 // the current best, prune the search at that point.
5476 NewCost = CurCost;
5477 NewRegs = CurRegs;
5478 NewCost.RateFormula(F, NewRegs, VisitedRegs, LU);
5479 if (NewCost.isLess(SolutionCost)) {
5480 Workspace.push_back(&F);
5481 if (Workspace.size() != Uses.size()) {
5482 SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
5483 NewRegs, VisitedRegs);
5484 if (F.getNumRegs() == 1 && Workspace.size() == 1)
5485 VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
5486 } else {
5487 LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
5488 dbgs() << ".\nRegs:\n";
5489 for (const SCEV *S : NewRegs) dbgs()
5490 << "- " << *S << "\n";
5491 dbgs() << '\n');
5492
5493 SolutionCost = NewCost;
5494 Solution = Workspace;
5495 }
5496 Workspace.pop_back();
5497 }
5498 }
5499}
5500
5501/// Choose one formula from each use. Return the results in the given Solution
5502/// vector.
5503void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
5505 Cost SolutionCost(L, SE, TTI, AMK);
5506 SolutionCost.Lose();
5507 Cost CurCost(L, SE, TTI, AMK);
5509 DenseSet<const SCEV *> VisitedRegs;
5510 Workspace.reserve(Uses.size());
5511
5512 // SolveRecurse does all the work.
5513 SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
5514 CurRegs, VisitedRegs);
5515 if (Solution.empty()) {
5516 LLVM_DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
5517 return;
5518 }
5519
5520 // Ok, we've now made all our decisions.
5521 LLVM_DEBUG(dbgs() << "\n"
5522 "The chosen solution requires ";
5523 SolutionCost.print(dbgs()); dbgs() << ":\n";
5524 for (size_t i = 0, e = Uses.size(); i != e; ++i) {
5525 dbgs() << " ";
5526 Uses[i].print(dbgs());
5527 dbgs() << "\n"
5528 " ";
5529 Solution[i]->print(dbgs());
5530 dbgs() << '\n';
5531 });
5532
5533 assert(Solution.size() == Uses.size() && "Malformed solution!");
5534
5535 const bool EnableDropUnprofitableSolution = [&] {
5537 case cl::BOU_TRUE:
5538 return true;
5539 case cl::BOU_FALSE:
5540 return false;
5541 case cl::BOU_UNSET:
5543 }
5544 llvm_unreachable("Unhandled cl::boolOrDefault enum");
5545 }();
5546
5547 if (BaselineCost.isLess(SolutionCost)) {
5548 if (!EnableDropUnprofitableSolution)
5549 LLVM_DEBUG(
5550 dbgs() << "Baseline is more profitable than chosen solution, "
5551 "add option 'lsr-drop-solution' to drop LSR solution.\n");
5552 else {
5553 LLVM_DEBUG(dbgs() << "Baseline is more profitable than chosen "
5554 "solution, dropping LSR solution.\n";);
5555 Solution.clear();
5556 }
5557 }
5558}
5559
5560/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
5561/// we can go while still being dominated by the input positions. This helps
5562/// canonicalize the insert position, which encourages sharing.
5564LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
5565 const SmallVectorImpl<Instruction *> &Inputs)
5566 const {
5567 Instruction *Tentative = &*IP;
5568 while (true) {
5569 bool AllDominate = true;
5570 Instruction *BetterPos = nullptr;
5571 // Don't bother attempting to insert before a catchswitch, their basic block
5572 // cannot have other non-PHI instructions.
5573 if (isa<CatchSwitchInst>(Tentative))
5574 return IP;
5575
5576 for (Instruction *Inst : Inputs) {
5577 if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
5578 AllDominate = false;
5579 break;
5580 }
5581 // Attempt to find an insert position in the middle of the block,
5582 // instead of at the end, so that it can be used for other expansions.
5583 if (Tentative->getParent() == Inst->getParent() &&
5584 (!BetterPos || !DT.dominates(Inst, BetterPos)))
5585 BetterPos = &*std::next(BasicBlock::iterator(Inst));
5586 }
5587 if (!AllDominate)
5588 break;
5589 if (BetterPos)
5590 IP = BetterPos->getIterator();
5591 else
5592 IP = Tentative->getIterator();
5593
5594 const Loop *IPLoop = LI.getLoopFor(IP->getParent());
5595 unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
5596
5597 BasicBlock *IDom;
5598 for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
5599 if (!Rung) return IP;
5600 Rung = Rung->getIDom();
5601 if (!Rung) return IP;
5602 IDom = Rung->getBlock();
5603
5604 // Don't climb into a loop though.
5605 const Loop *IDomLoop = LI.getLoopFor(IDom);
5606 unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
5607 if (IDomDepth <= IPLoopDepth &&
5608 (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
5609 break;
5610 }
5611
5612 Tentative = IDom->getTerminator();
5613 }
5614
5615 return IP;
5616}
5617
5618/// Determine an input position which will be dominated by the operands and
5619/// which will dominate the result.
5620BasicBlock::iterator LSRInstance::AdjustInsertPositionForExpand(
5621 BasicBlock::iterator LowestIP, const LSRFixup &LF, const LSRUse &LU) const {
5622 // Collect some instructions which must be dominated by the
5623 // expanding replacement. These must be dominated by any operands that
5624 // will be required in the expansion.
5626 if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
5627 Inputs.push_back(I);
5628 if (LU.Kind == LSRUse::ICmpZero)
5629 if (Instruction *I =
5630 dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
5631 Inputs.push_back(I);
5632 if (LF.PostIncLoops.count(L)) {
5633 if (LF.isUseFullyOutsideLoop(L))
5634 Inputs.push_back(L->getLoopLatch()->getTerminator());
5635 else
5636 Inputs.push_back(IVIncInsertPos);
5637 }
5638 // The expansion must also be dominated by the increment positions of any
5639 // loops it for which it is using post-inc mode.
5640 for (const Loop *PIL : LF.PostIncLoops) {
5641 if (PIL == L) continue;
5642
5643 // Be dominated by the loop exit.
5644 SmallVector<BasicBlock *, 4> ExitingBlocks;
5645 PIL->getExitingBlocks(ExitingBlocks);
5646 if (!ExitingBlocks.empty()) {
5647 BasicBlock *BB = ExitingBlocks[0];
5648 for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
5649 BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
5650 Inputs.push_back(BB->getTerminator());
5651 }
5652 }
5653
5654 assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad()
5655 && !isa<DbgInfoIntrinsic>(LowestIP) &&
5656 "Insertion point must be a normal instruction");
5657
5658 // Then, climb up the immediate dominator tree as far as we can go while
5659 // still being dominated by the input positions.
5660 BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
5661
5662 // Don't insert instructions before PHI nodes.
5663 while (isa<PHINode>(IP)) ++IP;
5664
5665 // Ignore landingpad instructions.
5666 while (IP->isEHPad()) ++IP;
5667
5668 // Ignore debug intrinsics.
5669 while (isa<DbgInfoIntrinsic>(IP)) ++IP;
5670
5671 // Set IP below instructions recently inserted by SCEVExpander. This keeps the
5672 // IP consistent across expansions and allows the previously inserted
5673 // instructions to be reused by subsequent expansion.
5674 while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP)
5675 ++IP;
5676
5677 return IP;
5678}
5679
5680/// Emit instructions for the leading candidate expression for this LSRUse (this
5681/// is called "expanding").
5682Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
5683 const Formula &F, BasicBlock::iterator IP,
5684 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5685 if (LU.RigidFormula)
5686 return LF.OperandValToReplace;
5687
5688 // Determine an input position which will be dominated by the operands and
5689 // which will dominate the result.
5690 IP = AdjustInsertPositionForExpand(IP, LF, LU);
5691 Rewriter.setInsertPoint(&*IP);
5692
5693 // Inform the Rewriter if we have a post-increment use, so that it can
5694 // perform an advantageous expansion.
5695 Rewriter.setPostInc(LF.PostIncLoops);
5696
5697 // This is the type that the user actually needs.
5698 Type *OpTy = LF.OperandValToReplace->getType();
5699 // This will be the type that we'll initially expand to.
5700 Type *Ty = F.getType();
5701 if (!Ty)
5702 // No type known; just expand directly to the ultimate type.
5703 Ty = OpTy;
5704 else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
5705 // Expand directly to the ultimate type if it's the right size.
5706 Ty = OpTy;
5707 // This is the type to do integer arithmetic in.
5708 Type *IntTy = SE.getEffectiveSCEVType(Ty);
5709
5710 // Build up a list of operands to add together to form the full base.
5712
5713 // Expand the BaseRegs portion.
5714 for (const SCEV *Reg : F.BaseRegs) {
5715 assert(!Reg->isZero() && "Zero allocated in a base register!");
5716
5717 // If we're expanding for a post-inc user, make the post-inc adjustment.
5718 Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE);
5719 Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
5720 }
5721
5722 // Expand the ScaledReg portion.
5723 Value *ICmpScaledV = nullptr;
5724 if (F.Scale != 0) {
5725 const SCEV *ScaledS = F.ScaledReg;
5726
5727 // If we're expanding for a post-inc user, make the post-inc adjustment.
5728 PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
5729 ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE);
5730
5731 if (LU.Kind == LSRUse::ICmpZero) {
5732 // Expand ScaleReg as if it was part of the base regs.
5733 if (F.Scale == 1)
5734 Ops.push_back(
5735 SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)));
5736 else {
5737 // An interesting way of "folding" with an icmp is to use a negated
5738 // scale, which we'll implement by inserting it into the other operand
5739 // of the icmp.
5740 assert(F.Scale == -1 &&
5741 "The only scale supported by ICmpZero uses is -1!");
5742 ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr);
5743 }
5744 } else {
5745 // Otherwise just expand the scaled register and an explicit scale,
5746 // which is expected to be matched as part of the address.
5747
5748 // Flush the operand list to suppress SCEVExpander hoisting address modes.
5749 // Unless the addressing mode will not be folded.
5750 if (!Ops.empty() && LU.Kind == LSRUse::Address &&
5751 isAMCompletelyFolded(TTI, LU, F)) {
5752 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), nullptr);
5753 Ops.clear();
5754 Ops.push_back(SE.getUnknown(FullV));
5755 }
5756 ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr));
5757 if (F.Scale != 1)
5758 ScaledS =
5759 SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
5760 Ops.push_back(ScaledS);
5761 }
5762 }
5763
5764 // Expand the GV portion.
5765 if (F.BaseGV) {
5766 // Flush the operand list to suppress SCEVExpander hoisting.
5767 if (!Ops.empty()) {
5768 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), IntTy);
5769 Ops.clear();
5770 Ops.push_back(SE.getUnknown(FullV));
5771 }
5772 Ops.push_back(SE.getUnknown(F.BaseGV));
5773 }
5774
5775 // Flush the operand list to suppress SCEVExpander hoisting of both folded and
5776 // unfolded offsets. LSR assumes they both live next to their uses.
5777 if (!Ops.empty()) {
5778 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
5779 Ops.clear();
5780 Ops.push_back(SE.getUnknown(FullV));
5781 }
5782
5783 // FIXME: Are we sure we won't get a mismatch here? Is there a way to bail
5784 // out at this point, or should we generate a SCEV adding together mixed
5785 // offsets?
5786 assert(F.BaseOffset.isCompatibleImmediate(LF.Offset) &&
5787 "Expanding mismatched offsets\n");
5788 // Expand the immediate portion.
5789 Immediate Offset = F.BaseOffset.addUnsigned(LF.Offset);
5790 if (Offset.isNonZero()) {
5791 if (LU.Kind == LSRUse::ICmpZero) {
5792 // The other interesting way of "folding" with an ICmpZero is to use a
5793 // negated immediate.
5794 if (!ICmpScaledV)
5795 ICmpScaledV =
5796 ConstantInt::get(IntTy, -(uint64_t)Offset.getFixedValue());
5797 else {
5798 Ops.push_back(SE.getUnknown(ICmpScaledV));
5799 ICmpScaledV = ConstantInt::get(IntTy, Offset.getFixedValue());
5800 }
5801 } else {
5802 // Just add the immediate values. These again are expected to be matched
5803 // as part of the address.
5804 Ops.push_back(Offset.getUnknownSCEV(SE, IntTy));
5805 }
5806 }
5807
5808 // Expand the unfolded offset portion.
5809 Immediate UnfoldedOffset = F.UnfoldedOffset;
5810 if (UnfoldedOffset.isNonZero()) {
5811 // Just add the immediate values.
5812 Ops.push_back(UnfoldedOffset.getUnknownSCEV(SE, IntTy));
5813 }
5814
5815 // Emit instructions summing all the operands.
5816 const SCEV *FullS = Ops.empty() ?
5817 SE.getConstant(IntTy, 0) :
5818 SE.getAddExpr(Ops);
5819 Value *FullV = Rewriter.expandCodeFor(FullS, Ty);
5820
5821 // We're done expanding now, so reset the rewriter.
5822 Rewriter.clearPostInc();
5823
5824 // An ICmpZero Formula represents an ICmp which we're handling as a
5825 // comparison against zero. Now that we've expanded an expression for that
5826 // form, update the ICmp's other operand.
5827 if (LU.Kind == LSRUse::ICmpZero) {
5828 ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
5829 if (auto *OperandIsInstr = dyn_cast<Instruction>(CI->getOperand(1)))
5830 DeadInsts.emplace_back(OperandIsInstr);
5831 assert(!F.BaseGV && "ICmp does not support folding a global value and "
5832 "a scale at the same time!");
5833 if (F.Scale == -1) {
5834 if (ICmpScaledV->getType() != OpTy) {
5836 CastInst::getCastOpcode(ICmpScaledV, false, OpTy, false),
5837 ICmpScaledV, OpTy, "tmp", CI->getIterator());
5838 ICmpScaledV = Cast;
5839 }
5840 CI->setOperand(1, ICmpScaledV);
5841 } else {
5842 // A scale of 1 means that the scale has been expanded as part of the
5843 // base regs.
5844 assert((F.Scale == 0 || F.Scale == 1) &&
5845 "ICmp does not support folding a global value and "
5846 "a scale at the same time!");
5848 -(uint64_t)Offset.getFixedValue());
5849 if (C->getType() != OpTy) {
5851 CastInst::getCastOpcode(C, false, OpTy, false), C, OpTy,
5852 CI->getDataLayout());
5853 assert(C && "Cast of ConstantInt should have folded");
5854 }
5855
5856 CI->setOperand(1, C);
5857 }
5858 }
5859
5860 return FullV;
5861}
5862
5863/// Helper for Rewrite. PHI nodes are special because the use of their operands
5864/// effectively happens in their predecessor blocks, so the expression may need
5865/// to be expanded in multiple places.
5866void LSRInstance::RewriteForPHI(
5867 PHINode *PN, const LSRUse &LU, const LSRFixup &LF, const Formula &F,
5868 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5870
5871 // Inserting instructions in the loop and using them as PHI's input could
5872 // break LCSSA in case if PHI's parent block is not a loop exit (i.e. the
5873 // corresponding incoming block is not loop exiting). So collect all such
5874 // instructions to form LCSSA for them later.
5875 SmallVector<Instruction *, 4> InsertedNonLCSSAInsts;
5876
5877 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
5878 if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
5879 bool needUpdateFixups = false;
5880 BasicBlock *BB = PN->getIncomingBlock(i);
5881
5882 // If this is a critical edge, split the edge so that we do not insert
5883 // the code on all predecessor/successor paths. We do this unless this
5884 // is the canonical backedge for this loop, which complicates post-inc
5885 // users.
5886 if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
5887 !isa<IndirectBrInst>(BB->getTerminator()) &&
5888 !isa<CatchSwitchInst>(BB->getTerminator())) {
5889 BasicBlock *Parent = PN->getParent();
5890 Loop *PNLoop = LI.getLoopFor(Parent);
5891 if (!PNLoop || Parent != PNLoop->getHeader()) {
5892 // Split the critical edge.
5893 BasicBlock *NewBB = nullptr;
5894 if (!Parent->isLandingPad()) {
5895 NewBB =
5896 SplitCriticalEdge(BB, Parent,
5897 CriticalEdgeSplittingOptions(&DT, &LI, MSSAU)
5898 .setMergeIdenticalEdges()
5899 .setKeepOneInputPHIs());
5900 } else {
5902 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
5903 SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DTU, &LI);
5904 NewBB = NewBBs[0];
5905 }
5906 // If NewBB==NULL, then SplitCriticalEdge refused to split because all
5907 // phi predecessors are identical. The simple thing to do is skip
5908 // splitting in this case rather than complicate the API.
5909 if (NewBB) {
5910 // If PN is outside of the loop and BB is in the loop, we want to
5911 // move the block to be immediately before the PHI block, not
5912 // immediately after BB.
5913 if (L->contains(BB) && !L->contains(PN))
5914 NewBB->moveBefore(PN->getParent());
5915
5916 // Splitting the edge can reduce the number of PHI entries we have.
5917 e = PN->getNumIncomingValues();
5918 BB = NewBB;
5919 i = PN->getBasicBlockIndex(BB);
5920
5921 needUpdateFixups = true;
5922 }
5923 }
5924 }
5925
5926 std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
5927 Inserted.insert(std::make_pair(BB, static_cast<Value *>(nullptr)));
5928 if (!Pair.second)
5929 PN->setIncomingValue(i, Pair.first->second);
5930 else {
5931 Value *FullV =
5932 Expand(LU, LF, F, BB->getTerminator()->getIterator(), DeadInsts);
5933
5934 // If this is reuse-by-noop-cast, insert the noop cast.
5935 Type *OpTy = LF.OperandValToReplace->getType();
5936 if (FullV->getType() != OpTy)
5937 FullV = CastInst::Create(
5938 CastInst::getCastOpcode(FullV, false, OpTy, false), FullV,
5939 LF.OperandValToReplace->getType(), "tmp",
5940 BB->getTerminator()->getIterator());
5941
5942 // If the incoming block for this value is not in the loop, it means the
5943 // current PHI is not in a loop exit, so we must create a LCSSA PHI for
5944 // the inserted value.
5945 if (auto *I = dyn_cast<Instruction>(FullV))
5946 if (L->contains(I) && !L->contains(BB))
5947 InsertedNonLCSSAInsts.push_back(I);
5948
5949 PN->setIncomingValue(i, FullV);
5950 Pair.first->second = FullV;
5951 }
5952
5953 // If LSR splits critical edge and phi node has other pending
5954 // fixup operands, we need to update those pending fixups. Otherwise
5955 // formulae will not be implemented completely and some instructions
5956 // will not be eliminated.
5957 if (needUpdateFixups) {
5958 for (LSRUse &LU : Uses)
5959 for (LSRFixup &Fixup : LU.Fixups)
5960 // If fixup is supposed to rewrite some operand in the phi
5961 // that was just updated, it may be already moved to
5962 // another phi node. Such fixup requires update.
5963 if (Fixup.UserInst == PN) {
5964 // Check if the operand we try to replace still exists in the
5965 // original phi.
5966 bool foundInOriginalPHI = false;
5967 for (const auto &val : PN->incoming_values())
5968 if (val == Fixup.OperandValToReplace) {
5969 foundInOriginalPHI = true;
5970 break;
5971 }
5972
5973 // If fixup operand found in original PHI - nothing to do.
5974 if (foundInOriginalPHI)
5975 continue;
5976
5977 // Otherwise it might be moved to another PHI and requires update.
5978 // If fixup operand not found in any of the incoming blocks that
5979 // means we have already rewritten it - nothing to do.
5980 for (const auto &Block : PN->blocks())
5981 for (BasicBlock::iterator I = Block->begin(); isa<PHINode>(I);
5982 ++I) {
5983 PHINode *NewPN = cast<PHINode>(I);
5984 for (const auto &val : NewPN->incoming_values())
5985 if (val == Fixup.OperandValToReplace)
5986 Fixup.UserInst = NewPN;
5987 }
5988 }
5989 }
5990 }
5991
5992 formLCSSAForInstructions(InsertedNonLCSSAInsts, DT, LI, &SE);
5993}
5994
5995/// Emit instructions for the leading candidate expression for this LSRUse (this
5996/// is called "expanding"), and update the UserInst to reference the newly
5997/// expanded value.
5998void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
5999 const Formula &F,
6000 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
6001 // First, find an insertion point that dominates UserInst. For PHI nodes,
6002 // find the nearest block which dominates all the relevant uses.
6003 if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
6004 RewriteForPHI(PN, LU, LF, F, DeadInsts);
6005 } else {
6006 Value *FullV = Expand(LU, LF, F, LF.UserInst->getIterator(), DeadInsts);
6007
6008 // If this is reuse-by-noop-cast, insert the noop cast.
6009 Type *OpTy = LF.OperandValToReplace->getType();
6010 if (FullV->getType() != OpTy) {
6011 Instruction *Cast =
6012 CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
6013 FullV, OpTy, "tmp", LF.UserInst->getIterator());
6014 FullV = Cast;
6015 }
6016
6017 // Update the user. ICmpZero is handled specially here (for now) because
6018 // Expand may have updated one of the operands of the icmp already, and
6019 // its new value may happen to be equal to LF.OperandValToReplace, in
6020 // which case doing replaceUsesOfWith leads to replacing both operands
6021 // with the same value. TODO: Reorganize this.
6022 if (LU.Kind == LSRUse::ICmpZero)
6023 LF.UserInst->setOperand(0, FullV);
6024 else
6025 LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
6026 }
6027
6028 if (auto *OperandIsInstr = dyn_cast<Instruction>(LF.OperandValToReplace))
6029 DeadInsts.emplace_back(OperandIsInstr);
6030}
6031
6032// Trying to hoist the IVInc to loop header if all IVInc users are in
6033// the loop header. It will help backend to generate post index load/store
6034// when the latch block is different from loop header block.
6035static bool canHoistIVInc(const TargetTransformInfo &TTI, const LSRFixup &Fixup,
6036 const LSRUse &LU, Instruction *IVIncInsertPos,
6037 Loop *L) {
6038 if (LU.Kind != LSRUse::Address)
6039 return false;
6040
6041 // For now this code do the conservative optimization, only work for
6042 // the header block. Later we can hoist the IVInc to the block post
6043 // dominate all users.
6044 BasicBlock *LHeader = L->getHeader();
6045 if (IVIncInsertPos->getParent() == LHeader)
6046 return false;
6047
6048 if (!Fixup.OperandValToReplace ||
6049 any_of(Fixup.OperandValToReplace->users(), [&LHeader](User *U) {
6050 Instruction *UI = cast<Instruction>(U);
6051 return UI->getParent() != LHeader;
6052 }))
6053 return false;
6054
6055 Instruction *I = Fixup.UserInst;
6056 Type *Ty = I->getType();
6057 return Ty->isIntegerTy() &&
6058 ((isa<LoadInst>(I) && TTI.isIndexedLoadLegal(TTI.MIM_PostInc, Ty)) ||
6059 (isa<StoreInst>(I) && TTI.isIndexedStoreLegal(TTI.MIM_PostInc, Ty)));
6060}
6061
6062/// Rewrite all the fixup locations with new values, following the chosen
6063/// solution.
6064void LSRInstance::ImplementSolution(
6065 const SmallVectorImpl<const Formula *> &Solution) {
6066 // Keep track of instructions we may have made dead, so that
6067 // we can remove them after we are done working.
6069
6070 // Mark phi nodes that terminate chains so the expander tries to reuse them.
6071 for (const IVChain &Chain : IVChainVec) {
6072 if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst()))
6073 Rewriter.setChainedPhi(PN);
6074 }
6075
6076 // Expand the new value definitions and update the users.
6077 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
6078 for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) {
6079 Instruction *InsertPos =
6080 canHoistIVInc(TTI, Fixup, Uses[LUIdx], IVIncInsertPos, L)
6081 ? L->getHeader()->getTerminator()
6082 : IVIncInsertPos;
6083 Rewriter.setIVIncInsertPos(L, InsertPos);
6084 Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], DeadInsts);
6085 Changed = true;
6086 }
6087
6088 for (const IVChain &Chain : IVChainVec) {
6089 GenerateIVChain(Chain, DeadInsts);
6090 Changed = true;
6091 }
6092
6093 for (const WeakVH &IV : Rewriter.getInsertedIVs())
6094 if (IV && dyn_cast<Instruction>(&*IV)->getParent())
6095 ScalarEvolutionIVs.push_back(IV);
6096
6097 // Clean up after ourselves. This must be done before deleting any
6098 // instructions.
6099 Rewriter.clear();
6100
6102 &TLI, MSSAU);
6103
6104 // In our cost analysis above, we assume that each addrec consumes exactly
6105 // one register, and arrange to have increments inserted just before the
6106 // latch to maximimize the chance this is true. However, if we reused
6107 // existing IVs, we now need to move the increments to match our
6108 // expectations. Otherwise, our cost modeling results in us having a
6109 // chosen a non-optimal result for the actual schedule. (And yes, this
6110 // scheduling decision does impact later codegen.)
6111 for (PHINode &PN : L->getHeader()->phis()) {
6112 BinaryOperator *BO = nullptr;
6113 Value *Start = nullptr, *Step = nullptr;
6114 if (!matchSimpleRecurrence(&PN, BO, Start, Step))
6115 continue;
6116
6117 switch (BO->getOpcode()) {
6118 case Instruction::Sub:
6119 if (BO->getOperand(0) != &PN)
6120 // sub is non-commutative - match handling elsewhere in LSR
6121 continue;
6122 break;
6123 case Instruction::Add:
6124 break;
6125 default:
6126 continue;
6127 };
6128
6129 if (!isa<Constant>(Step))
6130 // If not a constant step, might increase register pressure
6131 // (We assume constants have been canonicalized to RHS)
6132 continue;
6133
6134 if (BO->getParent() == IVIncInsertPos->getParent())
6135 // Only bother moving across blocks. Isel can handle block local case.
6136 continue;
6137
6138 // Can we legally schedule inc at the desired point?
6139 if (!llvm::all_of(BO->uses(),
6140 [&](Use &U) {return DT.dominates(IVIncInsertPos, U);}))
6141 continue;
6142 BO->moveBefore(IVIncInsertPos);
6143 Changed = true;
6144 }
6145
6146
6147}
6148
6149LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
6150 DominatorTree &DT, LoopInfo &LI,
6153 : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), TLI(TLI), TTI(TTI), L(L),
6154 MSSAU(MSSAU), AMK(PreferredAddresingMode.getNumOccurrences() > 0
6156 : TTI.getPreferredAddressingMode(L, &SE)),
6157 Rewriter(SE, L->getHeader()->getDataLayout(), "lsr", false),
6158 BaselineCost(L, SE, TTI, AMK) {
6159 // If LoopSimplify form is not available, stay out of trouble.
6160 if (!L->isLoopSimplifyForm())
6161 return;
6162
6163 // If there's no interesting work to be done, bail early.
6164 if (IU.empty()) return;
6165
6166 // If there's too much analysis to be done, bail early. We won't be able to
6167 // model the problem anyway.
6168 unsigned NumUsers = 0;
6169 for (const IVStrideUse &U : IU) {
6170 if (++NumUsers > MaxIVUsers) {
6171 (void)U;
6172 LLVM_DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U
6173 << "\n");
6174 return;
6175 }
6176 // Bail out if we have a PHI on an EHPad that gets a value from a
6177 // CatchSwitchInst. Because the CatchSwitchInst cannot be split, there is
6178 // no good place to stick any instructions.
6179 if (auto *PN = dyn_cast<PHINode>(U.getUser())) {
6180 auto *FirstNonPHI = PN->getParent()->getFirstNonPHI();
6181 if (isa<FuncletPadInst>(FirstNonPHI) ||
6182 isa<CatchSwitchInst>(FirstNonPHI))
6183 for (BasicBlock *PredBB : PN->blocks())
6184 if (isa<CatchSwitchInst>(PredBB->getFirstNonPHI()))
6185 return;
6186 }
6187 }
6188
6189 LLVM_DEBUG(dbgs() << "\nLSR on loop ";
6190 L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
6191 dbgs() << ":\n");
6192
6193 // Configure SCEVExpander already now, so the correct mode is used for
6194 // isSafeToExpand() checks.
6195#ifndef NDEBUG
6196 Rewriter.setDebugType(DEBUG_TYPE);
6197#endif
6198 Rewriter.disableCanonicalMode();
6199 Rewriter.enableLSRMode();
6200
6201 // First, perform some low-level loop optimizations.
6202 OptimizeShadowIV();
6203 OptimizeLoopTermCond();
6204
6205 // If loop preparation eliminates all interesting IV users, bail.
6206 if (IU.empty()) return;
6207
6208 // Skip nested loops until we can model them better with formulae.
6209 if (!L->isInnermost()) {
6210 LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
6211 return;
6212 }
6213
6214 // Start collecting data and preparing for the solver.
6215 // If number of registers is not the major cost, we cannot benefit from the
6216 // current profitable chain optimization which is based on number of
6217 // registers.
6218 // FIXME: add profitable chain optimization for other kinds major cost, for
6219 // example number of instructions.
6221 CollectChains();
6222 CollectInterestingTypesAndFactors();
6223 CollectFixupsAndInitialFormulae();
6224 CollectLoopInvariantFixupsAndFormulae();
6225
6226 if (Uses.empty())
6227 return;
6228
6229 LLVM_DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
6230 print_uses(dbgs()));
6231 LLVM_DEBUG(dbgs() << "The baseline solution requires ";
6232 BaselineCost.print(dbgs()); dbgs() << "\n");
6233
6234 // Now use the reuse data to generate a bunch of interesting ways
6235 // to formulate the values needed for the uses.
6236 GenerateAllReuseFormulae();
6237
6238 FilterOutUndesirableDedicatedRegisters();
6239 NarrowSearchSpaceUsingHeuristics();
6240
6242 Solve(Solution);
6243
6244 // Release memory that is no longer needed.
6245 Factors.clear();
6246 Types.clear();
6247 RegUses.clear();
6248
6249 if (Solution.empty())
6250 return;
6251
6252#ifndef NDEBUG
6253 // Formulae should be legal.
6254 for (const LSRUse &LU : Uses) {
6255 for (const Formula &F : LU.Formulae)
6256 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
6257 F) && "Illegal formula generated!");
6258 };
6259#endif
6260
6261 // Now that we've decided what we want, make it so.
6262 ImplementSolution(Solution);
6263}
6264
6265#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
6266void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
6267 if (Factors.empty() && Types.empty()) return;
6268
6269 OS << "LSR has identified the following interesting factors and types: ";
6270 bool First = true;
6271
6272 for (int64_t Factor : Factors) {
6273 if (!First) OS << ", ";
6274 First = false;
6275 OS << '*' << Factor;
6276 }
6277
6278 for (Type *Ty : Types) {
6279 if (!First) OS << ", ";
6280 First = false;
6281 OS << '(' << *Ty << ')';
6282 }
6283 OS << '\n';
6284}
6285
6286void LSRInstance::print_fixups(raw_ostream &OS) const {
6287 OS << "LSR is examining the following fixup sites:\n";
6288 for (const LSRUse &LU : Uses)
6289 for (const LSRFixup &LF : LU.Fixups) {
6290 dbgs() << " ";
6291 LF.print(OS);
6292 OS << '\n';
6293 }
6294}
6295
6296void LSRInstance::print_uses(raw_ostream &OS) const {
6297 OS << "LSR is examining the following uses:\n";
6298 for (const LSRUse &LU : Uses) {
6299 dbgs() << " ";
6300 LU.print(OS);
6301 OS << '\n';
6302 for (const Formula &F : LU.Formulae) {
6303 OS << " ";
6304 F.print(OS);
6305 OS << '\n';
6306 }
6307 }
6308}
6309
6310void LSRInstance::print(raw_ostream &OS) const {
6311 print_factors_and_types(OS);
6312 print_fixups(OS);
6313 print_uses(OS);
6314}
6315
6316LLVM_DUMP_METHOD void LSRInstance::dump() const {
6317 print(errs()); errs() << '\n';
6318}
6319#endif
6320
6321namespace {
6322
6323class LoopStrengthReduce : public LoopPass {
6324public:
6325 static char ID; // Pass ID, replacement for typeid
6326
6327 LoopStrengthReduce();
6328
6329private:
6330 bool runOnLoop(Loop *L, LPPassManager &LPM) override;
6331 void getAnalysisUsage(AnalysisUsage &AU) const override;
6332};
6333
6334} // end anonymous namespace
6335
6336LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
6338}
6339
6340void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
6341 // We split critical edges, so we change the CFG. However, we do update
6342 // many analyses if they are around.
6344
6354 // Requiring LoopSimplify a second time here prevents IVUsers from running
6355 // twice, since LoopSimplify was invalidated by running ScalarEvolution.
6361}
6362
6363namespace {
6364
6365/// Enables more convenient iteration over a DWARF expression vector.
6367ToDwarfOpIter(SmallVectorImpl<uint64_t> &Expr) {
6372 return {Begin, End};
6373}
6374
6375struct SCEVDbgValueBuilder {
6376 SCEVDbgValueBuilder() = default;
6377 SCEVDbgValueBuilder(const SCEVDbgValueBuilder &Base) { clone(Base); }
6378
6379 void clone(const SCEVDbgValueBuilder &Base) {
6380 LocationOps = Base.LocationOps;
6381 Expr = Base.Expr;
6382 }
6383
6384 void clear() {
6385 LocationOps.clear();
6386 Expr.clear();
6387 }
6388
6389 /// The DIExpression as we translate the SCEV.
6391 /// The location ops of the DIExpression.
6392 SmallVector<Value *, 2> LocationOps;
6393
6394 void pushOperator(uint64_t Op) { Expr.push_back(Op); }
6395 void pushUInt(uint64_t Operand) { Expr.push_back(Operand); }
6396
6397 /// Add a DW_OP_LLVM_arg to the expression, followed by the index of the value
6398 /// in the set of values referenced by the expression.
6399 void pushLocation(llvm::Value *V) {
6401 auto *It = llvm::find(LocationOps, V);
6402 unsigned ArgIndex = 0;
6403 if (It != LocationOps.end()) {
6404 ArgIndex = std::distance(LocationOps.begin(), It);
6405 } else {
6406 ArgIndex = LocationOps.size();
6407 LocationOps.push_back(V);
6408 }
6409 Expr.push_back(ArgIndex);
6410 }
6411
6412 void pushValue(const SCEVUnknown *U) {
6413 llvm::Value *V = cast<SCEVUnknown>(U)->getValue();
6414 pushLocation(V);
6415 }
6416
6417 bool pushConst(const SCEVConstant *C) {
6418 if (C->getAPInt().getSignificantBits() > 64)
6419 return false;
6420 Expr.push_back(llvm::dwarf::DW_OP_consts);
6421 Expr.push_back(C->getAPInt().getSExtValue());
6422 return true;
6423 }
6424
6425 // Iterating the expression as DWARF ops is convenient when updating
6426 // DWARF_OP_LLVM_args.
6428 return ToDwarfOpIter(Expr);
6429 }
6430
6431 /// Several SCEV types are sequences of the same arithmetic operator applied
6432 /// to constants and values that may be extended or truncated.
6433 bool pushArithmeticExpr(const llvm::SCEVCommutativeExpr *CommExpr,
6434 uint64_t DwarfOp) {
6435 assert((isa<llvm::SCEVAddExpr>(CommExpr) || isa<SCEVMulExpr>(CommExpr)) &&
6436 "Expected arithmetic SCEV type");
6437 bool Success = true;
6438 unsigned EmitOperator = 0;
6439 for (const auto &Op : CommExpr->operands()) {
6440 Success &= pushSCEV(Op);
6441
6442 if (EmitOperator >= 1)
6443 pushOperator(DwarfOp);
6444 ++EmitOperator;
6445 }
6446 return Success;
6447 }
6448
6449 // TODO: Identify and omit noop casts.
6450 bool pushCast(const llvm::SCEVCastExpr *C, bool IsSigned) {
6451 const llvm::SCEV *Inner = C->getOperand(0);
6452 const llvm::Type *Type = C->getType();
6453 uint64_t ToWidth = Type->getIntegerBitWidth();
6454 bool Success = pushSCEV(Inner);
6455 uint64_t CastOps[] = {dwarf::DW_OP_LLVM_convert, ToWidth,
6456 IsSigned ? llvm::dwarf::DW_ATE_signed
6457 : llvm::dwarf::DW_ATE_unsigned};
6458 for (const auto &Op : CastOps)
6459 pushOperator(Op);
6460 return Success;
6461 }
6462
6463 // TODO: MinMax - although these haven't been encountered in the test suite.
6464 bool pushSCEV(const llvm::SCEV *S) {
6465 bool Success = true;
6466 if (const SCEVConstant *StartInt = dyn_cast<SCEVConstant>(S)) {
6467 Success &= pushConst(StartInt);
6468
6469 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
6470 if (!U->getValue())
6471 return false;
6472 pushLocation(U->getValue());
6473
6474 } else if (const SCEVMulExpr *MulRec = dyn_cast<SCEVMulExpr>(S)) {
6475 Success &= pushArithmeticExpr(MulRec, llvm::dwarf::DW_OP_mul);
6476
6477 } else if (const SCEVUDivExpr *UDiv = dyn_cast<SCEVUDivExpr>(S)) {
6478 Success &= pushSCEV(UDiv->getLHS());
6479 Success &= pushSCEV(UDiv->getRHS());
6480 pushOperator(llvm::dwarf::DW_OP_div);
6481
6482 } else if (const SCEVCastExpr *Cast = dyn_cast<SCEVCastExpr>(S)) {
6483 // Assert if a new and unknown SCEVCastEXpr type is encountered.
6484 assert((isa<SCEVZeroExtendExpr>(Cast) || isa<SCEVTruncateExpr>(Cast) ||
6485 isa<SCEVPtrToIntExpr>(Cast) || isa<SCEVSignExtendExpr>(Cast)) &&
6486 "Unexpected cast type in SCEV.");
6487 Success &= pushCast(Cast, (isa<SCEVSignExtendExpr>(Cast)));
6488
6489 } else if (const SCEVAddExpr *AddExpr = dyn_cast<SCEVAddExpr>(S)) {
6490 Success &= pushArithmeticExpr(AddExpr, llvm::dwarf::DW_OP_plus);
6491
6492 } else if (isa<SCEVAddRecExpr>(S)) {
6493 // Nested SCEVAddRecExpr are generated by nested loops and are currently
6494 // unsupported.
6495 return false;
6496
6497 } else {
6498 return false;
6499 }
6500 return Success;
6501 }
6502
6503 /// Return true if the combination of arithmetic operator and underlying
6504 /// SCEV constant value is an identity function.
6505 bool isIdentityFunction(uint64_t Op, const SCEV *S) {
6506 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
6507 if (C->getAPInt().getSignificantBits() > 64)
6508 return false;
6509 int64_t I = C->getAPInt().getSExtValue();
6510 switch (Op) {
6511 case llvm::dwarf::DW_OP_plus:
6512 case llvm::dwarf::DW_OP_minus:
6513 return I == 0;
6514 case llvm::dwarf::DW_OP_mul:
6515 case llvm::dwarf::DW_OP_div:
6516 return I == 1;
6517 }
6518 }
6519 return false;
6520 }
6521
6522 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6523 /// builder's expression stack. The stack should already contain an
6524 /// expression for the iteration count, so that it can be multiplied by
6525 /// the stride and added to the start.
6526 /// Components of the expression are omitted if they are an identity function.
6527 /// Chain (non-affine) SCEVs are not supported.
6528 bool SCEVToValueExpr(const llvm::SCEVAddRecExpr &SAR, ScalarEvolution &SE) {
6529 assert(SAR.isAffine() && "Expected affine SCEV");
6530 // TODO: Is this check needed?
6531 if (isa<SCEVAddRecExpr>(SAR.getStart()))
6532 return false;
6533
6534 const SCEV *Start = SAR.getStart();
6535 const SCEV *Stride = SAR.getStepRecurrence(SE);
6536
6537 // Skip pushing arithmetic noops.
6538 if (!isIdentityFunction(llvm::dwarf::DW_OP_mul, Stride)) {
6539 if (!pushSCEV(Stride))
6540 return false;
6541 pushOperator(llvm::dwarf::DW_OP_mul);
6542 }
6543 if (!isIdentityFunction(llvm::dwarf::DW_OP_plus, Start)) {
6544 if (!pushSCEV(Start))
6545 return false;
6546 pushOperator(llvm::dwarf::DW_OP_plus);
6547 }
6548 return true;
6549 }
6550
6551 /// Create an expression that is an offset from a value (usually the IV).
6552 void createOffsetExpr(int64_t Offset, Value *OffsetValue) {
6553 pushLocation(OffsetValue);
6555 LLVM_DEBUG(
6556 dbgs() << "scev-salvage: Generated IV offset expression. Offset: "
6557 << std::to_string(Offset) << "\n");
6558 }
6559
6560 /// Combine a translation of the SCEV and the IV to create an expression that
6561 /// recovers a location's value.
6562 /// returns true if an expression was created.
6563 bool createIterCountExpr(const SCEV *S,
6564 const SCEVDbgValueBuilder &IterationCount,
6565 ScalarEvolution &SE) {
6566 // SCEVs for SSA values are most frquently of the form
6567 // {start,+,stride}, but sometimes they are ({start,+,stride} + %a + ..).
6568 // This is because %a is a PHI node that is not the IV. However, these
6569 // SCEVs have not been observed to result in debuginfo-lossy optimisations,
6570 // so its not expected this point will be reached.
6571 if (!isa<SCEVAddRecExpr>(S))
6572 return false;
6573
6574 LLVM_DEBUG(dbgs() << "scev-salvage: Location to salvage SCEV: " << *S
6575 << '\n');
6576
6577 const auto *Rec = cast<SCEVAddRecExpr>(S);
6578 if (!Rec->isAffine())
6579 return false;
6580
6582 return false;
6583
6584 // Initialise a new builder with the iteration count expression. In
6585 // combination with the value's SCEV this enables recovery.
6586 clone(IterationCount);
6587 if (!SCEVToValueExpr(*Rec, SE))
6588 return false;
6589
6590 return true;
6591 }
6592
6593 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6594 /// builder's expression stack. The stack should already contain an
6595 /// expression for the iteration count, so that it can be multiplied by
6596 /// the stride and added to the start.
6597 /// Components of the expression are omitted if they are an identity function.
6598 bool SCEVToIterCountExpr(const llvm::SCEVAddRecExpr &SAR,
6599 ScalarEvolution &SE) {
6600 assert(SAR.isAffine() && "Expected affine SCEV");
6601 if (isa<SCEVAddRecExpr>(SAR.getStart())) {
6602 LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV. Unsupported nested AddRec: "
6603 << SAR << '\n');
6604 return false;
6605 }
6606 const SCEV *Start = SAR.getStart();
6607 const SCEV *Stride = SAR.getStepRecurrence(SE);
6608
6609 // Skip pushing arithmetic noops.
6610 if (!isIdentityFunction(llvm::dwarf::DW_OP_minus, Start)) {
6611 if (!pushSCEV(Start))
6612 return false;
6613 pushOperator(llvm::dwarf::DW_OP_minus);
6614 }
6615 if (!isIdentityFunction(llvm::dwarf::DW_OP_div, Stride)) {
6616 if (!pushSCEV(Stride))
6617 return false;
6618 pushOperator(llvm::dwarf::DW_OP_div);
6619 }
6620 return true;
6621 }
6622
6623 // Append the current expression and locations to a location list and an
6624 // expression list. Modify the DW_OP_LLVM_arg indexes to account for
6625 // the locations already present in the destination list.
6626 void appendToVectors(SmallVectorImpl<uint64_t> &DestExpr,
6627 SmallVectorImpl<Value *> &DestLocations) {
6628 assert(!DestLocations.empty() &&
6629 "Expected the locations vector to contain the IV");
6630 // The DWARF_OP_LLVM_arg arguments of the expression being appended must be
6631 // modified to account for the locations already in the destination vector.
6632 // All builders contain the IV as the first location op.
6633 assert(!LocationOps.empty() &&
6634 "Expected the location ops to contain the IV.");
6635 // DestIndexMap[n] contains the index in DestLocations for the nth
6636 // location in this SCEVDbgValueBuilder.
6637 SmallVector<uint64_t, 2> DestIndexMap;
6638 for (const auto &Op : LocationOps) {
6639 auto It = find(DestLocations, Op);
6640 if (It != DestLocations.end()) {
6641 // Location already exists in DestLocations, reuse existing ArgIndex.
6642 DestIndexMap.push_back(std::distance(DestLocations.begin(), It));
6643 continue;
6644 }
6645 // Location is not in DestLocations, add it.
6646 DestIndexMap.push_back(DestLocations.size());
6647 DestLocations.push_back(Op);
6648 }
6649
6650 for (const auto &Op : expr_ops()) {
6651 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6652 Op.appendToVector(DestExpr);
6653 continue;
6654 }
6655
6657 // `DW_OP_LLVM_arg n` represents the nth LocationOp in this SCEV,
6658 // DestIndexMap[n] contains its new index in DestLocations.
6659 uint64_t NewIndex = DestIndexMap[Op.getArg(0)];
6660 DestExpr.push_back(NewIndex);
6661 }
6662 }
6663};
6664
6665/// Holds all the required data to salvage a dbg.value using the pre-LSR SCEVs
6666/// and DIExpression.
6667struct DVIRecoveryRec {
6668 DVIRecoveryRec(DbgValueInst *DbgValue)
6669 : DbgRef(DbgValue), Expr(DbgValue->getExpression()),
6670 HadLocationArgList(false) {}
6671 DVIRecoveryRec(DbgVariableRecord *DVR)
6672 : DbgRef(DVR), Expr(DVR->getExpression()), HadLocationArgList(false) {}
6673
6675 DIExpression *Expr;
6676 bool HadLocationArgList;
6677 SmallVector<WeakVH, 2> LocationOps;
6680
6681 void clear() {
6682 for (auto &RE : RecoveryExprs)
6683 RE.reset();
6684 RecoveryExprs.clear();
6685 }
6686
6687 ~DVIRecoveryRec() { clear(); }
6688};
6689} // namespace
6690
6691/// Returns the total number of DW_OP_llvm_arg operands in the expression.
6692/// This helps in determining if a DIArglist is necessary or can be omitted from
6693/// the dbg.value.
6695 auto expr_ops = ToDwarfOpIter(Expr);
6696 unsigned Count = 0;
6697 for (auto Op : expr_ops)
6698 if (Op.getOp() == dwarf::DW_OP_LLVM_arg)
6699 Count++;
6700 return Count;
6701}
6702
6703/// Overwrites DVI with the location and Ops as the DIExpression. This will
6704/// create an invalid expression if Ops has any dwarf::DW_OP_llvm_arg operands,
6705/// because a DIArglist is not created for the first argument of the dbg.value.
6706template <typename T>
6707static void updateDVIWithLocation(T &DbgVal, Value *Location,
6709 assert(numLLVMArgOps(Ops) == 0 && "Expected expression that does not "
6710 "contain any DW_OP_llvm_arg operands.");
6711 DbgVal.setRawLocation(ValueAsMetadata::get(Location));
6712 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6713 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6714}
6715
6716/// Overwrite DVI with locations placed into a DIArglist.
6717template <typename T>
6718static void updateDVIWithLocations(T &DbgVal,
6719 SmallVectorImpl<Value *> &Locations,
6721 assert(numLLVMArgOps(Ops) != 0 &&
6722 "Expected expression that references DIArglist locations using "
6723 "DW_OP_llvm_arg operands.");
6725 for (Value *V : Locations)
6726 MetadataLocs.push_back(ValueAsMetadata::get(V));
6727 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6728 DbgVal.setRawLocation(llvm::DIArgList::get(DbgVal.getContext(), ValArrayRef));
6729 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6730}
6731
6732/// Write the new expression and new location ops for the dbg.value. If possible
6733/// reduce the szie of the dbg.value intrinsic by omitting DIArglist. This
6734/// can be omitted if:
6735/// 1. There is only a single location, refenced by a single DW_OP_llvm_arg.
6736/// 2. The DW_OP_LLVM_arg is the first operand in the expression.
6737static void UpdateDbgValueInst(DVIRecoveryRec &DVIRec,
6738 SmallVectorImpl<Value *> &NewLocationOps,
6740 auto UpdateDbgValueInstImpl = [&](auto *DbgVal) {
6741 unsigned NumLLVMArgs = numLLVMArgOps(NewExpr);
6742 if (NumLLVMArgs == 0) {
6743 // Location assumed to be on the stack.
6744 updateDVIWithLocation(*DbgVal, NewLocationOps[0], NewExpr);
6745 } else if (NumLLVMArgs == 1 && NewExpr[0] == dwarf::DW_OP_LLVM_arg) {
6746 // There is only a single DW_OP_llvm_arg at the start of the expression,
6747 // so it can be omitted along with DIArglist.
6748 assert(NewExpr[1] == 0 &&
6749 "Lone LLVM_arg in a DIExpression should refer to location-op 0.");
6751 updateDVIWithLocation(*DbgVal, NewLocationOps[0], ShortenedOps);
6752 } else {
6753 // Multiple DW_OP_llvm_arg, so DIArgList is strictly necessary.
6754 updateDVIWithLocations(*DbgVal, NewLocationOps, NewExpr);
6755 }
6756
6757 // If the DIExpression was previously empty then add the stack terminator.
6758 // Non-empty expressions have only had elements inserted into them and so
6759 // the terminator should already be present e.g. stack_value or fragment.
6760 DIExpression *SalvageExpr = DbgVal->getExpression();
6761 if (!DVIRec.Expr->isComplex() && SalvageExpr->isComplex()) {
6762 SalvageExpr =
6763 DIExpression::append(SalvageExpr, {dwarf::DW_OP_stack_value});
6764 DbgVal->setExpression(SalvageExpr);
6765 }
6766 };
6767 if (isa<DbgValueInst *>(DVIRec.DbgRef))
6768 UpdateDbgValueInstImpl(cast<DbgValueInst *>(DVIRec.DbgRef));
6769 else
6770 UpdateDbgValueInstImpl(cast<DbgVariableRecord *>(DVIRec.DbgRef));
6771}
6772
6773/// Cached location ops may be erased during LSR, in which case a poison is
6774/// required when restoring from the cache. The type of that location is no
6775/// longer available, so just use int8. The poison will be replaced by one or
6776/// more locations later when a SCEVDbgValueBuilder selects alternative
6777/// locations to use for the salvage.
6779 return (VH) ? VH : PoisonValue::get(llvm::Type::getInt8Ty(C));
6780}
6781
6782/// Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
6783static void restorePreTransformState(DVIRecoveryRec &DVIRec) {
6784 auto RestorePreTransformStateImpl = [&](auto *DbgVal) {
6785 LLVM_DEBUG(dbgs() << "scev-salvage: restore dbg.value to pre-LSR state\n"
6786 << "scev-salvage: post-LSR: " << *DbgVal << '\n');
6787 assert(DVIRec.Expr && "Expected an expression");
6788 DbgVal->setExpression(DVIRec.Expr);
6789
6790 // Even a single location-op may be inside a DIArgList and referenced with
6791 // DW_OP_LLVM_arg, which is valid only with a DIArgList.
6792 if (!DVIRec.HadLocationArgList) {
6793 assert(DVIRec.LocationOps.size() == 1 &&
6794 "Unexpected number of location ops.");
6795 // LSR's unsuccessful salvage attempt may have added DIArgList, which in
6796 // this case was not present before, so force the location back to a
6797 // single uncontained Value.
6798 Value *CachedValue =
6799 getValueOrPoison(DVIRec.LocationOps[0], DbgVal->getContext());
6800 DbgVal->setRawLocation(ValueAsMetadata::get(CachedValue));
6801 } else {
6803 for (WeakVH VH : DVIRec.LocationOps) {
6804 Value *CachedValue = getValueOrPoison(VH, DbgVal->getContext());
6805 MetadataLocs.push_back(ValueAsMetadata::get(CachedValue));
6806 }
6807 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6808 DbgVal->setRawLocation(
6809 llvm::DIArgList::get(DbgVal->getContext(), ValArrayRef));
6810 }
6811 LLVM_DEBUG(dbgs() << "scev-salvage: pre-LSR: " << *DbgVal << '\n');
6812 };
6813 if (isa<DbgValueInst *>(DVIRec.DbgRef))
6814 RestorePreTransformStateImpl(cast<DbgValueInst *>(DVIRec.DbgRef));
6815 else
6816 RestorePreTransformStateImpl(cast<DbgVariableRecord *>(DVIRec.DbgRef));
6817}
6818
6820 llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec,
6821 const SCEV *SCEVInductionVar,
6822 SCEVDbgValueBuilder IterCountExpr) {
6823
6824 if (isa<DbgValueInst *>(DVIRec.DbgRef)
6825 ? !cast<DbgValueInst *>(DVIRec.DbgRef)->isKillLocation()
6826 : !cast<DbgVariableRecord *>(DVIRec.DbgRef)->isKillLocation())
6827 return false;
6828
6829 // LSR may have caused several changes to the dbg.value in the failed salvage
6830 // attempt. So restore the DIExpression, the location ops and also the
6831 // location ops format, which is always DIArglist for multiple ops, but only
6832 // sometimes for a single op.
6834
6835 // LocationOpIndexMap[i] will store the post-LSR location index of
6836 // the non-optimised out location at pre-LSR index i.
6837 SmallVector<int64_t, 2> LocationOpIndexMap;
6838 LocationOpIndexMap.assign(DVIRec.LocationOps.size(), -1);
6839 SmallVector<Value *, 2> NewLocationOps;
6840 NewLocationOps.push_back(LSRInductionVar);
6841
6842 for (unsigned i = 0; i < DVIRec.LocationOps.size(); i++) {
6843 WeakVH VH = DVIRec.LocationOps[i];
6844 // Place the locations not optimised out in the list first, avoiding
6845 // inserts later. The map is used to update the DIExpression's
6846 // DW_OP_LLVM_arg arguments as the expression is updated.
6847 if (VH && !isa<UndefValue>(VH)) {
6848 NewLocationOps.push_back(VH);
6849 LocationOpIndexMap[i] = NewLocationOps.size() - 1;
6850 LLVM_DEBUG(dbgs() << "scev-salvage: Location index " << i
6851 << " now at index " << LocationOpIndexMap[i] << "\n");
6852 continue;
6853 }
6854
6855 // It's possible that a value referred to in the SCEV may have been
6856 // optimised out by LSR.
6857 if (SE.containsErasedValue(DVIRec.SCEVs[i]) ||
6858 SE.containsUndefs(DVIRec.SCEVs[i])) {
6859 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV for location at index: " << i
6860 << " refers to a location that is now undef or erased. "
6861 "Salvage abandoned.\n");
6862 return false;
6863 }
6864
6865 LLVM_DEBUG(dbgs() << "scev-salvage: salvaging location at index " << i
6866 << " with SCEV: " << *DVIRec.SCEVs[i] << "\n");
6867
6868 DVIRec.RecoveryExprs[i] = std::make_unique<SCEVDbgValueBuilder>();
6869 SCEVDbgValueBuilder *SalvageExpr = DVIRec.RecoveryExprs[i].get();
6870
6871 // Create an offset-based salvage expression if possible, as it requires
6872 // less DWARF ops than an iteration count-based expression.
6873 if (std::optional<APInt> Offset =
6874 SE.computeConstantDifference(DVIRec.SCEVs[i], SCEVInductionVar)) {
6875 if (Offset->getSignificantBits() <= 64)
6876 SalvageExpr->createOffsetExpr(Offset->getSExtValue(), LSRInductionVar);
6877 } else if (!SalvageExpr->createIterCountExpr(DVIRec.SCEVs[i], IterCountExpr,
6878 SE))
6879 return false;
6880 }
6881
6882 // Merge the DbgValueBuilder generated expressions and the original
6883 // DIExpression, place the result into an new vector.
6885 if (DVIRec.Expr->getNumElements() == 0) {
6886 assert(DVIRec.RecoveryExprs.size() == 1 &&
6887 "Expected only a single recovery expression for an empty "
6888 "DIExpression.");
6889 assert(DVIRec.RecoveryExprs[0] &&
6890 "Expected a SCEVDbgSalvageBuilder for location 0");
6891 SCEVDbgValueBuilder *B = DVIRec.RecoveryExprs[0].get();
6892 B->appendToVectors(NewExpr, NewLocationOps);
6893 }
6894 for (const auto &Op : DVIRec.Expr->expr_ops()) {
6895 // Most Ops needn't be updated.
6896 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6897 Op.appendToVector(NewExpr);
6898 continue;
6899 }
6900
6901 uint64_t LocationArgIndex = Op.getArg(0);
6902 SCEVDbgValueBuilder *DbgBuilder =
6903 DVIRec.RecoveryExprs[LocationArgIndex].get();
6904 // The location doesn't have s SCEVDbgValueBuilder, so LSR did not
6905 // optimise it away. So just translate the argument to the updated
6906 // location index.
6907 if (!DbgBuilder) {
6908 NewExpr.push_back(dwarf::DW_OP_LLVM_arg);
6909 assert(LocationOpIndexMap[Op.getArg(0)] != -1 &&
6910 "Expected a positive index for the location-op position.");
6911 NewExpr.push_back(LocationOpIndexMap[Op.getArg(0)]);
6912 continue;
6913 }
6914 // The location has a recovery expression.
6915 DbgBuilder->appendToVectors(NewExpr, NewLocationOps);
6916 }
6917
6918 UpdateDbgValueInst(DVIRec, NewLocationOps, NewExpr);
6919 if (isa<DbgValueInst *>(DVIRec.DbgRef))
6920 LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: "
6921 << *cast<DbgValueInst *>(DVIRec.DbgRef) << "\n");
6922 else
6923 LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: "
6924 << *cast<DbgVariableRecord *>(DVIRec.DbgRef) << "\n");
6925 return true;
6926}
6927
6928/// Obtain an expression for the iteration count, then attempt to salvage the
6929/// dbg.value intrinsics.
6931 llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar,
6932 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &DVIToUpdate) {
6933 if (DVIToUpdate.empty())
6934 return;
6935
6936 const llvm::SCEV *SCEVInductionVar = SE.getSCEV(LSRInductionVar);
6937 assert(SCEVInductionVar &&
6938 "Anticipated a SCEV for the post-LSR induction variable");
6939
6940 if (const SCEVAddRecExpr *IVAddRec =
6941 dyn_cast<SCEVAddRecExpr>(SCEVInductionVar)) {
6942 if (!IVAddRec->isAffine())
6943 return;
6944
6945 // Prevent translation using excessive resources.
6946 if (IVAddRec->getExpressionSize() > MaxSCEVSalvageExpressionSize)
6947 return;
6948
6949 // The iteration count is required to recover location values.
6950 SCEVDbgValueBuilder IterCountExpr;
6951 IterCountExpr.pushLocation(LSRInductionVar);
6952 if (!IterCountExpr.SCEVToIterCountExpr(*IVAddRec, SE))
6953 return;
6954
6955 LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV: " << *SCEVInductionVar
6956 << '\n');
6957
6958 for (auto &DVIRec : DVIToUpdate) {
6959 SalvageDVI(L, SE, LSRInductionVar, *DVIRec, SCEVInductionVar,
6960 IterCountExpr);
6961 }
6962 }
6963}
6964
6965/// Identify and cache salvageable DVI locations and expressions along with the
6966/// corresponding SCEV(s). Also ensure that the DVI is not deleted between
6967/// cacheing and salvaging.
6969 Loop *L, ScalarEvolution &SE,
6970 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &SalvageableDVISCEVs,
6971 SmallSet<AssertingVH<DbgValueInst>, 2> &DVIHandles) {
6972 for (const auto &B : L->getBlocks()) {
6973 for (auto &I : *B) {
6974 auto ProcessDbgValue = [&](auto *DbgVal) -> bool {
6975 // Ensure that if any location op is undef that the dbg.vlue is not
6976 // cached.
6977 if (DbgVal->isKillLocation())
6978 return false;
6979
6980 // Check that the location op SCEVs are suitable for translation to
6981 // DIExpression.
6982 const auto &HasTranslatableLocationOps =
6983 [&](const auto *DbgValToTranslate) -> bool {
6984 for (const auto LocOp : DbgValToTranslate->location_ops()) {
6985 if (!LocOp)
6986 return false;
6987
6988 if (!SE.isSCEVable(LocOp->getType()))
6989 return false;
6990
6991 const SCEV *S = SE.getSCEV(LocOp);
6992 if (SE.containsUndefs(S))
6993 return false;
6994 }
6995 return true;
6996 };
6997
6998 if (!HasTranslatableLocationOps(DbgVal))
6999 return false;
7000
7001 std::unique_ptr<DVIRecoveryRec> NewRec =
7002 std::make_unique<DVIRecoveryRec>(DbgVal);
7003 // Each location Op may need a SCEVDbgValueBuilder in order to recover
7004 // it. Pre-allocating a vector will enable quick lookups of the builder
7005 // later during the salvage.
7006 NewRec->RecoveryExprs.resize(DbgVal->getNumVariableLocationOps());
7007 for (const auto LocOp : DbgVal->location_ops()) {
7008 NewRec->SCEVs.push_back(SE.getSCEV(LocOp));
7009 NewRec->LocationOps.push_back(LocOp);
7010 NewRec->HadLocationArgList = DbgVal->hasArgList();
7011 }
7012 SalvageableDVISCEVs.push_back(std::move(NewRec));
7013 return true;
7014 };
7015 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
7016 if (DVR.isDbgValue() || DVR.isDbgAssign())
7017 ProcessDbgValue(&DVR);
7018 }
7019 auto DVI = dyn_cast<DbgValueInst>(&I);
7020 if (!DVI)
7021 continue;
7022 if (ProcessDbgValue(DVI))
7023 DVIHandles.insert(DVI);
7024 }
7025 }
7026}
7027
7028/// Ideally pick the PHI IV inserted by ScalarEvolutionExpander. As a fallback
7029/// any PHi from the loop header is usable, but may have less chance of
7030/// surviving subsequent transforms.
7032 const LSRInstance &LSR) {
7033
7034 auto IsSuitableIV = [&](PHINode *P) {
7035 if (!SE.isSCEVable(P->getType()))
7036 return false;
7037 if (const SCEVAddRecExpr *Rec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(P)))
7038 return Rec->isAffine() && !SE.containsUndefs(SE.getSCEV(P));
7039 return false;
7040 };
7041
7042 // For now, just pick the first IV that was generated and inserted by
7043 // ScalarEvolution. Ideally pick an IV that is unlikely to be optimised away
7044 // by subsequent transforms.
7045 for (const WeakVH &IV : LSR.getScalarEvolutionIVs()) {
7046 if (!IV)
7047 continue;
7048
7049 // There should only be PHI node IVs.
7050 PHINode *P = cast<PHINode>(&*IV);
7051
7052 if (IsSuitableIV(P))
7053 return P;
7054 }
7055
7056 for (PHINode &P : L.getHeader()->phis()) {
7057 if (IsSuitableIV(&P))
7058 return &P;
7059 }
7060 return nullptr;
7061}
7062
7063static std::optional<std::tuple<PHINode *, PHINode *, const SCEV *, bool>>
7065 const LoopInfo &LI, const TargetTransformInfo &TTI) {
7066 if (!L->isInnermost()) {
7067 LLVM_DEBUG(dbgs() << "Cannot fold on non-innermost loop\n");
7068 return std::nullopt;
7069 }
7070 // Only inspect on simple loop structure
7071 if (!L->isLoopSimplifyForm()) {
7072 LLVM_DEBUG(dbgs() << "Cannot fold on non-simple loop\n");
7073 return std::nullopt;
7074 }
7075
7077 LLVM_DEBUG(dbgs() << "Cannot fold on backedge that is loop variant\n");
7078 return std::nullopt;
7079 }
7080
7081 BasicBlock *LoopLatch = L->getLoopLatch();
7082 BranchInst *BI = dyn_cast<BranchInst>(LoopLatch->getTerminator());
7083 if (!BI || BI->isUnconditional())
7084 return std::nullopt;
7085 auto *TermCond = dyn_cast<ICmpInst>(BI->getCondition());
7086 if (!TermCond) {
7087 LLVM_DEBUG(
7088 dbgs() << "Cannot fold on branching condition that is not an ICmpInst");
7089 return std::nullopt;
7090 }
7091 if (!TermCond->hasOneUse()) {
7092 LLVM_DEBUG(
7093 dbgs()
7094 << "Cannot replace terminating condition with more than one use\n");
7095 return std::nullopt;
7096 }
7097
7098 BinaryOperator *LHS = dyn_cast<BinaryOperator>(TermCond->getOperand(0));
7099 Value *RHS = TermCond->getOperand(1);
7100 if (!LHS || !L->isLoopInvariant(RHS))
7101 // We could pattern match the inverse form of the icmp, but that is
7102 // non-canonical, and this pass is running *very* late in the pipeline.
7103 return std::nullopt;
7104
7105 // Find the IV used by the current exit condition.
7106 PHINode *ToFold;
7107 Value *ToFoldStart, *ToFoldStep;
7108 if (!matchSimpleRecurrence(LHS, ToFold, ToFoldStart, ToFoldStep))
7109 return std::nullopt;
7110
7111 // Ensure the simple recurrence is a part of the current loop.
7112 if (ToFold->getParent() != L->getHeader())
7113 return std::nullopt;
7114
7115 // If that IV isn't dead after we rewrite the exit condition in terms of
7116 // another IV, there's no point in doing the transform.
7117 if (!isAlmostDeadIV(ToFold, LoopLatch, TermCond))
7118 return std::nullopt;
7119
7120 // Inserting instructions in the preheader has a runtime cost, scale
7121 // the allowed cost with the loops trip count as best we can.
7122 const unsigned ExpansionBudget = [&]() {
7123 unsigned Budget = 2 * SCEVCheapExpansionBudget;
7124 if (unsigned SmallTC = SE.getSmallConstantMaxTripCount(L))
7125 return std::min(Budget, SmallTC);
7126 if (std::optional<unsigned> SmallTC = getLoopEstimatedTripCount(L))
7127 return std::min(Budget, *SmallTC);
7128 // Unknown trip count, assume long running by default.
7129 return Budget;
7130 }();
7131
7132 const SCEV *BECount = SE.getBackedgeTakenCount(L);
7133 const DataLayout &DL = L->getHeader()->getDataLayout();
7134 SCEVExpander Expander(SE, DL, "lsr_fold_term_cond");
7135
7136 PHINode *ToHelpFold = nullptr;
7137 const SCEV *TermValueS = nullptr;
7138 bool MustDropPoison = false;
7139 auto InsertPt = L->getLoopPreheader()->getTerminator();
7140 for (PHINode &PN : L->getHeader()->phis()) {
7141 if (ToFold == &PN)
7142 continue;
7143
7144 if (!SE.isSCEVable(PN.getType())) {
7145 LLVM_DEBUG(dbgs() << "IV of phi '" << PN
7146 << "' is not SCEV-able, not qualified for the "
7147 "terminating condition folding.\n");
7148 continue;
7149 }
7150 const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(&PN));
7151 // Only speculate on affine AddRec
7152 if (!AddRec || !AddRec->isAffine()) {
7153 LLVM_DEBUG(dbgs() << "SCEV of phi '" << PN
7154 << "' is not an affine add recursion, not qualified "
7155 "for the terminating condition folding.\n");
7156 continue;
7157 }
7158
7159 // Check that we can compute the value of AddRec on the exiting iteration
7160 // without soundness problems. evaluateAtIteration internally needs
7161 // to multiply the stride of the iteration number - which may wrap around.
7162 // The issue here is subtle because computing the result accounting for
7163 // wrap is insufficient. In order to use the result in an exit test, we
7164 // must also know that AddRec doesn't take the same value on any previous
7165 // iteration. The simplest case to consider is a candidate IV which is
7166 // narrower than the trip count (and thus original IV), but this can
7167 // also happen due to non-unit strides on the candidate IVs.
7168 if (!AddRec->hasNoSelfWrap() ||
7169 !SE.isKnownNonZero(AddRec->getStepRecurrence(SE)))
7170 continue;
7171
7172 const SCEVAddRecExpr *PostInc = AddRec->getPostIncExpr(SE);
7173 const SCEV *TermValueSLocal = PostInc->evaluateAtIteration(BECount, SE);
7174 if (!Expander.isSafeToExpand(TermValueSLocal)) {
7175 LLVM_DEBUG(
7176 dbgs() << "Is not safe to expand terminating value for phi node" << PN
7177 << "\n");
7178 continue;
7179 }
7180
7181 if (Expander.isHighCostExpansion(TermValueSLocal, L, ExpansionBudget,
7182 &TTI, InsertPt)) {
7183 LLVM_DEBUG(
7184 dbgs() << "Is too expensive to expand terminating value for phi node"
7185 << PN << "\n");
7186 continue;
7187 }
7188
7189 // The candidate IV may have been otherwise dead and poison from the
7190 // very first iteration. If we can't disprove that, we can't use the IV.
7191 if (!mustExecuteUBIfPoisonOnPathTo(&PN, LoopLatch->getTerminator(), &DT)) {
7192 LLVM_DEBUG(dbgs() << "Can not prove poison safety for IV "
7193 << PN << "\n");
7194 continue;
7195 }
7196
7197 // The candidate IV may become poison on the last iteration. If this
7198 // value is not branched on, this is a well defined program. We're
7199 // about to add a new use to this IV, and we have to ensure we don't
7200 // insert UB which didn't previously exist.
7201 bool MustDropPoisonLocal = false;
7202 Instruction *PostIncV =
7203 cast<Instruction>(PN.getIncomingValueForBlock(LoopLatch));
7204 if (!mustExecuteUBIfPoisonOnPathTo(PostIncV, LoopLatch->getTerminator(),
7205 &DT)) {
7206 LLVM_DEBUG(dbgs() << "Can not prove poison safety to insert use"
7207 << PN << "\n");
7208
7209 // If this is a complex recurrance with multiple instructions computing
7210 // the backedge value, we might need to strip poison flags from all of
7211 // them.
7212 if (PostIncV->getOperand(0) != &PN)
7213 continue;
7214
7215 // In order to perform the transform, we need to drop the poison generating
7216 // flags on this instruction (if any).
7217 MustDropPoisonLocal = PostIncV->hasPoisonGeneratingFlags();
7218 }
7219
7220 // We pick the last legal alternate IV. We could expore choosing an optimal
7221 // alternate IV if we had a decent heuristic to do so.
7222 ToHelpFold = &PN;
7223 TermValueS = TermValueSLocal;
7224 MustDropPoison = MustDropPoisonLocal;
7225 }
7226
7227 LLVM_DEBUG(if (ToFold && !ToHelpFold) dbgs()
7228 << "Cannot find other AddRec IV to help folding\n";);
7229
7230 LLVM_DEBUG(if (ToFold && ToHelpFold) dbgs()
7231 << "\nFound loop that can fold terminating condition\n"
7232 << " BECount (SCEV): " << *SE.getBackedgeTakenCount(L) << "\n"
7233 << " TermCond: " << *TermCond << "\n"
7234 << " BrandInst: " << *BI << "\n"
7235 << " ToFold: " << *ToFold << "\n"
7236 << " ToHelpFold: " << *ToHelpFold << "\n");
7237
7238 if (!ToFold || !ToHelpFold)
7239 return std::nullopt;
7240 return std::make_tuple(ToFold, ToHelpFold, TermValueS, MustDropPoison);
7241}
7242
7244 DominatorTree &DT, LoopInfo &LI,
7245 const TargetTransformInfo &TTI,
7247 MemorySSA *MSSA) {
7248
7249 // Debug preservation - before we start removing anything identify which DVI
7250 // meet the salvageable criteria and store their DIExpression and SCEVs.
7251 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> SalvageableDVIRecords;
7253 DbgGatherSalvagableDVI(L, SE, SalvageableDVIRecords, DVIHandles);
7254
7255 bool Changed = false;
7256 std::unique_ptr<MemorySSAUpdater> MSSAU;
7257 if (MSSA)
7258 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
7259
7260 // Run the main LSR transformation.
7261 const LSRInstance &Reducer =
7262 LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get());
7263 Changed |= Reducer.getChanged();
7264
7265 // Remove any extra phis created by processing inner loops.
7266 Changed |= DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7267 if (EnablePhiElim && L->isLoopSimplifyForm()) {
7269 const DataLayout &DL = L->getHeader()->getDataLayout();
7270 SCEVExpander Rewriter(SE, DL, "lsr", false);
7271#ifndef NDEBUG
7272 Rewriter.setDebugType(DEBUG_TYPE);
7273#endif
7274 unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI);
7275 Rewriter.clear();
7276 if (numFolded) {
7277 Changed = true;
7279 MSSAU.get());
7280 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7281 }
7282 }
7283 // LSR may at times remove all uses of an induction variable from a loop.
7284 // The only remaining use is the PHI in the exit block.
7285 // When this is the case, if the exit value of the IV can be calculated using
7286 // SCEV, we can replace the exit block PHI with the final value of the IV and
7287 // skip the updates in each loop iteration.
7288 if (L->isRecursivelyLCSSAForm(DT, LI) && L->getExitBlock()) {
7290 const DataLayout &DL = L->getHeader()->getDataLayout();
7291 SCEVExpander Rewriter(SE, DL, "lsr", true);
7292 int Rewrites = rewriteLoopExitValues(L, &LI, &TLI, &SE, &TTI, Rewriter, &DT,
7293 UnusedIndVarInLoop, DeadInsts);
7294 Rewriter.clear();
7295 if (Rewrites) {
7296 Changed = true;
7298 MSSAU.get());
7299 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7300 }
7301 }
7302
7303 const bool EnableFormTerm = [&] {
7305 case cl::BOU_TRUE:
7306 return true;
7307 case cl::BOU_FALSE:
7308 return false;
7309 case cl::BOU_UNSET:
7311 }
7312 llvm_unreachable("Unhandled cl::boolOrDefault enum");
7313 }();
7314
7315 if (EnableFormTerm) {
7316 if (auto Opt = canFoldTermCondOfLoop(L, SE, DT, LI, TTI)) {
7317 auto [ToFold, ToHelpFold, TermValueS, MustDrop] = *Opt;
7318
7319 Changed = true;
7320 NumTermFold++;
7321
7322 BasicBlock *LoopPreheader = L->getLoopPreheader();
7323 BasicBlock *LoopLatch = L->getLoopLatch();
7324
7325 (void)ToFold;
7326 LLVM_DEBUG(dbgs() << "To fold phi-node:\n"
7327 << *ToFold << "\n"
7328 << "New term-cond phi-node:\n"
7329 << *ToHelpFold << "\n");
7330
7331 Value *StartValue = ToHelpFold->getIncomingValueForBlock(LoopPreheader);
7332 (void)StartValue;
7333 Value *LoopValue = ToHelpFold->getIncomingValueForBlock(LoopLatch);
7334
7335 // See comment in canFoldTermCondOfLoop on why this is sufficient.
7336 if (MustDrop)
7337 cast<Instruction>(LoopValue)->dropPoisonGeneratingFlags();
7338
7339 // SCEVExpander for both use in preheader and latch
7340 const DataLayout &DL = L->getHeader()->getDataLayout();
7341 SCEVExpander Expander(SE, DL, "lsr_fold_term_cond");
7342
7343 assert(Expander.isSafeToExpand(TermValueS) &&
7344 "Terminating value was checked safe in canFoldTerminatingCondition");
7345
7346 // Create new terminating value at loop preheader
7347 Value *TermValue = Expander.expandCodeFor(TermValueS, ToHelpFold->getType(),
7348 LoopPreheader->getTerminator());
7349
7350 LLVM_DEBUG(dbgs() << "Start value of new term-cond phi-node:\n"
7351 << *StartValue << "\n"
7352 << "Terminating value of new term-cond phi-node:\n"
7353 << *TermValue << "\n");
7354
7355 // Create new terminating condition at loop latch
7356 BranchInst *BI = cast<BranchInst>(LoopLatch->getTerminator());
7357 ICmpInst *OldTermCond = cast<ICmpInst>(BI->getCondition());
7358 IRBuilder<> LatchBuilder(LoopLatch->getTerminator());
7359 Value *NewTermCond =
7360 LatchBuilder.CreateICmp(CmpInst::ICMP_EQ, LoopValue, TermValue,
7361 "lsr_fold_term_cond.replaced_term_cond");
7362 // Swap successors to exit loop body if IV equals to new TermValue
7363 if (BI->getSuccessor(0) == L->getHeader())
7364 BI->swapSuccessors();
7365
7366 LLVM_DEBUG(dbgs() << "Old term-cond:\n"
7367 << *OldTermCond << "\n"
7368 << "New term-cond:\n" << *NewTermCond << "\n");
7369
7370 BI->setCondition(NewTermCond);
7371
7372 Expander.clear();
7373 OldTermCond->eraseFromParent();
7374 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7375 }
7376 }
7377
7378 if (SalvageableDVIRecords.empty())
7379 return Changed;
7380
7381 // Obtain relevant IVs and attempt to rewrite the salvageable DVIs with
7382 // expressions composed using the derived iteration count.
7383 // TODO: Allow for multiple IV references for nested AddRecSCEVs
7384 for (const auto &L : LI) {
7385 if (llvm::PHINode *IV = GetInductionVariable(*L, SE, Reducer))
7386 DbgRewriteSalvageableDVIs(L, SE, IV, SalvageableDVIRecords);
7387 else {
7388 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV salvaging not possible. An IV "
7389 "could not be identified.\n");
7390 }
7391 }
7392
7393 for (auto &Rec : SalvageableDVIRecords)
7394 Rec->clear();
7395 SalvageableDVIRecords.clear();
7396 DVIHandles.clear();
7397 return Changed;
7398}
7399
7400bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
7401 if (skipLoop(L))
7402 return false;
7403
7404 auto &IU = getAnalysis<IVUsersWrapperPass>().getIU();
7405 auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
7406 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
7407 auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
7408 const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
7409 *L->getHeader()->getParent());
7410 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
7411 *L->getHeader()->getParent());
7412 auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
7413 *L->getHeader()->getParent());
7414 auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
7415 MemorySSA *MSSA = nullptr;
7416 if (MSSAAnalysis)
7417 MSSA = &MSSAAnalysis->getMSSA();
7418 return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, TLI, MSSA);
7419}
7420
7423 LPMUpdater &) {
7424 if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,
7425 AR.DT, AR.LI, AR.TTI, AR.AC, AR.TLI, AR.MSSA))
7426 return PreservedAnalyses::all();
7427
7428 auto PA = getLoopPassPreservedAnalyses();
7429 if (AR.MSSA)
7430 PA.preserve<MemorySSAAnalysis>();
7431 return PA;
7432}
7433
7434char LoopStrengthReduce::ID = 0;
7435
7436INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
7437 "Loop Strength Reduction", false, false)
7443INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
7444INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
7445 "Loop Strength Reduction", false, false)
7446
7447Pass *llvm::createLoopStrengthReducePass() { return new LoopStrengthReduce(); }
#define Success
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
This file implements a class to represent arbitrary precision integral constant values and operations...
@ PostInc
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:686
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:537
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static void clear(coro::Shape &Shape)
Definition: Coroutines.cpp:148
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static bool isCanonical(const MDString *S)
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
This file contains constants used for implementing Dwarf debug support.
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1294
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Rewrite Partial Register Uses
Hexagon Hardware Loops
This defines the Use class.
iv Induction Variable Users
Definition: IVUsers.cpp:48
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:512
This header provides classes for managing per-loop analyses.
static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec, const SCEV *SCEVInductionVar, SCEVDbgValueBuilder IterCountExpr)
static cl::opt< bool > DropScaledForVScale("lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true), cl::desc("Avoid using scaled registers with vscale-relative addressing"))
static std::optional< std::tuple< PHINode *, PHINode *, const SCEV *, bool > > canFoldTermCondOfLoop(Loop *L, ScalarEvolution &SE, DominatorTree &DT, const LoopInfo &LI, const TargetTransformInfo &TTI)
static Value * getWideOperand(Value *Oper)
IVChain logic must consistently peek base TruncInst operands, so wrap it in a convenient helper.
static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE)
Return true if the given add can be sign-extended without changing its value.
static bool mayUsePostIncMode(const TargetTransformInfo &TTI, LSRUse &LU, const SCEV *S, const Loop *L, ScalarEvolution &SE)
Return true if the SCEV represents a value that may end up as a post-increment operation.
static void restorePreTransformState(DVIRecoveryRec &DVIRec)
Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE)
If S involves the addition of a constant integer value, return that integer value,...
static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L)
static User::op_iterator findIVOperand(User::op_iterator OI, User::op_iterator OE, Loop *L, ScalarEvolution &SE)
Helper for CollectChains that finds an IV operand (computed by an AddRec in this loop) within [OI,...
static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset, Immediate MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg, int64_t Scale)
Test whether we know how to expand the current formula.
static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE)
Return true if the given mul can be sign-extended without changing its value.
static const unsigned MaxSCEVSalvageExpressionSize
Limit the size of expression that SCEV-based salvaging will attempt to translate into a DIExpression.
static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if this AddRec is already a phi in its loop.
static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F, const Loop &L)
static cl::opt< bool > InsnsCost("lsr-insns-cost", cl::Hidden, cl::init(true), cl::desc("Add instruction count to a LSR cost model"))
static cl::opt< bool > StressIVChain("stress-ivchain", cl::Hidden, cl::init(false), cl::desc("Stress test LSR IV chains"))
static bool isAddressUse(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Returns true if the specified instruction is using the specified value as an address.
static GlobalValue * ExtractSymbol(const SCEV *&S, ScalarEvolution &SE)
If S involves the addition of a GlobalValue address, return that symbol, and mutate S to point to a n...
static void updateDVIWithLocation(T &DbgVal, Value *Location, SmallVectorImpl< uint64_t > &Ops)
Overwrites DVI with the location and Ops as the DIExpression.
static bool isLegalAddImmediate(const TargetTransformInfo &TTI, Immediate Offset)
static cl::opt< cl::boolOrDefault > AllowDropSolutionIfLessProfitable("lsr-drop-solution", cl::Hidden, cl::desc("Attempt to drop solution if it is less profitable"))
static cl::opt< bool > EnableVScaleImmediates("lsr-enable-vscale-immediates", cl::Hidden, cl::init(true), cl::desc("Enable analysis of vscale-relative immediates in LSR"))
static cl::opt< TTI::AddressingModeKind > PreferredAddresingMode("lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None), cl::desc("A flag that overrides the target's preferred addressing mode."), cl::values(clEnumValN(TTI::AMK_None, "none", "Don't prefer any addressing mode"), clEnumValN(TTI::AMK_PreIndexed, "preindexed", "Prefer pre-indexed addressing mode"), clEnumValN(TTI::AMK_PostIndexed, "postindexed", "Prefer post-indexed addressing mode")))
static const SCEV * getExprBase(const SCEV *S)
Return an approximation of this SCEV expression's "base", or NULL for any constant.
static bool isAlwaysFoldable(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg)
static llvm::PHINode * GetInductionVariable(const Loop &L, ScalarEvolution &SE, const LSRInstance &LSR)
Ideally pick the PHI IV inserted by ScalarEvolutionExpander.
static cl::opt< cl::boolOrDefault > AllowTerminatingConditionFoldingAfterLSR("lsr-term-fold", cl::Hidden, cl::desc("Attempt to replace primary IV with other IV."))
static bool IsSimplerBaseSCEVForTarget(const TargetTransformInfo &TTI, ScalarEvolution &SE, const SCEV *Best, const SCEV *Reg, MemAccessTy AccessType)
loop reduce
static const unsigned MaxIVUsers
MaxIVUsers is an arbitrary threshold that provides an early opportunity for bail out.
static bool isHighCostExpansion(const SCEV *S, SmallPtrSetImpl< const SCEV * > &Processed, ScalarEvolution &SE)
Check if expanding this expression is likely to incur significant cost.
static Value * getValueOrPoison(WeakVH &VH, LLVMContext &C)
Cached location ops may be erased during LSR, in which case a poison is required when restoring from ...
static MemAccessTy getAccessType(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Return the type of the memory being accessed.
static unsigned numLLVMArgOps(SmallVectorImpl< uint64_t > &Expr)
Returns the total number of DW_OP_llvm_arg operands in the expression.
static void DbgRewriteSalvageableDVIs(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &DVIToUpdate)
Obtain an expression for the iteration count, then attempt to salvage the dbg.value intrinsics.
static cl::opt< bool > EnablePhiElim("enable-lsr-phielim", cl::Hidden, cl::init(true), cl::desc("Enable LSR phi elimination"))
static void DbgGatherSalvagableDVI(Loop *L, ScalarEvolution &SE, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &SalvageableDVISCEVs, SmallSet< AssertingVH< DbgValueInst >, 2 > &DVIHandles)
Identify and cache salvageable DVI locations and expressions along with the corresponding SCEV(s).
static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if the given addrec can be sign-extended without changing its value.
static bool canHoistIVInc(const TargetTransformInfo &TTI, const LSRFixup &Fixup, const LSRUse &LU, Instruction *IVIncInsertPos, Loop *L)
static void DoInitialMatch(const SCEV *S, Loop *L, SmallVectorImpl< const SCEV * > &Good, SmallVectorImpl< const SCEV * > &Bad, ScalarEvolution &SE)
Recursion helper for initialMatch.
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F)
Check if the addressing mode defined by F is completely folded in LU at isel time.
static cl::opt< bool > LSRExpNarrow("lsr-exp-narrow", cl::Hidden, cl::init(false), cl::desc("Narrow LSR complex solution using" " expectation of registers number"))
static cl::opt< bool > FilterSameScaledReg("lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true), cl::desc("Narrow LSR search space by filtering non-optimal formulae" " with the same ScaledReg and Scale"))
static void updateDVIWithLocations(T &DbgVal, SmallVectorImpl< Value * > &Locations, SmallVectorImpl< uint64_t > &Ops)
Overwrite DVI with locations placed into a DIArglist.
static cl::opt< unsigned > ComplexityLimit("lsr-complexity-limit", cl::Hidden, cl::init(std::numeric_limits< uint16_t >::max()), cl::desc("LSR search space complexity limit"))
static void UpdateDbgValueInst(DVIRecoveryRec &DVIRec, SmallVectorImpl< Value * > &NewLocationOps, SmallVectorImpl< uint64_t > &NewExpr)
Write the new expression and new location ops for the dbg.value.
static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC, TargetLibraryInfo &TLI, MemorySSA *MSSA)
static bool isProfitableChain(IVChain &Chain, SmallPtrSetImpl< Instruction * > &Users, ScalarEvolution &SE, const TargetTransformInfo &TTI)
Return true if the number of registers needed for the chain is estimated to be less than the number r...
static const SCEV * CollectSubexprs(const SCEV *S, const SCEVConstant *C, SmallVectorImpl< const SCEV * > &Ops, const Loop *L, ScalarEvolution &SE, unsigned Depth=0)
Split S into subexpressions which can be pulled out into separate registers.
static const SCEV * getExactSDiv(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool IgnoreSignificantBits=false)
Return an expression for LHS /s RHS, if it can be determined and if the remainder is known to be zero...
static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, Value *Operand, const TargetTransformInfo &TTI)
Return true if the IVInc can be folded into an addressing mode.
#define DEBUG_TYPE
static const SCEV * getAnyExtendConsideringPostIncUses(ArrayRef< PostIncLoopSet > Loops, const SCEV *Expr, Type *ToTy, ScalarEvolution &SE)
Extend/Truncate Expr to ToTy considering post-inc uses in Loops.
static unsigned getSetupCost(const SCEV *Reg, unsigned Depth)
static cl::opt< unsigned > SetupCostDepthLimit("lsr-setupcost-depth-limit", cl::Hidden, cl::init(7), cl::desc("The limit on recursion depth for LSRs setup cost"))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
unsigned Reg
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
Module.h This file contains the declarations for the Module class.
uint64_t IntrinsicInst * II
#define P(N)
PowerPC TLS Dynamic Call Fixup
This header defines various interfaces for pass management in LLVM.
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file defines the PointerIntPair class.
const SmallVectorImpl< MachineOperand > & Cond
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
SI optimize exec mask operations pre RA
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This pass exposes codegen information to IR-level passes.
Virtual Register Rewriter
Definition: VirtRegMap.cpp:237
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class recording the (high level) value of a variable.
Class for arbitrary precision integers.
Definition: APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1500
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:309
APInt sdiv(const APInt &RHS) const
Signed division function for APInt.
Definition: APInt.cpp:1614
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition: APInt.h:1491
APInt srem(const APInt &RHS) const
Function for signed remainder operation.
Definition: APInt.cpp:1706
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1522
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:405
Represent the analysis usage information of a pass.
AnalysisUsage & addRequiredID(const void *ID)
Definition: Pass.cpp:283
AnalysisUsage & addPreservedID(const void *ID)
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Definition: Any.h:28
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
Value handle that asserts if the Value is deleted.
Definition: ValueHandle.h:264
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:495
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:696
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:507
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:167
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition: BasicBlock.h:366
bool isLandingPad() const
Return true if this basic block is a landing pad.
Definition: BasicBlock.cpp:677
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:229
BinaryOps getOpcode() const
Definition: InstrTypes.h:442
static BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
Conditional or Unconditional Branch instruction.
void setCondition(Value *V)
void swapSuccessors()
Swap the successors of this branch instruction.
BasicBlock * getSuccessor(unsigned i) const
bool isUnconditional() const
Value * getCondition() const
static Instruction::CastOps getCastOpcode(const Value *Val, bool SrcIsSigned, Type *Ty, bool DstIsSigned)
Returns the opcode necessary to cast Val into Ty using usual casting rules.
static CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ ICMP_EQ
equal
Definition: InstrTypes.h:778
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:871
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
static bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
Definition: Constants.cpp:1575
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.h:124
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:161
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:155
This is an important base class in LLVM.
Definition: Constant.h:42
static DIArgList * get(LLVMContext &Context, ArrayRef< ValueAsMetadata * > Args)
An iterator for expression operands.
DWARF expression.
static DIExpression * append(const DIExpression *Expr, ArrayRef< uint64_t > Ops)
Append the opcodes Ops to DIExpr.
static void appendOffset(SmallVectorImpl< uint64_t > &Ops, int64_t Offset)
Append Ops with operations to apply the Offset.
bool isComplex() const
Return whether the location is computed on the expression stack, meaning it cannot be a simple regist...
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
This represents the llvm.dbg.value instruction.
Record of a variable value-assignment, aka a non instruction representation of the dbg....
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
NodeT * getBlock() const
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:317
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
This instruction compares its operands according to the predicate given to the constructor.
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2356
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2671
IVStrideUse - Keep track of one use of a strided induction variable.
Definition: IVUsers.h:35
void transformToPostInc(const Loop *L)
transformToPostInc - Transform the expression to post-inc form for the given loop.
Definition: IVUsers.cpp:367
Value * getOperandValToReplace() const
getOperandValToReplace - Return the Value of the operand in the user instruction that this IVStrideUs...
Definition: IVUsers.h:54
void setUser(Instruction *NewUser)
setUser - Assign a new user instruction for this use.
Definition: IVUsers.h:48
Analysis pass that exposes the IVUsers for a loop.
Definition: IVUsers.h:184
ilist< IVStrideUse >::const_iterator const_iterator
Definition: IVUsers.h:142
bool empty() const
Definition: IVUsers.h:147
void print(raw_ostream &OS) const
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:466
bool isEHPad() const
Return true if the instruction is a variety of EH-block.
Definition: Instruction.h:824
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:92
Type * getAccessType() const LLVM_READONLY
Return the type this instruction accesses in memory, if any.
bool hasPoisonGeneratingFlags() const LLVM_READONLY
Return true if this operator has flags which may cause this instruction to evaluate to poison despite...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:463
const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition: Instruction.cpp:74
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
An instruction for reading from memory.
Definition: Instructions.h:174
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:598
virtual bool runOnLoop(Loop *L, LPPassManager &LPM)=0
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
An analysis that produces MemorySSA for a function.
Definition: MemorySSA.h:928
Legacy analysis pass which computes MemorySSA.
Definition: MemorySSA.h:985
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition: MemorySSA.h:701
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
iterator_range< const_block_iterator > blocks() const
op_range incoming_values()
void setIncomingValue(unsigned i, Value *V)
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
static unsigned getIncomingValueNumForOperand(unsigned i)
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98
PointerIntPair - This class implements a pair of a pointer and small integer.
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1852
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
This node represents an addition of some number of SCEVs.
This node represents a polynomial recurrence on the trip count of the specified loop.
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
const SCEVAddRecExpr * getPostIncExpr(ScalarEvolution &SE) const
Return an expression representing the value of this expression one iteration of the loop ahead.
This is the base class for unary cast operator classes.
This node is the base class for n'ary commutative operators.
This class represents a constant integer value.
ConstantInt * getValue() const
const APInt & getAPInt() const
This class uses information about analyze scalars to rewrite expressions in canonical form.
bool isSafeToExpand(const SCEV *S) const
Return true if the given expression is safe to expand in the sense that all materialized values are s...
bool isHighCostExpansion(ArrayRef< const SCEV * > Exprs, Loop *L, unsigned Budget, const TargetTransformInfo *TTI, const Instruction *At)
Return true for expressions that can't be evaluated at runtime within given Budget.
void clear()
Erase the contents of the InsertedExpressions map so that users trying to expand the same expression ...
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This is the base class for unary integral cast operator classes.
This node represents multiplication of some number of SCEVs.
This node is a base class providing common functionality for n'ary operators.
ArrayRef< const SCEV * > operands() const
This class represents a signed maximum selection.
This class represents a binary unsigned division operation.
This class represents an unsigned maximum selection.
This means that we are dealing with an entirely unknown SCEV value, and only represent it as its LLVM...
This class represents an analyzed expression in the program.
ArrayRef< const SCEV * > operands() const
Return operands of this SCEV expression.
unsigned short getExpressionSize() const
bool isZero() const
Return true if the expression is a constant zero.
SCEVTypes getSCEVType() const
Type * getType() const
Return the LLVM type of this SCEV expression.
This class represents a cast from signed integer to floating point.
The main scalar evolution driver.
bool isKnownNonZero(const SCEV *S)
Test if the given expression is known to be non-zero.
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
uint64_t getTypeSizeInBits(Type *Ty) const
Return the size in bits of the specified type, for which isSCEVable must return true.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getNoopOrSignExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
unsigned getSmallConstantMaxTripCount(const Loop *L)
Returns the upper bound of the loop trip count as a normal unsigned value.
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getAddRecExpr(const SCEV *Start, const SCEV *Step, const Loop *L, SCEV::NoWrapFlags Flags)
Get an add recurrence expression for the specified loop.
bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
Type * getEffectiveSCEVType(Type *Ty) const
Return a type with the same bitwidth as the given type and which represents how SCEV will treat the g...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
const SCEV * getAnyExtendExpr(const SCEV *Op, Type *Ty)
getAnyExtendExpr - Return a SCEV for the given operand extended with unspecified bits out to the give...
bool containsUndefs(const SCEV *S) const
Return true if the SCEV expression contains an undef value.
const SCEV * getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
const SCEV * getVScale(Type *Ty)
bool hasComputableLoopEvolution(const SCEV *S, const Loop *L)
Return true if the given SCEV changes value in a known way in the specified loop.
const SCEV * getPointerBase(const SCEV *V)
Transitively follow the chain of pointer-type operands until reaching a SCEV that does not have a sin...
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUnknown(Value *V)
std::optional< APInt > computeConstantDifference(const SCEV *LHS, const SCEV *RHS)
Compute LHS - RHS and returns the result as an APInt if it is a constant, and std::nullopt if it isn'...
bool properlyDominates(const SCEV *S, const BasicBlock *BB)
Return true if elements that makes up the given SCEV properly dominate the specified basic block.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
bool containsErasedValue(const SCEV *S) const
Return true if the SCEV expression contains a Value that has been optimised out and is now a nullptr.
LLVMContext & getContext() const
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:113
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
iterator_range< const_set_bits_iterator > set_bits() const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
size_type size() const
Returns the number of bits in this bitvector.
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
size_type count() const
Returns the number of bits which are set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:323
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:412
iterator end() const
Definition: SmallPtrSet.h:437
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:344
iterator begin() const
Definition: SmallPtrSet.h:432
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
void clear()
Definition: SmallSet.h:218
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:717
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void reserve(size_type N)
Definition: SmallVector.h:676
iterator erase(const_iterator CI)
Definition: SmallVector.h:750
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:591
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
typename SuperClass::iterator iterator
Definition: SmallVector.h:590
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
static StackOffset get(int64_t Fixed, int64_t Scalable)
Definition: TypeSize.h:44
An instruction for storing to memory.
Definition: Instructions.h:290
Provides information about what library functions are available for the current target.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const
bool shouldDropLSRSolutionIfLessProfitable() const
Return true if LSR should drop a found solution if it's calculated to be less profitable than the bas...
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const
Return true if LSR cost of C1 is lower than C2.
bool shouldFoldTerminatingConditionAfterLSR() const
Return true if LSR should attempts to replace a use of an otherwise dead primary IV in the latch cond...
bool isProfitableLSRChainElement(Instruction *I) const
bool LSRWithInstrQueries() const
Return true if the loop strength reduce pass should make Instruction* based TTI queries to isLegalAdd...
bool isIndexedStoreLegal(enum MemIndexedMode Mode, Type *Ty) const
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace=0, Instruction *I=nullptr, int64_t ScalableOffset=0) const
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isIndexedLoadLegal(enum MemIndexedMode Mode, Type *Ty) const
bool isLegalICmpImmediate(int64_t Imm) const
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
bool isLegalAddImmediate(int64_t Imm) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *LibInfo) const
Return true if the target can save a compare for loop count, for example hardware loop saves a compar...
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isLegalAddScalableImmediate(int64_t Imm) const
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
bool isNumRegsMajorCostOfLSR() const
Return true if LSR major cost is number of registers.
@ MIM_PostInc
Post-incrementing.
bool canMacroFuseCmp() const
Return true if the target can fuse a compare and branch.
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace=0) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
This class represents a truncation of integer types.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static Type * getVoidTy(LLVMContext &C)
int getFPMantissaWidth() const
Return the width of the mantissa of this type.
static IntegerType * getInt8Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
This class represents a cast unsigned integer to floating point.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:242
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
op_iterator op_end()
Definition: User.h:236
static ValueAsMetadata * get(Value *V)
Definition: Metadata.cpp:495
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
iterator_range< use_iterator > uses()
Definition: Value.h:376
A nullable Value handle that is nullable.
Definition: ValueHandle.h:144
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Key
PAL metadata keys.
@ Entry
Definition: COFF.h:811
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ SC
CHAIN = SC CHAIN, Imm128 - System call.
Reg
All possible values of the reg field in the ModR/M byte.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:711
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
@ DW_OP_LLVM_arg
Only used in LLVM metadata.
Definition: Dwarf.h:147
@ DW_OP_LLVM_convert
Only used in LLVM metadata.
Definition: Dwarf.h:143
constexpr double e
Definition: MathExtras.h:47
Sequence
A sequence of states that a pointer may go through in which an objc_retain and objc_release are actua...
Definition: PtrState.h:41
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:227
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:236
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool mustExecuteUBIfPoisonOnPathTo(Instruction *Root, Instruction *OnPathTo, DominatorTree *DT)
Return true if undefined behavior would provable be executed on the path to OnPathTo if Root produced...
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition: DWP.cpp:480
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Returns a loop's estimated trip count based on branch weight metadata.
Definition: LoopUtils.cpp:849
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
bool operator!=(uint64_t V1, const APInt &V2)
Definition: APInt.h:2062
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2067
char & LoopSimplifyID
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
bool matchSimpleRecurrence(const PHINode *P, BinaryOperator *&BO, Value *&Start, Value *&Step)
Attempt to match a simple first order recurrence cycle of the form: iv = phi Ty [Start,...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
bool DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr)
Examine each PHI in the given block and delete it if it is dead.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
const SCEV * denormalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE)
Denormalize S to be post-increment for all loops present in Loops.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
cl::opt< unsigned > SCEVCheapExpansionBudget
Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
void SplitLandingPadPredecessors(BasicBlock *OrigBB, ArrayRef< BasicBlock * > Preds, const char *Suffix, const char *Suffix2, SmallVectorImpl< BasicBlock * > &NewBBs, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, bool PreserveLCSSA=false)
This method transforms the landing pad, OrigBB, by introducing two new basic blocks into the function...
const SCEV * normalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE, bool CheckInvertible=true)
Normalize S to be post-increment for all loops present in Loops.
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
@ Add
Sum of integers.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1914
Pass * createLoopStrengthReducePass()
BasicBlock * SplitCriticalEdge(Instruction *TI, unsigned SuccNum, const CriticalEdgeSplittingOptions &Options=CriticalEdgeSplittingOptions(), const Twine &BBName="")
If this edge is a critical edge, insert a new node to split the critical edge.
bool RecursivelyDeleteTriviallyDeadInstructionsPermissive(SmallVectorImpl< WeakTrackingVH > &DeadInsts, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
Same functionality as RecursivelyDeleteTriviallyDeadInstructions, but allow instructions that are not...
Definition: Local.cpp:555
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
bool formLCSSAForInstructions(SmallVectorImpl< Instruction * > &Worklist, const DominatorTree &DT, const LoopInfo &LI, ScalarEvolution *SE, SmallVectorImpl< PHINode * > *PHIsToRemove=nullptr, SmallVectorImpl< PHINode * > *InsertedPHIs=nullptr)
Ensures LCSSA form for every instruction from the Worklist in the scope of innermost containing loop.
Definition: LCSSA.cpp:77
void initializeLoopStrengthReducePass(PassRegistry &)
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
bool isAlmostDeadIV(PHINode *IV, BasicBlock *LatchBlock, Value *Cond)
Return true if the induction variable IV in a Loop whose latch is LatchBlock would become dead if the...
Definition: LoopUtils.cpp:469
int rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI, ScalarEvolution *SE, const TargetTransformInfo *TTI, SCEVExpander &Rewriter, DominatorTree *DT, ReplaceExitVal ReplaceExitValue, SmallVector< WeakTrackingVH, 16 > &DeadInsts)
If the final value of any expressions that are recurrent in the loop can be computed,...
Definition: LoopUtils.cpp:1449
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
@ UnusedIndVarInLoop
Definition: LoopUtils.h:468
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:471
bool SCEVExprContains(const SCEV *Root, PredTy Pred)
Return true if any node in Root satisfies the predicate Pred.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Option class for critical edge splitting.
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Information about a load/store intrinsic defined by the target.
Value * PtrVal
This is the pointer that the intrinsic is loading from or storing to.