LLVM 20.0.0git
LoopStrengthReduce.cpp
Go to the documentation of this file.
1//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This transformation analyzes and transforms the induction variables (and
10// computations derived from them) into forms suitable for efficient execution
11// on the target.
12//
13// This pass performs a strength reduction on array references inside loops that
14// have as one or more of their components the loop induction variable, it
15// rewrites expressions to take advantage of scaled-index addressing modes
16// available on the target, and it performs a variety of other optimizations
17// related to loop induction variables.
18//
19// Terminology note: this code has a lot of handling for "post-increment" or
20// "post-inc" users. This is not talking about post-increment addressing modes;
21// it is instead talking about code like this:
22//
23// %i = phi [ 0, %entry ], [ %i.next, %latch ]
24// ...
25// %i.next = add %i, 1
26// %c = icmp eq %i.next, %n
27//
28// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
29// it's useful to think about these as the same register, with some uses using
30// the value of the register before the add and some using it after. In this
31// example, the icmp is a post-increment user, since it uses %i.next, which is
32// the value of the induction variable after the increment. The other common
33// case of post-increment users is users outside the loop.
34//
35// TODO: More sophistication in the way Formulae are generated and filtered.
36//
37// TODO: Handle multiple loops at a time.
38//
39// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
40// of a GlobalValue?
41//
42// TODO: When truncation is free, truncate ICmp users' operands to make it a
43// smaller encoding (on x86 at least).
44//
45// TODO: When a negated register is used by an add (such as in a list of
46// multiple base registers, or as the increment expression in an addrec),
47// we may not actually need both reg and (-1 * reg) in registers; the
48// negation can be implemented by using a sub instead of an add. The
49// lack of support for taking this into consideration when making
50// register pressure decisions is partly worked around by the "Special"
51// use kind.
52//
53//===----------------------------------------------------------------------===//
54
56#include "llvm/ADT/APInt.h"
57#include "llvm/ADT/DenseMap.h"
58#include "llvm/ADT/DenseSet.h"
59#include "llvm/ADT/Hashing.h"
61#include "llvm/ADT/STLExtras.h"
62#include "llvm/ADT/SetVector.h"
65#include "llvm/ADT/SmallSet.h"
67#include "llvm/ADT/Statistic.h"
84#include "llvm/Config/llvm-config.h"
85#include "llvm/IR/BasicBlock.h"
86#include "llvm/IR/Constant.h"
87#include "llvm/IR/Constants.h"
90#include "llvm/IR/Dominators.h"
91#include "llvm/IR/GlobalValue.h"
92#include "llvm/IR/IRBuilder.h"
93#include "llvm/IR/InstrTypes.h"
94#include "llvm/IR/Instruction.h"
97#include "llvm/IR/Module.h"
98#include "llvm/IR/Operator.h"
99#include "llvm/IR/PassManager.h"
100#include "llvm/IR/Type.h"
101#include "llvm/IR/Use.h"
102#include "llvm/IR/User.h"
103#include "llvm/IR/Value.h"
104#include "llvm/IR/ValueHandle.h"
106#include "llvm/Pass.h"
107#include "llvm/Support/Casting.h"
110#include "llvm/Support/Debug.h"
120#include <algorithm>
121#include <cassert>
122#include <cstddef>
123#include <cstdint>
124#include <iterator>
125#include <limits>
126#include <map>
127#include <numeric>
128#include <optional>
129#include <utility>
130
131using namespace llvm;
132
133#define DEBUG_TYPE "loop-reduce"
134
135/// MaxIVUsers is an arbitrary threshold that provides an early opportunity for
136/// bail out. This threshold is far beyond the number of users that LSR can
137/// conceivably solve, so it should not affect generated code, but catches the
138/// worst cases before LSR burns too much compile time and stack space.
139static const unsigned MaxIVUsers = 200;
140
141/// Limit the size of expression that SCEV-based salvaging will attempt to
142/// translate into a DIExpression.
143/// Choose a maximum size such that debuginfo is not excessively increased and
144/// the salvaging is not too expensive for the compiler.
145static const unsigned MaxSCEVSalvageExpressionSize = 64;
146
147// Cleanup congruent phis after LSR phi expansion.
149 "enable-lsr-phielim", cl::Hidden, cl::init(true),
150 cl::desc("Enable LSR phi elimination"));
151
152// The flag adds instruction count to solutions cost comparison.
154 "lsr-insns-cost", cl::Hidden, cl::init(true),
155 cl::desc("Add instruction count to a LSR cost model"));
156
157// Flag to choose how to narrow complex lsr solution
159 "lsr-exp-narrow", cl::Hidden, cl::init(false),
160 cl::desc("Narrow LSR complex solution using"
161 " expectation of registers number"));
162
163// Flag to narrow search space by filtering non-optimal formulae with
164// the same ScaledReg and Scale.
166 "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true),
167 cl::desc("Narrow LSR search space by filtering non-optimal formulae"
168 " with the same ScaledReg and Scale"));
169
171 "lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None),
172 cl::desc("A flag that overrides the target's preferred addressing mode."),
174 "none",
175 "Don't prefer any addressing mode"),
177 "preindexed",
178 "Prefer pre-indexed addressing mode"),
180 "postindexed",
181 "Prefer post-indexed addressing mode")));
182
184 "lsr-complexity-limit", cl::Hidden,
185 cl::init(std::numeric_limits<uint16_t>::max()),
186 cl::desc("LSR search space complexity limit"));
187
189 "lsr-setupcost-depth-limit", cl::Hidden, cl::init(7),
190 cl::desc("The limit on recursion depth for LSRs setup cost"));
191
193 "lsr-term-fold", cl::Hidden,
194 cl::desc("Attempt to replace primary IV with other IV."));
195
197 "lsr-drop-solution", cl::Hidden,
198 cl::desc("Attempt to drop solution if it is less profitable"));
199
201 "lsr-enable-vscale-immediates", cl::Hidden, cl::init(true),
202 cl::desc("Enable analysis of vscale-relative immediates in LSR"));
203
205 "lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true),
206 cl::desc("Avoid using scaled registers with vscale-relative addressing"));
207
208STATISTIC(NumTermFold,
209 "Number of terminating condition fold recognized and performed");
210
211#ifndef NDEBUG
212// Stress test IV chain generation.
214 "stress-ivchain", cl::Hidden, cl::init(false),
215 cl::desc("Stress test LSR IV chains"));
216#else
217static bool StressIVChain = false;
218#endif
219
220namespace {
221
222struct MemAccessTy {
223 /// Used in situations where the accessed memory type is unknown.
224 static const unsigned UnknownAddressSpace =
225 std::numeric_limits<unsigned>::max();
226
227 Type *MemTy = nullptr;
228 unsigned AddrSpace = UnknownAddressSpace;
229
230 MemAccessTy() = default;
231 MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {}
232
233 bool operator==(MemAccessTy Other) const {
234 return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
235 }
236
237 bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
238
239 static MemAccessTy getUnknown(LLVMContext &Ctx,
240 unsigned AS = UnknownAddressSpace) {
241 return MemAccessTy(Type::getVoidTy(Ctx), AS);
242 }
243
244 Type *getType() { return MemTy; }
245};
246
247/// This class holds data which is used to order reuse candidates.
248class RegSortData {
249public:
250 /// This represents the set of LSRUse indices which reference
251 /// a particular register.
252 SmallBitVector UsedByIndices;
253
254 void print(raw_ostream &OS) const;
255 void dump() const;
256};
257
258// An offset from an address that is either scalable or fixed. Used for
259// per-target optimizations of addressing modes.
260class Immediate : public details::FixedOrScalableQuantity<Immediate, int64_t> {
261 constexpr Immediate(ScalarTy MinVal, bool Scalable)
262 : FixedOrScalableQuantity(MinVal, Scalable) {}
263
264 constexpr Immediate(const FixedOrScalableQuantity<Immediate, int64_t> &V)
265 : FixedOrScalableQuantity(V) {}
266
267public:
268 constexpr Immediate() = delete;
269
270 static constexpr Immediate getFixed(ScalarTy MinVal) {
271 return {MinVal, false};
272 }
273 static constexpr Immediate getScalable(ScalarTy MinVal) {
274 return {MinVal, true};
275 }
276 static constexpr Immediate get(ScalarTy MinVal, bool Scalable) {
277 return {MinVal, Scalable};
278 }
279 static constexpr Immediate getZero() { return {0, false}; }
280 static constexpr Immediate getFixedMin() {
281 return {std::numeric_limits<int64_t>::min(), false};
282 }
283 static constexpr Immediate getFixedMax() {
284 return {std::numeric_limits<int64_t>::max(), false};
285 }
286 static constexpr Immediate getScalableMin() {
287 return {std::numeric_limits<int64_t>::min(), true};
288 }
289 static constexpr Immediate getScalableMax() {
290 return {std::numeric_limits<int64_t>::max(), true};
291 }
292
293 constexpr bool isLessThanZero() const { return Quantity < 0; }
294
295 constexpr bool isGreaterThanZero() const { return Quantity > 0; }
296
297 constexpr bool isCompatibleImmediate(const Immediate &Imm) const {
298 return isZero() || Imm.isZero() || Imm.Scalable == Scalable;
299 }
300
301 constexpr bool isMin() const {
302 return Quantity == std::numeric_limits<ScalarTy>::min();
303 }
304
305 constexpr bool isMax() const {
306 return Quantity == std::numeric_limits<ScalarTy>::max();
307 }
308
309 // Arithmetic 'operators' that cast to unsigned types first.
310 constexpr Immediate addUnsigned(const Immediate &RHS) const {
311 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
312 ScalarTy Value = (uint64_t)Quantity + RHS.getKnownMinValue();
313 return {Value, Scalable || RHS.isScalable()};
314 }
315
316 constexpr Immediate subUnsigned(const Immediate &RHS) const {
317 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
318 ScalarTy Value = (uint64_t)Quantity - RHS.getKnownMinValue();
319 return {Value, Scalable || RHS.isScalable()};
320 }
321
322 // Scale the quantity by a constant without caring about runtime scalability.
323 constexpr Immediate mulUnsigned(const ScalarTy RHS) const {
324 ScalarTy Value = (uint64_t)Quantity * RHS;
325 return {Value, Scalable};
326 }
327
328 // Helpers for generating SCEVs with vscale terms where needed.
329 const SCEV *getSCEV(ScalarEvolution &SE, Type *Ty) const {
330 const SCEV *S = SE.getConstant(Ty, Quantity);
331 if (Scalable)
332 S = SE.getMulExpr(S, SE.getVScale(S->getType()));
333 return S;
334 }
335
336 const SCEV *getNegativeSCEV(ScalarEvolution &SE, Type *Ty) const {
337 const SCEV *NegS = SE.getConstant(Ty, -(uint64_t)Quantity);
338 if (Scalable)
339 NegS = SE.getMulExpr(NegS, SE.getVScale(NegS->getType()));
340 return NegS;
341 }
342
343 const SCEV *getUnknownSCEV(ScalarEvolution &SE, Type *Ty) const {
344 const SCEV *SU = SE.getUnknown(ConstantInt::getSigned(Ty, Quantity));
345 if (Scalable)
346 SU = SE.getMulExpr(SU, SE.getVScale(SU->getType()));
347 return SU;
348 }
349};
350
351// This is needed for the Compare type of std::map when Immediate is used
352// as a key. We don't need it to be fully correct against any value of vscale,
353// just to make sure that vscale-related terms in the map are considered against
354// each other rather than being mixed up and potentially missing opportunities.
355struct KeyOrderTargetImmediate {
356 bool operator()(const Immediate &LHS, const Immediate &RHS) const {
357 if (LHS.isScalable() && !RHS.isScalable())
358 return false;
359 if (!LHS.isScalable() && RHS.isScalable())
360 return true;
361 return LHS.getKnownMinValue() < RHS.getKnownMinValue();
362 }
363};
364
365// This would be nicer if we could be generic instead of directly using size_t,
366// but there doesn't seem to be a type trait for is_orderable or
367// is_lessthan_comparable or similar.
368struct KeyOrderSizeTAndImmediate {
369 bool operator()(const std::pair<size_t, Immediate> &LHS,
370 const std::pair<size_t, Immediate> &RHS) const {
371 size_t LSize = LHS.first;
372 size_t RSize = RHS.first;
373 if (LSize != RSize)
374 return LSize < RSize;
375 return KeyOrderTargetImmediate()(LHS.second, RHS.second);
376 }
377};
378} // end anonymous namespace
379
380#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
381void RegSortData::print(raw_ostream &OS) const {
382 OS << "[NumUses=" << UsedByIndices.count() << ']';
383}
384
385LLVM_DUMP_METHOD void RegSortData::dump() const {
386 print(errs()); errs() << '\n';
387}
388#endif
389
390namespace {
391
392/// Map register candidates to information about how they are used.
393class RegUseTracker {
394 using RegUsesTy = DenseMap<const SCEV *, RegSortData>;
395
396 RegUsesTy RegUsesMap;
398
399public:
400 void countRegister(const SCEV *Reg, size_t LUIdx);
401 void dropRegister(const SCEV *Reg, size_t LUIdx);
402 void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
403
404 bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
405
406 const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
407
408 void clear();
409
412
413 iterator begin() { return RegSequence.begin(); }
414 iterator end() { return RegSequence.end(); }
415 const_iterator begin() const { return RegSequence.begin(); }
416 const_iterator end() const { return RegSequence.end(); }
417};
418
419} // end anonymous namespace
420
421void
422RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
423 std::pair<RegUsesTy::iterator, bool> Pair =
424 RegUsesMap.insert(std::make_pair(Reg, RegSortData()));
425 RegSortData &RSD = Pair.first->second;
426 if (Pair.second)
427 RegSequence.push_back(Reg);
428 RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
429 RSD.UsedByIndices.set(LUIdx);
430}
431
432void
433RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
434 RegUsesTy::iterator It = RegUsesMap.find(Reg);
435 assert(It != RegUsesMap.end());
436 RegSortData &RSD = It->second;
437 assert(RSD.UsedByIndices.size() > LUIdx);
438 RSD.UsedByIndices.reset(LUIdx);
439}
440
441void
442RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
443 assert(LUIdx <= LastLUIdx);
444
445 // Update RegUses. The data structure is not optimized for this purpose;
446 // we must iterate through it and update each of the bit vectors.
447 for (auto &Pair : RegUsesMap) {
448 SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
449 if (LUIdx < UsedByIndices.size())
450 UsedByIndices[LUIdx] =
451 LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : false;
452 UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
453 }
454}
455
456bool
457RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
458 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
459 if (I == RegUsesMap.end())
460 return false;
461 const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
462 int i = UsedByIndices.find_first();
463 if (i == -1) return false;
464 if ((size_t)i != LUIdx) return true;
465 return UsedByIndices.find_next(i) != -1;
466}
467
468const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
469 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
470 assert(I != RegUsesMap.end() && "Unknown register!");
471 return I->second.UsedByIndices;
472}
473
474void RegUseTracker::clear() {
475 RegUsesMap.clear();
476 RegSequence.clear();
477}
478
479namespace {
480
481/// This class holds information that describes a formula for computing
482/// satisfying a use. It may include broken-out immediates and scaled registers.
483struct Formula {
484 /// Global base address used for complex addressing.
485 GlobalValue *BaseGV = nullptr;
486
487 /// Base offset for complex addressing.
488 Immediate BaseOffset = Immediate::getZero();
489
490 /// Whether any complex addressing has a base register.
491 bool HasBaseReg = false;
492
493 /// The scale of any complex addressing.
494 int64_t Scale = 0;
495
496 /// The list of "base" registers for this use. When this is non-empty. The
497 /// canonical representation of a formula is
498 /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
499 /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
500 /// 3. The reg containing recurrent expr related with currect loop in the
501 /// formula should be put in the ScaledReg.
502 /// #1 enforces that the scaled register is always used when at least two
503 /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
504 /// #2 enforces that 1 * reg is reg.
505 /// #3 ensures invariant regs with respect to current loop can be combined
506 /// together in LSR codegen.
507 /// This invariant can be temporarily broken while building a formula.
508 /// However, every formula inserted into the LSRInstance must be in canonical
509 /// form.
511
512 /// The 'scaled' register for this use. This should be non-null when Scale is
513 /// not zero.
514 const SCEV *ScaledReg = nullptr;
515
516 /// An additional constant offset which added near the use. This requires a
517 /// temporary register, but the offset itself can live in an add immediate
518 /// field rather than a register.
519 Immediate UnfoldedOffset = Immediate::getZero();
520
521 Formula() = default;
522
523 void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
524
525 bool isCanonical(const Loop &L) const;
526
527 void canonicalize(const Loop &L);
528
529 bool unscale();
530
531 bool hasZeroEnd() const;
532
533 size_t getNumRegs() const;
534 Type *getType() const;
535
536 void deleteBaseReg(const SCEV *&S);
537
538 bool referencesReg(const SCEV *S) const;
539 bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
540 const RegUseTracker &RegUses) const;
541
542 void print(raw_ostream &OS) const;
543 void dump() const;
544};
545
546} // end anonymous namespace
547
548/// Recursion helper for initialMatch.
549static void DoInitialMatch(const SCEV *S, Loop *L,
552 ScalarEvolution &SE) {
553 // Collect expressions which properly dominate the loop header.
554 if (SE.properlyDominates(S, L->getHeader())) {
555 Good.push_back(S);
556 return;
557 }
558
559 // Look at add operands.
560 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
561 for (const SCEV *S : Add->operands())
562 DoInitialMatch(S, L, Good, Bad, SE);
563 return;
564 }
565
566 // Look at addrec operands.
567 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
568 if (!AR->getStart()->isZero() && AR->isAffine()) {
569 DoInitialMatch(AR->getStart(), L, Good, Bad, SE);
570 DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
571 AR->getStepRecurrence(SE),
572 // FIXME: AR->getNoWrapFlags()
573 AR->getLoop(), SCEV::FlagAnyWrap),
574 L, Good, Bad, SE);
575 return;
576 }
577
578 // Handle a multiplication by -1 (negation) if it didn't fold.
579 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
580 if (Mul->getOperand(0)->isAllOnesValue()) {
582 const SCEV *NewMul = SE.getMulExpr(Ops);
583
586 DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
587 const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
588 SE.getEffectiveSCEVType(NewMul->getType())));
589 for (const SCEV *S : MyGood)
590 Good.push_back(SE.getMulExpr(NegOne, S));
591 for (const SCEV *S : MyBad)
592 Bad.push_back(SE.getMulExpr(NegOne, S));
593 return;
594 }
595
596 // Ok, we can't do anything interesting. Just stuff the whole thing into a
597 // register and hope for the best.
598 Bad.push_back(S);
599}
600
601/// Incorporate loop-variant parts of S into this Formula, attempting to keep
602/// all loop-invariant and loop-computable values in a single base register.
603void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
606 DoInitialMatch(S, L, Good, Bad, SE);
607 if (!Good.empty()) {
608 const SCEV *Sum = SE.getAddExpr(Good);
609 if (!Sum->isZero())
610 BaseRegs.push_back(Sum);
611 HasBaseReg = true;
612 }
613 if (!Bad.empty()) {
614 const SCEV *Sum = SE.getAddExpr(Bad);
615 if (!Sum->isZero())
616 BaseRegs.push_back(Sum);
617 HasBaseReg = true;
618 }
619 canonicalize(*L);
620}
621
622static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L) {
623 return SCEVExprContains(S, [&L](const SCEV *S) {
624 return isa<SCEVAddRecExpr>(S) && (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
625 });
626}
627
628/// Check whether or not this formula satisfies the canonical
629/// representation.
630/// \see Formula::BaseRegs.
631bool Formula::isCanonical(const Loop &L) const {
632 if (!ScaledReg)
633 return BaseRegs.size() <= 1;
634
635 if (Scale != 1)
636 return true;
637
638 if (Scale == 1 && BaseRegs.empty())
639 return false;
640
641 if (containsAddRecDependentOnLoop(ScaledReg, L))
642 return true;
643
644 // If ScaledReg is not a recurrent expr, or it is but its loop is not current
645 // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
646 // loop, we want to swap the reg in BaseRegs with ScaledReg.
647 return none_of(BaseRegs, [&L](const SCEV *S) {
649 });
650}
651
652/// Helper method to morph a formula into its canonical representation.
653/// \see Formula::BaseRegs.
654/// Every formula having more than one base register, must use the ScaledReg
655/// field. Otherwise, we would have to do special cases everywhere in LSR
656/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
657/// On the other hand, 1*reg should be canonicalized into reg.
658void Formula::canonicalize(const Loop &L) {
659 if (isCanonical(L))
660 return;
661
662 if (BaseRegs.empty()) {
663 // No base reg? Use scale reg with scale = 1 as such.
664 assert(ScaledReg && "Expected 1*reg => reg");
665 assert(Scale == 1 && "Expected 1*reg => reg");
666 BaseRegs.push_back(ScaledReg);
667 Scale = 0;
668 ScaledReg = nullptr;
669 return;
670 }
671
672 // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
673 if (!ScaledReg) {
674 ScaledReg = BaseRegs.pop_back_val();
675 Scale = 1;
676 }
677
678 // If ScaledReg is an invariant with respect to L, find the reg from
679 // BaseRegs containing the recurrent expr related with Loop L. Swap the
680 // reg with ScaledReg.
681 if (!containsAddRecDependentOnLoop(ScaledReg, L)) {
682 auto I = find_if(BaseRegs, [&L](const SCEV *S) {
684 });
685 if (I != BaseRegs.end())
686 std::swap(ScaledReg, *I);
687 }
688 assert(isCanonical(L) && "Failed to canonicalize?");
689}
690
691/// Get rid of the scale in the formula.
692/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
693/// \return true if it was possible to get rid of the scale, false otherwise.
694/// \note After this operation the formula may not be in the canonical form.
695bool Formula::unscale() {
696 if (Scale != 1)
697 return false;
698 Scale = 0;
699 BaseRegs.push_back(ScaledReg);
700 ScaledReg = nullptr;
701 return true;
702}
703
704bool Formula::hasZeroEnd() const {
705 if (UnfoldedOffset || BaseOffset)
706 return false;
707 if (BaseRegs.size() != 1 || ScaledReg)
708 return false;
709 return true;
710}
711
712/// Return the total number of register operands used by this formula. This does
713/// not include register uses implied by non-constant addrec strides.
714size_t Formula::getNumRegs() const {
715 return !!ScaledReg + BaseRegs.size();
716}
717
718/// Return the type of this formula, if it has one, or null otherwise. This type
719/// is meaningless except for the bit size.
720Type *Formula::getType() const {
721 return !BaseRegs.empty() ? BaseRegs.front()->getType() :
722 ScaledReg ? ScaledReg->getType() :
723 BaseGV ? BaseGV->getType() :
724 nullptr;
725}
726
727/// Delete the given base reg from the BaseRegs list.
728void Formula::deleteBaseReg(const SCEV *&S) {
729 if (&S != &BaseRegs.back())
730 std::swap(S, BaseRegs.back());
731 BaseRegs.pop_back();
732}
733
734/// Test if this formula references the given register.
735bool Formula::referencesReg(const SCEV *S) const {
736 return S == ScaledReg || is_contained(BaseRegs, S);
737}
738
739/// Test whether this formula uses registers which are used by uses other than
740/// the use with the given index.
741bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
742 const RegUseTracker &RegUses) const {
743 if (ScaledReg)
744 if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
745 return true;
746 for (const SCEV *BaseReg : BaseRegs)
747 if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
748 return true;
749 return false;
750}
751
752#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
753void Formula::print(raw_ostream &OS) const {
754 bool First = true;
755 if (BaseGV) {
756 if (!First) OS << " + "; else First = false;
757 BaseGV->printAsOperand(OS, /*PrintType=*/false);
758 }
759 if (BaseOffset.isNonZero()) {
760 if (!First) OS << " + "; else First = false;
761 OS << BaseOffset;
762 }
763 for (const SCEV *BaseReg : BaseRegs) {
764 if (!First) OS << " + "; else First = false;
765 OS << "reg(" << *BaseReg << ')';
766 }
767 if (HasBaseReg && BaseRegs.empty()) {
768 if (!First) OS << " + "; else First = false;
769 OS << "**error: HasBaseReg**";
770 } else if (!HasBaseReg && !BaseRegs.empty()) {
771 if (!First) OS << " + "; else First = false;
772 OS << "**error: !HasBaseReg**";
773 }
774 if (Scale != 0) {
775 if (!First) OS << " + "; else First = false;
776 OS << Scale << "*reg(";
777 if (ScaledReg)
778 OS << *ScaledReg;
779 else
780 OS << "<unknown>";
781 OS << ')';
782 }
783 if (UnfoldedOffset.isNonZero()) {
784 if (!First) OS << " + ";
785 OS << "imm(" << UnfoldedOffset << ')';
786 }
787}
788
789LLVM_DUMP_METHOD void Formula::dump() const {
790 print(errs()); errs() << '\n';
791}
792#endif
793
794/// Return true if the given addrec can be sign-extended without changing its
795/// value.
797 Type *WideTy =
799 return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
800}
801
802/// Return true if the given add can be sign-extended without changing its
803/// value.
804static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
805 Type *WideTy =
806 IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
807 return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
808}
809
810/// Return true if the given mul can be sign-extended without changing its
811/// value.
812static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
813 Type *WideTy =
815 SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
816 return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
817}
818
819/// Return an expression for LHS /s RHS, if it can be determined and if the
820/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
821/// is true, expressions like (X * Y) /s Y are simplified to X, ignoring that
822/// the multiplication may overflow, which is useful when the result will be
823/// used in a context where the most significant bits are ignored.
824static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
825 ScalarEvolution &SE,
826 bool IgnoreSignificantBits = false) {
827 // Handle the trivial case, which works for any SCEV type.
828 if (LHS == RHS)
829 return SE.getConstant(LHS->getType(), 1);
830
831 // Handle a few RHS special cases.
832 const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
833 if (RC) {
834 const APInt &RA = RC->getAPInt();
835 // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
836 // some folding.
837 if (RA.isAllOnes()) {
838 if (LHS->getType()->isPointerTy())
839 return nullptr;
840 return SE.getMulExpr(LHS, RC);
841 }
842 // Handle x /s 1 as x.
843 if (RA == 1)
844 return LHS;
845 }
846
847 // Check for a division of a constant by a constant.
848 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
849 if (!RC)
850 return nullptr;
851 const APInt &LA = C->getAPInt();
852 const APInt &RA = RC->getAPInt();
853 if (LA.srem(RA) != 0)
854 return nullptr;
855 return SE.getConstant(LA.sdiv(RA));
856 }
857
858 // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
859 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
860 if ((IgnoreSignificantBits || isAddRecSExtable(AR, SE)) && AR->isAffine()) {
861 const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
862 IgnoreSignificantBits);
863 if (!Step) return nullptr;
864 const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
865 IgnoreSignificantBits);
866 if (!Start) return nullptr;
867 // FlagNW is independent of the start value, step direction, and is
868 // preserved with smaller magnitude steps.
869 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
870 return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
871 }
872 return nullptr;
873 }
874
875 // Distribute the sdiv over add operands, if the add doesn't overflow.
876 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
877 if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
879 for (const SCEV *S : Add->operands()) {
880 const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
881 if (!Op) return nullptr;
882 Ops.push_back(Op);
883 }
884 return SE.getAddExpr(Ops);
885 }
886 return nullptr;
887 }
888
889 // Check for a multiply operand that we can pull RHS out of.
890 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
891 if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
892 // Handle special case C1*X*Y /s C2*X*Y.
893 if (const SCEVMulExpr *MulRHS = dyn_cast<SCEVMulExpr>(RHS)) {
894 if (IgnoreSignificantBits || isMulSExtable(MulRHS, SE)) {
895 const SCEVConstant *LC = dyn_cast<SCEVConstant>(Mul->getOperand(0));
896 const SCEVConstant *RC =
897 dyn_cast<SCEVConstant>(MulRHS->getOperand(0));
898 if (LC && RC) {
900 SmallVector<const SCEV *, 4> ROps(drop_begin(MulRHS->operands()));
901 if (LOps == ROps)
902 return getExactSDiv(LC, RC, SE, IgnoreSignificantBits);
903 }
904 }
905 }
906
908 bool Found = false;
909 for (const SCEV *S : Mul->operands()) {
910 if (!Found)
911 if (const SCEV *Q = getExactSDiv(S, RHS, SE,
912 IgnoreSignificantBits)) {
913 S = Q;
914 Found = true;
915 }
916 Ops.push_back(S);
917 }
918 return Found ? SE.getMulExpr(Ops) : nullptr;
919 }
920 return nullptr;
921 }
922
923 // Otherwise we don't know.
924 return nullptr;
925}
926
927/// If S involves the addition of a constant integer value, return that integer
928/// value, and mutate S to point to a new SCEV with that value excluded.
929static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
930 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
931 if (C->getAPInt().getSignificantBits() <= 64) {
932 S = SE.getConstant(C->getType(), 0);
933 return Immediate::getFixed(C->getValue()->getSExtValue());
934 }
935 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
936 SmallVector<const SCEV *, 8> NewOps(Add->operands());
937 Immediate Result = ExtractImmediate(NewOps.front(), SE);
938 if (Result.isNonZero())
939 S = SE.getAddExpr(NewOps);
940 return Result;
941 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
942 SmallVector<const SCEV *, 8> NewOps(AR->operands());
943 Immediate Result = ExtractImmediate(NewOps.front(), SE);
944 if (Result.isNonZero())
945 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
946 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
948 return Result;
949 } else if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S)) {
950 if (EnableVScaleImmediates && M->getNumOperands() == 2) {
951 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(M->getOperand(0)))
952 if (isa<SCEVVScale>(M->getOperand(1))) {
953 S = SE.getConstant(M->getType(), 0);
954 return Immediate::getScalable(C->getValue()->getSExtValue());
955 }
956 }
957 }
958 return Immediate::getZero();
959}
960
961/// If S involves the addition of a GlobalValue address, return that symbol, and
962/// mutate S to point to a new SCEV with that value excluded.
964 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
965 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
966 S = SE.getConstant(GV->getType(), 0);
967 return GV;
968 }
969 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
970 SmallVector<const SCEV *, 8> NewOps(Add->operands());
971 GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
972 if (Result)
973 S = SE.getAddExpr(NewOps);
974 return Result;
975 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
976 SmallVector<const SCEV *, 8> NewOps(AR->operands());
977 GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
978 if (Result)
979 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
980 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
982 return Result;
983 }
984 return nullptr;
985}
986
987/// Returns true if the specified instruction is using the specified value as an
988/// address.
990 Instruction *Inst, Value *OperandVal) {
991 bool isAddress = isa<LoadInst>(Inst);
992 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
993 if (SI->getPointerOperand() == OperandVal)
994 isAddress = true;
995 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
996 // Addressing modes can also be folded into prefetches and a variety
997 // of intrinsics.
998 switch (II->getIntrinsicID()) {
999 case Intrinsic::memset:
1000 case Intrinsic::prefetch:
1001 case Intrinsic::masked_load:
1002 if (II->getArgOperand(0) == OperandVal)
1003 isAddress = true;
1004 break;
1005 case Intrinsic::masked_store:
1006 if (II->getArgOperand(1) == OperandVal)
1007 isAddress = true;
1008 break;
1009 case Intrinsic::memmove:
1010 case Intrinsic::memcpy:
1011 if (II->getArgOperand(0) == OperandVal ||
1012 II->getArgOperand(1) == OperandVal)
1013 isAddress = true;
1014 break;
1015 default: {
1016 MemIntrinsicInfo IntrInfo;
1017 if (TTI.getTgtMemIntrinsic(II, IntrInfo)) {
1018 if (IntrInfo.PtrVal == OperandVal)
1019 isAddress = true;
1020 }
1021 }
1022 }
1023 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1024 if (RMW->getPointerOperand() == OperandVal)
1025 isAddress = true;
1026 } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1027 if (CmpX->getPointerOperand() == OperandVal)
1028 isAddress = true;
1029 }
1030 return isAddress;
1031}
1032
1033/// Return the type of the memory being accessed.
1034static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
1035 Instruction *Inst, Value *OperandVal) {
1036 MemAccessTy AccessTy = MemAccessTy::getUnknown(Inst->getContext());
1037
1038 // First get the type of memory being accessed.
1039 if (Type *Ty = Inst->getAccessType())
1040 AccessTy.MemTy = Ty;
1041
1042 // Then get the pointer address space.
1043 if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
1044 AccessTy.AddrSpace = SI->getPointerAddressSpace();
1045 } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
1046 AccessTy.AddrSpace = LI->getPointerAddressSpace();
1047 } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1048 AccessTy.AddrSpace = RMW->getPointerAddressSpace();
1049 } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1050 AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
1051 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
1052 switch (II->getIntrinsicID()) {
1053 case Intrinsic::prefetch:
1054 case Intrinsic::memset:
1055 AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
1056 AccessTy.MemTy = OperandVal->getType();
1057 break;
1058 case Intrinsic::memmove:
1059 case Intrinsic::memcpy:
1060 AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace();
1061 AccessTy.MemTy = OperandVal->getType();
1062 break;
1063 case Intrinsic::masked_load:
1064 AccessTy.AddrSpace =
1065 II->getArgOperand(0)->getType()->getPointerAddressSpace();
1066 break;
1067 case Intrinsic::masked_store:
1068 AccessTy.AddrSpace =
1069 II->getArgOperand(1)->getType()->getPointerAddressSpace();
1070 break;
1071 default: {
1072 MemIntrinsicInfo IntrInfo;
1073 if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) {
1074 AccessTy.AddrSpace
1075 = IntrInfo.PtrVal->getType()->getPointerAddressSpace();
1076 }
1077
1078 break;
1079 }
1080 }
1081 }
1082
1083 return AccessTy;
1084}
1085
1086/// Return true if this AddRec is already a phi in its loop.
1087static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
1088 for (PHINode &PN : AR->getLoop()->getHeader()->phis()) {
1089 if (SE.isSCEVable(PN.getType()) &&
1090 (SE.getEffectiveSCEVType(PN.getType()) ==
1091 SE.getEffectiveSCEVType(AR->getType())) &&
1092 SE.getSCEV(&PN) == AR)
1093 return true;
1094 }
1095 return false;
1096}
1097
1098/// Check if expanding this expression is likely to incur significant cost. This
1099/// is tricky because SCEV doesn't track which expressions are actually computed
1100/// by the current IR.
1101///
1102/// We currently allow expansion of IV increments that involve adds,
1103/// multiplication by constants, and AddRecs from existing phis.
1104///
1105/// TODO: Allow UDivExpr if we can find an existing IV increment that is an
1106/// obvious multiple of the UDivExpr.
1107static bool isHighCostExpansion(const SCEV *S,
1109 ScalarEvolution &SE) {
1110 // Zero/One operand expressions
1111 switch (S->getSCEVType()) {
1112 case scUnknown:
1113 case scConstant:
1114 case scVScale:
1115 return false;
1116 case scTruncate:
1117 return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
1118 Processed, SE);
1119 case scZeroExtend:
1120 return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
1121 Processed, SE);
1122 case scSignExtend:
1123 return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
1124 Processed, SE);
1125 default:
1126 break;
1127 }
1128
1129 if (!Processed.insert(S).second)
1130 return false;
1131
1132 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
1133 for (const SCEV *S : Add->operands()) {
1134 if (isHighCostExpansion(S, Processed, SE))
1135 return true;
1136 }
1137 return false;
1138 }
1139
1140 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
1141 if (Mul->getNumOperands() == 2) {
1142 // Multiplication by a constant is ok
1143 if (isa<SCEVConstant>(Mul->getOperand(0)))
1144 return isHighCostExpansion(Mul->getOperand(1), Processed, SE);
1145
1146 // If we have the value of one operand, check if an existing
1147 // multiplication already generates this expression.
1148 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) {
1149 Value *UVal = U->getValue();
1150 for (User *UR : UVal->users()) {
1151 // If U is a constant, it may be used by a ConstantExpr.
1152 Instruction *UI = dyn_cast<Instruction>(UR);
1153 if (UI && UI->getOpcode() == Instruction::Mul &&
1154 SE.isSCEVable(UI->getType())) {
1155 return SE.getSCEV(UI) == Mul;
1156 }
1157 }
1158 }
1159 }
1160 }
1161
1162 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
1163 if (isExistingPhi(AR, SE))
1164 return false;
1165 }
1166
1167 // Fow now, consider any other type of expression (div/mul/min/max) high cost.
1168 return true;
1169}
1170
1171namespace {
1172
1173class LSRUse;
1174
1175} // end anonymous namespace
1176
1177/// Check if the addressing mode defined by \p F is completely
1178/// folded in \p LU at isel time.
1179/// This includes address-mode folding and special icmp tricks.
1180/// This function returns true if \p LU can accommodate what \p F
1181/// defines and up to 1 base + 1 scaled + offset.
1182/// In other words, if \p F has several base registers, this function may
1183/// still return true. Therefore, users still need to account for
1184/// additional base registers and/or unfolded offsets to derive an
1185/// accurate cost model.
1187 const LSRUse &LU, const Formula &F);
1188
1189// Get the cost of the scaling factor used in F for LU.
1191 const LSRUse &LU, const Formula &F,
1192 const Loop &L);
1193
1194namespace {
1195
1196/// This class is used to measure and compare candidate formulae.
1197class Cost {
1198 const Loop *L = nullptr;
1199 ScalarEvolution *SE = nullptr;
1200 const TargetTransformInfo *TTI = nullptr;
1203
1204public:
1205 Cost() = delete;
1206 Cost(const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
1208 L(L), SE(&SE), TTI(&TTI), AMK(AMK) {
1209 C.Insns = 0;
1210 C.NumRegs = 0;
1211 C.AddRecCost = 0;
1212 C.NumIVMuls = 0;
1213 C.NumBaseAdds = 0;
1214 C.ImmCost = 0;
1215 C.SetupCost = 0;
1216 C.ScaleCost = 0;
1217 }
1218
1219 bool isLess(const Cost &Other) const;
1220
1221 void Lose();
1222
1223#ifndef NDEBUG
1224 // Once any of the metrics loses, they must all remain losers.
1225 bool isValid() {
1226 return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
1227 | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
1228 || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
1229 & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
1230 }
1231#endif
1232
1233 bool isLoser() {
1234 assert(isValid() && "invalid cost");
1235 return C.NumRegs == ~0u;
1236 }
1237
1238 void RateFormula(const Formula &F,
1240 const DenseSet<const SCEV *> &VisitedRegs,
1241 const LSRUse &LU,
1242 SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
1243
1244 void print(raw_ostream &OS) const;
1245 void dump() const;
1246
1247private:
1248 void RateRegister(const Formula &F, const SCEV *Reg,
1250 void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1253};
1254
1255/// An operand value in an instruction which is to be replaced with some
1256/// equivalent, possibly strength-reduced, replacement.
1257struct LSRFixup {
1258 /// The instruction which will be updated.
1259 Instruction *UserInst = nullptr;
1260
1261 /// The operand of the instruction which will be replaced. The operand may be
1262 /// used more than once; every instance will be replaced.
1263 Value *OperandValToReplace = nullptr;
1264
1265 /// If this user is to use the post-incremented value of an induction
1266 /// variable, this set is non-empty and holds the loops associated with the
1267 /// induction variable.
1268 PostIncLoopSet PostIncLoops;
1269
1270 /// A constant offset to be added to the LSRUse expression. This allows
1271 /// multiple fixups to share the same LSRUse with different offsets, for
1272 /// example in an unrolled loop.
1273 Immediate Offset = Immediate::getZero();
1274
1275 LSRFixup() = default;
1276
1277 bool isUseFullyOutsideLoop(const Loop *L) const;
1278
1279 void print(raw_ostream &OS) const;
1280 void dump() const;
1281};
1282
1283/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted
1284/// SmallVectors of const SCEV*.
1285struct UniquifierDenseMapInfo {
1286 static SmallVector<const SCEV *, 4> getEmptyKey() {
1288 V.push_back(reinterpret_cast<const SCEV *>(-1));
1289 return V;
1290 }
1291
1292 static SmallVector<const SCEV *, 4> getTombstoneKey() {
1294 V.push_back(reinterpret_cast<const SCEV *>(-2));
1295 return V;
1296 }
1297
1298 static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) {
1299 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
1300 }
1301
1302 static bool isEqual(const SmallVector<const SCEV *, 4> &LHS,
1304 return LHS == RHS;
1305 }
1306};
1307
1308/// This class holds the state that LSR keeps for each use in IVUsers, as well
1309/// as uses invented by LSR itself. It includes information about what kinds of
1310/// things can be folded into the user, information about the user itself, and
1311/// information about how the use may be satisfied. TODO: Represent multiple
1312/// users of the same expression in common?
1313class LSRUse {
1314 DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;
1315
1316public:
1317 /// An enum for a kind of use, indicating what types of scaled and immediate
1318 /// operands it might support.
1319 enum KindType {
1320 Basic, ///< A normal use, with no folding.
1321 Special, ///< A special case of basic, allowing -1 scales.
1322 Address, ///< An address use; folding according to TargetLowering
1323 ICmpZero ///< An equality icmp with both operands folded into one.
1324 // TODO: Add a generic icmp too?
1325 };
1326
1327 using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>;
1328
1329 KindType Kind;
1330 MemAccessTy AccessTy;
1331
1332 /// The list of operands which are to be replaced.
1334
1335 /// Keep track of the min and max offsets of the fixups.
1336 Immediate MinOffset = Immediate::getFixedMax();
1337 Immediate MaxOffset = Immediate::getFixedMin();
1338
1339 /// This records whether all of the fixups using this LSRUse are outside of
1340 /// the loop, in which case some special-case heuristics may be used.
1341 bool AllFixupsOutsideLoop = true;
1342
1343 /// RigidFormula is set to true to guarantee that this use will be associated
1344 /// with a single formula--the one that initially matched. Some SCEV
1345 /// expressions cannot be expanded. This allows LSR to consider the registers
1346 /// used by those expressions without the need to expand them later after
1347 /// changing the formula.
1348 bool RigidFormula = false;
1349
1350 /// This records the widest use type for any fixup using this
1351 /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
1352 /// fixup widths to be equivalent, because the narrower one may be relying on
1353 /// the implicit truncation to truncate away bogus bits.
1354 Type *WidestFixupType = nullptr;
1355
1356 /// A list of ways to build a value that can satisfy this user. After the
1357 /// list is populated, one of these is selected heuristically and used to
1358 /// formulate a replacement for OperandValToReplace in UserInst.
1359 SmallVector<Formula, 12> Formulae;
1360
1361 /// The set of register candidates used by all formulae in this LSRUse.
1363
1364 LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {}
1365
1366 LSRFixup &getNewFixup() {
1367 Fixups.push_back(LSRFixup());
1368 return Fixups.back();
1369 }
1370
1371 void pushFixup(LSRFixup &f) {
1372 Fixups.push_back(f);
1373 if (Immediate::isKnownGT(f.Offset, MaxOffset))
1374 MaxOffset = f.Offset;
1375 if (Immediate::isKnownLT(f.Offset, MinOffset))
1376 MinOffset = f.Offset;
1377 }
1378
1379 bool HasFormulaWithSameRegs(const Formula &F) const;
1380 float getNotSelectedProbability(const SCEV *Reg) const;
1381 bool InsertFormula(const Formula &F, const Loop &L);
1382 void DeleteFormula(Formula &F);
1383 void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
1384
1385 void print(raw_ostream &OS) const;
1386 void dump() const;
1387};
1388
1389} // end anonymous namespace
1390
1392 LSRUse::KindType Kind, MemAccessTy AccessTy,
1393 GlobalValue *BaseGV, Immediate BaseOffset,
1394 bool HasBaseReg, int64_t Scale,
1395 Instruction *Fixup = nullptr);
1396
1397static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) {
1398 if (isa<SCEVUnknown>(Reg) || isa<SCEVConstant>(Reg))
1399 return 1;
1400 if (Depth == 0)
1401 return 0;
1402 if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))
1403 return getSetupCost(S->getStart(), Depth - 1);
1404 if (auto S = dyn_cast<SCEVIntegralCastExpr>(Reg))
1405 return getSetupCost(S->getOperand(), Depth - 1);
1406 if (auto S = dyn_cast<SCEVNAryExpr>(Reg))
1407 return std::accumulate(S->operands().begin(), S->operands().end(), 0,
1408 [&](unsigned i, const SCEV *Reg) {
1409 return i + getSetupCost(Reg, Depth - 1);
1410 });
1411 if (auto S = dyn_cast<SCEVUDivExpr>(Reg))
1412 return getSetupCost(S->getLHS(), Depth - 1) +
1413 getSetupCost(S->getRHS(), Depth - 1);
1414 return 0;
1415}
1416
1417/// Tally up interesting quantities from the given register.
1418void Cost::RateRegister(const Formula &F, const SCEV *Reg,
1420 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
1421 // If this is an addrec for another loop, it should be an invariant
1422 // with respect to L since L is the innermost loop (at least
1423 // for now LSR only handles innermost loops).
1424 if (AR->getLoop() != L) {
1425 // If the AddRec exists, consider it's register free and leave it alone.
1426 if (isExistingPhi(AR, *SE) && AMK != TTI::AMK_PostIndexed)
1427 return;
1428
1429 // It is bad to allow LSR for current loop to add induction variables
1430 // for its sibling loops.
1431 if (!AR->getLoop()->contains(L)) {
1432 Lose();
1433 return;
1434 }
1435
1436 // Otherwise, it will be an invariant with respect to Loop L.
1437 ++C.NumRegs;
1438 return;
1439 }
1440
1441 unsigned LoopCost = 1;
1442 if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||
1443 TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {
1444
1445 // If the step size matches the base offset, we could use pre-indexed
1446 // addressing.
1447 if (AMK == TTI::AMK_PreIndexed && F.BaseOffset.isFixed()) {
1448 if (auto *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)))
1449 if (Step->getAPInt() == F.BaseOffset.getFixedValue())
1450 LoopCost = 0;
1451 } else if (AMK == TTI::AMK_PostIndexed) {
1452 const SCEV *LoopStep = AR->getStepRecurrence(*SE);
1453 if (isa<SCEVConstant>(LoopStep)) {
1454 const SCEV *LoopStart = AR->getStart();
1455 if (!isa<SCEVConstant>(LoopStart) &&
1456 SE->isLoopInvariant(LoopStart, L))
1457 LoopCost = 0;
1458 }
1459 }
1460 }
1461 C.AddRecCost += LoopCost;
1462
1463 // Add the step value register, if it needs one.
1464 // TODO: The non-affine case isn't precisely modeled here.
1465 if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
1466 if (!Regs.count(AR->getOperand(1))) {
1467 RateRegister(F, AR->getOperand(1), Regs);
1468 if (isLoser())
1469 return;
1470 }
1471 }
1472 }
1473 ++C.NumRegs;
1474
1475 // Rough heuristic; favor registers which don't require extra setup
1476 // instructions in the preheader.
1477 C.SetupCost += getSetupCost(Reg, SetupCostDepthLimit);
1478 // Ensure we don't, even with the recusion limit, produce invalid costs.
1479 C.SetupCost = std::min<unsigned>(C.SetupCost, 1 << 16);
1480
1481 C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
1482 SE->hasComputableLoopEvolution(Reg, L);
1483}
1484
1485/// Record this register in the set. If we haven't seen it before, rate
1486/// it. Optional LoserRegs provides a way to declare any formula that refers to
1487/// one of those regs an instant loser.
1488void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1490 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1491 if (LoserRegs && LoserRegs->count(Reg)) {
1492 Lose();
1493 return;
1494 }
1495 if (Regs.insert(Reg).second) {
1496 RateRegister(F, Reg, Regs);
1497 if (LoserRegs && isLoser())
1498 LoserRegs->insert(Reg);
1499 }
1500}
1501
1502void Cost::RateFormula(const Formula &F,
1504 const DenseSet<const SCEV *> &VisitedRegs,
1505 const LSRUse &LU,
1506 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1507 if (isLoser())
1508 return;
1509 assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
1510 // Tally up the registers.
1511 unsigned PrevAddRecCost = C.AddRecCost;
1512 unsigned PrevNumRegs = C.NumRegs;
1513 unsigned PrevNumBaseAdds = C.NumBaseAdds;
1514 if (const SCEV *ScaledReg = F.ScaledReg) {
1515 if (VisitedRegs.count(ScaledReg)) {
1516 Lose();
1517 return;
1518 }
1519 RatePrimaryRegister(F, ScaledReg, Regs, LoserRegs);
1520 if (isLoser())
1521 return;
1522 }
1523 for (const SCEV *BaseReg : F.BaseRegs) {
1524 if (VisitedRegs.count(BaseReg)) {
1525 Lose();
1526 return;
1527 }
1528 RatePrimaryRegister(F, BaseReg, Regs, LoserRegs);
1529 if (isLoser())
1530 return;
1531 }
1532
1533 // Determine how many (unfolded) adds we'll need inside the loop.
1534 size_t NumBaseParts = F.getNumRegs();
1535 if (NumBaseParts > 1)
1536 // Do not count the base and a possible second register if the target
1537 // allows to fold 2 registers.
1538 C.NumBaseAdds +=
1539 NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F)));
1540 C.NumBaseAdds += (F.UnfoldedOffset.isNonZero());
1541
1542 // Accumulate non-free scaling amounts.
1543 C.ScaleCost += *getScalingFactorCost(*TTI, LU, F, *L).getValue();
1544
1545 // Tally up the non-zero immediates.
1546 for (const LSRFixup &Fixup : LU.Fixups) {
1547 if (Fixup.Offset.isCompatibleImmediate(F.BaseOffset)) {
1548 Immediate Offset = Fixup.Offset.addUnsigned(F.BaseOffset);
1549 if (F.BaseGV)
1550 C.ImmCost += 64; // Handle symbolic values conservatively.
1551 // TODO: This should probably be the pointer size.
1552 else if (Offset.isNonZero())
1553 C.ImmCost +=
1554 APInt(64, Offset.getKnownMinValue(), true).getSignificantBits();
1555
1556 // Check with target if this offset with this instruction is
1557 // specifically not supported.
1558 if (LU.Kind == LSRUse::Address && Offset.isNonZero() &&
1559 !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1560 Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
1561 C.NumBaseAdds++;
1562 } else {
1563 // Incompatible immediate type, increase cost to avoid using
1564 C.ImmCost += 2048;
1565 }
1566 }
1567
1568 // If we don't count instruction cost exit here.
1569 if (!InsnsCost) {
1570 assert(isValid() && "invalid cost");
1571 return;
1572 }
1573
1574 // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
1575 // additional instruction (at least fill).
1576 // TODO: Need distinguish register class?
1577 unsigned TTIRegNum = TTI->getNumberOfRegisters(
1578 TTI->getRegisterClassForType(false, F.getType())) - 1;
1579 if (C.NumRegs > TTIRegNum) {
1580 // Cost already exceeded TTIRegNum, then only newly added register can add
1581 // new instructions.
1582 if (PrevNumRegs > TTIRegNum)
1583 C.Insns += (C.NumRegs - PrevNumRegs);
1584 else
1585 C.Insns += (C.NumRegs - TTIRegNum);
1586 }
1587
1588 // If ICmpZero formula ends with not 0, it could not be replaced by
1589 // just add or sub. We'll need to compare final result of AddRec.
1590 // That means we'll need an additional instruction. But if the target can
1591 // macro-fuse a compare with a branch, don't count this extra instruction.
1592 // For -10 + {0, +, 1}:
1593 // i = i + 1;
1594 // cmp i, 10
1595 //
1596 // For {-10, +, 1}:
1597 // i = i + 1;
1598 if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() &&
1599 !TTI->canMacroFuseCmp())
1600 C.Insns++;
1601 // Each new AddRec adds 1 instruction to calculation.
1602 C.Insns += (C.AddRecCost - PrevAddRecCost);
1603
1604 // BaseAdds adds instructions for unfolded registers.
1605 if (LU.Kind != LSRUse::ICmpZero)
1606 C.Insns += C.NumBaseAdds - PrevNumBaseAdds;
1607 assert(isValid() && "invalid cost");
1608}
1609
1610/// Set this cost to a losing value.
1611void Cost::Lose() {
1612 C.Insns = std::numeric_limits<unsigned>::max();
1613 C.NumRegs = std::numeric_limits<unsigned>::max();
1614 C.AddRecCost = std::numeric_limits<unsigned>::max();
1615 C.NumIVMuls = std::numeric_limits<unsigned>::max();
1616 C.NumBaseAdds = std::numeric_limits<unsigned>::max();
1617 C.ImmCost = std::numeric_limits<unsigned>::max();
1618 C.SetupCost = std::numeric_limits<unsigned>::max();
1619 C.ScaleCost = std::numeric_limits<unsigned>::max();
1620}
1621
1622/// Choose the lower cost.
1623bool Cost::isLess(const Cost &Other) const {
1624 if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
1625 C.Insns != Other.C.Insns)
1626 return C.Insns < Other.C.Insns;
1627 return TTI->isLSRCostLess(C, Other.C);
1628}
1629
1630#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1631void Cost::print(raw_ostream &OS) const {
1632 if (InsnsCost)
1633 OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
1634 OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s");
1635 if (C.AddRecCost != 0)
1636 OS << ", with addrec cost " << C.AddRecCost;
1637 if (C.NumIVMuls != 0)
1638 OS << ", plus " << C.NumIVMuls << " IV mul"
1639 << (C.NumIVMuls == 1 ? "" : "s");
1640 if (C.NumBaseAdds != 0)
1641 OS << ", plus " << C.NumBaseAdds << " base add"
1642 << (C.NumBaseAdds == 1 ? "" : "s");
1643 if (C.ScaleCost != 0)
1644 OS << ", plus " << C.ScaleCost << " scale cost";
1645 if (C.ImmCost != 0)
1646 OS << ", plus " << C.ImmCost << " imm cost";
1647 if (C.SetupCost != 0)
1648 OS << ", plus " << C.SetupCost << " setup cost";
1649}
1650
1651LLVM_DUMP_METHOD void Cost::dump() const {
1652 print(errs()); errs() << '\n';
1653}
1654#endif
1655
1656/// Test whether this fixup always uses its value outside of the given loop.
1657bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
1658 // PHI nodes use their value in their incoming blocks.
1659 if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
1660 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
1661 if (PN->getIncomingValue(i) == OperandValToReplace &&
1662 L->contains(PN->getIncomingBlock(i)))
1663 return false;
1664 return true;
1665 }
1666
1667 return !L->contains(UserInst);
1668}
1669
1670#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1671void LSRFixup::print(raw_ostream &OS) const {
1672 OS << "UserInst=";
1673 // Store is common and interesting enough to be worth special-casing.
1674 if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
1675 OS << "store ";
1676 Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
1677 } else if (UserInst->getType()->isVoidTy())
1678 OS << UserInst->getOpcodeName();
1679 else
1680 UserInst->printAsOperand(OS, /*PrintType=*/false);
1681
1682 OS << ", OperandValToReplace=";
1683 OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
1684
1685 for (const Loop *PIL : PostIncLoops) {
1686 OS << ", PostIncLoop=";
1687 PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
1688 }
1689
1690 if (Offset.isNonZero())
1691 OS << ", Offset=" << Offset;
1692}
1693
1694LLVM_DUMP_METHOD void LSRFixup::dump() const {
1695 print(errs()); errs() << '\n';
1696}
1697#endif
1698
1699/// Test whether this use as a formula which has the same registers as the given
1700/// formula.
1701bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
1703 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1704 // Unstable sort by host order ok, because this is only used for uniquifying.
1705 llvm::sort(Key);
1706 return Uniquifier.count(Key);
1707}
1708
1709/// The function returns a probability of selecting formula without Reg.
1710float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
1711 unsigned FNum = 0;
1712 for (const Formula &F : Formulae)
1713 if (F.referencesReg(Reg))
1714 FNum++;
1715 return ((float)(Formulae.size() - FNum)) / Formulae.size();
1716}
1717
1718/// If the given formula has not yet been inserted, add it to the list, and
1719/// return true. Return false otherwise. The formula must be in canonical form.
1720bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
1721 assert(F.isCanonical(L) && "Invalid canonical representation");
1722
1723 if (!Formulae.empty() && RigidFormula)
1724 return false;
1725
1727 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1728 // Unstable sort by host order ok, because this is only used for uniquifying.
1729 llvm::sort(Key);
1730
1731 if (!Uniquifier.insert(Key).second)
1732 return false;
1733
1734 // Using a register to hold the value of 0 is not profitable.
1735 assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
1736 "Zero allocated in a scaled register!");
1737#ifndef NDEBUG
1738 for (const SCEV *BaseReg : F.BaseRegs)
1739 assert(!BaseReg->isZero() && "Zero allocated in a base register!");
1740#endif
1741
1742 // Add the formula to the list.
1743 Formulae.push_back(F);
1744
1745 // Record registers now being used by this use.
1746 Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1747 if (F.ScaledReg)
1748 Regs.insert(F.ScaledReg);
1749
1750 return true;
1751}
1752
1753/// Remove the given formula from this use's list.
1754void LSRUse::DeleteFormula(Formula &F) {
1755 if (&F != &Formulae.back())
1756 std::swap(F, Formulae.back());
1757 Formulae.pop_back();
1758}
1759
1760/// Recompute the Regs field, and update RegUses.
1761void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
1762 // Now that we've filtered out some formulae, recompute the Regs set.
1763 SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
1764 Regs.clear();
1765 for (const Formula &F : Formulae) {
1766 if (F.ScaledReg) Regs.insert(F.ScaledReg);
1767 Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1768 }
1769
1770 // Update the RegTracker.
1771 for (const SCEV *S : OldRegs)
1772 if (!Regs.count(S))
1773 RegUses.dropRegister(S, LUIdx);
1774}
1775
1776#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1777void LSRUse::print(raw_ostream &OS) const {
1778 OS << "LSR Use: Kind=";
1779 switch (Kind) {
1780 case Basic: OS << "Basic"; break;
1781 case Special: OS << "Special"; break;
1782 case ICmpZero: OS << "ICmpZero"; break;
1783 case Address:
1784 OS << "Address of ";
1785 if (AccessTy.MemTy->isPointerTy())
1786 OS << "pointer"; // the full pointer type could be really verbose
1787 else {
1788 OS << *AccessTy.MemTy;
1789 }
1790
1791 OS << " in addrspace(" << AccessTy.AddrSpace << ')';
1792 }
1793
1794 OS << ", Offsets={";
1795 bool NeedComma = false;
1796 for (const LSRFixup &Fixup : Fixups) {
1797 if (NeedComma) OS << ',';
1798 OS << Fixup.Offset;
1799 NeedComma = true;
1800 }
1801 OS << '}';
1802
1803 if (AllFixupsOutsideLoop)
1804 OS << ", all-fixups-outside-loop";
1805
1806 if (WidestFixupType)
1807 OS << ", widest fixup type: " << *WidestFixupType;
1808}
1809
1810LLVM_DUMP_METHOD void LSRUse::dump() const {
1811 print(errs()); errs() << '\n';
1812}
1813#endif
1814
1816 LSRUse::KindType Kind, MemAccessTy AccessTy,
1817 GlobalValue *BaseGV, Immediate BaseOffset,
1818 bool HasBaseReg, int64_t Scale,
1819 Instruction *Fixup /* = nullptr */) {
1820 switch (Kind) {
1821 case LSRUse::Address: {
1822 int64_t FixedOffset =
1823 BaseOffset.isScalable() ? 0 : BaseOffset.getFixedValue();
1824 int64_t ScalableOffset =
1825 BaseOffset.isScalable() ? BaseOffset.getKnownMinValue() : 0;
1826 return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, FixedOffset,
1827 HasBaseReg, Scale, AccessTy.AddrSpace,
1828 Fixup, ScalableOffset);
1829 }
1830 case LSRUse::ICmpZero:
1831 // There's not even a target hook for querying whether it would be legal to
1832 // fold a GV into an ICmp.
1833 if (BaseGV)
1834 return false;
1835
1836 // ICmp only has two operands; don't allow more than two non-trivial parts.
1837 if (Scale != 0 && HasBaseReg && BaseOffset.isNonZero())
1838 return false;
1839
1840 // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
1841 // putting the scaled register in the other operand of the icmp.
1842 if (Scale != 0 && Scale != -1)
1843 return false;
1844
1845 // If we have low-level target information, ask the target if it can fold an
1846 // integer immediate on an icmp.
1847 if (BaseOffset.isNonZero()) {
1848 // We don't have an interface to query whether the target supports
1849 // icmpzero against scalable quantities yet.
1850 if (BaseOffset.isScalable())
1851 return false;
1852
1853 // We have one of:
1854 // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
1855 // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
1856 // Offs is the ICmp immediate.
1857 if (Scale == 0)
1858 // The cast does the right thing with
1859 // std::numeric_limits<int64_t>::min().
1860 BaseOffset = BaseOffset.getFixed(-(uint64_t)BaseOffset.getFixedValue());
1861 return TTI.isLegalICmpImmediate(BaseOffset.getFixedValue());
1862 }
1863
1864 // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
1865 return true;
1866
1867 case LSRUse::Basic:
1868 // Only handle single-register values.
1869 return !BaseGV && Scale == 0 && BaseOffset.isZero();
1870
1871 case LSRUse::Special:
1872 // Special case Basic to handle -1 scales.
1873 return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset.isZero();
1874 }
1875
1876 llvm_unreachable("Invalid LSRUse Kind!");
1877}
1878
1880 Immediate MinOffset, Immediate MaxOffset,
1881 LSRUse::KindType Kind, MemAccessTy AccessTy,
1882 GlobalValue *BaseGV, Immediate BaseOffset,
1883 bool HasBaseReg, int64_t Scale) {
1884 if (BaseOffset.isNonZero() &&
1885 (BaseOffset.isScalable() != MinOffset.isScalable() ||
1886 BaseOffset.isScalable() != MaxOffset.isScalable()))
1887 return false;
1888 // Check for overflow.
1889 int64_t Base = BaseOffset.getKnownMinValue();
1890 int64_t Min = MinOffset.getKnownMinValue();
1891 int64_t Max = MaxOffset.getKnownMinValue();
1892 if (((int64_t)((uint64_t)Base + Min) > Base) != (Min > 0))
1893 return false;
1894 MinOffset = Immediate::get((uint64_t)Base + Min, MinOffset.isScalable());
1895 if (((int64_t)((uint64_t)Base + Max) > Base) != (Max > 0))
1896 return false;
1897 MaxOffset = Immediate::get((uint64_t)Base + Max, MaxOffset.isScalable());
1898
1899 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
1900 HasBaseReg, Scale) &&
1901 isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
1902 HasBaseReg, Scale);
1903}
1904
1906 Immediate MinOffset, Immediate MaxOffset,
1907 LSRUse::KindType Kind, MemAccessTy AccessTy,
1908 const Formula &F, const Loop &L) {
1909 // For the purpose of isAMCompletelyFolded either having a canonical formula
1910 // or a scale not equal to zero is correct.
1911 // Problems may arise from non canonical formulae having a scale == 0.
1912 // Strictly speaking it would best to just rely on canonical formulae.
1913 // However, when we generate the scaled formulae, we first check that the
1914 // scaling factor is profitable before computing the actual ScaledReg for
1915 // compile time sake.
1916 assert((F.isCanonical(L) || F.Scale != 0));
1917 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1918 F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
1919}
1920
1921/// Test whether we know how to expand the current formula.
1922static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1923 Immediate MaxOffset, LSRUse::KindType Kind,
1924 MemAccessTy AccessTy, GlobalValue *BaseGV,
1925 Immediate BaseOffset, bool HasBaseReg, int64_t Scale) {
1926 // We know how to expand completely foldable formulae.
1927 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1928 BaseOffset, HasBaseReg, Scale) ||
1929 // Or formulae that use a base register produced by a sum of base
1930 // registers.
1931 (Scale == 1 &&
1932 isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1933 BaseGV, BaseOffset, true, 0));
1934}
1935
1936static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1937 Immediate MaxOffset, LSRUse::KindType Kind,
1938 MemAccessTy AccessTy, const Formula &F) {
1939 return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
1940 F.BaseOffset, F.HasBaseReg, F.Scale);
1941}
1942
1944 Immediate Offset) {
1945 if (Offset.isScalable())
1946 return TTI.isLegalAddScalableImmediate(Offset.getKnownMinValue());
1947
1948 return TTI.isLegalAddImmediate(Offset.getFixedValue());
1949}
1950
1952 const LSRUse &LU, const Formula &F) {
1953 // Target may want to look at the user instructions.
1954 if (LU.Kind == LSRUse::Address && TTI.LSRWithInstrQueries()) {
1955 for (const LSRFixup &Fixup : LU.Fixups)
1956 if (!isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1957 (F.BaseOffset + Fixup.Offset), F.HasBaseReg,
1958 F.Scale, Fixup.UserInst))
1959 return false;
1960 return true;
1961 }
1962
1963 return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1964 LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
1965 F.Scale);
1966}
1967
1969 const LSRUse &LU, const Formula &F,
1970 const Loop &L) {
1971 if (!F.Scale)
1972 return 0;
1973
1974 // If the use is not completely folded in that instruction, we will have to
1975 // pay an extra cost only for scale != 1.
1976 if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1977 LU.AccessTy, F, L))
1978 return F.Scale != 1;
1979
1980 switch (LU.Kind) {
1981 case LSRUse::Address: {
1982 // Check the scaling factor cost with both the min and max offsets.
1983 int64_t ScalableMin = 0, ScalableMax = 0, FixedMin = 0, FixedMax = 0;
1984 if (F.BaseOffset.isScalable()) {
1985 ScalableMin = (F.BaseOffset + LU.MinOffset).getKnownMinValue();
1986 ScalableMax = (F.BaseOffset + LU.MaxOffset).getKnownMinValue();
1987 } else {
1988 FixedMin = (F.BaseOffset + LU.MinOffset).getFixedValue();
1989 FixedMax = (F.BaseOffset + LU.MaxOffset).getFixedValue();
1990 }
1991 InstructionCost ScaleCostMinOffset = TTI.getScalingFactorCost(
1992 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMin, ScalableMin),
1993 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
1994 InstructionCost ScaleCostMaxOffset = TTI.getScalingFactorCost(
1995 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMax, ScalableMax),
1996 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
1997
1998 assert(ScaleCostMinOffset.isValid() && ScaleCostMaxOffset.isValid() &&
1999 "Legal addressing mode has an illegal cost!");
2000 return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
2001 }
2002 case LSRUse::ICmpZero:
2003 case LSRUse::Basic:
2004 case LSRUse::Special:
2005 // The use is completely folded, i.e., everything is folded into the
2006 // instruction.
2007 return 0;
2008 }
2009
2010 llvm_unreachable("Invalid LSRUse Kind!");
2011}
2012
2014 LSRUse::KindType Kind, MemAccessTy AccessTy,
2015 GlobalValue *BaseGV, Immediate BaseOffset,
2016 bool HasBaseReg) {
2017 // Fast-path: zero is always foldable.
2018 if (BaseOffset.isZero() && !BaseGV)
2019 return true;
2020
2021 // Conservatively, create an address with an immediate and a
2022 // base and a scale.
2023 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2024
2025 // Canonicalize a scale of 1 to a base register if the formula doesn't
2026 // already have a base register.
2027 if (!HasBaseReg && Scale == 1) {
2028 Scale = 0;
2029 HasBaseReg = true;
2030 }
2031
2032 // FIXME: Try with + without a scale? Maybe based on TTI?
2033 // I think basereg + scaledreg + immediateoffset isn't a good 'conservative'
2034 // default for many architectures, not just AArch64 SVE. More investigation
2035 // needed later to determine if this should be used more widely than just
2036 // on scalable types.
2037 if (HasBaseReg && BaseOffset.isNonZero() && Kind != LSRUse::ICmpZero &&
2038 AccessTy.MemTy && AccessTy.MemTy->isScalableTy() && DropScaledForVScale)
2039 Scale = 0;
2040
2041 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
2042 HasBaseReg, Scale);
2043}
2044
2046 ScalarEvolution &SE, Immediate MinOffset,
2047 Immediate MaxOffset, LSRUse::KindType Kind,
2048 MemAccessTy AccessTy, const SCEV *S,
2049 bool HasBaseReg) {
2050 // Fast-path: zero is always foldable.
2051 if (S->isZero()) return true;
2052
2053 // Conservatively, create an address with an immediate and a
2054 // base and a scale.
2055 Immediate BaseOffset = ExtractImmediate(S, SE);
2056 GlobalValue *BaseGV = ExtractSymbol(S, SE);
2057
2058 // If there's anything else involved, it's not foldable.
2059 if (!S->isZero()) return false;
2060
2061 // Fast-path: zero is always foldable.
2062 if (BaseOffset.isZero() && !BaseGV)
2063 return true;
2064
2065 if (BaseOffset.isScalable())
2066 return false;
2067
2068 // Conservatively, create an address with an immediate and a
2069 // base and a scale.
2070 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2071
2072 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
2073 BaseOffset, HasBaseReg, Scale);
2074}
2075
2076namespace {
2077
2078/// An individual increment in a Chain of IV increments. Relate an IV user to
2079/// an expression that computes the IV it uses from the IV used by the previous
2080/// link in the Chain.
2081///
2082/// For the head of a chain, IncExpr holds the absolute SCEV expression for the
2083/// original IVOperand. The head of the chain's IVOperand is only valid during
2084/// chain collection, before LSR replaces IV users. During chain generation,
2085/// IncExpr can be used to find the new IVOperand that computes the same
2086/// expression.
2087struct IVInc {
2088 Instruction *UserInst;
2089 Value* IVOperand;
2090 const SCEV *IncExpr;
2091
2092 IVInc(Instruction *U, Value *O, const SCEV *E)
2093 : UserInst(U), IVOperand(O), IncExpr(E) {}
2094};
2095
2096// The list of IV increments in program order. We typically add the head of a
2097// chain without finding subsequent links.
2098struct IVChain {
2100 const SCEV *ExprBase = nullptr;
2101
2102 IVChain() = default;
2103 IVChain(const IVInc &Head, const SCEV *Base)
2104 : Incs(1, Head), ExprBase(Base) {}
2105
2107
2108 // Return the first increment in the chain.
2109 const_iterator begin() const {
2110 assert(!Incs.empty());
2111 return std::next(Incs.begin());
2112 }
2113 const_iterator end() const {
2114 return Incs.end();
2115 }
2116
2117 // Returns true if this chain contains any increments.
2118 bool hasIncs() const { return Incs.size() >= 2; }
2119
2120 // Add an IVInc to the end of this chain.
2121 void add(const IVInc &X) { Incs.push_back(X); }
2122
2123 // Returns the last UserInst in the chain.
2124 Instruction *tailUserInst() const { return Incs.back().UserInst; }
2125
2126 // Returns true if IncExpr can be profitably added to this chain.
2127 bool isProfitableIncrement(const SCEV *OperExpr,
2128 const SCEV *IncExpr,
2130};
2131
2132/// Helper for CollectChains to track multiple IV increment uses. Distinguish
2133/// between FarUsers that definitely cross IV increments and NearUsers that may
2134/// be used between IV increments.
2135struct ChainUsers {
2138};
2139
2140/// This class holds state for the main loop strength reduction logic.
2141class LSRInstance {
2142 IVUsers &IU;
2143 ScalarEvolution &SE;
2144 DominatorTree &DT;
2145 LoopInfo &LI;
2146 AssumptionCache &AC;
2147 TargetLibraryInfo &TLI;
2148 const TargetTransformInfo &TTI;
2149 Loop *const L;
2150 MemorySSAUpdater *MSSAU;
2152 mutable SCEVExpander Rewriter;
2153 bool Changed = false;
2154
2155 /// This is the insert position that the current loop's induction variable
2156 /// increment should be placed. In simple loops, this is the latch block's
2157 /// terminator. But in more complicated cases, this is a position which will
2158 /// dominate all the in-loop post-increment users.
2159 Instruction *IVIncInsertPos = nullptr;
2160
2161 /// Interesting factors between use strides.
2162 ///
2163 /// We explicitly use a SetVector which contains a SmallSet, instead of the
2164 /// default, a SmallDenseSet, because we need to use the full range of
2165 /// int64_ts, and there's currently no good way of doing that with
2166 /// SmallDenseSet.
2168
2169 /// The cost of the current SCEV, the best solution by LSR will be dropped if
2170 /// the solution is not profitable.
2171 Cost BaselineCost;
2172
2173 /// Interesting use types, to facilitate truncation reuse.
2175
2176 /// The list of interesting uses.
2178
2179 /// Track which uses use which register candidates.
2180 RegUseTracker RegUses;
2181
2182 // Limit the number of chains to avoid quadratic behavior. We don't expect to
2183 // have more than a few IV increment chains in a loop. Missing a Chain falls
2184 // back to normal LSR behavior for those uses.
2185 static const unsigned MaxChains = 8;
2186
2187 /// IV users can form a chain of IV increments.
2189
2190 /// IV users that belong to profitable IVChains.
2192
2193 /// Induction variables that were generated and inserted by the SCEV Expander.
2194 SmallVector<llvm::WeakVH, 2> ScalarEvolutionIVs;
2195
2196 void OptimizeShadowIV();
2197 bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
2198 ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
2199 void OptimizeLoopTermCond();
2200
2201 void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
2202 SmallVectorImpl<ChainUsers> &ChainUsersVec);
2203 void FinalizeChain(IVChain &Chain);
2204 void CollectChains();
2205 void GenerateIVChain(const IVChain &Chain,
2207
2208 void CollectInterestingTypesAndFactors();
2209 void CollectFixupsAndInitialFormulae();
2210
2211 // Support for sharing of LSRUses between LSRFixups.
2213 UseMapTy UseMap;
2214
2215 bool reconcileNewOffset(LSRUse &LU, Immediate NewOffset, bool HasBaseReg,
2216 LSRUse::KindType Kind, MemAccessTy AccessTy);
2217
2218 std::pair<size_t, Immediate> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
2219 MemAccessTy AccessTy);
2220
2221 void DeleteUse(LSRUse &LU, size_t LUIdx);
2222
2223 LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
2224
2225 void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2226 void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2227 void CountRegisters(const Formula &F, size_t LUIdx);
2228 bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
2229
2230 void CollectLoopInvariantFixupsAndFormulae();
2231
2232 void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
2233 unsigned Depth = 0);
2234
2235 void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
2236 const Formula &Base, unsigned Depth,
2237 size_t Idx, bool IsScaledReg = false);
2238 void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
2239 void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2240 const Formula &Base, size_t Idx,
2241 bool IsScaledReg = false);
2242 void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2243 void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2244 const Formula &Base,
2245 const SmallVectorImpl<Immediate> &Worklist,
2246 size_t Idx, bool IsScaledReg = false);
2247 void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2248 void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2249 void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2250 void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
2251 void GenerateCrossUseConstantOffsets();
2252 void GenerateAllReuseFormulae();
2253
2254 void FilterOutUndesirableDedicatedRegisters();
2255
2256 size_t EstimateSearchSpaceComplexity() const;
2257 void NarrowSearchSpaceByDetectingSupersets();
2258 void NarrowSearchSpaceByCollapsingUnrolledCode();
2259 void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
2260 void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
2261 void NarrowSearchSpaceByFilterPostInc();
2262 void NarrowSearchSpaceByDeletingCostlyFormulas();
2263 void NarrowSearchSpaceByPickingWinnerRegs();
2264 void NarrowSearchSpaceUsingHeuristics();
2265
2266 void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
2267 Cost &SolutionCost,
2269 const Cost &CurCost,
2270 const SmallPtrSet<const SCEV *, 16> &CurRegs,
2271 DenseSet<const SCEV *> &VisitedRegs) const;
2272 void Solve(SmallVectorImpl<const Formula *> &Solution) const;
2273
2275 HoistInsertPosition(BasicBlock::iterator IP,
2276 const SmallVectorImpl<Instruction *> &Inputs) const;
2277 BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP,
2278 const LSRFixup &LF,
2279 const LSRUse &LU) const;
2280
2281 Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2283 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2284 void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
2285 const Formula &F,
2286 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2287 void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2288 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2289 void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
2290
2291public:
2292 LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
2294 TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU);
2295
2296 bool getChanged() const { return Changed; }
2297 const SmallVectorImpl<WeakVH> &getScalarEvolutionIVs() const {
2298 return ScalarEvolutionIVs;
2299 }
2300
2301 void print_factors_and_types(raw_ostream &OS) const;
2302 void print_fixups(raw_ostream &OS) const;
2303 void print_uses(raw_ostream &OS) const;
2304 void print(raw_ostream &OS) const;
2305 void dump() const;
2306};
2307
2308} // end anonymous namespace
2309
2310/// If IV is used in a int-to-float cast inside the loop then try to eliminate
2311/// the cast operation.
2312void LSRInstance::OptimizeShadowIV() {
2313 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2314 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2315 return;
2316
2317 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
2318 UI != E; /* empty */) {
2319 IVUsers::const_iterator CandidateUI = UI;
2320 ++UI;
2321 Instruction *ShadowUse = CandidateUI->getUser();
2322 Type *DestTy = nullptr;
2323 bool IsSigned = false;
2324
2325 /* If shadow use is a int->float cast then insert a second IV
2326 to eliminate this cast.
2327
2328 for (unsigned i = 0; i < n; ++i)
2329 foo((double)i);
2330
2331 is transformed into
2332
2333 double d = 0.0;
2334 for (unsigned i = 0; i < n; ++i, ++d)
2335 foo(d);
2336 */
2337 if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
2338 IsSigned = false;
2339 DestTy = UCast->getDestTy();
2340 }
2341 else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
2342 IsSigned = true;
2343 DestTy = SCast->getDestTy();
2344 }
2345 if (!DestTy) continue;
2346
2347 // If target does not support DestTy natively then do not apply
2348 // this transformation.
2349 if (!TTI.isTypeLegal(DestTy)) continue;
2350
2351 PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
2352 if (!PH) continue;
2353 if (PH->getNumIncomingValues() != 2) continue;
2354
2355 // If the calculation in integers overflows, the result in FP type will
2356 // differ. So we only can do this transformation if we are guaranteed to not
2357 // deal with overflowing values
2358 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PH));
2359 if (!AR) continue;
2360 if (IsSigned && !AR->hasNoSignedWrap()) continue;
2361 if (!IsSigned && !AR->hasNoUnsignedWrap()) continue;
2362
2363 Type *SrcTy = PH->getType();
2364 int Mantissa = DestTy->getFPMantissaWidth();
2365 if (Mantissa == -1) continue;
2366 if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
2367 continue;
2368
2369 unsigned Entry, Latch;
2370 if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
2371 Entry = 0;
2372 Latch = 1;
2373 } else {
2374 Entry = 1;
2375 Latch = 0;
2376 }
2377
2378 ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
2379 if (!Init) continue;
2380 Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
2381 (double)Init->getSExtValue() :
2382 (double)Init->getZExtValue());
2383
2384 BinaryOperator *Incr =
2385 dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
2386 if (!Incr) continue;
2387 if (Incr->getOpcode() != Instruction::Add
2388 && Incr->getOpcode() != Instruction::Sub)
2389 continue;
2390
2391 /* Initialize new IV, double d = 0.0 in above example. */
2392 ConstantInt *C = nullptr;
2393 if (Incr->getOperand(0) == PH)
2394 C = dyn_cast<ConstantInt>(Incr->getOperand(1));
2395 else if (Incr->getOperand(1) == PH)
2396 C = dyn_cast<ConstantInt>(Incr->getOperand(0));
2397 else
2398 continue;
2399
2400 if (!C) continue;
2401
2402 // Ignore negative constants, as the code below doesn't handle them
2403 // correctly. TODO: Remove this restriction.
2404 if (!C->getValue().isStrictlyPositive())
2405 continue;
2406
2407 /* Add new PHINode. */
2408 PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH->getIterator());
2409 NewPH->setDebugLoc(PH->getDebugLoc());
2410
2411 /* create new increment. '++d' in above example. */
2412 Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
2414 Incr->getOpcode() == Instruction::Add ? Instruction::FAdd
2415 : Instruction::FSub,
2416 NewPH, CFP, "IV.S.next.", Incr->getIterator());
2417 NewIncr->setDebugLoc(Incr->getDebugLoc());
2418
2419 NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
2420 NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
2421
2422 /* Remove cast operation */
2423 ShadowUse->replaceAllUsesWith(NewPH);
2424 ShadowUse->eraseFromParent();
2425 Changed = true;
2426 break;
2427 }
2428}
2429
2430/// If Cond has an operand that is an expression of an IV, set the IV user and
2431/// stride information and return true, otherwise return false.
2432bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
2433 for (IVStrideUse &U : IU)
2434 if (U.getUser() == Cond) {
2435 // NOTE: we could handle setcc instructions with multiple uses here, but
2436 // InstCombine does it as well for simple uses, it's not clear that it
2437 // occurs enough in real life to handle.
2438 CondUse = &U;
2439 return true;
2440 }
2441 return false;
2442}
2443
2444/// Rewrite the loop's terminating condition if it uses a max computation.
2445///
2446/// This is a narrow solution to a specific, but acute, problem. For loops
2447/// like this:
2448///
2449/// i = 0;
2450/// do {
2451/// p[i] = 0.0;
2452/// } while (++i < n);
2453///
2454/// the trip count isn't just 'n', because 'n' might not be positive. And
2455/// unfortunately this can come up even for loops where the user didn't use
2456/// a C do-while loop. For example, seemingly well-behaved top-test loops
2457/// will commonly be lowered like this:
2458///
2459/// if (n > 0) {
2460/// i = 0;
2461/// do {
2462/// p[i] = 0.0;
2463/// } while (++i < n);
2464/// }
2465///
2466/// and then it's possible for subsequent optimization to obscure the if
2467/// test in such a way that indvars can't find it.
2468///
2469/// When indvars can't find the if test in loops like this, it creates a
2470/// max expression, which allows it to give the loop a canonical
2471/// induction variable:
2472///
2473/// i = 0;
2474/// max = n < 1 ? 1 : n;
2475/// do {
2476/// p[i] = 0.0;
2477/// } while (++i != max);
2478///
2479/// Canonical induction variables are necessary because the loop passes
2480/// are designed around them. The most obvious example of this is the
2481/// LoopInfo analysis, which doesn't remember trip count values. It
2482/// expects to be able to rediscover the trip count each time it is
2483/// needed, and it does this using a simple analysis that only succeeds if
2484/// the loop has a canonical induction variable.
2485///
2486/// However, when it comes time to generate code, the maximum operation
2487/// can be quite costly, especially if it's inside of an outer loop.
2488///
2489/// This function solves this problem by detecting this type of loop and
2490/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
2491/// the instructions for the maximum computation.
2492ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
2493 // Check that the loop matches the pattern we're looking for.
2494 if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
2495 Cond->getPredicate() != CmpInst::ICMP_NE)
2496 return Cond;
2497
2498 SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
2499 if (!Sel || !Sel->hasOneUse()) return Cond;
2500
2501 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2502 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2503 return Cond;
2504 const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
2505
2506 // Add one to the backedge-taken count to get the trip count.
2507 const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
2508 if (IterationCount != SE.getSCEV(Sel)) return Cond;
2509
2510 // Check for a max calculation that matches the pattern. There's no check
2511 // for ICMP_ULE here because the comparison would be with zero, which
2512 // isn't interesting.
2513 CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
2514 const SCEVNAryExpr *Max = nullptr;
2515 if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
2516 Pred = ICmpInst::ICMP_SLE;
2517 Max = S;
2518 } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
2519 Pred = ICmpInst::ICMP_SLT;
2520 Max = S;
2521 } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
2522 Pred = ICmpInst::ICMP_ULT;
2523 Max = U;
2524 } else {
2525 // No match; bail.
2526 return Cond;
2527 }
2528
2529 // To handle a max with more than two operands, this optimization would
2530 // require additional checking and setup.
2531 if (Max->getNumOperands() != 2)
2532 return Cond;
2533
2534 const SCEV *MaxLHS = Max->getOperand(0);
2535 const SCEV *MaxRHS = Max->getOperand(1);
2536
2537 // ScalarEvolution canonicalizes constants to the left. For < and >, look
2538 // for a comparison with 1. For <= and >=, a comparison with zero.
2539 if (!MaxLHS ||
2540 (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
2541 return Cond;
2542
2543 // Check the relevant induction variable for conformance to
2544 // the pattern.
2545 const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
2546 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
2547 if (!AR || !AR->isAffine() ||
2548 AR->getStart() != One ||
2549 AR->getStepRecurrence(SE) != One)
2550 return Cond;
2551
2552 assert(AR->getLoop() == L &&
2553 "Loop condition operand is an addrec in a different loop!");
2554
2555 // Check the right operand of the select, and remember it, as it will
2556 // be used in the new comparison instruction.
2557 Value *NewRHS = nullptr;
2558 if (ICmpInst::isTrueWhenEqual(Pred)) {
2559 // Look for n+1, and grab n.
2560 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
2561 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2562 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2563 NewRHS = BO->getOperand(0);
2564 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
2565 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2566 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2567 NewRHS = BO->getOperand(0);
2568 if (!NewRHS)
2569 return Cond;
2570 } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
2571 NewRHS = Sel->getOperand(1);
2572 else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
2573 NewRHS = Sel->getOperand(2);
2574 else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
2575 NewRHS = SU->getValue();
2576 else
2577 // Max doesn't match expected pattern.
2578 return Cond;
2579
2580 // Determine the new comparison opcode. It may be signed or unsigned,
2581 // and the original comparison may be either equality or inequality.
2582 if (Cond->getPredicate() == CmpInst::ICMP_EQ)
2583 Pred = CmpInst::getInversePredicate(Pred);
2584
2585 // Ok, everything looks ok to change the condition into an SLT or SGE and
2586 // delete the max calculation.
2587 ICmpInst *NewCond = new ICmpInst(Cond->getIterator(), Pred,
2588 Cond->getOperand(0), NewRHS, "scmp");
2589
2590 // Delete the max calculation instructions.
2591 NewCond->setDebugLoc(Cond->getDebugLoc());
2592 Cond->replaceAllUsesWith(NewCond);
2593 CondUse->setUser(NewCond);
2594 Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
2595 Cond->eraseFromParent();
2596 Sel->eraseFromParent();
2597 if (Cmp->use_empty())
2598 Cmp->eraseFromParent();
2599 return NewCond;
2600}
2601
2602/// Change loop terminating condition to use the postinc iv when possible.
2603void
2604LSRInstance::OptimizeLoopTermCond() {
2606
2607 // We need a different set of heuristics for rotated and non-rotated loops.
2608 // If a loop is rotated then the latch is also the backedge, so inserting
2609 // post-inc expressions just before the latch is ideal. To reduce live ranges
2610 // it also makes sense to rewrite terminating conditions to use post-inc
2611 // expressions.
2612 //
2613 // If the loop is not rotated then the latch is not a backedge; the latch
2614 // check is done in the loop head. Adding post-inc expressions before the
2615 // latch will cause overlapping live-ranges of pre-inc and post-inc expressions
2616 // in the loop body. In this case we do *not* want to use post-inc expressions
2617 // in the latch check, and we want to insert post-inc expressions before
2618 // the backedge.
2619 BasicBlock *LatchBlock = L->getLoopLatch();
2620 SmallVector<BasicBlock*, 8> ExitingBlocks;
2621 L->getExitingBlocks(ExitingBlocks);
2622 if (!llvm::is_contained(ExitingBlocks, LatchBlock)) {
2623 // The backedge doesn't exit the loop; treat this as a head-tested loop.
2624 IVIncInsertPos = LatchBlock->getTerminator();
2625 return;
2626 }
2627
2628 // Otherwise treat this as a rotated loop.
2629 for (BasicBlock *ExitingBlock : ExitingBlocks) {
2630 // Get the terminating condition for the loop if possible. If we
2631 // can, we want to change it to use a post-incremented version of its
2632 // induction variable, to allow coalescing the live ranges for the IV into
2633 // one register value.
2634
2635 BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
2636 if (!TermBr)
2637 continue;
2638 // FIXME: Overly conservative, termination condition could be an 'or' etc..
2639 if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
2640 continue;
2641
2642 // Search IVUsesByStride to find Cond's IVUse if there is one.
2643 IVStrideUse *CondUse = nullptr;
2644 ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
2645 if (!FindIVUserForCond(Cond, CondUse))
2646 continue;
2647
2648 // If the trip count is computed in terms of a max (due to ScalarEvolution
2649 // being unable to find a sufficient guard, for example), change the loop
2650 // comparison to use SLT or ULT instead of NE.
2651 // One consequence of doing this now is that it disrupts the count-down
2652 // optimization. That's not always a bad thing though, because in such
2653 // cases it may still be worthwhile to avoid a max.
2654 Cond = OptimizeMax(Cond, CondUse);
2655
2656 // If this exiting block dominates the latch block, it may also use
2657 // the post-inc value if it won't be shared with other uses.
2658 // Check for dominance.
2659 if (!DT.dominates(ExitingBlock, LatchBlock))
2660 continue;
2661
2662 // Conservatively avoid trying to use the post-inc value in non-latch
2663 // exits if there may be pre-inc users in intervening blocks.
2664 if (LatchBlock != ExitingBlock)
2665 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
2666 // Test if the use is reachable from the exiting block. This dominator
2667 // query is a conservative approximation of reachability.
2668 if (&*UI != CondUse &&
2669 !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {
2670 // Conservatively assume there may be reuse if the quotient of their
2671 // strides could be a legal scale.
2672 const SCEV *A = IU.getStride(*CondUse, L);
2673 const SCEV *B = IU.getStride(*UI, L);
2674 if (!A || !B) continue;
2675 if (SE.getTypeSizeInBits(A->getType()) !=
2676 SE.getTypeSizeInBits(B->getType())) {
2677 if (SE.getTypeSizeInBits(A->getType()) >
2678 SE.getTypeSizeInBits(B->getType()))
2679 B = SE.getSignExtendExpr(B, A->getType());
2680 else
2681 A = SE.getSignExtendExpr(A, B->getType());
2682 }
2683 if (const SCEVConstant *D =
2684 dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
2685 const ConstantInt *C = D->getValue();
2686 // Stride of one or negative one can have reuse with non-addresses.
2687 if (C->isOne() || C->isMinusOne())
2688 goto decline_post_inc;
2689 // Avoid weird situations.
2690 if (C->getValue().getSignificantBits() >= 64 ||
2691 C->getValue().isMinSignedValue())
2692 goto decline_post_inc;
2693 // Check for possible scaled-address reuse.
2694 if (isAddressUse(TTI, UI->getUser(), UI->getOperandValToReplace())) {
2695 MemAccessTy AccessTy = getAccessType(
2696 TTI, UI->getUser(), UI->getOperandValToReplace());
2697 int64_t Scale = C->getSExtValue();
2698 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2699 /*BaseOffset=*/0,
2700 /*HasBaseReg=*/true, Scale,
2701 AccessTy.AddrSpace))
2702 goto decline_post_inc;
2703 Scale = -Scale;
2704 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2705 /*BaseOffset=*/0,
2706 /*HasBaseReg=*/true, Scale,
2707 AccessTy.AddrSpace))
2708 goto decline_post_inc;
2709 }
2710 }
2711 }
2712
2713 LLVM_DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: "
2714 << *Cond << '\n');
2715
2716 // It's possible for the setcc instruction to be anywhere in the loop, and
2717 // possible for it to have multiple users. If it is not immediately before
2718 // the exiting block branch, move it.
2719 if (Cond->getNextNonDebugInstruction() != TermBr) {
2720 if (Cond->hasOneUse()) {
2721 Cond->moveBefore(TermBr);
2722 } else {
2723 // Clone the terminating condition and insert into the loopend.
2724 ICmpInst *OldCond = Cond;
2725 Cond = cast<ICmpInst>(Cond->clone());
2726 Cond->setName(L->getHeader()->getName() + ".termcond");
2727 Cond->insertInto(ExitingBlock, TermBr->getIterator());
2728
2729 // Clone the IVUse, as the old use still exists!
2730 CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
2731 TermBr->replaceUsesOfWith(OldCond, Cond);
2732 }
2733 }
2734
2735 // If we get to here, we know that we can transform the setcc instruction to
2736 // use the post-incremented version of the IV, allowing us to coalesce the
2737 // live ranges for the IV correctly.
2738 CondUse->transformToPostInc(L);
2739 Changed = true;
2740
2741 PostIncs.insert(Cond);
2742 decline_post_inc:;
2743 }
2744
2745 // Determine an insertion point for the loop induction variable increment. It
2746 // must dominate all the post-inc comparisons we just set up, and it must
2747 // dominate the loop latch edge.
2748 IVIncInsertPos = L->getLoopLatch()->getTerminator();
2749 for (Instruction *Inst : PostIncs)
2750 IVIncInsertPos = DT.findNearestCommonDominator(IVIncInsertPos, Inst);
2751}
2752
2753/// Determine if the given use can accommodate a fixup at the given offset and
2754/// other details. If so, update the use and return true.
2755bool LSRInstance::reconcileNewOffset(LSRUse &LU, Immediate NewOffset,
2756 bool HasBaseReg, LSRUse::KindType Kind,
2757 MemAccessTy AccessTy) {
2758 Immediate NewMinOffset = LU.MinOffset;
2759 Immediate NewMaxOffset = LU.MaxOffset;
2760 MemAccessTy NewAccessTy = AccessTy;
2761
2762 // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
2763 // something conservative, however this can pessimize in the case that one of
2764 // the uses will have all its uses outside the loop, for example.
2765 if (LU.Kind != Kind)
2766 return false;
2767
2768 // Check for a mismatched access type, and fall back conservatively as needed.
2769 // TODO: Be less conservative when the type is similar and can use the same
2770 // addressing modes.
2771 if (Kind == LSRUse::Address) {
2772 if (AccessTy.MemTy != LU.AccessTy.MemTy) {
2773 NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext(),
2774 AccessTy.AddrSpace);
2775 }
2776 }
2777
2778 // Conservatively assume HasBaseReg is true for now.
2779 if (Immediate::isKnownLT(NewOffset, LU.MinOffset)) {
2780 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2781 LU.MaxOffset - NewOffset, HasBaseReg))
2782 return false;
2783 NewMinOffset = NewOffset;
2784 } else if (Immediate::isKnownGT(NewOffset, LU.MaxOffset)) {
2785 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2786 NewOffset - LU.MinOffset, HasBaseReg))
2787 return false;
2788 NewMaxOffset = NewOffset;
2789 }
2790
2791 // FIXME: We should be able to handle some level of scalable offset support
2792 // for 'void', but in order to get basic support up and running this is
2793 // being left out.
2794 if (NewAccessTy.MemTy && NewAccessTy.MemTy->isVoidTy() &&
2795 (NewMinOffset.isScalable() || NewMaxOffset.isScalable()))
2796 return false;
2797
2798 // Update the use.
2799 LU.MinOffset = NewMinOffset;
2800 LU.MaxOffset = NewMaxOffset;
2801 LU.AccessTy = NewAccessTy;
2802 return true;
2803}
2804
2805/// Return an LSRUse index and an offset value for a fixup which needs the given
2806/// expression, with the given kind and optional access type. Either reuse an
2807/// existing use or create a new one, as needed.
2808std::pair<size_t, Immediate> LSRInstance::getUse(const SCEV *&Expr,
2809 LSRUse::KindType Kind,
2810 MemAccessTy AccessTy) {
2811 const SCEV *Copy = Expr;
2812 Immediate Offset = ExtractImmediate(Expr, SE);
2813
2814 // Basic uses can't accept any offset, for example.
2815 if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
2816 Offset, /*HasBaseReg=*/ true)) {
2817 Expr = Copy;
2818 Offset = Immediate::getFixed(0);
2819 }
2820
2821 std::pair<UseMapTy::iterator, bool> P =
2822 UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0));
2823 if (!P.second) {
2824 // A use already existed with this base.
2825 size_t LUIdx = P.first->second;
2826 LSRUse &LU = Uses[LUIdx];
2827 if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
2828 // Reuse this use.
2829 return std::make_pair(LUIdx, Offset);
2830 }
2831
2832 // Create a new use.
2833 size_t LUIdx = Uses.size();
2834 P.first->second = LUIdx;
2835 Uses.push_back(LSRUse(Kind, AccessTy));
2836 LSRUse &LU = Uses[LUIdx];
2837
2838 LU.MinOffset = Offset;
2839 LU.MaxOffset = Offset;
2840 return std::make_pair(LUIdx, Offset);
2841}
2842
2843/// Delete the given use from the Uses list.
2844void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
2845 if (&LU != &Uses.back())
2846 std::swap(LU, Uses.back());
2847 Uses.pop_back();
2848
2849 // Update RegUses.
2850 RegUses.swapAndDropUse(LUIdx, Uses.size());
2851}
2852
2853/// Look for a use distinct from OrigLU which is has a formula that has the same
2854/// registers as the given formula.
2855LSRUse *
2856LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
2857 const LSRUse &OrigLU) {
2858 // Search all uses for the formula. This could be more clever.
2859 for (LSRUse &LU : Uses) {
2860 // Check whether this use is close enough to OrigLU, to see whether it's
2861 // worthwhile looking through its formulae.
2862 // Ignore ICmpZero uses because they may contain formulae generated by
2863 // GenerateICmpZeroScales, in which case adding fixup offsets may
2864 // be invalid.
2865 if (&LU != &OrigLU &&
2866 LU.Kind != LSRUse::ICmpZero &&
2867 LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
2868 LU.WidestFixupType == OrigLU.WidestFixupType &&
2869 LU.HasFormulaWithSameRegs(OrigF)) {
2870 // Scan through this use's formulae.
2871 for (const Formula &F : LU.Formulae) {
2872 // Check to see if this formula has the same registers and symbols
2873 // as OrigF.
2874 if (F.BaseRegs == OrigF.BaseRegs &&
2875 F.ScaledReg == OrigF.ScaledReg &&
2876 F.BaseGV == OrigF.BaseGV &&
2877 F.Scale == OrigF.Scale &&
2878 F.UnfoldedOffset == OrigF.UnfoldedOffset) {
2879 if (F.BaseOffset.isZero())
2880 return &LU;
2881 // This is the formula where all the registers and symbols matched;
2882 // there aren't going to be any others. Since we declined it, we
2883 // can skip the rest of the formulae and proceed to the next LSRUse.
2884 break;
2885 }
2886 }
2887 }
2888 }
2889
2890 // Nothing looked good.
2891 return nullptr;
2892}
2893
2894void LSRInstance::CollectInterestingTypesAndFactors() {
2896
2897 // Collect interesting types and strides.
2899 for (const IVStrideUse &U : IU) {
2900 const SCEV *Expr = IU.getExpr(U);
2901 if (!Expr)
2902 continue;
2903
2904 // Collect interesting types.
2905 Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
2906
2907 // Add strides for mentioned loops.
2908 Worklist.push_back(Expr);
2909 do {
2910 const SCEV *S = Worklist.pop_back_val();
2911 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
2912 if (AR->getLoop() == L)
2913 Strides.insert(AR->getStepRecurrence(SE));
2914 Worklist.push_back(AR->getStart());
2915 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
2916 append_range(Worklist, Add->operands());
2917 }
2918 } while (!Worklist.empty());
2919 }
2920
2921 // Compute interesting factors from the set of interesting strides.
2923 I = Strides.begin(), E = Strides.end(); I != E; ++I)
2925 std::next(I); NewStrideIter != E; ++NewStrideIter) {
2926 const SCEV *OldStride = *I;
2927 const SCEV *NewStride = *NewStrideIter;
2928
2929 if (SE.getTypeSizeInBits(OldStride->getType()) !=
2930 SE.getTypeSizeInBits(NewStride->getType())) {
2931 if (SE.getTypeSizeInBits(OldStride->getType()) >
2932 SE.getTypeSizeInBits(NewStride->getType()))
2933 NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
2934 else
2935 OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
2936 }
2937 if (const SCEVConstant *Factor =
2938 dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
2939 SE, true))) {
2940 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2941 Factors.insert(Factor->getAPInt().getSExtValue());
2942 } else if (const SCEVConstant *Factor =
2943 dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
2944 NewStride,
2945 SE, true))) {
2946 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2947 Factors.insert(Factor->getAPInt().getSExtValue());
2948 }
2949 }
2950
2951 // If all uses use the same type, don't bother looking for truncation-based
2952 // reuse.
2953 if (Types.size() == 1)
2954 Types.clear();
2955
2956 LLVM_DEBUG(print_factors_and_types(dbgs()));
2957}
2958
2959/// Helper for CollectChains that finds an IV operand (computed by an AddRec in
2960/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
2961/// IVStrideUses, we could partially skip this.
2962static User::op_iterator
2964 Loop *L, ScalarEvolution &SE) {
2965 for(; OI != OE; ++OI) {
2966 if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
2967 if (!SE.isSCEVable(Oper->getType()))
2968 continue;
2969
2970 if (const SCEVAddRecExpr *AR =
2971 dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) {
2972 if (AR->getLoop() == L)
2973 break;
2974 }
2975 }
2976 }
2977 return OI;
2978}
2979
2980/// IVChain logic must consistently peek base TruncInst operands, so wrap it in
2981/// a convenient helper.
2983 if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
2984 return Trunc->getOperand(0);
2985 return Oper;
2986}
2987
2988/// Return an approximation of this SCEV expression's "base", or NULL for any
2989/// constant. Returning the expression itself is conservative. Returning a
2990/// deeper subexpression is more precise and valid as long as it isn't less
2991/// complex than another subexpression. For expressions involving multiple
2992/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
2993/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
2994/// IVInc==b-a.
2995///
2996/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
2997/// SCEVUnknown, we simply return the rightmost SCEV operand.
2998static const SCEV *getExprBase(const SCEV *S) {
2999 switch (S->getSCEVType()) {
3000 default: // including scUnknown.
3001 return S;
3002 case scConstant:
3003 case scVScale:
3004 return nullptr;
3005 case scTruncate:
3006 return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
3007 case scZeroExtend:
3008 return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
3009 case scSignExtend:
3010 return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
3011 case scAddExpr: {
3012 // Skip over scaled operands (scMulExpr) to follow add operands as long as
3013 // there's nothing more complex.
3014 // FIXME: not sure if we want to recognize negation.
3015 const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
3016 for (const SCEV *SubExpr : reverse(Add->operands())) {
3017 if (SubExpr->getSCEVType() == scAddExpr)
3018 return getExprBase(SubExpr);
3019
3020 if (SubExpr->getSCEVType() != scMulExpr)
3021 return SubExpr;
3022 }
3023 return S; // all operands are scaled, be conservative.
3024 }
3025 case scAddRecExpr:
3026 return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
3027 }
3028 llvm_unreachable("Unknown SCEV kind!");
3029}
3030
3031/// Return true if the chain increment is profitable to expand into a loop
3032/// invariant value, which may require its own register. A profitable chain
3033/// increment will be an offset relative to the same base. We allow such offsets
3034/// to potentially be used as chain increment as long as it's not obviously
3035/// expensive to expand using real instructions.
3036bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
3037 const SCEV *IncExpr,
3038 ScalarEvolution &SE) {
3039 // Aggressively form chains when -stress-ivchain.
3040 if (StressIVChain)
3041 return true;
3042
3043 // Do not replace a constant offset from IV head with a nonconstant IV
3044 // increment.
3045 if (!isa<SCEVConstant>(IncExpr)) {
3046 const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
3047 if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
3048 return false;
3049 }
3050
3052 return !isHighCostExpansion(IncExpr, Processed, SE);
3053}
3054
3055/// Return true if the number of registers needed for the chain is estimated to
3056/// be less than the number required for the individual IV users. First prohibit
3057/// any IV users that keep the IV live across increments (the Users set should
3058/// be empty). Next count the number and type of increments in the chain.
3059///
3060/// Chaining IVs can lead to considerable code bloat if ISEL doesn't
3061/// effectively use postinc addressing modes. Only consider it profitable it the
3062/// increments can be computed in fewer registers when chained.
3063///
3064/// TODO: Consider IVInc free if it's already used in another chains.
3065static bool isProfitableChain(IVChain &Chain,
3067 ScalarEvolution &SE,
3068 const TargetTransformInfo &TTI) {
3069 if (StressIVChain)
3070 return true;
3071
3072 if (!Chain.hasIncs())
3073 return false;
3074
3075 if (!Users.empty()) {
3076 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
3077 for (Instruction *Inst
3078 : Users) { dbgs() << " " << *Inst << "\n"; });
3079 return false;
3080 }
3081 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3082
3083 // The chain itself may require a register, so intialize cost to 1.
3084 int cost = 1;
3085
3086 // A complete chain likely eliminates the need for keeping the original IV in
3087 // a register. LSR does not currently know how to form a complete chain unless
3088 // the header phi already exists.
3089 if (isa<PHINode>(Chain.tailUserInst())
3090 && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
3091 --cost;
3092 }
3093 const SCEV *LastIncExpr = nullptr;
3094 unsigned NumConstIncrements = 0;
3095 unsigned NumVarIncrements = 0;
3096 unsigned NumReusedIncrements = 0;
3097
3098 if (TTI.isProfitableLSRChainElement(Chain.Incs[0].UserInst))
3099 return true;
3100
3101 for (const IVInc &Inc : Chain) {
3102 if (TTI.isProfitableLSRChainElement(Inc.UserInst))
3103 return true;
3104 if (Inc.IncExpr->isZero())
3105 continue;
3106
3107 // Incrementing by zero or some constant is neutral. We assume constants can
3108 // be folded into an addressing mode or an add's immediate operand.
3109 if (isa<SCEVConstant>(Inc.IncExpr)) {
3110 ++NumConstIncrements;
3111 continue;
3112 }
3113
3114 if (Inc.IncExpr == LastIncExpr)
3115 ++NumReusedIncrements;
3116 else
3117 ++NumVarIncrements;
3118
3119 LastIncExpr = Inc.IncExpr;
3120 }
3121 // An IV chain with a single increment is handled by LSR's postinc
3122 // uses. However, a chain with multiple increments requires keeping the IV's
3123 // value live longer than it needs to be if chained.
3124 if (NumConstIncrements > 1)
3125 --cost;
3126
3127 // Materializing increment expressions in the preheader that didn't exist in
3128 // the original code may cost a register. For example, sign-extended array
3129 // indices can produce ridiculous increments like this:
3130 // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
3131 cost += NumVarIncrements;
3132
3133 // Reusing variable increments likely saves a register to hold the multiple of
3134 // the stride.
3135 cost -= NumReusedIncrements;
3136
3137 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
3138 << "\n");
3139
3140 return cost < 0;
3141}
3142
3143/// Add this IV user to an existing chain or make it the head of a new chain.
3144void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
3145 SmallVectorImpl<ChainUsers> &ChainUsersVec) {
3146 // When IVs are used as types of varying widths, they are generally converted
3147 // to a wider type with some uses remaining narrow under a (free) trunc.
3148 Value *const NextIV = getWideOperand(IVOper);
3149 const SCEV *const OperExpr = SE.getSCEV(NextIV);
3150 const SCEV *const OperExprBase = getExprBase(OperExpr);
3151
3152 // Visit all existing chains. Check if its IVOper can be computed as a
3153 // profitable loop invariant increment from the last link in the Chain.
3154 unsigned ChainIdx = 0, NChains = IVChainVec.size();
3155 const SCEV *LastIncExpr = nullptr;
3156 for (; ChainIdx < NChains; ++ChainIdx) {
3157 IVChain &Chain = IVChainVec[ChainIdx];
3158
3159 // Prune the solution space aggressively by checking that both IV operands
3160 // are expressions that operate on the same unscaled SCEVUnknown. This
3161 // "base" will be canceled by the subsequent getMinusSCEV call. Checking
3162 // first avoids creating extra SCEV expressions.
3163 if (!StressIVChain && Chain.ExprBase != OperExprBase)
3164 continue;
3165
3166 Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
3167 if (PrevIV->getType() != NextIV->getType())
3168 continue;
3169
3170 // A phi node terminates a chain.
3171 if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
3172 continue;
3173
3174 // The increment must be loop-invariant so it can be kept in a register.
3175 const SCEV *PrevExpr = SE.getSCEV(PrevIV);
3176 const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
3177 if (isa<SCEVCouldNotCompute>(IncExpr) || !SE.isLoopInvariant(IncExpr, L))
3178 continue;
3179
3180 if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
3181 LastIncExpr = IncExpr;
3182 break;
3183 }
3184 }
3185 // If we haven't found a chain, create a new one, unless we hit the max. Don't
3186 // bother for phi nodes, because they must be last in the chain.
3187 if (ChainIdx == NChains) {
3188 if (isa<PHINode>(UserInst))
3189 return;
3190 if (NChains >= MaxChains && !StressIVChain) {
3191 LLVM_DEBUG(dbgs() << "IV Chain Limit\n");
3192 return;
3193 }
3194 LastIncExpr = OperExpr;
3195 // IVUsers may have skipped over sign/zero extensions. We don't currently
3196 // attempt to form chains involving extensions unless they can be hoisted
3197 // into this loop's AddRec.
3198 if (!isa<SCEVAddRecExpr>(LastIncExpr))
3199 return;
3200 ++NChains;
3201 IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
3202 OperExprBase));
3203 ChainUsersVec.resize(NChains);
3204 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
3205 << ") IV=" << *LastIncExpr << "\n");
3206 } else {
3207 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst
3208 << ") IV+" << *LastIncExpr << "\n");
3209 // Add this IV user to the end of the chain.
3210 IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
3211 }
3212 IVChain &Chain = IVChainVec[ChainIdx];
3213
3214 SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
3215 // This chain's NearUsers become FarUsers.
3216 if (!LastIncExpr->isZero()) {
3217 ChainUsersVec[ChainIdx].FarUsers.insert(NearUsers.begin(),
3218 NearUsers.end());
3219 NearUsers.clear();
3220 }
3221
3222 // All other uses of IVOperand become near uses of the chain.
3223 // We currently ignore intermediate values within SCEV expressions, assuming
3224 // they will eventually be used be the current chain, or can be computed
3225 // from one of the chain increments. To be more precise we could
3226 // transitively follow its user and only add leaf IV users to the set.
3227 for (User *U : IVOper->users()) {
3228 Instruction *OtherUse = dyn_cast<Instruction>(U);
3229 if (!OtherUse)
3230 continue;
3231 // Uses in the chain will no longer be uses if the chain is formed.
3232 // Include the head of the chain in this iteration (not Chain.begin()).
3233 IVChain::const_iterator IncIter = Chain.Incs.begin();
3234 IVChain::const_iterator IncEnd = Chain.Incs.end();
3235 for( ; IncIter != IncEnd; ++IncIter) {
3236 if (IncIter->UserInst == OtherUse)
3237 break;
3238 }
3239 if (IncIter != IncEnd)
3240 continue;
3241
3242 if (SE.isSCEVable(OtherUse->getType())
3243 && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
3244 && IU.isIVUserOrOperand(OtherUse)) {
3245 continue;
3246 }
3247 NearUsers.insert(OtherUse);
3248 }
3249
3250 // Since this user is part of the chain, it's no longer considered a use
3251 // of the chain.
3252 ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
3253}
3254
3255/// Populate the vector of Chains.
3256///
3257/// This decreases ILP at the architecture level. Targets with ample registers,
3258/// multiple memory ports, and no register renaming probably don't want
3259/// this. However, such targets should probably disable LSR altogether.
3260///
3261/// The job of LSR is to make a reasonable choice of induction variables across
3262/// the loop. Subsequent passes can easily "unchain" computation exposing more
3263/// ILP *within the loop* if the target wants it.
3264///
3265/// Finding the best IV chain is potentially a scheduling problem. Since LSR
3266/// will not reorder memory operations, it will recognize this as a chain, but
3267/// will generate redundant IV increments. Ideally this would be corrected later
3268/// by a smart scheduler:
3269/// = A[i]
3270/// = A[i+x]
3271/// A[i] =
3272/// A[i+x] =
3273///
3274/// TODO: Walk the entire domtree within this loop, not just the path to the
3275/// loop latch. This will discover chains on side paths, but requires
3276/// maintaining multiple copies of the Chains state.
3277void LSRInstance::CollectChains() {
3278 LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n");
3279 SmallVector<ChainUsers, 8> ChainUsersVec;
3280
3282 BasicBlock *LoopHeader = L->getHeader();
3283 for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
3284 Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
3285 LatchPath.push_back(Rung->getBlock());
3286 }
3287 LatchPath.push_back(LoopHeader);
3288
3289 // Walk the instruction stream from the loop header to the loop latch.
3290 for (BasicBlock *BB : reverse(LatchPath)) {
3291 for (Instruction &I : *BB) {
3292 // Skip instructions that weren't seen by IVUsers analysis.
3293 if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&I))
3294 continue;
3295
3296 // Ignore users that are part of a SCEV expression. This way we only
3297 // consider leaf IV Users. This effectively rediscovers a portion of
3298 // IVUsers analysis but in program order this time.
3299 if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I)))
3300 continue;
3301
3302 // Remove this instruction from any NearUsers set it may be in.
3303 for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
3304 ChainIdx < NChains; ++ChainIdx) {
3305 ChainUsersVec[ChainIdx].NearUsers.erase(&I);
3306 }
3307 // Search for operands that can be chained.
3308 SmallPtrSet<Instruction*, 4> UniqueOperands;
3309 User::op_iterator IVOpEnd = I.op_end();
3310 User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE);
3311 while (IVOpIter != IVOpEnd) {
3312 Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
3313 if (UniqueOperands.insert(IVOpInst).second)
3314 ChainInstruction(&I, IVOpInst, ChainUsersVec);
3315 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3316 }
3317 } // Continue walking down the instructions.
3318 } // Continue walking down the domtree.
3319 // Visit phi backedges to determine if the chain can generate the IV postinc.
3320 for (PHINode &PN : L->getHeader()->phis()) {
3321 if (!SE.isSCEVable(PN.getType()))
3322 continue;
3323
3324 Instruction *IncV =
3325 dyn_cast<Instruction>(PN.getIncomingValueForBlock(L->getLoopLatch()));
3326 if (IncV)
3327 ChainInstruction(&PN, IncV, ChainUsersVec);
3328 }
3329 // Remove any unprofitable chains.
3330 unsigned ChainIdx = 0;
3331 for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
3332 UsersIdx < NChains; ++UsersIdx) {
3333 if (!isProfitableChain(IVChainVec[UsersIdx],
3334 ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
3335 continue;
3336 // Preserve the chain at UsesIdx.
3337 if (ChainIdx != UsersIdx)
3338 IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
3339 FinalizeChain(IVChainVec[ChainIdx]);
3340 ++ChainIdx;
3341 }
3342 IVChainVec.resize(ChainIdx);
3343}
3344
3345void LSRInstance::FinalizeChain(IVChain &Chain) {
3346 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3347 LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
3348
3349 for (const IVInc &Inc : Chain) {
3350 LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");
3351 auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
3352 assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
3353 IVIncSet.insert(UseI);
3354 }
3355}
3356
3357/// Return true if the IVInc can be folded into an addressing mode.
3358static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
3359 Value *Operand, const TargetTransformInfo &TTI) {
3360 const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
3361 Immediate IncOffset = Immediate::getZero();
3362 if (IncConst) {
3363 if (IncConst && IncConst->getAPInt().getSignificantBits() > 64)
3364 return false;
3365 IncOffset = Immediate::getFixed(IncConst->getValue()->getSExtValue());
3366 } else {
3367 // Look for mul(vscale, constant), to detect a scalable offset.
3368 auto *IncVScale = dyn_cast<SCEVMulExpr>(IncExpr);
3369 if (!IncVScale || IncVScale->getNumOperands() != 2 ||
3370 !isa<SCEVVScale>(IncVScale->getOperand(1)))
3371 return false;
3372 auto *Scale = dyn_cast<SCEVConstant>(IncVScale->getOperand(0));
3373 if (!Scale || Scale->getType()->getScalarSizeInBits() > 64)
3374 return false;
3375 IncOffset = Immediate::getScalable(Scale->getValue()->getSExtValue());
3376 }
3377
3378 if (!isAddressUse(TTI, UserInst, Operand))
3379 return false;
3380
3381 MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
3382 if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
3383 IncOffset, /*HasBaseReg=*/false))
3384 return false;
3385
3386 return true;
3387}
3388
3389/// Generate an add or subtract for each IVInc in a chain to materialize the IV
3390/// user's operand from the previous IV user's operand.
3391void LSRInstance::GenerateIVChain(const IVChain &Chain,
3393 // Find the new IVOperand for the head of the chain. It may have been replaced
3394 // by LSR.
3395 const IVInc &Head = Chain.Incs[0];
3396 User::op_iterator IVOpEnd = Head.UserInst->op_end();
3397 // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
3398 User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
3399 IVOpEnd, L, SE);
3400 Value *IVSrc = nullptr;
3401 while (IVOpIter != IVOpEnd) {
3402 IVSrc = getWideOperand(*IVOpIter);
3403
3404 // If this operand computes the expression that the chain needs, we may use
3405 // it. (Check this after setting IVSrc which is used below.)
3406 //
3407 // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
3408 // narrow for the chain, so we can no longer use it. We do allow using a
3409 // wider phi, assuming the LSR checked for free truncation. In that case we
3410 // should already have a truncate on this operand such that
3411 // getSCEV(IVSrc) == IncExpr.
3412 if (SE.getSCEV(*IVOpIter) == Head.IncExpr
3413 || SE.getSCEV(IVSrc) == Head.IncExpr) {
3414 break;
3415 }
3416 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3417 }
3418 if (IVOpIter == IVOpEnd) {
3419 // Gracefully give up on this chain.
3420 LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
3421 return;
3422 }
3423 assert(IVSrc && "Failed to find IV chain source");
3424
3425 LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
3426 Type *IVTy = IVSrc->getType();
3427 Type *IntTy = SE.getEffectiveSCEVType(IVTy);
3428 const SCEV *LeftOverExpr = nullptr;
3429 const SCEV *Accum = SE.getZero(IntTy);
3431 Bases.emplace_back(Accum, IVSrc);
3432
3433 for (const IVInc &Inc : Chain) {
3434 Instruction *InsertPt = Inc.UserInst;
3435 if (isa<PHINode>(InsertPt))
3436 InsertPt = L->getLoopLatch()->getTerminator();
3437
3438 // IVOper will replace the current IV User's operand. IVSrc is the IV
3439 // value currently held in a register.
3440 Value *IVOper = IVSrc;
3441 if (!Inc.IncExpr->isZero()) {
3442 // IncExpr was the result of subtraction of two narrow values, so must
3443 // be signed.
3444 const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
3445 Accum = SE.getAddExpr(Accum, IncExpr);
3446 LeftOverExpr = LeftOverExpr ?
3447 SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
3448 }
3449
3450 // Look through each base to see if any can produce a nice addressing mode.
3451 bool FoundBase = false;
3452 for (auto [MapScev, MapIVOper] : reverse(Bases)) {
3453 const SCEV *Remainder = SE.getMinusSCEV(Accum, MapScev);
3454 if (canFoldIVIncExpr(Remainder, Inc.UserInst, Inc.IVOperand, TTI)) {
3455 if (!Remainder->isZero()) {
3456 Rewriter.clearPostInc();
3457 Value *IncV = Rewriter.expandCodeFor(Remainder, IntTy, InsertPt);
3458 const SCEV *IVOperExpr =
3459 SE.getAddExpr(SE.getUnknown(MapIVOper), SE.getUnknown(IncV));
3460 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3461 } else {
3462 IVOper = MapIVOper;
3463 }
3464
3465 FoundBase = true;
3466 break;
3467 }
3468 }
3469 if (!FoundBase && LeftOverExpr && !LeftOverExpr->isZero()) {
3470 // Expand the IV increment.
3471 Rewriter.clearPostInc();
3472 Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
3473 const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
3474 SE.getUnknown(IncV));
3475 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3476
3477 // If an IV increment can't be folded, use it as the next IV value.
3478 if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
3479 assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
3480 Bases.emplace_back(Accum, IVOper);
3481 IVSrc = IVOper;
3482 LeftOverExpr = nullptr;
3483 }
3484 }
3485 Type *OperTy = Inc.IVOperand->getType();
3486 if (IVTy != OperTy) {
3487 assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
3488 "cannot extend a chained IV");
3489 IRBuilder<> Builder(InsertPt);
3490 IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
3491 }
3492 Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
3493 if (auto *OperandIsInstr = dyn_cast<Instruction>(Inc.IVOperand))
3494 DeadInsts.emplace_back(OperandIsInstr);
3495 }
3496 // If LSR created a new, wider phi, we may also replace its postinc. We only
3497 // do this if we also found a wide value for the head of the chain.
3498 if (isa<PHINode>(Chain.tailUserInst())) {
3499 for (PHINode &Phi : L->getHeader()->phis()) {
3500 if (Phi.getType() != IVSrc->getType())
3501 continue;
3502 Instruction *PostIncV = dyn_cast<Instruction>(
3503 Phi.getIncomingValueForBlock(L->getLoopLatch()));
3504 if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
3505 continue;
3506 Value *IVOper = IVSrc;
3507 Type *PostIncTy = PostIncV->getType();
3508 if (IVTy != PostIncTy) {
3509 assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
3510 IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
3511 Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
3512 IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
3513 }
3514 Phi.replaceUsesOfWith(PostIncV, IVOper);
3515 DeadInsts.emplace_back(PostIncV);
3516 }
3517 }
3518}
3519
3520void LSRInstance::CollectFixupsAndInitialFormulae() {
3521 BranchInst *ExitBranch = nullptr;
3522 bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &TLI);
3523
3524 // For calculating baseline cost
3526 DenseSet<const SCEV *> VisitedRegs;
3527 DenseSet<size_t> VisitedLSRUse;
3528
3529 for (const IVStrideUse &U : IU) {
3530 Instruction *UserInst = U.getUser();
3531 // Skip IV users that are part of profitable IV Chains.
3532 User::op_iterator UseI =
3533 find(UserInst->operands(), U.getOperandValToReplace());
3534 assert(UseI != UserInst->op_end() && "cannot find IV operand");
3535 if (IVIncSet.count(UseI)) {
3536 LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
3537 continue;
3538 }
3539
3540 LSRUse::KindType Kind = LSRUse::Basic;
3541 MemAccessTy AccessTy;
3542 if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
3543 Kind = LSRUse::Address;
3544 AccessTy = getAccessType(TTI, UserInst, U.getOperandValToReplace());
3545 }
3546
3547 const SCEV *S = IU.getExpr(U);
3548 if (!S)
3549 continue;
3550 PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
3551
3552 // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
3553 // (N - i == 0), and this allows (N - i) to be the expression that we work
3554 // with rather than just N or i, so we can consider the register
3555 // requirements for both N and i at the same time. Limiting this code to
3556 // equality icmps is not a problem because all interesting loops use
3557 // equality icmps, thanks to IndVarSimplify.
3558 if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) {
3559 // If CI can be saved in some target, like replaced inside hardware loop
3560 // in PowerPC, no need to generate initial formulae for it.
3561 if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition()))
3562 continue;
3563 if (CI->isEquality()) {
3564 // Swap the operands if needed to put the OperandValToReplace on the
3565 // left, for consistency.
3566 Value *NV = CI->getOperand(1);
3567 if (NV == U.getOperandValToReplace()) {
3568 CI->setOperand(1, CI->getOperand(0));
3569 CI->setOperand(0, NV);
3570 NV = CI->getOperand(1);
3571 Changed = true;
3572 }
3573
3574 // x == y --> x - y == 0
3575 const SCEV *N = SE.getSCEV(NV);
3576 if (SE.isLoopInvariant(N, L) && Rewriter.isSafeToExpand(N) &&
3577 (!NV->getType()->isPointerTy() ||
3578 SE.getPointerBase(N) == SE.getPointerBase(S))) {
3579 // S is normalized, so normalize N before folding it into S
3580 // to keep the result normalized.
3581 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3582 if (!N)
3583 continue;
3584 Kind = LSRUse::ICmpZero;
3585 S = SE.getMinusSCEV(N, S);
3586 } else if (L->isLoopInvariant(NV) &&
3587 (!isa<Instruction>(NV) ||
3588 DT.dominates(cast<Instruction>(NV), L->getHeader())) &&
3589 !NV->getType()->isPointerTy()) {
3590 // If we can't generally expand the expression (e.g. it contains
3591 // a divide), but it is already at a loop invariant point before the
3592 // loop, wrap it in an unknown (to prevent the expander from trying
3593 // to re-expand in a potentially unsafe way.) The restriction to
3594 // integer types is required because the unknown hides the base, and
3595 // SCEV can't compute the difference of two unknown pointers.
3596 N = SE.getUnknown(NV);
3597 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3598 if (!N)
3599 continue;
3600 Kind = LSRUse::ICmpZero;
3601 S = SE.getMinusSCEV(N, S);
3602 assert(!isa<SCEVCouldNotCompute>(S));
3603 }
3604
3605 // -1 and the negations of all interesting strides (except the negation
3606 // of -1) are now also interesting.
3607 for (size_t i = 0, e = Factors.size(); i != e; ++i)
3608 if (Factors[i] != -1)
3609 Factors.insert(-(uint64_t)Factors[i]);
3610 Factors.insert(-1);
3611 }
3612 }
3613
3614 // Get or create an LSRUse.
3615 std::pair<size_t, Immediate> P = getUse(S, Kind, AccessTy);
3616 size_t LUIdx = P.first;
3617 Immediate Offset = P.second;
3618 LSRUse &LU = Uses[LUIdx];
3619
3620 // Record the fixup.
3621 LSRFixup &LF = LU.getNewFixup();
3622 LF.UserInst = UserInst;
3623 LF.OperandValToReplace = U.getOperandValToReplace();
3624 LF.PostIncLoops = TmpPostIncLoops;
3625 LF.Offset = Offset;
3626 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3627
3628 // Create SCEV as Formula for calculating baseline cost
3629 if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) {
3630 Formula F;
3631 F.initialMatch(S, L, SE);
3632 BaselineCost.RateFormula(F, Regs, VisitedRegs, LU);
3633 VisitedLSRUse.insert(LUIdx);
3634 }
3635
3636 if (!LU.WidestFixupType ||
3637 SE.getTypeSizeInBits(LU.WidestFixupType) <
3638 SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3639 LU.WidestFixupType = LF.OperandValToReplace->getType();
3640
3641 // If this is the first use of this LSRUse, give it a formula.
3642 if (LU.Formulae.empty()) {
3643 InsertInitialFormula(S, LU, LUIdx);
3644 CountRegisters(LU.Formulae.back(), LUIdx);
3645 }
3646 }
3647
3648 LLVM_DEBUG(print_fixups(dbgs()));
3649}
3650
3651/// Insert a formula for the given expression into the given use, separating out
3652/// loop-variant portions from loop-invariant and loop-computable portions.
3653void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU,
3654 size_t LUIdx) {
3655 // Mark uses whose expressions cannot be expanded.
3656 if (!Rewriter.isSafeToExpand(S))
3657 LU.RigidFormula = true;
3658
3659 Formula F;
3660 F.initialMatch(S, L, SE);
3661 bool Inserted = InsertFormula(LU, LUIdx, F);
3662 assert(Inserted && "Initial formula already exists!"); (void)Inserted;
3663}
3664
3665/// Insert a simple single-register formula for the given expression into the
3666/// given use.
3667void
3668LSRInstance::InsertSupplementalFormula(const SCEV *S,
3669 LSRUse &LU, size_t LUIdx) {
3670 Formula F;
3671 F.BaseRegs.push_back(S);
3672 F.HasBaseReg = true;
3673 bool Inserted = InsertFormula(LU, LUIdx, F);
3674 assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
3675}
3676
3677/// Note which registers are used by the given formula, updating RegUses.
3678void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
3679 if (F.ScaledReg)
3680 RegUses.countRegister(F.ScaledReg, LUIdx);
3681 for (const SCEV *BaseReg : F.BaseRegs)
3682 RegUses.countRegister(BaseReg, LUIdx);
3683}
3684
3685/// If the given formula has not yet been inserted, add it to the list, and
3686/// return true. Return false otherwise.
3687bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
3688 // Do not insert formula that we will not be able to expand.
3689 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
3690 "Formula is illegal");
3691
3692 if (!LU.InsertFormula(F, *L))
3693 return false;
3694
3695 CountRegisters(F, LUIdx);
3696 return true;
3697}
3698
3699/// Check for other uses of loop-invariant values which we're tracking. These
3700/// other uses will pin these values in registers, making them less profitable
3701/// for elimination.
3702/// TODO: This currently misses non-constant addrec step registers.
3703/// TODO: Should this give more weight to users inside the loop?
3704void
3705LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
3706 SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
3708
3709 // Don't collect outside uses if we are favoring postinc - the instructions in
3710 // the loop are more important than the ones outside of it.
3711 if (AMK == TTI::AMK_PostIndexed)
3712 return;
3713
3714 while (!Worklist.empty()) {
3715 const SCEV *S = Worklist.pop_back_val();
3716
3717 // Don't process the same SCEV twice
3718 if (!Visited.insert(S).second)
3719 continue;
3720
3721 if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
3722 append_range(Worklist, N->operands());
3723 else if (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(S))
3724 Worklist.push_back(C->getOperand());
3725 else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
3726 Worklist.push_back(D->getLHS());
3727 Worklist.push_back(D->getRHS());
3728 } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
3729 const Value *V = US->getValue();
3730 if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
3731 // Look for instructions defined outside the loop.
3732 if (L->contains(Inst)) continue;
3733 } else if (isa<Constant>(V))
3734 // Constants can be re-materialized.
3735 continue;
3736 for (const Use &U : V->uses()) {
3737 const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
3738 // Ignore non-instructions.
3739 if (!UserInst)
3740 continue;
3741 // Don't bother if the instruction is an EHPad.
3742 if (UserInst->isEHPad())
3743 continue;
3744 // Ignore instructions in other functions (as can happen with
3745 // Constants).
3746 if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
3747 continue;
3748 // Ignore instructions not dominated by the loop.
3749 const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
3750 UserInst->getParent() :
3751 cast<PHINode>(UserInst)->getIncomingBlock(
3753 if (!DT.dominates(L->getHeader(), UseBB))
3754 continue;
3755 // Don't bother if the instruction is in a BB which ends in an EHPad.
3756 if (UseBB->getTerminator()->isEHPad())
3757 continue;
3758
3759 // Ignore cases in which the currently-examined value could come from
3760 // a basic block terminated with an EHPad. This checks all incoming
3761 // blocks of the phi node since it is possible that the same incoming
3762 // value comes from multiple basic blocks, only some of which may end
3763 // in an EHPad. If any of them do, a subsequent rewrite attempt by this
3764 // pass would try to insert instructions into an EHPad, hitting an
3765 // assertion.
3766 if (isa<PHINode>(UserInst)) {
3767 const auto *PhiNode = cast<PHINode>(UserInst);
3768 bool HasIncompatibleEHPTerminatedBlock = false;
3769 llvm::Value *ExpectedValue = U;
3770 for (unsigned int I = 0; I < PhiNode->getNumIncomingValues(); I++) {
3771 if (PhiNode->getIncomingValue(I) == ExpectedValue) {
3772 if (PhiNode->getIncomingBlock(I)->getTerminator()->isEHPad()) {
3773 HasIncompatibleEHPTerminatedBlock = true;
3774 break;
3775 }
3776 }
3777 }
3778 if (HasIncompatibleEHPTerminatedBlock) {
3779 continue;
3780 }
3781 }
3782
3783 // Don't bother rewriting PHIs in catchswitch blocks.
3784 if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
3785 continue;
3786 // Ignore uses which are part of other SCEV expressions, to avoid
3787 // analyzing them multiple times.
3788 if (SE.isSCEVable(UserInst->getType())) {
3789 const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
3790 // If the user is a no-op, look through to its uses.
3791 if (!isa<SCEVUnknown>(UserS))
3792 continue;
3793 if (UserS == US) {
3794 Worklist.push_back(
3795 SE.getUnknown(const_cast<Instruction *>(UserInst)));
3796 continue;
3797 }
3798 }
3799 // Ignore icmp instructions which are already being analyzed.
3800 if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
3801 unsigned OtherIdx = !U.getOperandNo();
3802 Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
3803 if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
3804 continue;
3805 }
3806
3807 std::pair<size_t, Immediate> P =
3808 getUse(S, LSRUse::Basic, MemAccessTy());
3809 size_t LUIdx = P.first;
3810 Immediate Offset = P.second;
3811 LSRUse &LU = Uses[LUIdx];
3812 LSRFixup &LF = LU.getNewFixup();
3813 LF.UserInst = const_cast<Instruction *>(UserInst);
3814 LF.OperandValToReplace = U;
3815 LF.Offset = Offset;
3816 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3817 if (!LU.WidestFixupType ||
3818 SE.getTypeSizeInBits(LU.WidestFixupType) <
3819 SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3820 LU.WidestFixupType = LF.OperandValToReplace->getType();
3821 InsertSupplementalFormula(US, LU, LUIdx);
3822 CountRegisters(LU.Formulae.back(), Uses.size() - 1);
3823 break;
3824 }
3825 }
3826 }
3827}
3828
3829/// Split S into subexpressions which can be pulled out into separate
3830/// registers. If C is non-null, multiply each subexpression by C.
3831///
3832/// Return remainder expression after factoring the subexpressions captured by
3833/// Ops. If Ops is complete, return NULL.
3834static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
3836 const Loop *L,
3837 ScalarEvolution &SE,
3838 unsigned Depth = 0) {
3839 // Arbitrarily cap recursion to protect compile time.
3840 if (Depth >= 3)
3841 return S;
3842
3843 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
3844 // Break out add operands.
3845 for (const SCEV *S : Add->operands()) {
3846 const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
3847 if (Remainder)
3848 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3849 }
3850 return nullptr;
3851 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
3852 // Split a non-zero base out of an addrec.
3853 if (AR->getStart()->isZero() || !AR->isAffine())
3854 return S;
3855
3856 const SCEV *Remainder = CollectSubexprs(AR->getStart(),
3857 C, Ops, L, SE, Depth+1);
3858 // Split the non-zero AddRec unless it is part of a nested recurrence that
3859 // does not pertain to this loop.
3860 if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) {
3861 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3862 Remainder = nullptr;
3863 }
3864 if (Remainder != AR->getStart()) {
3865 if (!Remainder)
3866 Remainder = SE.getConstant(AR->getType(), 0);
3867 return SE.getAddRecExpr(Remainder,
3868 AR->getStepRecurrence(SE),
3869 AR->getLoop(),
3870 //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
3872 }
3873 } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
3874 // Break (C * (a + b + c)) into C*a + C*b + C*c.
3875 if (Mul->getNumOperands() != 2)
3876 return S;
3877 if (const SCEVConstant *Op0 =
3878 dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
3879 C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
3880 const SCEV *Remainder =
3881 CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1);
3882 if (Remainder)
3883 Ops.push_back(SE.getMulExpr(C, Remainder));
3884 return nullptr;
3885 }
3886 }
3887 return S;
3888}
3889
3890/// Return true if the SCEV represents a value that may end up as a
3891/// post-increment operation.
3893 LSRUse &LU, const SCEV *S, const Loop *L,
3894 ScalarEvolution &SE) {
3895 if (LU.Kind != LSRUse::Address ||
3896 !LU.AccessTy.getType()->isIntOrIntVectorTy())
3897 return false;
3898 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S);
3899 if (!AR)
3900 return false;
3901 const SCEV *LoopStep = AR->getStepRecurrence(SE);
3902 if (!isa<SCEVConstant>(LoopStep))
3903 return false;
3904 // Check if a post-indexed load/store can be used.
3907 const SCEV *LoopStart = AR->getStart();
3908 if (!isa<SCEVConstant>(LoopStart) && SE.isLoopInvariant(LoopStart, L))
3909 return true;
3910 }
3911 return false;
3912}
3913
3914/// Helper function for LSRInstance::GenerateReassociations.
3915void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
3916 const Formula &Base,
3917 unsigned Depth, size_t Idx,
3918 bool IsScaledReg) {
3919 const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3920 // Don't generate reassociations for the base register of a value that
3921 // may generate a post-increment operator. The reason is that the
3922 // reassociations cause extra base+register formula to be created,
3923 // and possibly chosen, but the post-increment is more efficient.
3924 if (AMK == TTI::AMK_PostIndexed && mayUsePostIncMode(TTI, LU, BaseReg, L, SE))
3925 return;
3927 const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
3928 if (Remainder)
3929 AddOps.push_back(Remainder);
3930
3931 if (AddOps.size() == 1)
3932 return;
3933
3935 JE = AddOps.end();
3936 J != JE; ++J) {
3937 // Loop-variant "unknown" values are uninteresting; we won't be able to
3938 // do anything meaningful with them.
3939 if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
3940 continue;
3941
3942 // Don't pull a constant into a register if the constant could be folded
3943 // into an immediate field.
3944 if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3945 LU.AccessTy, *J, Base.getNumRegs() > 1))
3946 continue;
3947
3948 // Collect all operands except *J.
3949 SmallVector<const SCEV *, 8> InnerAddOps(
3950 ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
3951 InnerAddOps.append(std::next(J),
3952 ((const SmallVector<const SCEV *, 8> &)AddOps).end());
3953
3954 // Don't leave just a constant behind in a register if the constant could
3955 // be folded into an immediate field.
3956 if (InnerAddOps.size() == 1 &&
3957 isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3958 LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
3959 continue;
3960
3961 const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
3962 if (InnerSum->isZero())
3963 continue;
3964 Formula F = Base;
3965
3966 if (F.UnfoldedOffset.isNonZero() && F.UnfoldedOffset.isScalable())
3967 continue;
3968
3969 // Add the remaining pieces of the add back into the new formula.
3970 const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
3971 if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
3972 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
3973 InnerSumSC->getValue()->getZExtValue())) {
3974 F.UnfoldedOffset =
3975 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
3976 InnerSumSC->getValue()->getZExtValue());
3977 if (IsScaledReg)
3978 F.ScaledReg = nullptr;
3979 else
3980 F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
3981 } else if (IsScaledReg)
3982 F.ScaledReg = InnerSum;
3983 else
3984 F.BaseRegs[Idx] = InnerSum;
3985
3986 // Add J as its own register, or an unfolded immediate.
3987 const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
3988 if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
3989 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
3990 SC->getValue()->getZExtValue()))
3991 F.UnfoldedOffset =
3992 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
3993 SC->getValue()->getZExtValue());
3994 else
3995 F.BaseRegs.push_back(*J);
3996 // We may have changed the number of register in base regs, adjust the
3997 // formula accordingly.
3998 F.canonicalize(*L);
3999
4000 if (InsertFormula(LU, LUIdx, F))
4001 // If that formula hadn't been seen before, recurse to find more like
4002 // it.
4003 // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2)
4004 // Because just Depth is not enough to bound compile time.
4005 // This means that every time AddOps.size() is greater 16^x we will add
4006 // x to Depth.
4007 GenerateReassociations(LU, LUIdx, LU.Formulae.back(),
4008 Depth + 1 + (Log2_32(AddOps.size()) >> 2));
4009 }
4010}
4011
4012/// Split out subexpressions from adds and the bases of addrecs.
4013void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
4014 Formula Base, unsigned Depth) {
4015 assert(Base.isCanonical(*L) && "Input must be in the canonical form");
4016 // Arbitrarily cap recursion to protect compile time.
4017 if (Depth >= 3)
4018 return;
4019
4020 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4021 GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
4022
4023 if (Base.Scale == 1)
4024 GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
4025 /* Idx */ -1, /* IsScaledReg */ true);
4026}
4027
4028/// Generate a formula consisting of all of the loop-dominating registers added
4029/// into a single register.
4030void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
4031 Formula Base) {
4032 // This method is only interesting on a plurality of registers.
4033 if (Base.BaseRegs.size() + (Base.Scale == 1) +
4034 (Base.UnfoldedOffset.isNonZero()) <=
4035 1)
4036 return;
4037
4038 // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
4039 // processing the formula.
4040 Base.unscale();
4042 Formula NewBase = Base;
4043 NewBase.BaseRegs.clear();
4044 Type *CombinedIntegerType = nullptr;
4045 for (const SCEV *BaseReg : Base.BaseRegs) {
4046 if (SE.properlyDominates(BaseReg, L->getHeader()) &&
4047 !SE.hasComputableLoopEvolution(BaseReg, L)) {
4048 if (!CombinedIntegerType)
4049 CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType());
4050 Ops.push_back(BaseReg);
4051 }
4052 else
4053 NewBase.BaseRegs.push_back(BaseReg);
4054 }
4055
4056 // If no register is relevant, we're done.
4057 if (Ops.size() == 0)
4058 return;
4059
4060 // Utility function for generating the required variants of the combined
4061 // registers.
4062 auto GenerateFormula = [&](const SCEV *Sum) {
4063 Formula F = NewBase;
4064
4065 // TODO: If Sum is zero, it probably means ScalarEvolution missed an
4066 // opportunity to fold something. For now, just ignore such cases
4067 // rather than proceed with zero in a register.
4068 if (Sum->isZero())
4069 return;
4070
4071 F.BaseRegs.push_back(Sum);
4072 F.canonicalize(*L);
4073 (void)InsertFormula(LU, LUIdx, F);
4074 };
4075
4076 // If we collected at least two registers, generate a formula combining them.
4077 if (Ops.size() > 1) {
4078 SmallVector<const SCEV *, 4> OpsCopy(Ops); // Don't let SE modify Ops.
4079 GenerateFormula(SE.getAddExpr(OpsCopy));
4080 }
4081
4082 // If we have an unfolded offset, generate a formula combining it with the
4083 // registers collected.
4084 if (NewBase.UnfoldedOffset.isNonZero() && NewBase.UnfoldedOffset.isFixed()) {
4085 assert(CombinedIntegerType && "Missing a type for the unfolded offset");
4086 Ops.push_back(SE.getConstant(CombinedIntegerType,
4087 NewBase.UnfoldedOffset.getFixedValue(), true));
4088 NewBase.UnfoldedOffset = Immediate::getFixed(0);
4089 GenerateFormula(SE.getAddExpr(Ops));
4090 }
4091}
4092
4093/// Helper function for LSRInstance::GenerateSymbolicOffsets.
4094void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
4095 const Formula &Base, size_t Idx,
4096 bool IsScaledReg) {
4097 const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4098 GlobalValue *GV = ExtractSymbol(G, SE);
4099 if (G->isZero() || !GV)
4100 return;
4101 Formula F = Base;
4102 F.BaseGV = GV;
4103 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4104 return;
4105 if (IsScaledReg)
4106 F.ScaledReg = G;
4107 else
4108 F.BaseRegs[Idx] = G;
4109 (void)InsertFormula(LU, LUIdx, F);
4110}
4111
4112/// Generate reuse formulae using symbolic offsets.
4113void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
4114 Formula Base) {
4115 // We can't add a symbolic offset if the address already contains one.
4116 if (Base.BaseGV) return;
4117
4118 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4119 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
4120 if (Base.Scale == 1)
4121 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
4122 /* IsScaledReg */ true);
4123}
4124
4125/// Helper function for LSRInstance::GenerateConstantOffsets.
4126void LSRInstance::GenerateConstantOffsetsImpl(
4127 LSRUse &LU, unsigned LUIdx, const Formula &Base,
4128 const SmallVectorImpl<Immediate> &Worklist, size_t Idx, bool IsScaledReg) {
4129
4130 auto GenerateOffset = [&](const SCEV *G, Immediate Offset) {
4131 Formula F = Base;
4132 if (!Base.BaseOffset.isCompatibleImmediate(Offset))
4133 return;
4134 F.BaseOffset = Base.BaseOffset.subUnsigned(Offset);
4135
4136 if (isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) {
4137 // Add the offset to the base register.
4138 const SCEV *NewOffset = Offset.getSCEV(SE, G->getType());
4139 const SCEV *NewG = SE.getAddExpr(NewOffset, G);
4140 // If it cancelled out, drop the base register, otherwise update it.
4141 if (NewG->isZero()) {
4142 if (IsScaledReg) {
4143 F.Scale = 0;
4144 F.ScaledReg = nullptr;
4145 } else
4146 F.deleteBaseReg(F.BaseRegs[Idx]);
4147 F.canonicalize(*L);
4148 } else if (IsScaledReg)
4149 F.ScaledReg = NewG;
4150 else
4151 F.BaseRegs[Idx] = NewG;
4152
4153 (void)InsertFormula(LU, LUIdx, F);
4154 }
4155 };
4156
4157 const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4158
4159 // With constant offsets and constant steps, we can generate pre-inc
4160 // accesses by having the offset equal the step. So, for access #0 with a
4161 // step of 8, we generate a G - 8 base which would require the first access
4162 // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer
4163 // for itself and hopefully becomes the base for other accesses. This means
4164 // means that a single pre-indexed access can be generated to become the new
4165 // base pointer for each iteration of the loop, resulting in no extra add/sub
4166 // instructions for pointer updating.
4167 if (AMK == TTI::AMK_PreIndexed && LU.Kind == LSRUse::Address) {
4168 if (auto *GAR = dyn_cast<SCEVAddRecExpr>(G)) {
4169 if (auto *StepRec =
4170 dyn_cast<SCEVConstant>(GAR->getStepRecurrence(SE))) {
4171 const APInt &StepInt = StepRec->getAPInt();
4172 int64_t Step = StepInt.isNegative() ?
4173 StepInt.getSExtValue() : StepInt.getZExtValue();
4174
4175 for (Immediate Offset : Worklist) {
4176 if (Offset.isFixed()) {
4177 Offset = Immediate::getFixed(Offset.getFixedValue() - Step);
4178 GenerateOffset(G, Offset);
4179 }
4180 }
4181 }
4182 }
4183 }
4184 for (Immediate Offset : Worklist)
4185 GenerateOffset(G, Offset);
4186
4187 Immediate Imm = ExtractImmediate(G, SE);
4188 if (G->isZero() || Imm.isZero() ||
4189 !Base.BaseOffset.isCompatibleImmediate(Imm))
4190 return;
4191 Formula F = Base;
4192 F.BaseOffset = F.BaseOffset.addUnsigned(Imm);
4193 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4194 return;
4195 if (IsScaledReg) {
4196 F.ScaledReg = G;
4197 } else {
4198 F.BaseRegs[Idx] = G;
4199 // We may generate non canonical Formula if G is a recurrent expr reg
4200 // related with current loop while F.ScaledReg is not.
4201 F.canonicalize(*L);
4202 }
4203 (void)InsertFormula(LU, LUIdx, F);
4204}
4205
4206/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
4207void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
4208 Formula Base) {
4209 // TODO: For now, just add the min and max offset, because it usually isn't
4210 // worthwhile looking at everything inbetween.
4212 Worklist.push_back(LU.MinOffset);
4213 if (LU.MaxOffset != LU.MinOffset)
4214 Worklist.push_back(LU.MaxOffset);
4215
4216 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4217 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
4218 if (Base.Scale == 1)
4219 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
4220 /* IsScaledReg */ true);
4221}
4222
4223/// For ICmpZero, check to see if we can scale up the comparison. For example, x
4224/// == y -> x*c == y*c.
4225void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
4226 Formula Base) {
4227 if (LU.Kind != LSRUse::ICmpZero) return;
4228
4229 // Determine the integer type for the base formula.
4230 Type *IntTy = Base.getType();
4231 if (!IntTy) return;
4232 if (SE.getTypeSizeInBits(IntTy) > 64) return;
4233
4234 // Don't do this if there is more than one offset.
4235 if (LU.MinOffset != LU.MaxOffset) return;
4236
4237 // Check if transformation is valid. It is illegal to multiply pointer.
4238 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4239 return;
4240 for (const SCEV *BaseReg : Base.BaseRegs)
4241 if (BaseReg->getType()->isPointerTy())
4242 return;
4243 assert(!Base.BaseGV && "ICmpZero use is not legal!");
4244
4245 // Check each interesting stride.
4246 for (int64_t Factor : Factors) {
4247 // Check that Factor can be represented by IntTy
4248 if (!ConstantInt::isValueValidForType(IntTy, Factor))
4249 continue;
4250 // Check that the multiplication doesn't overflow.
4251 if (Base.BaseOffset.isMin() && Factor == -1)
4252 continue;
4253 // Not supporting scalable immediates.
4254 if (Base.BaseOffset.isNonZero() && Base.BaseOffset.isScalable())
4255 continue;
4256 Immediate NewBaseOffset = Base.BaseOffset.mulUnsigned(Factor);
4257 assert(Factor != 0 && "Zero factor not expected!");
4258 if (NewBaseOffset.getFixedValue() / Factor !=
4259 Base.BaseOffset.getFixedValue())
4260 continue;
4261 // If the offset will be truncated at this use, check that it is in bounds.
4262 if (!IntTy->isPointerTy() &&
4263 !ConstantInt::isValueValidForType(IntTy, NewBaseOffset.getFixedValue()))
4264 continue;
4265
4266 // Check that multiplying with the use offset doesn't overflow.
4267 Immediate Offset = LU.MinOffset;
4268 if (Offset.isMin() && Factor == -1)
4269 continue;
4270 Offset = Offset.mulUnsigned(Factor);
4271 if (Offset.getFixedValue() / Factor != LU.MinOffset.getFixedValue())
4272 continue;
4273 // If the offset will be truncated at this use, check that it is in bounds.
4274 if (!IntTy->isPointerTy() &&
4275 !ConstantInt::isValueValidForType(IntTy, Offset.getFixedValue()))
4276 continue;
4277
4278 Formula F = Base;
4279 F.BaseOffset = NewBaseOffset;
4280
4281 // Check that this scale is legal.
4282 if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
4283 continue;
4284
4285 // Compensate for the use having MinOffset built into it.
4286 F.BaseOffset = F.BaseOffset.addUnsigned(Offset).subUnsigned(LU.MinOffset);
4287
4288 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4289
4290 // Check that multiplying with each base register doesn't overflow.
4291 for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
4292 F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
4293 if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
4294 goto next;
4295 }
4296
4297 // Check that multiplying with the scaled register doesn't overflow.
4298 if (F.ScaledReg) {
4299 F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
4300 if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
4301 continue;
4302 }
4303
4304 // Check that multiplying with the unfolded offset doesn't overflow.
4305 if (F.UnfoldedOffset.isNonZero()) {
4306 if (F.UnfoldedOffset.isMin() && Factor == -1)
4307 continue;
4308 F.UnfoldedOffset = F.UnfoldedOffset.mulUnsigned(Factor);
4309 if (F.UnfoldedOffset.getFixedValue() / Factor !=
4310 Base.UnfoldedOffset.getFixedValue())
4311 continue;
4312 // If the offset will be truncated, check that it is in bounds.
4314 IntTy, F.UnfoldedOffset.getFixedValue()))
4315 continue;
4316 }
4317
4318 // If we make it here and it's legal, add it.
4319 (void)InsertFormula(LU, LUIdx, F);
4320 next:;
4321 }
4322}
4323
4324/// Generate stride factor reuse formulae by making use of scaled-offset address
4325/// modes, for example.
4326void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
4327 // Determine the integer type for the base formula.
4328 Type *IntTy = Base.getType();
4329 if (!IntTy) return;
4330
4331 // If this Formula already has a scaled register, we can't add another one.
4332 // Try to unscale the formula to generate a better scale.
4333 if (Base.Scale != 0 && !Base.unscale())
4334 return;
4335
4336 assert(Base.Scale == 0 && "unscale did not did its job!");
4337
4338 // Check each interesting stride.
4339 for (int64_t Factor : Factors) {
4340 Base.Scale = Factor;
4341 Base.HasBaseReg = Base.BaseRegs.size() > 1;
4342 // Check whether this scale is going to be legal.
4343 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4344 Base)) {
4345 // As a special-case, handle special out-of-loop Basic users specially.
4346 // TODO: Reconsider this special case.
4347 if (LU.Kind == LSRUse::Basic &&
4348 isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
4349 LU.AccessTy, Base) &&
4350 LU.AllFixupsOutsideLoop)
4351 LU.Kind = LSRUse::Special;
4352 else
4353 continue;
4354 }
4355 // For an ICmpZero, negating a solitary base register won't lead to
4356 // new solutions.
4357 if (LU.Kind == LSRUse::ICmpZero && !Base.HasBaseReg &&
4358 Base.BaseOffset.isZero() && !Base.BaseGV)
4359 continue;
4360 // For each addrec base reg, if its loop is current loop, apply the scale.
4361 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
4362 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
4363 if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
4364 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4365 if (FactorS->isZero())
4366 continue;
4367 // Divide out the factor, ignoring high bits, since we'll be
4368 // scaling the value back up in the end.
4369 if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true))
4370 if (!Quotient->isZero()) {
4371 // TODO: This could be optimized to avoid all the copying.
4372 Formula F = Base;
4373 F.ScaledReg = Quotient;
4374 F.deleteBaseReg(F.BaseRegs[i]);
4375 // The canonical representation of 1*reg is reg, which is already in
4376 // Base. In that case, do not try to insert the formula, it will be
4377 // rejected anyway.
4378 if (F.Scale == 1 && (F.BaseRegs.empty() ||
4379 (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
4380 continue;
4381 // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
4382 // non canonical Formula with ScaledReg's loop not being L.
4383 if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
4384 F.canonicalize(*L);
4385 (void)InsertFormula(LU, LUIdx, F);
4386 }
4387 }
4388 }
4389 }
4390}
4391
4392/// Extend/Truncate \p Expr to \p ToTy considering post-inc uses in \p Loops.
4393/// For all PostIncLoopSets in \p Loops, first de-normalize \p Expr, then
4394/// perform the extension/truncate and normalize again, as the normalized form
4395/// can result in folds that are not valid in the post-inc use contexts. The
4396/// expressions for all PostIncLoopSets must match, otherwise return nullptr.
4397static const SCEV *
4399 const SCEV *Expr, Type *ToTy,
4400 ScalarEvolution &SE) {
4401 const SCEV *Result = nullptr;
4402 for (auto &L : Loops) {
4403 auto *DenormExpr = denormalizeForPostIncUse(Expr, L, SE);
4404 const SCEV *NewDenormExpr = SE.getAnyExtendExpr(DenormExpr, ToTy);
4405 const SCEV *New = normalizeForPostIncUse(NewDenormExpr, L, SE);
4406 if (!New || (Result && New != Result))
4407 return nullptr;
4408 Result = New;
4409 }
4410
4411 assert(Result && "failed to create expression");
4412 return Result;
4413}
4414
4415/// Generate reuse formulae from different IV types.
4416void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
4417 // Don't bother truncating symbolic values.
4418 if (Base.BaseGV) return;
4419
4420 // Determine the integer type for the base formula.
4421 Type *DstTy = Base.getType();
4422 if (!DstTy) return;
4423 if (DstTy->isPointerTy())
4424 return;
4425
4426 // It is invalid to extend a pointer type so exit early if ScaledReg or
4427 // any of the BaseRegs are pointers.
4428 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4429 return;
4430 if (any_of(Base.BaseRegs,
4431 [](const SCEV *S) { return S->getType()->isPointerTy(); }))
4432 return;
4433
4435 for (auto &LF : LU.Fixups)
4436 Loops.push_back(LF.PostIncLoops);
4437
4438 for (Type *SrcTy : Types) {
4439 if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
4440 Formula F = Base;
4441
4442 // Sometimes SCEV is able to prove zero during ext transform. It may
4443 // happen if SCEV did not do all possible transforms while creating the
4444 // initial node (maybe due to depth limitations), but it can do them while
4445 // taking ext.
4446 if (F.ScaledReg) {
4447 const SCEV *NewScaledReg =
4448 getAnyExtendConsideringPostIncUses(Loops, F.ScaledReg, SrcTy, SE);
4449 if (!NewScaledReg || NewScaledReg->isZero())
4450 continue;
4451 F.ScaledReg = NewScaledReg;
4452 }
4453 bool HasZeroBaseReg = false;
4454 for (const SCEV *&BaseReg : F.BaseRegs) {
4455 const SCEV *NewBaseReg =
4456 getAnyExtendConsideringPostIncUses(Loops, BaseReg, SrcTy, SE);
4457 if (!NewBaseReg || NewBaseReg->isZero()) {
4458 HasZeroBaseReg = true;
4459 break;
4460 }
4461 BaseReg = NewBaseReg;
4462 }
4463 if (HasZeroBaseReg)
4464 continue;
4465
4466 // TODO: This assumes we've done basic processing on all uses and
4467 // have an idea what the register usage is.
4468 if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
4469 continue;
4470
4471 F.canonicalize(*L);
4472 (void)InsertFormula(LU, LUIdx, F);
4473 }
4474 }
4475}
4476
4477namespace {
4478
4479/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
4480/// modifications so that the search phase doesn't have to worry about the data
4481/// structures moving underneath it.
4482struct WorkItem {
4483 size_t LUIdx;
4484 Immediate Imm;
4485 const SCEV *OrigReg;
4486
4487 WorkItem(size_t LI, Immediate I, const SCEV *R)
4488 : LUIdx(LI), Imm(I), OrigReg(R) {}
4489
4490 void print(raw_ostream &OS) const;
4491 void dump() const;
4492};
4493
4494} // end anonymous namespace
4495
4496#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4497void WorkItem::print(raw_ostream &OS) const {
4498 OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
4499 << " , add offset " << Imm;
4500}
4501
4502LLVM_DUMP_METHOD void WorkItem::dump() const {
4503 print(errs()); errs() << '\n';
4504}
4505#endif
4506
4507/// Look for registers which are a constant distance apart and try to form reuse
4508/// opportunities between them.
4509void LSRInstance::GenerateCrossUseConstantOffsets() {
4510 // Group the registers by their value without any added constant offset.
4511 using ImmMapTy = std::map<Immediate, const SCEV *, KeyOrderTargetImmediate>;
4512
4516 for (const SCEV *Use : RegUses) {
4517 const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify.
4518 Immediate Imm = ExtractImmediate(Reg, SE);
4519 auto Pair = Map.insert(std::make_pair(Reg, ImmMapTy()));
4520 if (Pair.second)
4521 Sequence.push_back(Reg);
4522 Pair.first->second.insert(std::make_pair(Imm, Use));
4523 UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
4524 }
4525
4526 // Now examine each set of registers with the same base value. Build up
4527 // a list of work to do and do the work in a separate step so that we're
4528 // not adding formulae and register counts while we're searching.
4529 SmallVector<WorkItem, 32> WorkItems;
4530 SmallSet<std::pair<size_t, Immediate>, 32, KeyOrderSizeTAndImmediate>
4531 UniqueItems;
4532 for (const SCEV *Reg : Sequence) {
4533 const ImmMapTy &Imms = Map.find(Reg)->second;
4534
4535 // It's not worthwhile looking for reuse if there's only one offset.
4536 if (Imms.size() == 1)
4537 continue;
4538
4539 LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
4540 for (const auto &Entry
4541 : Imms) dbgs()
4542 << ' ' << Entry.first;
4543 dbgs() << '\n');
4544
4545 // Examine each offset.
4546 for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
4547 J != JE; ++J) {
4548 const SCEV *OrigReg = J->second;
4549
4550 Immediate JImm = J->first;
4551 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
4552
4553 if (!isa<SCEVConstant>(OrigReg) &&
4554 UsedByIndicesMap[Reg].count() == 1) {
4555 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4556 << '\n');
4557 continue;
4558 }
4559
4560 // Conservatively examine offsets between this orig reg a few selected
4561 // other orig regs.
4562 Immediate First = Imms.begin()->first;
4563 Immediate Last = std::prev(Imms.end())->first;
4564 if (!First.isCompatibleImmediate(Last)) {
4565 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4566 << "\n");
4567 continue;
4568 }
4569 // Only scalable if both terms are scalable, or if one is scalable and
4570 // the other is 0.
4571 bool Scalable = First.isScalable() || Last.isScalable();
4572 int64_t FI = First.getKnownMinValue();
4573 int64_t LI = Last.getKnownMinValue();
4574 // Compute (First + Last) / 2 without overflow using the fact that
4575 // First + Last = 2 * (First + Last) + (First ^ Last).
4576 int64_t Avg = (FI & LI) + ((FI ^ LI) >> 1);
4577 // If the result is negative and FI is odd and LI even (or vice versa),
4578 // we rounded towards -inf. Add 1 in that case, to round towards 0.
4579 Avg = Avg + ((FI ^ LI) & ((uint64_t)Avg >> 63));
4580 ImmMapTy::const_iterator OtherImms[] = {
4581 Imms.begin(), std::prev(Imms.end()),
4582 Imms.lower_bound(Immediate::get(Avg, Scalable))};
4583 for (const auto &M : OtherImms) {
4584 if (M == J || M == JE) continue;
4585 if (!JImm.isCompatibleImmediate(M->first))
4586 continue;
4587
4588 // Compute the difference between the two.
4589 Immediate Imm = JImm.subUnsigned(M->first);
4590 for (unsigned LUIdx : UsedByIndices.set_bits())
4591 // Make a memo of this use, offset, and register tuple.
4592 if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
4593 WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
4594 }
4595 }
4596 }
4597
4598 Map.clear();
4599 Sequence.clear();
4600 UsedByIndicesMap.clear();
4601 UniqueItems.clear();
4602
4603 // Now iterate through the worklist and add new formulae.
4604 for (const WorkItem &WI : WorkItems) {
4605 size_t LUIdx = WI.LUIdx;
4606 LSRUse &LU = Uses[LUIdx];
4607 Immediate Imm = WI.Imm;
4608 const SCEV *OrigReg = WI.OrigReg;
4609
4610 Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
4611 const SCEV *NegImmS = Imm.getNegativeSCEV(SE, IntTy);
4612 unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
4613
4614 // TODO: Use a more targeted data structure.
4615 for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
4616 Formula F = LU.Formulae[L];
4617 // FIXME: The code for the scaled and unscaled registers looks
4618 // very similar but slightly different. Investigate if they
4619 // could be merged. That way, we would not have to unscale the
4620 // Formula.
4621 F.unscale();
4622 // Use the immediate in the scaled register.
4623 if (F.ScaledReg == OrigReg) {
4624 if (!F.BaseOffset.isCompatibleImmediate(Imm))
4625 continue;
4626 Immediate Offset = F.BaseOffset.addUnsigned(Imm.mulUnsigned(F.Scale));
4627 // Don't create 50 + reg(-50).
4628 const SCEV *S = Offset.getNegativeSCEV(SE, IntTy);
4629 if (F.referencesReg(S))
4630 continue;
4631 Formula NewF = F;
4632 NewF.BaseOffset = Offset;
4633 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4634 NewF))
4635 continue;
4636 NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
4637
4638 // If the new scale is a constant in a register, and adding the constant
4639 // value to the immediate would produce a value closer to zero than the
4640 // immediate itself, then the formula isn't worthwhile.
4641 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) {
4642 // FIXME: Do we need to do something for scalable immediates here?
4643 // A scalable SCEV won't be constant, but we might still have
4644 // something in the offset? Bail out for now to be safe.
4645 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4646 continue;
4647 if (C->getValue()->isNegative() !=
4648 (NewF.BaseOffset.isLessThanZero()) &&
4649 (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
4650 .ule(std::abs(NewF.BaseOffset.getFixedValue())))
4651 continue;
4652 }
4653
4654 // OK, looks good.
4655 NewF.canonicalize(*this->L);
4656 (void)InsertFormula(LU, LUIdx, NewF);
4657 } else {
4658 // Use the immediate in a base register.
4659 for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
4660 const SCEV *BaseReg = F.BaseRegs[N];
4661 if (BaseReg != OrigReg)
4662 continue;
4663 Formula NewF = F;
4664 if (!NewF.BaseOffset.isCompatibleImmediate(Imm) ||
4665 !NewF.UnfoldedOffset.isCompatibleImmediate(Imm) ||
4666 !NewF.BaseOffset.isCompatibleImmediate(NewF.UnfoldedOffset))
4667 continue;
4668 NewF.BaseOffset = NewF.BaseOffset.addUnsigned(Imm);
4669 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
4670 LU.Kind, LU.AccessTy, NewF)) {
4671 if (AMK == TTI::AMK_PostIndexed &&
4672 mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
4673 continue;
4674 Immediate NewUnfoldedOffset = NewF.UnfoldedOffset.addUnsigned(Imm);
4675 if (!isLegalAddImmediate(TTI, NewUnfoldedOffset))
4676 continue;
4677 NewF = F;
4678 NewF.UnfoldedOffset = NewUnfoldedOffset;
4679 }
4680 NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
4681
4682 // If the new formula has a constant in a register, and adding the
4683 // constant value to the immediate would produce a value closer to
4684 // zero than the immediate itself, then the formula isn't worthwhile.
4685 for (const SCEV *NewReg : NewF.BaseRegs)
4686 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg)) {
4687 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4688 goto skip_formula;
4689 if ((C->getAPInt() + NewF.BaseOffset.getFixedValue())
4690 .abs()
4691 .slt(std::abs(NewF.BaseOffset.getFixedValue())) &&
4692 (C->getAPInt() + NewF.BaseOffset.getFixedValue())
4693 .countr_zero() >=
4694 (unsigned)llvm::countr_zero<uint64_t>(
4695 NewF.BaseOffset.getFixedValue()))
4696 goto skip_formula;
4697 }
4698
4699 // Ok, looks good.
4700 NewF.canonicalize(*this->L);
4701 (void)InsertFormula(LU, LUIdx, NewF);
4702 break;
4703 skip_formula:;
4704 }
4705 }
4706 }
4707 }
4708}
4709
4710/// Generate formulae for each use.
4711void
4712LSRInstance::GenerateAllReuseFormulae() {
4713 // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
4714 // queries are more precise.
4715 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4716 LSRUse &LU = Uses[LUIdx];
4717 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4718 GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
4719 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4720 GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
4721 }
4722 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4723 LSRUse &LU = Uses[LUIdx];
4724 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4725 GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
4726 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4727 GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
4728 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4729 GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
4730 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4731 GenerateScales(LU, LUIdx, LU.Formulae[i]);
4732 }
4733 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4734 LSRUse &LU = Uses[LUIdx];
4735 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4736 GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
4737 }
4738
4739 GenerateCrossUseConstantOffsets();
4740
4741 LLVM_DEBUG(dbgs() << "\n"
4742 "After generating reuse formulae:\n";
4743 print_uses(dbgs()));
4744}
4745
4746/// If there are multiple formulae with the same set of registers used
4747/// by other uses, pick the best one and delete the others.
4748void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
4749 DenseSet<const SCEV *> VisitedRegs;
4752#ifndef NDEBUG
4753 bool ChangedFormulae = false;
4754#endif
4755
4756 // Collect the best formula for each unique set of shared registers. This
4757 // is reset for each use.
4758 using BestFormulaeTy =
4759 DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>;
4760
4761 BestFormulaeTy BestFormulae;
4762
4763 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4764 LSRUse &LU = Uses[LUIdx];
4765 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
4766 dbgs() << '\n');
4767
4768 bool Any = false;
4769 for (size_t FIdx = 0, NumForms = LU.Formulae.size();
4770 FIdx != NumForms; ++FIdx) {
4771 Formula &F = LU.Formulae[FIdx];
4772
4773 // Some formulas are instant losers. For example, they may depend on
4774 // nonexistent AddRecs from other loops. These need to be filtered
4775 // immediately, otherwise heuristics could choose them over others leading
4776 // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
4777 // avoids the need to recompute this information across formulae using the
4778 // same bad AddRec. Passing LoserRegs is also essential unless we remove
4779 // the corresponding bad register from the Regs set.
4780 Cost CostF(L, SE, TTI, AMK);
4781 Regs.clear();
4782 CostF.RateFormula(F, Regs, VisitedRegs, LU, &LoserRegs);
4783 if (CostF.isLoser()) {
4784 // During initial formula generation, undesirable formulae are generated
4785 // by uses within other loops that have some non-trivial address mode or
4786 // use the postinc form of the IV. LSR needs to provide these formulae
4787 // as the basis of rediscovering the desired formula that uses an AddRec
4788 // corresponding to the existing phi. Once all formulae have been
4789 // generated, these initial losers may be pruned.
4790 LLVM_DEBUG(dbgs() << " Filtering loser "; F.print(dbgs());
4791 dbgs() << "\n");
4792 }
4793 else {
4795 for (const SCEV *Reg : F.BaseRegs) {
4796 if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
4797 Key.push_back(Reg);
4798 }
4799 if (F.ScaledReg &&
4800 RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
4801 Key.push_back(F.ScaledReg);
4802 // Unstable sort by host order ok, because this is only used for
4803 // uniquifying.
4804 llvm::sort(Key);
4805
4806 std::pair<BestFormulaeTy::const_iterator, bool> P =
4807 BestFormulae.insert(std::make_pair(Key, FIdx));
4808 if (P.second)
4809 continue;
4810
4811 Formula &Best = LU.Formulae[P.first->second];
4812
4813 Cost CostBest(L, SE, TTI, AMK);
4814 Regs.clear();
4815 CostBest.RateFormula(Best, Regs, VisitedRegs, LU);
4816 if (CostF.isLess(CostBest))
4817 std::swap(F, Best);
4818 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
4819 dbgs() << "\n"
4820 " in favor of formula ";
4821 Best.print(dbgs()); dbgs() << '\n');
4822 }
4823#ifndef NDEBUG
4824 ChangedFormulae = true;
4825#endif
4826 LU.DeleteFormula(F);
4827 --FIdx;
4828 --NumForms;
4829 Any = true;
4830 }
4831
4832 // Now that we've filtered out some formulae, recompute the Regs set.
4833 if (Any)
4834 LU.RecomputeRegs(LUIdx, RegUses);
4835
4836 // Reset this to prepare for the next use.
4837 BestFormulae.clear();
4838 }
4839
4840 LLVM_DEBUG(if (ChangedFormulae) {
4841 dbgs() << "\n"
4842 "After filtering out undesirable candidates:\n";
4843 print_uses(dbgs());
4844 });
4845}
4846
4847/// Estimate the worst-case number of solutions the solver might have to
4848/// consider. It almost never considers this many solutions because it prune the
4849/// search space, but the pruning isn't always sufficient.
4850size_t LSRInstance::EstimateSearchSpaceComplexity() const {
4851 size_t Power = 1;
4852 for (const LSRUse &LU : Uses) {
4853 size_t FSize = LU.Formulae.size();
4854 if (FSize >= ComplexityLimit) {
4855 Power = ComplexityLimit;
4856 break;
4857 }
4858 Power *= FSize;
4859 if (Power >= ComplexityLimit)
4860 break;
4861 }
4862 return Power;
4863}
4864
4865/// When one formula uses a superset of the registers of another formula, it
4866/// won't help reduce register pressure (though it may not necessarily hurt
4867/// register pressure); remove it to simplify the system.
4868void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
4869 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4870 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4871
4872 LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
4873 "which use a superset of registers used by other "
4874 "formulae.\n");
4875
4876 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4877 LSRUse &LU = Uses[LUIdx];
4878 bool Any = false;
4879 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
4880 Formula &F = LU.Formulae[i];
4881 if (F.BaseOffset.isNonZero() && F.BaseOffset.isScalable())
4882 continue;
4883 // Look for a formula with a constant or GV in a register. If the use
4884 // also has a formula with that same value in an immediate field,
4885 // delete the one that uses a register.
4887 I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
4888 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
4889 Formula NewF = F;
4890 //FIXME: Formulas should store bitwidth to do wrapping properly.
4891 // See PR41034.
4892 NewF.BaseOffset =
4893 Immediate::getFixed(NewF.BaseOffset.getFixedValue() +
4894 (uint64_t)C->getValue()->getSExtValue());
4895 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4896 (I - F.BaseRegs.begin()));
4897 if (LU.HasFormulaWithSameRegs(NewF)) {
4898 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4899 dbgs() << '\n');
4900 LU.DeleteFormula(F);
4901 --i;
4902 --e;
4903 Any = true;
4904 break;
4905 }
4906 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
4907 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
4908 if (!F.BaseGV) {
4909 Formula NewF = F;
4910 NewF.BaseGV = GV;
4911 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4912 (I - F.BaseRegs.begin()));
4913 if (LU.HasFormulaWithSameRegs(NewF)) {
4914 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4915 dbgs() << '\n');
4916 LU.DeleteFormula(F);
4917 --i;
4918 --e;
4919 Any = true;
4920 break;
4921 }
4922 }
4923 }
4924 }
4925 }
4926 if (Any)
4927 LU.RecomputeRegs(LUIdx, RegUses);
4928 }
4929
4930 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4931 }
4932}
4933
4934/// When there are many registers for expressions like A, A+1, A+2, etc.,
4935/// allocate a single register for them.