LLVM  mainline
LoopStrengthReduce.cpp
Go to the documentation of this file.
00001 //===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // This transformation analyzes and transforms the induction variables (and
00011 // computations derived from them) into forms suitable for efficient execution
00012 // on the target.
00013 //
00014 // This pass performs a strength reduction on array references inside loops that
00015 // have as one or more of their components the loop induction variable, it
00016 // rewrites expressions to take advantage of scaled-index addressing modes
00017 // available on the target, and it performs a variety of other optimizations
00018 // related to loop induction variables.
00019 //
00020 // Terminology note: this code has a lot of handling for "post-increment" or
00021 // "post-inc" users. This is not talking about post-increment addressing modes;
00022 // it is instead talking about code like this:
00023 //
00024 //   %i = phi [ 0, %entry ], [ %i.next, %latch ]
00025 //   ...
00026 //   %i.next = add %i, 1
00027 //   %c = icmp eq %i.next, %n
00028 //
00029 // The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
00030 // it's useful to think about these as the same register, with some uses using
00031 // the value of the register before the add and some using it after. In this
00032 // example, the icmp is a post-increment user, since it uses %i.next, which is
00033 // the value of the induction variable after the increment. The other common
00034 // case of post-increment users is users outside the loop.
00035 //
00036 // TODO: More sophistication in the way Formulae are generated and filtered.
00037 //
00038 // TODO: Handle multiple loops at a time.
00039 //
00040 // TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
00041 //       of a GlobalValue?
00042 //
00043 // TODO: When truncation is free, truncate ICmp users' operands to make it a
00044 //       smaller encoding (on x86 at least).
00045 //
00046 // TODO: When a negated register is used by an add (such as in a list of
00047 //       multiple base registers, or as the increment expression in an addrec),
00048 //       we may not actually need both reg and (-1 * reg) in registers; the
00049 //       negation can be implemented by using a sub instead of an add. The
00050 //       lack of support for taking this into consideration when making
00051 //       register pressure decisions is partly worked around by the "Special"
00052 //       use kind.
00053 //
00054 //===----------------------------------------------------------------------===//
00055 
00056 #include "llvm/Transforms/Scalar.h"
00057 #include "llvm/ADT/DenseSet.h"
00058 #include "llvm/ADT/Hashing.h"
00059 #include "llvm/ADT/STLExtras.h"
00060 #include "llvm/ADT/SetVector.h"
00061 #include "llvm/ADT/SmallBitVector.h"
00062 #include "llvm/Analysis/IVUsers.h"
00063 #include "llvm/Analysis/LoopPass.h"
00064 #include "llvm/Analysis/ScalarEvolutionExpander.h"
00065 #include "llvm/Analysis/TargetTransformInfo.h"
00066 #include "llvm/IR/Constants.h"
00067 #include "llvm/IR/DerivedTypes.h"
00068 #include "llvm/IR/Dominators.h"
00069 #include "llvm/IR/Instructions.h"
00070 #include "llvm/IR/IntrinsicInst.h"
00071 #include "llvm/IR/Module.h"
00072 #include "llvm/IR/ValueHandle.h"
00073 #include "llvm/Support/CommandLine.h"
00074 #include "llvm/Support/Debug.h"
00075 #include "llvm/Support/raw_ostream.h"
00076 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
00077 #include "llvm/Transforms/Utils/Local.h"
00078 #include <algorithm>
00079 using namespace llvm;
00080 
00081 #define DEBUG_TYPE "loop-reduce"
00082 
00083 /// MaxIVUsers is an arbitrary threshold that provides an early opportunitiy for
00084 /// bail out. This threshold is far beyond the number of users that LSR can
00085 /// conceivably solve, so it should not affect generated code, but catches the
00086 /// worst cases before LSR burns too much compile time and stack space.
00087 static const unsigned MaxIVUsers = 200;
00088 
00089 // Temporary flag to cleanup congruent phis after LSR phi expansion.
00090 // It's currently disabled until we can determine whether it's truly useful or
00091 // not. The flag should be removed after the v3.0 release.
00092 // This is now needed for ivchains.
00093 static cl::opt<bool> EnablePhiElim(
00094   "enable-lsr-phielim", cl::Hidden, cl::init(true),
00095   cl::desc("Enable LSR phi elimination"));
00096 
00097 #ifndef NDEBUG
00098 // Stress test IV chain generation.
00099 static cl::opt<bool> StressIVChain(
00100   "stress-ivchain", cl::Hidden, cl::init(false),
00101   cl::desc("Stress test LSR IV chains"));
00102 #else
00103 static bool StressIVChain = false;
00104 #endif
00105 
00106 namespace {
00107 
00108 /// RegSortData - This class holds data which is used to order reuse candidates.
00109 class RegSortData {
00110 public:
00111   /// UsedByIndices - This represents the set of LSRUse indices which reference
00112   /// a particular register.
00113   SmallBitVector UsedByIndices;
00114 
00115   RegSortData() {}
00116 
00117   void print(raw_ostream &OS) const;
00118   void dump() const;
00119 };
00120 
00121 }
00122 
00123 void RegSortData::print(raw_ostream &OS) const {
00124   OS << "[NumUses=" << UsedByIndices.count() << ']';
00125 }
00126 
00127 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
00128 void RegSortData::dump() const {
00129   print(errs()); errs() << '\n';
00130 }
00131 #endif
00132 
00133 namespace {
00134 
00135 /// RegUseTracker - Map register candidates to information about how they are
00136 /// used.
00137 class RegUseTracker {
00138   typedef DenseMap<const SCEV *, RegSortData> RegUsesTy;
00139 
00140   RegUsesTy RegUsesMap;
00141   SmallVector<const SCEV *, 16> RegSequence;
00142 
00143 public:
00144   void CountRegister(const SCEV *Reg, size_t LUIdx);
00145   void DropRegister(const SCEV *Reg, size_t LUIdx);
00146   void SwapAndDropUse(size_t LUIdx, size_t LastLUIdx);
00147 
00148   bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
00149 
00150   const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
00151 
00152   void clear();
00153 
00154   typedef SmallVectorImpl<const SCEV *>::iterator iterator;
00155   typedef SmallVectorImpl<const SCEV *>::const_iterator const_iterator;
00156   iterator begin() { return RegSequence.begin(); }
00157   iterator end()   { return RegSequence.end(); }
00158   const_iterator begin() const { return RegSequence.begin(); }
00159   const_iterator end() const   { return RegSequence.end(); }
00160 };
00161 
00162 }
00163 
00164 void
00165 RegUseTracker::CountRegister(const SCEV *Reg, size_t LUIdx) {
00166   std::pair<RegUsesTy::iterator, bool> Pair =
00167     RegUsesMap.insert(std::make_pair(Reg, RegSortData()));
00168   RegSortData &RSD = Pair.first->second;
00169   if (Pair.second)
00170     RegSequence.push_back(Reg);
00171   RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
00172   RSD.UsedByIndices.set(LUIdx);
00173 }
00174 
00175 void
00176 RegUseTracker::DropRegister(const SCEV *Reg, size_t LUIdx) {
00177   RegUsesTy::iterator It = RegUsesMap.find(Reg);
00178   assert(It != RegUsesMap.end());
00179   RegSortData &RSD = It->second;
00180   assert(RSD.UsedByIndices.size() > LUIdx);
00181   RSD.UsedByIndices.reset(LUIdx);
00182 }
00183 
00184 void
00185 RegUseTracker::SwapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
00186   assert(LUIdx <= LastLUIdx);
00187 
00188   // Update RegUses. The data structure is not optimized for this purpose;
00189   // we must iterate through it and update each of the bit vectors.
00190   for (RegUsesTy::iterator I = RegUsesMap.begin(), E = RegUsesMap.end();
00191        I != E; ++I) {
00192     SmallBitVector &UsedByIndices = I->second.UsedByIndices;
00193     if (LUIdx < UsedByIndices.size())
00194       UsedByIndices[LUIdx] =
00195         LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : 0;
00196     UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
00197   }
00198 }
00199 
00200 bool
00201 RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
00202   RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
00203   if (I == RegUsesMap.end())
00204     return false;
00205   const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
00206   int i = UsedByIndices.find_first();
00207   if (i == -1) return false;
00208   if ((size_t)i != LUIdx) return true;
00209   return UsedByIndices.find_next(i) != -1;
00210 }
00211 
00212 const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
00213   RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
00214   assert(I != RegUsesMap.end() && "Unknown register!");
00215   return I->second.UsedByIndices;
00216 }
00217 
00218 void RegUseTracker::clear() {
00219   RegUsesMap.clear();
00220   RegSequence.clear();
00221 }
00222 
00223 namespace {
00224 
00225 /// Formula - This class holds information that describes a formula for
00226 /// computing satisfying a use. It may include broken-out immediates and scaled
00227 /// registers.
00228 struct Formula {
00229   /// Global base address used for complex addressing.
00230   GlobalValue *BaseGV;
00231 
00232   /// Base offset for complex addressing.
00233   int64_t BaseOffset;
00234 
00235   /// Whether any complex addressing has a base register.
00236   bool HasBaseReg;
00237 
00238   /// The scale of any complex addressing.
00239   int64_t Scale;
00240 
00241   /// BaseRegs - The list of "base" registers for this use. When this is
00242   /// non-empty. The canonical representation of a formula is
00243   /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
00244   /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
00245   /// #1 enforces that the scaled register is always used when at least two
00246   /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
00247   /// #2 enforces that 1 * reg is reg.
00248   /// This invariant can be temporarly broken while building a formula.
00249   /// However, every formula inserted into the LSRInstance must be in canonical
00250   /// form.
00251   SmallVector<const SCEV *, 4> BaseRegs;
00252 
00253   /// ScaledReg - The 'scaled' register for this use. This should be non-null
00254   /// when Scale is not zero.
00255   const SCEV *ScaledReg;
00256 
00257   /// UnfoldedOffset - An additional constant offset which added near the
00258   /// use. This requires a temporary register, but the offset itself can
00259   /// live in an add immediate field rather than a register.
00260   int64_t UnfoldedOffset;
00261 
00262   Formula()
00263       : BaseGV(nullptr), BaseOffset(0), HasBaseReg(false), Scale(0),
00264         ScaledReg(nullptr), UnfoldedOffset(0) {}
00265 
00266   void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
00267 
00268   bool isCanonical() const;
00269 
00270   void Canonicalize();
00271 
00272   bool Unscale();
00273 
00274   size_t getNumRegs() const;
00275   Type *getType() const;
00276 
00277   void DeleteBaseReg(const SCEV *&S);
00278 
00279   bool referencesReg(const SCEV *S) const;
00280   bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
00281                                   const RegUseTracker &RegUses) const;
00282 
00283   void print(raw_ostream &OS) const;
00284   void dump() const;
00285 };
00286 
00287 }
00288 
00289 /// DoInitialMatch - Recursion helper for InitialMatch.
00290 static void DoInitialMatch(const SCEV *S, Loop *L,
00291                            SmallVectorImpl<const SCEV *> &Good,
00292                            SmallVectorImpl<const SCEV *> &Bad,
00293                            ScalarEvolution &SE) {
00294   // Collect expressions which properly dominate the loop header.
00295   if (SE.properlyDominates(S, L->getHeader())) {
00296     Good.push_back(S);
00297     return;
00298   }
00299 
00300   // Look at add operands.
00301   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
00302     for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
00303          I != E; ++I)
00304       DoInitialMatch(*I, L, Good, Bad, SE);
00305     return;
00306   }
00307 
00308   // Look at addrec operands.
00309   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
00310     if (!AR->getStart()->isZero()) {
00311       DoInitialMatch(AR->getStart(), L, Good, Bad, SE);
00312       DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
00313                                       AR->getStepRecurrence(SE),
00314                                       // FIXME: AR->getNoWrapFlags()
00315                                       AR->getLoop(), SCEV::FlagAnyWrap),
00316                      L, Good, Bad, SE);
00317       return;
00318     }
00319 
00320   // Handle a multiplication by -1 (negation) if it didn't fold.
00321   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
00322     if (Mul->getOperand(0)->isAllOnesValue()) {
00323       SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end());
00324       const SCEV *NewMul = SE.getMulExpr(Ops);
00325 
00326       SmallVector<const SCEV *, 4> MyGood;
00327       SmallVector<const SCEV *, 4> MyBad;
00328       DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
00329       const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
00330         SE.getEffectiveSCEVType(NewMul->getType())));
00331       for (SmallVectorImpl<const SCEV *>::const_iterator I = MyGood.begin(),
00332            E = MyGood.end(); I != E; ++I)
00333         Good.push_back(SE.getMulExpr(NegOne, *I));
00334       for (SmallVectorImpl<const SCEV *>::const_iterator I = MyBad.begin(),
00335            E = MyBad.end(); I != E; ++I)
00336         Bad.push_back(SE.getMulExpr(NegOne, *I));
00337       return;
00338     }
00339 
00340   // Ok, we can't do anything interesting. Just stuff the whole thing into a
00341   // register and hope for the best.
00342   Bad.push_back(S);
00343 }
00344 
00345 /// InitialMatch - Incorporate loop-variant parts of S into this Formula,
00346 /// attempting to keep all loop-invariant and loop-computable values in a
00347 /// single base register.
00348 void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
00349   SmallVector<const SCEV *, 4> Good;
00350   SmallVector<const SCEV *, 4> Bad;
00351   DoInitialMatch(S, L, Good, Bad, SE);
00352   if (!Good.empty()) {
00353     const SCEV *Sum = SE.getAddExpr(Good);
00354     if (!Sum->isZero())
00355       BaseRegs.push_back(Sum);
00356     HasBaseReg = true;
00357   }
00358   if (!Bad.empty()) {
00359     const SCEV *Sum = SE.getAddExpr(Bad);
00360     if (!Sum->isZero())
00361       BaseRegs.push_back(Sum);
00362     HasBaseReg = true;
00363   }
00364   Canonicalize();
00365 }
00366 
00367 /// \brief Check whether or not this formula statisfies the canonical
00368 /// representation.
00369 /// \see Formula::BaseRegs.
00370 bool Formula::isCanonical() const {
00371   if (ScaledReg)
00372     return Scale != 1 || !BaseRegs.empty();
00373   return BaseRegs.size() <= 1;
00374 }
00375 
00376 /// \brief Helper method to morph a formula into its canonical representation.
00377 /// \see Formula::BaseRegs.
00378 /// Every formula having more than one base register, must use the ScaledReg
00379 /// field. Otherwise, we would have to do special cases everywhere in LSR
00380 /// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
00381 /// On the other hand, 1*reg should be canonicalized into reg.
00382 void Formula::Canonicalize() {
00383   if (isCanonical())
00384     return;
00385   // So far we did not need this case. This is easy to implement but it is
00386   // useless to maintain dead code. Beside it could hurt compile time.
00387   assert(!BaseRegs.empty() && "1*reg => reg, should not be needed.");
00388   // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
00389   ScaledReg = BaseRegs.back();
00390   BaseRegs.pop_back();
00391   Scale = 1;
00392   size_t BaseRegsSize = BaseRegs.size();
00393   size_t Try = 0;
00394   // If ScaledReg is an invariant, try to find a variant expression.
00395   while (Try < BaseRegsSize && !isa<SCEVAddRecExpr>(ScaledReg))
00396     std::swap(ScaledReg, BaseRegs[Try++]);
00397 }
00398 
00399 /// \brief Get rid of the scale in the formula.
00400 /// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
00401 /// \return true if it was possible to get rid of the scale, false otherwise.
00402 /// \note After this operation the formula may not be in the canonical form.
00403 bool Formula::Unscale() {
00404   if (Scale != 1)
00405     return false;
00406   Scale = 0;
00407   BaseRegs.push_back(ScaledReg);
00408   ScaledReg = nullptr;
00409   return true;
00410 }
00411 
00412 /// getNumRegs - Return the total number of register operands used by this
00413 /// formula. This does not include register uses implied by non-constant
00414 /// addrec strides.
00415 size_t Formula::getNumRegs() const {
00416   return !!ScaledReg + BaseRegs.size();
00417 }
00418 
00419 /// getType - Return the type of this formula, if it has one, or null
00420 /// otherwise. This type is meaningless except for the bit size.
00421 Type *Formula::getType() const {
00422   return !BaseRegs.empty() ? BaseRegs.front()->getType() :
00423          ScaledReg ? ScaledReg->getType() :
00424          BaseGV ? BaseGV->getType() :
00425          nullptr;
00426 }
00427 
00428 /// DeleteBaseReg - Delete the given base reg from the BaseRegs list.
00429 void Formula::DeleteBaseReg(const SCEV *&S) {
00430   if (&S != &BaseRegs.back())
00431     std::swap(S, BaseRegs.back());
00432   BaseRegs.pop_back();
00433 }
00434 
00435 /// referencesReg - Test if this formula references the given register.
00436 bool Formula::referencesReg(const SCEV *S) const {
00437   return S == ScaledReg ||
00438          std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end();
00439 }
00440 
00441 /// hasRegsUsedByUsesOtherThan - Test whether this formula uses registers
00442 /// which are used by uses other than the use with the given index.
00443 bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
00444                                          const RegUseTracker &RegUses) const {
00445   if (ScaledReg)
00446     if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
00447       return true;
00448   for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(),
00449        E = BaseRegs.end(); I != E; ++I)
00450     if (RegUses.isRegUsedByUsesOtherThan(*I, LUIdx))
00451       return true;
00452   return false;
00453 }
00454 
00455 void Formula::print(raw_ostream &OS) const {
00456   bool First = true;
00457   if (BaseGV) {
00458     if (!First) OS << " + "; else First = false;
00459     BaseGV->printAsOperand(OS, /*PrintType=*/false);
00460   }
00461   if (BaseOffset != 0) {
00462     if (!First) OS << " + "; else First = false;
00463     OS << BaseOffset;
00464   }
00465   for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(),
00466        E = BaseRegs.end(); I != E; ++I) {
00467     if (!First) OS << " + "; else First = false;
00468     OS << "reg(" << **I << ')';
00469   }
00470   if (HasBaseReg && BaseRegs.empty()) {
00471     if (!First) OS << " + "; else First = false;
00472     OS << "**error: HasBaseReg**";
00473   } else if (!HasBaseReg && !BaseRegs.empty()) {
00474     if (!First) OS << " + "; else First = false;
00475     OS << "**error: !HasBaseReg**";
00476   }
00477   if (Scale != 0) {
00478     if (!First) OS << " + "; else First = false;
00479     OS << Scale << "*reg(";
00480     if (ScaledReg)
00481       OS << *ScaledReg;
00482     else
00483       OS << "<unknown>";
00484     OS << ')';
00485   }
00486   if (UnfoldedOffset != 0) {
00487     if (!First) OS << " + ";
00488     OS << "imm(" << UnfoldedOffset << ')';
00489   }
00490 }
00491 
00492 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
00493 void Formula::dump() const {
00494   print(errs()); errs() << '\n';
00495 }
00496 #endif
00497 
00498 /// isAddRecSExtable - Return true if the given addrec can be sign-extended
00499 /// without changing its value.
00500 static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
00501   Type *WideTy =
00502     IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1);
00503   return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
00504 }
00505 
00506 /// isAddSExtable - Return true if the given add can be sign-extended
00507 /// without changing its value.
00508 static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
00509   Type *WideTy =
00510     IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
00511   return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
00512 }
00513 
00514 /// isMulSExtable - Return true if the given mul can be sign-extended
00515 /// without changing its value.
00516 static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
00517   Type *WideTy =
00518     IntegerType::get(SE.getContext(),
00519                      SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
00520   return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
00521 }
00522 
00523 /// getExactSDiv - Return an expression for LHS /s RHS, if it can be determined
00524 /// and if the remainder is known to be zero,  or null otherwise. If
00525 /// IgnoreSignificantBits is true, expressions like (X * Y) /s Y are simplified
00526 /// to Y, ignoring that the multiplication may overflow, which is useful when
00527 /// the result will be used in a context where the most significant bits are
00528 /// ignored.
00529 static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
00530                                 ScalarEvolution &SE,
00531                                 bool IgnoreSignificantBits = false) {
00532   // Handle the trivial case, which works for any SCEV type.
00533   if (LHS == RHS)
00534     return SE.getConstant(LHS->getType(), 1);
00535 
00536   // Handle a few RHS special cases.
00537   const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
00538   if (RC) {
00539     const APInt &RA = RC->getValue()->getValue();
00540     // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
00541     // some folding.
00542     if (RA.isAllOnesValue())
00543       return SE.getMulExpr(LHS, RC);
00544     // Handle x /s 1 as x.
00545     if (RA == 1)
00546       return LHS;
00547   }
00548 
00549   // Check for a division of a constant by a constant.
00550   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
00551     if (!RC)
00552       return nullptr;
00553     const APInt &LA = C->getValue()->getValue();
00554     const APInt &RA = RC->getValue()->getValue();
00555     if (LA.srem(RA) != 0)
00556       return nullptr;
00557     return SE.getConstant(LA.sdiv(RA));
00558   }
00559 
00560   // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
00561   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
00562     if (IgnoreSignificantBits || isAddRecSExtable(AR, SE)) {
00563       const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
00564                                       IgnoreSignificantBits);
00565       if (!Step) return nullptr;
00566       const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
00567                                        IgnoreSignificantBits);
00568       if (!Start) return nullptr;
00569       // FlagNW is independent of the start value, step direction, and is
00570       // preserved with smaller magnitude steps.
00571       // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
00572       return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
00573     }
00574     return nullptr;
00575   }
00576 
00577   // Distribute the sdiv over add operands, if the add doesn't overflow.
00578   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
00579     if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
00580       SmallVector<const SCEV *, 8> Ops;
00581       for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
00582            I != E; ++I) {
00583         const SCEV *Op = getExactSDiv(*I, RHS, SE,
00584                                       IgnoreSignificantBits);
00585         if (!Op) return nullptr;
00586         Ops.push_back(Op);
00587       }
00588       return SE.getAddExpr(Ops);
00589     }
00590     return nullptr;
00591   }
00592 
00593   // Check for a multiply operand that we can pull RHS out of.
00594   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
00595     if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
00596       SmallVector<const SCEV *, 4> Ops;
00597       bool Found = false;
00598       for (SCEVMulExpr::op_iterator I = Mul->op_begin(), E = Mul->op_end();
00599            I != E; ++I) {
00600         const SCEV *S = *I;
00601         if (!Found)
00602           if (const SCEV *Q = getExactSDiv(S, RHS, SE,
00603                                            IgnoreSignificantBits)) {
00604             S = Q;
00605             Found = true;
00606           }
00607         Ops.push_back(S);
00608       }
00609       return Found ? SE.getMulExpr(Ops) : nullptr;
00610     }
00611     return nullptr;
00612   }
00613 
00614   // Otherwise we don't know.
00615   return nullptr;
00616 }
00617 
00618 /// ExtractImmediate - If S involves the addition of a constant integer value,
00619 /// return that integer value, and mutate S to point to a new SCEV with that
00620 /// value excluded.
00621 static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
00622   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
00623     if (C->getValue()->getValue().getMinSignedBits() <= 64) {
00624       S = SE.getConstant(C->getType(), 0);
00625       return C->getValue()->getSExtValue();
00626     }
00627   } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
00628     SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
00629     int64_t Result = ExtractImmediate(NewOps.front(), SE);
00630     if (Result != 0)
00631       S = SE.getAddExpr(NewOps);
00632     return Result;
00633   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
00634     SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
00635     int64_t Result = ExtractImmediate(NewOps.front(), SE);
00636     if (Result != 0)
00637       S = SE.getAddRecExpr(NewOps, AR->getLoop(),
00638                            // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
00639                            SCEV::FlagAnyWrap);
00640     return Result;
00641   }
00642   return 0;
00643 }
00644 
00645 /// ExtractSymbol - If S involves the addition of a GlobalValue address,
00646 /// return that symbol, and mutate S to point to a new SCEV with that
00647 /// value excluded.
00648 static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
00649   if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
00650     if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
00651       S = SE.getConstant(GV->getType(), 0);
00652       return GV;
00653     }
00654   } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
00655     SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
00656     GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
00657     if (Result)
00658       S = SE.getAddExpr(NewOps);
00659     return Result;
00660   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
00661     SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
00662     GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
00663     if (Result)
00664       S = SE.getAddRecExpr(NewOps, AR->getLoop(),
00665                            // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
00666                            SCEV::FlagAnyWrap);
00667     return Result;
00668   }
00669   return nullptr;
00670 }
00671 
00672 /// isAddressUse - Returns true if the specified instruction is using the
00673 /// specified value as an address.
00674 static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
00675   bool isAddress = isa<LoadInst>(Inst);
00676   if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
00677     if (SI->getOperand(1) == OperandVal)
00678       isAddress = true;
00679   } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
00680     // Addressing modes can also be folded into prefetches and a variety
00681     // of intrinsics.
00682     switch (II->getIntrinsicID()) {
00683       default: break;
00684       case Intrinsic::prefetch:
00685       case Intrinsic::x86_sse_storeu_ps:
00686       case Intrinsic::x86_sse2_storeu_pd:
00687       case Intrinsic::x86_sse2_storeu_dq:
00688       case Intrinsic::x86_sse2_storel_dq:
00689         if (II->getArgOperand(0) == OperandVal)
00690           isAddress = true;
00691         break;
00692     }
00693   }
00694   return isAddress;
00695 }
00696 
00697 /// getAccessType - Return the type of the memory being accessed.
00698 static Type *getAccessType(const Instruction *Inst) {
00699   Type *AccessTy = Inst->getType();
00700   if (const StoreInst *SI = dyn_cast<StoreInst>(Inst))
00701     AccessTy = SI->getOperand(0)->getType();
00702   else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
00703     // Addressing modes can also be folded into prefetches and a variety
00704     // of intrinsics.
00705     switch (II->getIntrinsicID()) {
00706     default: break;
00707     case Intrinsic::x86_sse_storeu_ps:
00708     case Intrinsic::x86_sse2_storeu_pd:
00709     case Intrinsic::x86_sse2_storeu_dq:
00710     case Intrinsic::x86_sse2_storel_dq:
00711       AccessTy = II->getArgOperand(0)->getType();
00712       break;
00713     }
00714   }
00715 
00716   // All pointers have the same requirements, so canonicalize them to an
00717   // arbitrary pointer type to minimize variation.
00718   if (PointerType *PTy = dyn_cast<PointerType>(AccessTy))
00719     AccessTy = PointerType::get(IntegerType::get(PTy->getContext(), 1),
00720                                 PTy->getAddressSpace());
00721 
00722   return AccessTy;
00723 }
00724 
00725 /// isExistingPhi - Return true if this AddRec is already a phi in its loop.
00726 static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
00727   for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();
00728        PHINode *PN = dyn_cast<PHINode>(I); ++I) {
00729     if (SE.isSCEVable(PN->getType()) &&
00730         (SE.getEffectiveSCEVType(PN->getType()) ==
00731          SE.getEffectiveSCEVType(AR->getType())) &&
00732         SE.getSCEV(PN) == AR)
00733       return true;
00734   }
00735   return false;
00736 }
00737 
00738 /// Check if expanding this expression is likely to incur significant cost. This
00739 /// is tricky because SCEV doesn't track which expressions are actually computed
00740 /// by the current IR.
00741 ///
00742 /// We currently allow expansion of IV increments that involve adds,
00743 /// multiplication by constants, and AddRecs from existing phis.
00744 ///
00745 /// TODO: Allow UDivExpr if we can find an existing IV increment that is an
00746 /// obvious multiple of the UDivExpr.
00747 static bool isHighCostExpansion(const SCEV *S,
00748                                 SmallPtrSetImpl<const SCEV*> &Processed,
00749                                 ScalarEvolution &SE) {
00750   // Zero/One operand expressions
00751   switch (S->getSCEVType()) {
00752   case scUnknown:
00753   case scConstant:
00754     return false;
00755   case scTruncate:
00756     return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
00757                                Processed, SE);
00758   case scZeroExtend:
00759     return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
00760                                Processed, SE);
00761   case scSignExtend:
00762     return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
00763                                Processed, SE);
00764   }
00765 
00766   if (!Processed.insert(S).second)
00767     return false;
00768 
00769   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
00770     for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
00771          I != E; ++I) {
00772       if (isHighCostExpansion(*I, Processed, SE))
00773         return true;
00774     }
00775     return false;
00776   }
00777 
00778   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
00779     if (Mul->getNumOperands() == 2) {
00780       // Multiplication by a constant is ok
00781       if (isa<SCEVConstant>(Mul->getOperand(0)))
00782         return isHighCostExpansion(Mul->getOperand(1), Processed, SE);
00783 
00784       // If we have the value of one operand, check if an existing
00785       // multiplication already generates this expression.
00786       if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) {
00787         Value *UVal = U->getValue();
00788         for (User *UR : UVal->users()) {
00789           // If U is a constant, it may be used by a ConstantExpr.
00790           Instruction *UI = dyn_cast<Instruction>(UR);
00791           if (UI && UI->getOpcode() == Instruction::Mul &&
00792               SE.isSCEVable(UI->getType())) {
00793             return SE.getSCEV(UI) == Mul;
00794           }
00795         }
00796       }
00797     }
00798   }
00799 
00800   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
00801     if (isExistingPhi(AR, SE))
00802       return false;
00803   }
00804 
00805   // Fow now, consider any other type of expression (div/mul/min/max) high cost.
00806   return true;
00807 }
00808 
00809 /// DeleteTriviallyDeadInstructions - If any of the instructions is the
00810 /// specified set are trivially dead, delete them and see if this makes any of
00811 /// their operands subsequently dead.
00812 static bool
00813 DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
00814   bool Changed = false;
00815 
00816   while (!DeadInsts.empty()) {
00817     Value *V = DeadInsts.pop_back_val();
00818     Instruction *I = dyn_cast_or_null<Instruction>(V);
00819 
00820     if (!I || !isInstructionTriviallyDead(I))
00821       continue;
00822 
00823     for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
00824       if (Instruction *U = dyn_cast<Instruction>(*OI)) {
00825         *OI = nullptr;
00826         if (U->use_empty())
00827           DeadInsts.push_back(U);
00828       }
00829 
00830     I->eraseFromParent();
00831     Changed = true;
00832   }
00833 
00834   return Changed;
00835 }
00836 
00837 namespace {
00838 class LSRUse;
00839 }
00840 
00841 /// \brief Check if the addressing mode defined by \p F is completely
00842 /// folded in \p LU at isel time.
00843 /// This includes address-mode folding and special icmp tricks.
00844 /// This function returns true if \p LU can accommodate what \p F
00845 /// defines and up to 1 base + 1 scaled + offset.
00846 /// In other words, if \p F has several base registers, this function may
00847 /// still return true. Therefore, users still need to account for
00848 /// additional base registers and/or unfolded offsets to derive an
00849 /// accurate cost model.
00850 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
00851                                  const LSRUse &LU, const Formula &F);
00852 // Get the cost of the scaling factor used in F for LU.
00853 static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
00854                                      const LSRUse &LU, const Formula &F);
00855 
00856 namespace {
00857 
00858 /// Cost - This class is used to measure and compare candidate formulae.
00859 class Cost {
00860   /// TODO: Some of these could be merged. Also, a lexical ordering
00861   /// isn't always optimal.
00862   unsigned NumRegs;
00863   unsigned AddRecCost;
00864   unsigned NumIVMuls;
00865   unsigned NumBaseAdds;
00866   unsigned ImmCost;
00867   unsigned SetupCost;
00868   unsigned ScaleCost;
00869 
00870 public:
00871   Cost()
00872     : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0),
00873       SetupCost(0), ScaleCost(0) {}
00874 
00875   bool operator<(const Cost &Other) const;
00876 
00877   void Lose();
00878 
00879 #ifndef NDEBUG
00880   // Once any of the metrics loses, they must all remain losers.
00881   bool isValid() {
00882     return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
00883              | ImmCost | SetupCost | ScaleCost) != ~0u)
00884       || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
00885            & ImmCost & SetupCost & ScaleCost) == ~0u);
00886   }
00887 #endif
00888 
00889   bool isLoser() {
00890     assert(isValid() && "invalid cost");
00891     return NumRegs == ~0u;
00892   }
00893 
00894   void RateFormula(const TargetTransformInfo &TTI,
00895                    const Formula &F,
00896                    SmallPtrSetImpl<const SCEV *> &Regs,
00897                    const DenseSet<const SCEV *> &VisitedRegs,
00898                    const Loop *L,
00899                    const SmallVectorImpl<int64_t> &Offsets,
00900                    ScalarEvolution &SE, DominatorTree &DT,
00901                    const LSRUse &LU,
00902                    SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
00903 
00904   void print(raw_ostream &OS) const;
00905   void dump() const;
00906 
00907 private:
00908   void RateRegister(const SCEV *Reg,
00909                     SmallPtrSetImpl<const SCEV *> &Regs,
00910                     const Loop *L,
00911                     ScalarEvolution &SE, DominatorTree &DT);
00912   void RatePrimaryRegister(const SCEV *Reg,
00913                            SmallPtrSetImpl<const SCEV *> &Regs,
00914                            const Loop *L,
00915                            ScalarEvolution &SE, DominatorTree &DT,
00916                            SmallPtrSetImpl<const SCEV *> *LoserRegs);
00917 };
00918 
00919 }
00920 
00921 /// RateRegister - Tally up interesting quantities from the given register.
00922 void Cost::RateRegister(const SCEV *Reg,
00923                         SmallPtrSetImpl<const SCEV *> &Regs,
00924                         const Loop *L,
00925                         ScalarEvolution &SE, DominatorTree &DT) {
00926   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
00927     // If this is an addrec for another loop, don't second-guess its addrec phi
00928     // nodes. LSR isn't currently smart enough to reason about more than one
00929     // loop at a time. LSR has already run on inner loops, will not run on outer
00930     // loops, and cannot be expected to change sibling loops.
00931     if (AR->getLoop() != L) {
00932       // If the AddRec exists, consider it's register free and leave it alone.
00933       if (isExistingPhi(AR, SE))
00934         return;
00935 
00936       // Otherwise, do not consider this formula at all.
00937       Lose();
00938       return;
00939     }
00940     AddRecCost += 1; /// TODO: This should be a function of the stride.
00941 
00942     // Add the step value register, if it needs one.
00943     // TODO: The non-affine case isn't precisely modeled here.
00944     if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
00945       if (!Regs.count(AR->getOperand(1))) {
00946         RateRegister(AR->getOperand(1), Regs, L, SE, DT);
00947         if (isLoser())
00948           return;
00949       }
00950     }
00951   }
00952   ++NumRegs;
00953 
00954   // Rough heuristic; favor registers which don't require extra setup
00955   // instructions in the preheader.
00956   if (!isa<SCEVUnknown>(Reg) &&
00957       !isa<SCEVConstant>(Reg) &&
00958       !(isa<SCEVAddRecExpr>(Reg) &&
00959         (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) ||
00960          isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart()))))
00961     ++SetupCost;
00962 
00963     NumIVMuls += isa<SCEVMulExpr>(Reg) &&
00964                  SE.hasComputableLoopEvolution(Reg, L);
00965 }
00966 
00967 /// RatePrimaryRegister - Record this register in the set. If we haven't seen it
00968 /// before, rate it. Optional LoserRegs provides a way to declare any formula
00969 /// that refers to one of those regs an instant loser.
00970 void Cost::RatePrimaryRegister(const SCEV *Reg,
00971                                SmallPtrSetImpl<const SCEV *> &Regs,
00972                                const Loop *L,
00973                                ScalarEvolution &SE, DominatorTree &DT,
00974                                SmallPtrSetImpl<const SCEV *> *LoserRegs) {
00975   if (LoserRegs && LoserRegs->count(Reg)) {
00976     Lose();
00977     return;
00978   }
00979   if (Regs.insert(Reg).second) {
00980     RateRegister(Reg, Regs, L, SE, DT);
00981     if (LoserRegs && isLoser())
00982       LoserRegs->insert(Reg);
00983   }
00984 }
00985 
00986 void Cost::RateFormula(const TargetTransformInfo &TTI,
00987                        const Formula &F,
00988                        SmallPtrSetImpl<const SCEV *> &Regs,
00989                        const DenseSet<const SCEV *> &VisitedRegs,
00990                        const Loop *L,
00991                        const SmallVectorImpl<int64_t> &Offsets,
00992                        ScalarEvolution &SE, DominatorTree &DT,
00993                        const LSRUse &LU,
00994                        SmallPtrSetImpl<const SCEV *> *LoserRegs) {
00995   assert(F.isCanonical() && "Cost is accurate only for canonical formula");
00996   // Tally up the registers.
00997   if (const SCEV *ScaledReg = F.ScaledReg) {
00998     if (VisitedRegs.count(ScaledReg)) {
00999       Lose();
01000       return;
01001     }
01002     RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs);
01003     if (isLoser())
01004       return;
01005   }
01006   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
01007        E = F.BaseRegs.end(); I != E; ++I) {
01008     const SCEV *BaseReg = *I;
01009     if (VisitedRegs.count(BaseReg)) {
01010       Lose();
01011       return;
01012     }
01013     RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs);
01014     if (isLoser())
01015       return;
01016   }
01017 
01018   // Determine how many (unfolded) adds we'll need inside the loop.
01019   size_t NumBaseParts = F.getNumRegs();
01020   if (NumBaseParts > 1)
01021     // Do not count the base and a possible second register if the target
01022     // allows to fold 2 registers.
01023     NumBaseAdds +=
01024         NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F)));
01025   NumBaseAdds += (F.UnfoldedOffset != 0);
01026 
01027   // Accumulate non-free scaling amounts.
01028   ScaleCost += getScalingFactorCost(TTI, LU, F);
01029 
01030   // Tally up the non-zero immediates.
01031   for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
01032        E = Offsets.end(); I != E; ++I) {
01033     int64_t Offset = (uint64_t)*I + F.BaseOffset;
01034     if (F.BaseGV)
01035       ImmCost += 64; // Handle symbolic values conservatively.
01036                      // TODO: This should probably be the pointer size.
01037     else if (Offset != 0)
01038       ImmCost += APInt(64, Offset, true).getMinSignedBits();
01039   }
01040   assert(isValid() && "invalid cost");
01041 }
01042 
01043 /// Lose - Set this cost to a losing value.
01044 void Cost::Lose() {
01045   NumRegs = ~0u;
01046   AddRecCost = ~0u;
01047   NumIVMuls = ~0u;
01048   NumBaseAdds = ~0u;
01049   ImmCost = ~0u;
01050   SetupCost = ~0u;
01051   ScaleCost = ~0u;
01052 }
01053 
01054 /// operator< - Choose the lower cost.
01055 bool Cost::operator<(const Cost &Other) const {
01056   return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost,
01057                   ImmCost, SetupCost) <
01058          std::tie(Other.NumRegs, Other.AddRecCost, Other.NumIVMuls,
01059                   Other.NumBaseAdds, Other.ScaleCost, Other.ImmCost,
01060                   Other.SetupCost);
01061 }
01062 
01063 void Cost::print(raw_ostream &OS) const {
01064   OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
01065   if (AddRecCost != 0)
01066     OS << ", with addrec cost " << AddRecCost;
01067   if (NumIVMuls != 0)
01068     OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s");
01069   if (NumBaseAdds != 0)
01070     OS << ", plus " << NumBaseAdds << " base add"
01071        << (NumBaseAdds == 1 ? "" : "s");
01072   if (ScaleCost != 0)
01073     OS << ", plus " << ScaleCost << " scale cost";
01074   if (ImmCost != 0)
01075     OS << ", plus " << ImmCost << " imm cost";
01076   if (SetupCost != 0)
01077     OS << ", plus " << SetupCost << " setup cost";
01078 }
01079 
01080 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
01081 void Cost::dump() const {
01082   print(errs()); errs() << '\n';
01083 }
01084 #endif
01085 
01086 namespace {
01087 
01088 /// LSRFixup - An operand value in an instruction which is to be replaced
01089 /// with some equivalent, possibly strength-reduced, replacement.
01090 struct LSRFixup {
01091   /// UserInst - The instruction which will be updated.
01092   Instruction *UserInst;
01093 
01094   /// OperandValToReplace - The operand of the instruction which will
01095   /// be replaced. The operand may be used more than once; every instance
01096   /// will be replaced.
01097   Value *OperandValToReplace;
01098 
01099   /// PostIncLoops - If this user is to use the post-incremented value of an
01100   /// induction variable, this variable is non-null and holds the loop
01101   /// associated with the induction variable.
01102   PostIncLoopSet PostIncLoops;
01103 
01104   /// LUIdx - The index of the LSRUse describing the expression which
01105   /// this fixup needs, minus an offset (below).
01106   size_t LUIdx;
01107 
01108   /// Offset - A constant offset to be added to the LSRUse expression.
01109   /// This allows multiple fixups to share the same LSRUse with different
01110   /// offsets, for example in an unrolled loop.
01111   int64_t Offset;
01112 
01113   bool isUseFullyOutsideLoop(const Loop *L) const;
01114 
01115   LSRFixup();
01116 
01117   void print(raw_ostream &OS) const;
01118   void dump() const;
01119 };
01120 
01121 }
01122 
01123 LSRFixup::LSRFixup()
01124   : UserInst(nullptr), OperandValToReplace(nullptr), LUIdx(~size_t(0)),
01125     Offset(0) {}
01126 
01127 /// isUseFullyOutsideLoop - Test whether this fixup always uses its
01128 /// value outside of the given loop.
01129 bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
01130   // PHI nodes use their value in their incoming blocks.
01131   if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
01132     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
01133       if (PN->getIncomingValue(i) == OperandValToReplace &&
01134           L->contains(PN->getIncomingBlock(i)))
01135         return false;
01136     return true;
01137   }
01138 
01139   return !L->contains(UserInst);
01140 }
01141 
01142 void LSRFixup::print(raw_ostream &OS) const {
01143   OS << "UserInst=";
01144   // Store is common and interesting enough to be worth special-casing.
01145   if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
01146     OS << "store ";
01147     Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
01148   } else if (UserInst->getType()->isVoidTy())
01149     OS << UserInst->getOpcodeName();
01150   else
01151     UserInst->printAsOperand(OS, /*PrintType=*/false);
01152 
01153   OS << ", OperandValToReplace=";
01154   OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
01155 
01156   for (PostIncLoopSet::const_iterator I = PostIncLoops.begin(),
01157        E = PostIncLoops.end(); I != E; ++I) {
01158     OS << ", PostIncLoop=";
01159     (*I)->getHeader()->printAsOperand(OS, /*PrintType=*/false);
01160   }
01161 
01162   if (LUIdx != ~size_t(0))
01163     OS << ", LUIdx=" << LUIdx;
01164 
01165   if (Offset != 0)
01166     OS << ", Offset=" << Offset;
01167 }
01168 
01169 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
01170 void LSRFixup::dump() const {
01171   print(errs()); errs() << '\n';
01172 }
01173 #endif
01174 
01175 namespace {
01176 
01177 /// UniquifierDenseMapInfo - A DenseMapInfo implementation for holding
01178 /// DenseMaps and DenseSets of sorted SmallVectors of const SCEV*.
01179 struct UniquifierDenseMapInfo {
01180   static SmallVector<const SCEV *, 4> getEmptyKey() {
01181     SmallVector<const SCEV *, 4>  V;
01182     V.push_back(reinterpret_cast<const SCEV *>(-1));
01183     return V;
01184   }
01185 
01186   static SmallVector<const SCEV *, 4> getTombstoneKey() {
01187     SmallVector<const SCEV *, 4> V;
01188     V.push_back(reinterpret_cast<const SCEV *>(-2));
01189     return V;
01190   }
01191 
01192   static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) {
01193     return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
01194   }
01195 
01196   static bool isEqual(const SmallVector<const SCEV *, 4> &LHS,
01197                       const SmallVector<const SCEV *, 4> &RHS) {
01198     return LHS == RHS;
01199   }
01200 };
01201 
01202 /// LSRUse - This class holds the state that LSR keeps for each use in
01203 /// IVUsers, as well as uses invented by LSR itself. It includes information
01204 /// about what kinds of things can be folded into the user, information about
01205 /// the user itself, and information about how the use may be satisfied.
01206 /// TODO: Represent multiple users of the same expression in common?
01207 class LSRUse {
01208   DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;
01209 
01210 public:
01211   /// KindType - An enum for a kind of use, indicating what types of
01212   /// scaled and immediate operands it might support.
01213   enum KindType {
01214     Basic,   ///< A normal use, with no folding.
01215     Special, ///< A special case of basic, allowing -1 scales.
01216     Address, ///< An address use; folding according to TargetLowering
01217     ICmpZero ///< An equality icmp with both operands folded into one.
01218     // TODO: Add a generic icmp too?
01219   };
01220 
01221   typedef PointerIntPair<const SCEV *, 2, KindType> SCEVUseKindPair;
01222 
01223   KindType Kind;
01224   Type *AccessTy;
01225 
01226   SmallVector<int64_t, 8> Offsets;
01227   int64_t MinOffset;
01228   int64_t MaxOffset;
01229 
01230   /// AllFixupsOutsideLoop - This records whether all of the fixups using this
01231   /// LSRUse are outside of the loop, in which case some special-case heuristics
01232   /// may be used.
01233   bool AllFixupsOutsideLoop;
01234 
01235   /// RigidFormula is set to true to guarantee that this use will be associated
01236   /// with a single formula--the one that initially matched. Some SCEV
01237   /// expressions cannot be expanded. This allows LSR to consider the registers
01238   /// used by those expressions without the need to expand them later after
01239   /// changing the formula.
01240   bool RigidFormula;
01241 
01242   /// WidestFixupType - This records the widest use type for any fixup using
01243   /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different
01244   /// max fixup widths to be equivalent, because the narrower one may be relying
01245   /// on the implicit truncation to truncate away bogus bits.
01246   Type *WidestFixupType;
01247 
01248   /// Formulae - A list of ways to build a value that can satisfy this user.
01249   /// After the list is populated, one of these is selected heuristically and
01250   /// used to formulate a replacement for OperandValToReplace in UserInst.
01251   SmallVector<Formula, 12> Formulae;
01252 
01253   /// Regs - The set of register candidates used by all formulae in this LSRUse.
01254   SmallPtrSet<const SCEV *, 4> Regs;
01255 
01256   LSRUse(KindType K, Type *T) : Kind(K), AccessTy(T),
01257                                       MinOffset(INT64_MAX),
01258                                       MaxOffset(INT64_MIN),
01259                                       AllFixupsOutsideLoop(true),
01260                                       RigidFormula(false),
01261                                       WidestFixupType(nullptr) {}
01262 
01263   bool HasFormulaWithSameRegs(const Formula &F) const;
01264   bool InsertFormula(const Formula &F);
01265   void DeleteFormula(Formula &F);
01266   void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
01267 
01268   void print(raw_ostream &OS) const;
01269   void dump() const;
01270 };
01271 
01272 }
01273 
01274 /// HasFormula - Test whether this use as a formula which has the same
01275 /// registers as the given formula.
01276 bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
01277   SmallVector<const SCEV *, 4> Key = F.BaseRegs;
01278   if (F.ScaledReg) Key.push_back(F.ScaledReg);
01279   // Unstable sort by host order ok, because this is only used for uniquifying.
01280   std::sort(Key.begin(), Key.end());
01281   return Uniquifier.count(Key);
01282 }
01283 
01284 /// InsertFormula - If the given formula has not yet been inserted, add it to
01285 /// the list, and return true. Return false otherwise.
01286 /// The formula must be in canonical form.
01287 bool LSRUse::InsertFormula(const Formula &F) {
01288   assert(F.isCanonical() && "Invalid canonical representation");
01289 
01290   if (!Formulae.empty() && RigidFormula)
01291     return false;
01292 
01293   SmallVector<const SCEV *, 4> Key = F.BaseRegs;
01294   if (F.ScaledReg) Key.push_back(F.ScaledReg);
01295   // Unstable sort by host order ok, because this is only used for uniquifying.
01296   std::sort(Key.begin(), Key.end());
01297 
01298   if (!Uniquifier.insert(Key).second)
01299     return false;
01300 
01301   // Using a register to hold the value of 0 is not profitable.
01302   assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
01303          "Zero allocated in a scaled register!");
01304 #ifndef NDEBUG
01305   for (SmallVectorImpl<const SCEV *>::const_iterator I =
01306        F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I)
01307     assert(!(*I)->isZero() && "Zero allocated in a base register!");
01308 #endif
01309 
01310   // Add the formula to the list.
01311   Formulae.push_back(F);
01312 
01313   // Record registers now being used by this use.
01314   Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
01315   if (F.ScaledReg)
01316     Regs.insert(F.ScaledReg);
01317 
01318   return true;
01319 }
01320 
01321 /// DeleteFormula - Remove the given formula from this use's list.
01322 void LSRUse::DeleteFormula(Formula &F) {
01323   if (&F != &Formulae.back())
01324     std::swap(F, Formulae.back());
01325   Formulae.pop_back();
01326 }
01327 
01328 /// RecomputeRegs - Recompute the Regs field, and update RegUses.
01329 void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
01330   // Now that we've filtered out some formulae, recompute the Regs set.
01331   SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
01332   Regs.clear();
01333   for (const Formula &F : Formulae) {
01334     if (F.ScaledReg) Regs.insert(F.ScaledReg);
01335     Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
01336   }
01337 
01338   // Update the RegTracker.
01339   for (const SCEV *S : OldRegs)
01340     if (!Regs.count(S))
01341       RegUses.DropRegister(S, LUIdx);
01342 }
01343 
01344 void LSRUse::print(raw_ostream &OS) const {
01345   OS << "LSR Use: Kind=";
01346   switch (Kind) {
01347   case Basic:    OS << "Basic"; break;
01348   case Special:  OS << "Special"; break;
01349   case ICmpZero: OS << "ICmpZero"; break;
01350   case Address:
01351     OS << "Address of ";
01352     if (AccessTy->isPointerTy())
01353       OS << "pointer"; // the full pointer type could be really verbose
01354     else
01355       OS << *AccessTy;
01356   }
01357 
01358   OS << ", Offsets={";
01359   for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
01360        E = Offsets.end(); I != E; ++I) {
01361     OS << *I;
01362     if (std::next(I) != E)
01363       OS << ',';
01364   }
01365   OS << '}';
01366 
01367   if (AllFixupsOutsideLoop)
01368     OS << ", all-fixups-outside-loop";
01369 
01370   if (WidestFixupType)
01371     OS << ", widest fixup type: " << *WidestFixupType;
01372 }
01373 
01374 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
01375 void LSRUse::dump() const {
01376   print(errs()); errs() << '\n';
01377 }
01378 #endif
01379 
01380 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
01381                                  LSRUse::KindType Kind, Type *AccessTy,
01382                                  GlobalValue *BaseGV, int64_t BaseOffset,
01383                                  bool HasBaseReg, int64_t Scale) {
01384   switch (Kind) {
01385   case LSRUse::Address:
01386     return TTI.isLegalAddressingMode(AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale);
01387 
01388     // Otherwise, just guess that reg+reg addressing is legal.
01389     //return ;
01390 
01391   case LSRUse::ICmpZero:
01392     // There's not even a target hook for querying whether it would be legal to
01393     // fold a GV into an ICmp.
01394     if (BaseGV)
01395       return false;
01396 
01397     // ICmp only has two operands; don't allow more than two non-trivial parts.
01398     if (Scale != 0 && HasBaseReg && BaseOffset != 0)
01399       return false;
01400 
01401     // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
01402     // putting the scaled register in the other operand of the icmp.
01403     if (Scale != 0 && Scale != -1)
01404       return false;
01405 
01406     // If we have low-level target information, ask the target if it can fold an
01407     // integer immediate on an icmp.
01408     if (BaseOffset != 0) {
01409       // We have one of:
01410       // ICmpZero     BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
01411       // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
01412       // Offs is the ICmp immediate.
01413       if (Scale == 0)
01414         // The cast does the right thing with INT64_MIN.
01415         BaseOffset = -(uint64_t)BaseOffset;
01416       return TTI.isLegalICmpImmediate(BaseOffset);
01417     }
01418 
01419     // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
01420     return true;
01421 
01422   case LSRUse::Basic:
01423     // Only handle single-register values.
01424     return !BaseGV && Scale == 0 && BaseOffset == 0;
01425 
01426   case LSRUse::Special:
01427     // Special case Basic to handle -1 scales.
01428     return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset == 0;
01429   }
01430 
01431   llvm_unreachable("Invalid LSRUse Kind!");
01432 }
01433 
01434 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
01435                                  int64_t MinOffset, int64_t MaxOffset,
01436                                  LSRUse::KindType Kind, Type *AccessTy,
01437                                  GlobalValue *BaseGV, int64_t BaseOffset,
01438                                  bool HasBaseReg, int64_t Scale) {
01439   // Check for overflow.
01440   if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) !=
01441       (MinOffset > 0))
01442     return false;
01443   MinOffset = (uint64_t)BaseOffset + MinOffset;
01444   if (((int64_t)((uint64_t)BaseOffset + MaxOffset) > BaseOffset) !=
01445       (MaxOffset > 0))
01446     return false;
01447   MaxOffset = (uint64_t)BaseOffset + MaxOffset;
01448 
01449   return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
01450                               HasBaseReg, Scale) &&
01451          isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
01452                               HasBaseReg, Scale);
01453 }
01454 
01455 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
01456                                  int64_t MinOffset, int64_t MaxOffset,
01457                                  LSRUse::KindType Kind, Type *AccessTy,
01458                                  const Formula &F) {
01459   // For the purpose of isAMCompletelyFolded either having a canonical formula
01460   // or a scale not equal to zero is correct.
01461   // Problems may arise from non canonical formulae having a scale == 0.
01462   // Strictly speaking it would best to just rely on canonical formulae.
01463   // However, when we generate the scaled formulae, we first check that the
01464   // scaling factor is profitable before computing the actual ScaledReg for
01465   // compile time sake.
01466   assert((F.isCanonical() || F.Scale != 0));
01467   return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
01468                               F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
01469 }
01470 
01471 /// isLegalUse - Test whether we know how to expand the current formula.
01472 static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
01473                        int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy,
01474                        GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg,
01475                        int64_t Scale) {
01476   // We know how to expand completely foldable formulae.
01477   return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
01478                               BaseOffset, HasBaseReg, Scale) ||
01479          // Or formulae that use a base register produced by a sum of base
01480          // registers.
01481          (Scale == 1 &&
01482           isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
01483                                BaseGV, BaseOffset, true, 0));
01484 }
01485 
01486 static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
01487                        int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy,
01488                        const Formula &F) {
01489   return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
01490                     F.BaseOffset, F.HasBaseReg, F.Scale);
01491 }
01492 
01493 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
01494                                  const LSRUse &LU, const Formula &F) {
01495   return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
01496                               LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
01497                               F.Scale);
01498 }
01499 
01500 static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
01501                                      const LSRUse &LU, const Formula &F) {
01502   if (!F.Scale)
01503     return 0;
01504 
01505   // If the use is not completely folded in that instruction, we will have to
01506   // pay an extra cost only for scale != 1.
01507   if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
01508                             LU.AccessTy, F))
01509     return F.Scale != 1;
01510 
01511   switch (LU.Kind) {
01512   case LSRUse::Address: {
01513     // Check the scaling factor cost with both the min and max offsets.
01514     int ScaleCostMinOffset =
01515       TTI.getScalingFactorCost(LU.AccessTy, F.BaseGV,
01516                                F.BaseOffset + LU.MinOffset,
01517                                F.HasBaseReg, F.Scale);
01518     int ScaleCostMaxOffset =
01519       TTI.getScalingFactorCost(LU.AccessTy, F.BaseGV,
01520                                F.BaseOffset + LU.MaxOffset,
01521                                F.HasBaseReg, F.Scale);
01522 
01523     assert(ScaleCostMinOffset >= 0 && ScaleCostMaxOffset >= 0 &&
01524            "Legal addressing mode has an illegal cost!");
01525     return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
01526   }
01527   case LSRUse::ICmpZero:
01528   case LSRUse::Basic:
01529   case LSRUse::Special:
01530     // The use is completely folded, i.e., everything is folded into the
01531     // instruction.
01532     return 0;
01533   }
01534 
01535   llvm_unreachable("Invalid LSRUse Kind!");
01536 }
01537 
01538 static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
01539                              LSRUse::KindType Kind, Type *AccessTy,
01540                              GlobalValue *BaseGV, int64_t BaseOffset,
01541                              bool HasBaseReg) {
01542   // Fast-path: zero is always foldable.
01543   if (BaseOffset == 0 && !BaseGV) return true;
01544 
01545   // Conservatively, create an address with an immediate and a
01546   // base and a scale.
01547   int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
01548 
01549   // Canonicalize a scale of 1 to a base register if the formula doesn't
01550   // already have a base register.
01551   if (!HasBaseReg && Scale == 1) {
01552     Scale = 0;
01553     HasBaseReg = true;
01554   }
01555 
01556   return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
01557                               HasBaseReg, Scale);
01558 }
01559 
01560 static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
01561                              ScalarEvolution &SE, int64_t MinOffset,
01562                              int64_t MaxOffset, LSRUse::KindType Kind,
01563                              Type *AccessTy, const SCEV *S, bool HasBaseReg) {
01564   // Fast-path: zero is always foldable.
01565   if (S->isZero()) return true;
01566 
01567   // Conservatively, create an address with an immediate and a
01568   // base and a scale.
01569   int64_t BaseOffset = ExtractImmediate(S, SE);
01570   GlobalValue *BaseGV = ExtractSymbol(S, SE);
01571 
01572   // If there's anything else involved, it's not foldable.
01573   if (!S->isZero()) return false;
01574 
01575   // Fast-path: zero is always foldable.
01576   if (BaseOffset == 0 && !BaseGV) return true;
01577 
01578   // Conservatively, create an address with an immediate and a
01579   // base and a scale.
01580   int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
01581 
01582   return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
01583                               BaseOffset, HasBaseReg, Scale);
01584 }
01585 
01586 namespace {
01587 
01588 /// IVInc - An individual increment in a Chain of IV increments.
01589 /// Relate an IV user to an expression that computes the IV it uses from the IV
01590 /// used by the previous link in the Chain.
01591 ///
01592 /// For the head of a chain, IncExpr holds the absolute SCEV expression for the
01593 /// original IVOperand. The head of the chain's IVOperand is only valid during
01594 /// chain collection, before LSR replaces IV users. During chain generation,
01595 /// IncExpr can be used to find the new IVOperand that computes the same
01596 /// expression.
01597 struct IVInc {
01598   Instruction *UserInst;
01599   Value* IVOperand;
01600   const SCEV *IncExpr;
01601 
01602   IVInc(Instruction *U, Value *O, const SCEV *E):
01603     UserInst(U), IVOperand(O), IncExpr(E) {}
01604 };
01605 
01606 // IVChain - The list of IV increments in program order.
01607 // We typically add the head of a chain without finding subsequent links.
01608 struct IVChain {
01609   SmallVector<IVInc,1> Incs;
01610   const SCEV *ExprBase;
01611 
01612   IVChain() : ExprBase(nullptr) {}
01613 
01614   IVChain(const IVInc &Head, const SCEV *Base)
01615     : Incs(1, Head), ExprBase(Base) {}
01616 
01617   typedef SmallVectorImpl<IVInc>::const_iterator const_iterator;
01618 
01619   // begin - return the first increment in the chain.
01620   const_iterator begin() const {
01621     assert(!Incs.empty());
01622     return std::next(Incs.begin());
01623   }
01624   const_iterator end() const {
01625     return Incs.end();
01626   }
01627 
01628   // hasIncs - Returns true if this chain contains any increments.
01629   bool hasIncs() const { return Incs.size() >= 2; }
01630 
01631   // add - Add an IVInc to the end of this chain.
01632   void add(const IVInc &X) { Incs.push_back(X); }
01633 
01634   // tailUserInst - Returns the last UserInst in the chain.
01635   Instruction *tailUserInst() const { return Incs.back().UserInst; }
01636 
01637   // isProfitableIncrement - Returns true if IncExpr can be profitably added to
01638   // this chain.
01639   bool isProfitableIncrement(const SCEV *OperExpr,
01640                              const SCEV *IncExpr,
01641                              ScalarEvolution&);
01642 };
01643 
01644 /// ChainUsers - Helper for CollectChains to track multiple IV increment uses.
01645 /// Distinguish between FarUsers that definitely cross IV increments and
01646 /// NearUsers that may be used between IV increments.
01647 struct ChainUsers {
01648   SmallPtrSet<Instruction*, 4> FarUsers;
01649   SmallPtrSet<Instruction*, 4> NearUsers;
01650 };
01651 
01652 /// LSRInstance - This class holds state for the main loop strength reduction
01653 /// logic.
01654 class LSRInstance {
01655   IVUsers &IU;
01656   ScalarEvolution &SE;
01657   DominatorTree &DT;
01658   LoopInfo &LI;
01659   const TargetTransformInfo &TTI;
01660   Loop *const L;
01661   bool Changed;
01662 
01663   /// IVIncInsertPos - This is the insert position that the current loop's
01664   /// induction variable increment should be placed. In simple loops, this is
01665   /// the latch block's terminator. But in more complicated cases, this is a
01666   /// position which will dominate all the in-loop post-increment users.
01667   Instruction *IVIncInsertPos;
01668 
01669   /// Factors - Interesting factors between use strides.
01670   SmallSetVector<int64_t, 8> Factors;
01671 
01672   /// Types - Interesting use types, to facilitate truncation reuse.
01673   SmallSetVector<Type *, 4> Types;
01674 
01675   /// Fixups - The list of operands which are to be replaced.
01676   SmallVector<LSRFixup, 16> Fixups;
01677 
01678   /// Uses - The list of interesting uses.
01679   SmallVector<LSRUse, 16> Uses;
01680 
01681   /// RegUses - Track which uses use which register candidates.
01682   RegUseTracker RegUses;
01683 
01684   // Limit the number of chains to avoid quadratic behavior. We don't expect to
01685   // have more than a few IV increment chains in a loop. Missing a Chain falls
01686   // back to normal LSR behavior for those uses.
01687   static const unsigned MaxChains = 8;
01688 
01689   /// IVChainVec - IV users can form a chain of IV increments.
01690   SmallVector<IVChain, MaxChains> IVChainVec;
01691 
01692   /// IVIncSet - IV users that belong to profitable IVChains.
01693   SmallPtrSet<Use*, MaxChains> IVIncSet;
01694 
01695   void OptimizeShadowIV();
01696   bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
01697   ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
01698   void OptimizeLoopTermCond();
01699 
01700   void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
01701                         SmallVectorImpl<ChainUsers> &ChainUsersVec);
01702   void FinalizeChain(IVChain &Chain);
01703   void CollectChains();
01704   void GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
01705                        SmallVectorImpl<WeakVH> &DeadInsts);
01706 
01707   void CollectInterestingTypesAndFactors();
01708   void CollectFixupsAndInitialFormulae();
01709 
01710   LSRFixup &getNewFixup() {
01711     Fixups.push_back(LSRFixup());
01712     return Fixups.back();
01713   }
01714 
01715   // Support for sharing of LSRUses between LSRFixups.
01716   typedef DenseMap<LSRUse::SCEVUseKindPair, size_t> UseMapTy;
01717   UseMapTy UseMap;
01718 
01719   bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
01720                           LSRUse::KindType Kind, Type *AccessTy);
01721 
01722   std::pair<size_t, int64_t> getUse(const SCEV *&Expr,
01723                                     LSRUse::KindType Kind,
01724                                     Type *AccessTy);
01725 
01726   void DeleteUse(LSRUse &LU, size_t LUIdx);
01727 
01728   LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
01729 
01730   void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
01731   void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
01732   void CountRegisters(const Formula &F, size_t LUIdx);
01733   bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
01734 
01735   void CollectLoopInvariantFixupsAndFormulae();
01736 
01737   void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
01738                               unsigned Depth = 0);
01739 
01740   void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
01741                                   const Formula &Base, unsigned Depth,
01742                                   size_t Idx, bool IsScaledReg = false);
01743   void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
01744   void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
01745                                    const Formula &Base, size_t Idx,
01746                                    bool IsScaledReg = false);
01747   void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
01748   void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
01749                                    const Formula &Base,
01750                                    const SmallVectorImpl<int64_t> &Worklist,
01751                                    size_t Idx, bool IsScaledReg = false);
01752   void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
01753   void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
01754   void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
01755   void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
01756   void GenerateCrossUseConstantOffsets();
01757   void GenerateAllReuseFormulae();
01758 
01759   void FilterOutUndesirableDedicatedRegisters();
01760 
01761   size_t EstimateSearchSpaceComplexity() const;
01762   void NarrowSearchSpaceByDetectingSupersets();
01763   void NarrowSearchSpaceByCollapsingUnrolledCode();
01764   void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
01765   void NarrowSearchSpaceByPickingWinnerRegs();
01766   void NarrowSearchSpaceUsingHeuristics();
01767 
01768   void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
01769                     Cost &SolutionCost,
01770                     SmallVectorImpl<const Formula *> &Workspace,
01771                     const Cost &CurCost,
01772                     const SmallPtrSet<const SCEV *, 16> &CurRegs,
01773                     DenseSet<const SCEV *> &VisitedRegs) const;
01774   void Solve(SmallVectorImpl<const Formula *> &Solution) const;
01775 
01776   BasicBlock::iterator
01777     HoistInsertPosition(BasicBlock::iterator IP,
01778                         const SmallVectorImpl<Instruction *> &Inputs) const;
01779   BasicBlock::iterator
01780     AdjustInsertPositionForExpand(BasicBlock::iterator IP,
01781                                   const LSRFixup &LF,
01782                                   const LSRUse &LU,
01783                                   SCEVExpander &Rewriter) const;
01784 
01785   Value *Expand(const LSRFixup &LF,
01786                 const Formula &F,
01787                 BasicBlock::iterator IP,
01788                 SCEVExpander &Rewriter,
01789                 SmallVectorImpl<WeakVH> &DeadInsts) const;
01790   void RewriteForPHI(PHINode *PN, const LSRFixup &LF,
01791                      const Formula &F,
01792                      SCEVExpander &Rewriter,
01793                      SmallVectorImpl<WeakVH> &DeadInsts,
01794                      Pass *P) const;
01795   void Rewrite(const LSRFixup &LF,
01796                const Formula &F,
01797                SCEVExpander &Rewriter,
01798                SmallVectorImpl<WeakVH> &DeadInsts,
01799                Pass *P) const;
01800   void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
01801                          Pass *P);
01802 
01803 public:
01804   LSRInstance(Loop *L, Pass *P);
01805 
01806   bool getChanged() const { return Changed; }
01807 
01808   void print_factors_and_types(raw_ostream &OS) const;
01809   void print_fixups(raw_ostream &OS) const;
01810   void print_uses(raw_ostream &OS) const;
01811   void print(raw_ostream &OS) const;
01812   void dump() const;
01813 };
01814 
01815 }
01816 
01817 /// OptimizeShadowIV - If IV is used in a int-to-float cast
01818 /// inside the loop then try to eliminate the cast operation.
01819 void LSRInstance::OptimizeShadowIV() {
01820   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
01821   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
01822     return;
01823 
01824   for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
01825        UI != E; /* empty */) {
01826     IVUsers::const_iterator CandidateUI = UI;
01827     ++UI;
01828     Instruction *ShadowUse = CandidateUI->getUser();
01829     Type *DestTy = nullptr;
01830     bool IsSigned = false;
01831 
01832     /* If shadow use is a int->float cast then insert a second IV
01833        to eliminate this cast.
01834 
01835          for (unsigned i = 0; i < n; ++i)
01836            foo((double)i);
01837 
01838        is transformed into
01839 
01840          double d = 0.0;
01841          for (unsigned i = 0; i < n; ++i, ++d)
01842            foo(d);
01843     */
01844     if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
01845       IsSigned = false;
01846       DestTy = UCast->getDestTy();
01847     }
01848     else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
01849       IsSigned = true;
01850       DestTy = SCast->getDestTy();
01851     }
01852     if (!DestTy) continue;
01853 
01854     // If target does not support DestTy natively then do not apply
01855     // this transformation.
01856     if (!TTI.isTypeLegal(DestTy)) continue;
01857 
01858     PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
01859     if (!PH) continue;
01860     if (PH->getNumIncomingValues() != 2) continue;
01861 
01862     Type *SrcTy = PH->getType();
01863     int Mantissa = DestTy->getFPMantissaWidth();
01864     if (Mantissa == -1) continue;
01865     if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
01866       continue;
01867 
01868     unsigned Entry, Latch;
01869     if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
01870       Entry = 0;
01871       Latch = 1;
01872     } else {
01873       Entry = 1;
01874       Latch = 0;
01875     }
01876 
01877     ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
01878     if (!Init) continue;
01879     Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
01880                                         (double)Init->getSExtValue() :
01881                                         (double)Init->getZExtValue());
01882 
01883     BinaryOperator *Incr =
01884       dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
01885     if (!Incr) continue;
01886     if (Incr->getOpcode() != Instruction::Add
01887         && Incr->getOpcode() != Instruction::Sub)
01888       continue;
01889 
01890     /* Initialize new IV, double d = 0.0 in above example. */
01891     ConstantInt *C = nullptr;
01892     if (Incr->getOperand(0) == PH)
01893       C = dyn_cast<ConstantInt>(Incr->getOperand(1));
01894     else if (Incr->getOperand(1) == PH)
01895       C = dyn_cast<ConstantInt>(Incr->getOperand(0));
01896     else
01897       continue;
01898 
01899     if (!C) continue;
01900 
01901     // Ignore negative constants, as the code below doesn't handle them
01902     // correctly. TODO: Remove this restriction.
01903     if (!C->getValue().isStrictlyPositive()) continue;
01904 
01905     /* Add new PHINode. */
01906     PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH);
01907 
01908     /* create new increment. '++d' in above example. */
01909     Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
01910     BinaryOperator *NewIncr =
01911       BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
01912                                Instruction::FAdd : Instruction::FSub,
01913                              NewPH, CFP, "IV.S.next.", Incr);
01914 
01915     NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
01916     NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
01917 
01918     /* Remove cast operation */
01919     ShadowUse->replaceAllUsesWith(NewPH);
01920     ShadowUse->eraseFromParent();
01921     Changed = true;
01922     break;
01923   }
01924 }
01925 
01926 /// FindIVUserForCond - If Cond has an operand that is an expression of an IV,
01927 /// set the IV user and stride information and return true, otherwise return
01928 /// false.
01929 bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
01930   for (IVUsers::iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
01931     if (UI->getUser() == Cond) {
01932       // NOTE: we could handle setcc instructions with multiple uses here, but
01933       // InstCombine does it as well for simple uses, it's not clear that it
01934       // occurs enough in real life to handle.
01935       CondUse = UI;
01936       return true;
01937     }
01938   return false;
01939 }
01940 
01941 /// OptimizeMax - Rewrite the loop's terminating condition if it uses
01942 /// a max computation.
01943 ///
01944 /// This is a narrow solution to a specific, but acute, problem. For loops
01945 /// like this:
01946 ///
01947 ///   i = 0;
01948 ///   do {
01949 ///     p[i] = 0.0;
01950 ///   } while (++i < n);
01951 ///
01952 /// the trip count isn't just 'n', because 'n' might not be positive. And
01953 /// unfortunately this can come up even for loops where the user didn't use
01954 /// a C do-while loop. For example, seemingly well-behaved top-test loops
01955 /// will commonly be lowered like this:
01956 //
01957 ///   if (n > 0) {
01958 ///     i = 0;
01959 ///     do {
01960 ///       p[i] = 0.0;
01961 ///     } while (++i < n);
01962 ///   }
01963 ///
01964 /// and then it's possible for subsequent optimization to obscure the if
01965 /// test in such a way that indvars can't find it.
01966 ///
01967 /// When indvars can't find the if test in loops like this, it creates a
01968 /// max expression, which allows it to give the loop a canonical
01969 /// induction variable:
01970 ///
01971 ///   i = 0;
01972 ///   max = n < 1 ? 1 : n;
01973 ///   do {
01974 ///     p[i] = 0.0;
01975 ///   } while (++i != max);
01976 ///
01977 /// Canonical induction variables are necessary because the loop passes
01978 /// are designed around them. The most obvious example of this is the
01979 /// LoopInfo analysis, which doesn't remember trip count values. It
01980 /// expects to be able to rediscover the trip count each time it is
01981 /// needed, and it does this using a simple analysis that only succeeds if
01982 /// the loop has a canonical induction variable.
01983 ///
01984 /// However, when it comes time to generate code, the maximum operation
01985 /// can be quite costly, especially if it's inside of an outer loop.
01986 ///
01987 /// This function solves this problem by detecting this type of loop and
01988 /// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
01989 /// the instructions for the maximum computation.
01990 ///
01991 ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
01992   // Check that the loop matches the pattern we're looking for.
01993   if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
01994       Cond->getPredicate() != CmpInst::ICMP_NE)
01995     return Cond;
01996 
01997   SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
01998   if (!Sel || !Sel->hasOneUse()) return Cond;
01999 
02000   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
02001   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
02002     return Cond;
02003   const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
02004 
02005   // Add one to the backedge-taken count to get the trip count.
02006   const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
02007   if (IterationCount != SE.getSCEV(Sel)) return Cond;
02008 
02009   // Check for a max calculation that matches the pattern. There's no check
02010   // for ICMP_ULE here because the comparison would be with zero, which
02011   // isn't interesting.
02012   CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
02013   const SCEVNAryExpr *Max = nullptr;
02014   if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
02015     Pred = ICmpInst::ICMP_SLE;
02016     Max = S;
02017   } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
02018     Pred = ICmpInst::ICMP_SLT;
02019     Max = S;
02020   } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
02021     Pred = ICmpInst::ICMP_ULT;
02022     Max = U;
02023   } else {
02024     // No match; bail.
02025     return Cond;
02026   }
02027 
02028   // To handle a max with more than two operands, this optimization would
02029   // require additional checking and setup.
02030   if (Max->getNumOperands() != 2)
02031     return Cond;
02032 
02033   const SCEV *MaxLHS = Max->getOperand(0);
02034   const SCEV *MaxRHS = Max->getOperand(1);
02035 
02036   // ScalarEvolution canonicalizes constants to the left. For < and >, look
02037   // for a comparison with 1. For <= and >=, a comparison with zero.
02038   if (!MaxLHS ||
02039       (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
02040     return Cond;
02041 
02042   // Check the relevant induction variable for conformance to
02043   // the pattern.
02044   const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
02045   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
02046   if (!AR || !AR->isAffine() ||
02047       AR->getStart() != One ||
02048       AR->getStepRecurrence(SE) != One)
02049     return Cond;
02050 
02051   assert(AR->getLoop() == L &&
02052          "Loop condition operand is an addrec in a different loop!");
02053 
02054   // Check the right operand of the select, and remember it, as it will
02055   // be used in the new comparison instruction.
02056   Value *NewRHS = nullptr;
02057   if (ICmpInst::isTrueWhenEqual(Pred)) {
02058     // Look for n+1, and grab n.
02059     if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
02060       if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
02061          if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
02062            NewRHS = BO->getOperand(0);
02063     if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
02064       if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
02065         if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
02066           NewRHS = BO->getOperand(0);
02067     if (!NewRHS)
02068       return Cond;
02069   } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
02070     NewRHS = Sel->getOperand(1);
02071   else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
02072     NewRHS = Sel->getOperand(2);
02073   else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
02074     NewRHS = SU->getValue();
02075   else
02076     // Max doesn't match expected pattern.
02077     return Cond;
02078 
02079   // Determine the new comparison opcode. It may be signed or unsigned,
02080   // and the original comparison may be either equality or inequality.
02081   if (Cond->getPredicate() == CmpInst::ICMP_EQ)
02082     Pred = CmpInst::getInversePredicate(Pred);
02083 
02084   // Ok, everything looks ok to change the condition into an SLT or SGE and
02085   // delete the max calculation.
02086   ICmpInst *NewCond =
02087     new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp");
02088 
02089   // Delete the max calculation instructions.
02090   Cond->replaceAllUsesWith(NewCond);
02091   CondUse->setUser(NewCond);
02092   Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
02093   Cond->eraseFromParent();
02094   Sel->eraseFromParent();
02095   if (Cmp->use_empty())
02096     Cmp->eraseFromParent();
02097   return NewCond;
02098 }
02099 
02100 /// OptimizeLoopTermCond - Change loop terminating condition to use the
02101 /// postinc iv when possible.
02102 void
02103 LSRInstance::OptimizeLoopTermCond() {
02104   SmallPtrSet<Instruction *, 4> PostIncs;
02105 
02106   BasicBlock *LatchBlock = L->getLoopLatch();
02107   SmallVector<BasicBlock*, 8> ExitingBlocks;
02108   L->getExitingBlocks(ExitingBlocks);
02109 
02110   for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
02111     BasicBlock *ExitingBlock = ExitingBlocks[i];
02112 
02113     // Get the terminating condition for the loop if possible.  If we
02114     // can, we want to change it to use a post-incremented version of its
02115     // induction variable, to allow coalescing the live ranges for the IV into
02116     // one register value.
02117 
02118     BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
02119     if (!TermBr)
02120       continue;
02121     // FIXME: Overly conservative, termination condition could be an 'or' etc..
02122     if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
02123       continue;
02124 
02125     // Search IVUsesByStride to find Cond's IVUse if there is one.
02126     IVStrideUse *CondUse = nullptr;
02127     ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
02128     if (!FindIVUserForCond(Cond, CondUse))
02129       continue;
02130 
02131     // If the trip count is computed in terms of a max (due to ScalarEvolution
02132     // being unable to find a sufficient guard, for example), change the loop
02133     // comparison to use SLT or ULT instead of NE.
02134     // One consequence of doing this now is that it disrupts the count-down
02135     // optimization. That's not always a bad thing though, because in such
02136     // cases it may still be worthwhile to avoid a max.
02137     Cond = OptimizeMax(Cond, CondUse);
02138 
02139     // If this exiting block dominates the latch block, it may also use
02140     // the post-inc value if it won't be shared with other uses.
02141     // Check for dominance.
02142     if (!DT.dominates(ExitingBlock, LatchBlock))
02143       continue;
02144 
02145     // Conservatively avoid trying to use the post-inc value in non-latch
02146     // exits if there may be pre-inc users in intervening blocks.
02147     if (LatchBlock != ExitingBlock)
02148       for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
02149         // Test if the use is reachable from the exiting block. This dominator
02150         // query is a conservative approximation of reachability.
02151         if (&*UI != CondUse &&
02152             !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {
02153           // Conservatively assume there may be reuse if the quotient of their
02154           // strides could be a legal scale.
02155           const SCEV *A = IU.getStride(*CondUse, L);
02156           const SCEV *B = IU.getStride(*UI, L);
02157           if (!A || !B) continue;
02158           if (SE.getTypeSizeInBits(A->getType()) !=
02159               SE.getTypeSizeInBits(B->getType())) {
02160             if (SE.getTypeSizeInBits(A->getType()) >
02161                 SE.getTypeSizeInBits(B->getType()))
02162               B = SE.getSignExtendExpr(B, A->getType());
02163             else
02164               A = SE.getSignExtendExpr(A, B->getType());
02165           }
02166           if (const SCEVConstant *D =
02167                 dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
02168             const ConstantInt *C = D->getValue();
02169             // Stride of one or negative one can have reuse with non-addresses.
02170             if (C->isOne() || C->isAllOnesValue())
02171               goto decline_post_inc;
02172             // Avoid weird situations.
02173             if (C->getValue().getMinSignedBits() >= 64 ||
02174                 C->getValue().isMinSignedValue())
02175               goto decline_post_inc;
02176             // Check for possible scaled-address reuse.
02177             Type *AccessTy = getAccessType(UI->getUser());
02178             int64_t Scale = C->getSExtValue();
02179             if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ nullptr,
02180                                           /*BaseOffset=*/ 0,
02181                                           /*HasBaseReg=*/ false, Scale))
02182               goto decline_post_inc;
02183             Scale = -Scale;
02184             if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ nullptr,
02185                                           /*BaseOffset=*/ 0,
02186                                           /*HasBaseReg=*/ false, Scale))
02187               goto decline_post_inc;
02188           }
02189         }
02190 
02191     DEBUG(dbgs() << "  Change loop exiting icmp to use postinc iv: "
02192                  << *Cond << '\n');
02193 
02194     // It's possible for the setcc instruction to be anywhere in the loop, and
02195     // possible for it to have multiple users.  If it is not immediately before
02196     // the exiting block branch, move it.
02197     if (&*++BasicBlock::iterator(Cond) != TermBr) {
02198       if (Cond->hasOneUse()) {
02199         Cond->moveBefore(TermBr);
02200       } else {
02201         // Clone the terminating condition and insert into the loopend.
02202         ICmpInst *OldCond = Cond;
02203         Cond = cast<ICmpInst>(Cond->clone());
02204         Cond->setName(L->getHeader()->getName() + ".termcond");
02205         ExitingBlock->getInstList().insert(TermBr, Cond);
02206 
02207         // Clone the IVUse, as the old use still exists!
02208         CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
02209         TermBr->replaceUsesOfWith(OldCond, Cond);
02210       }
02211     }
02212 
02213     // If we get to here, we know that we can transform the setcc instruction to
02214     // use the post-incremented version of the IV, allowing us to coalesce the
02215     // live ranges for the IV correctly.
02216     CondUse->transformToPostInc(L);
02217     Changed = true;
02218 
02219     PostIncs.insert(Cond);
02220   decline_post_inc:;
02221   }
02222 
02223   // Determine an insertion point for the loop induction variable increment. It
02224   // must dominate all the post-inc comparisons we just set up, and it must
02225   // dominate the loop latch edge.
02226   IVIncInsertPos = L->getLoopLatch()->getTerminator();
02227   for (Instruction *Inst : PostIncs) {
02228     BasicBlock *BB =
02229       DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
02230                                     Inst->getParent());
02231     if (BB == Inst->getParent())
02232       IVIncInsertPos = Inst;
02233     else if (BB != IVIncInsertPos->getParent())
02234       IVIncInsertPos = BB->getTerminator();
02235   }
02236 }
02237 
02238 /// reconcileNewOffset - Determine if the given use can accommodate a fixup
02239 /// at the given offset and other details. If so, update the use and
02240 /// return true.
02241 bool
02242 LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
02243                                 LSRUse::KindType Kind, Type *AccessTy) {
02244   int64_t NewMinOffset = LU.MinOffset;
02245   int64_t NewMaxOffset = LU.MaxOffset;
02246   Type *NewAccessTy = AccessTy;
02247 
02248   // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
02249   // something conservative, however this can pessimize in the case that one of
02250   // the uses will have all its uses outside the loop, for example.
02251   if (LU.Kind != Kind)
02252     return false;
02253 
02254   // Check for a mismatched access type, and fall back conservatively as needed.
02255   // TODO: Be less conservative when the type is similar and can use the same
02256   // addressing modes.
02257   if (Kind == LSRUse::Address && AccessTy != LU.AccessTy)
02258     NewAccessTy = Type::getVoidTy(AccessTy->getContext());
02259 
02260   // Conservatively assume HasBaseReg is true for now.
02261   if (NewOffset < LU.MinOffset) {
02262     if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
02263                           LU.MaxOffset - NewOffset, HasBaseReg))
02264       return false;
02265     NewMinOffset = NewOffset;
02266   } else if (NewOffset > LU.MaxOffset) {
02267     if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
02268                           NewOffset - LU.MinOffset, HasBaseReg))
02269       return false;
02270     NewMaxOffset = NewOffset;
02271   }
02272 
02273   // Update the use.
02274   LU.MinOffset = NewMinOffset;
02275   LU.MaxOffset = NewMaxOffset;
02276   LU.AccessTy = NewAccessTy;
02277   if (NewOffset != LU.Offsets.back())
02278     LU.Offsets.push_back(NewOffset);
02279   return true;
02280 }
02281 
02282 /// getUse - Return an LSRUse index and an offset value for a fixup which
02283 /// needs the given expression, with the given kind and optional access type.
02284 /// Either reuse an existing use or create a new one, as needed.
02285 std::pair<size_t, int64_t>
02286 LSRInstance::getUse(const SCEV *&Expr,
02287                     LSRUse::KindType Kind, Type *AccessTy) {
02288   const SCEV *Copy = Expr;
02289   int64_t Offset = ExtractImmediate(Expr, SE);
02290 
02291   // Basic uses can't accept any offset, for example.
02292   if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
02293                         Offset, /*HasBaseReg=*/ true)) {
02294     Expr = Copy;
02295     Offset = 0;
02296   }
02297 
02298   std::pair<UseMapTy::iterator, bool> P =
02299     UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0));
02300   if (!P.second) {
02301     // A use already existed with this base.
02302     size_t LUIdx = P.first->second;
02303     LSRUse &LU = Uses[LUIdx];
02304     if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
02305       // Reuse this use.
02306       return std::make_pair(LUIdx, Offset);
02307   }
02308 
02309   // Create a new use.
02310   size_t LUIdx = Uses.size();
02311   P.first->second = LUIdx;
02312   Uses.push_back(LSRUse(Kind, AccessTy));
02313   LSRUse &LU = Uses[LUIdx];
02314 
02315   // We don't need to track redundant offsets, but we don't need to go out
02316   // of our way here to avoid them.
02317   if (LU.Offsets.empty() || Offset != LU.Offsets.back())
02318     LU.Offsets.push_back(Offset);
02319 
02320   LU.MinOffset = Offset;
02321   LU.MaxOffset = Offset;
02322   return std::make_pair(LUIdx, Offset);
02323 }
02324 
02325 /// DeleteUse - Delete the given use from the Uses list.
02326 void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
02327   if (&LU != &Uses.back())
02328     std::swap(LU, Uses.back());
02329   Uses.pop_back();
02330 
02331   // Update RegUses.
02332   RegUses.SwapAndDropUse(LUIdx, Uses.size());
02333 }
02334 
02335 /// FindUseWithFormula - Look for a use distinct from OrigLU which is has
02336 /// a formula that has the same registers as the given formula.
02337 LSRUse *
02338 LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
02339                                        const LSRUse &OrigLU) {
02340   // Search all uses for the formula. This could be more clever.
02341   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
02342     LSRUse &LU = Uses[LUIdx];
02343     // Check whether this use is close enough to OrigLU, to see whether it's
02344     // worthwhile looking through its formulae.
02345     // Ignore ICmpZero uses because they may contain formulae generated by
02346     // GenerateICmpZeroScales, in which case adding fixup offsets may
02347     // be invalid.
02348     if (&LU != &OrigLU &&
02349         LU.Kind != LSRUse::ICmpZero &&
02350         LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
02351         LU.WidestFixupType == OrigLU.WidestFixupType &&
02352         LU.HasFormulaWithSameRegs(OrigF)) {
02353       // Scan through this use's formulae.
02354       for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
02355            E = LU.Formulae.end(); I != E; ++I) {
02356         const Formula &F = *I;
02357         // Check to see if this formula has the same registers and symbols
02358         // as OrigF.
02359         if (F.BaseRegs == OrigF.BaseRegs &&
02360             F.ScaledReg == OrigF.ScaledReg &&
02361             F.BaseGV == OrigF.BaseGV &&
02362             F.Scale == OrigF.Scale &&
02363             F.UnfoldedOffset == OrigF.UnfoldedOffset) {
02364           if (F.BaseOffset == 0)
02365             return &LU;
02366           // This is the formula where all the registers and symbols matched;
02367           // there aren't going to be any others. Since we declined it, we
02368           // can skip the rest of the formulae and proceed to the next LSRUse.
02369           break;
02370         }
02371       }
02372     }
02373   }
02374 
02375   // Nothing looked good.
02376   return nullptr;
02377 }
02378 
02379 void LSRInstance::CollectInterestingTypesAndFactors() {
02380   SmallSetVector<const SCEV *, 4> Strides;
02381 
02382   // Collect interesting types and strides.
02383   SmallVector<const SCEV *, 4> Worklist;
02384   for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) {
02385     const SCEV *Expr = IU.getExpr(*UI);
02386 
02387     // Collect interesting types.
02388     Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
02389 
02390     // Add strides for mentioned loops.
02391     Worklist.push_back(Expr);
02392     do {
02393       const SCEV *S = Worklist.pop_back_val();
02394       if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
02395         if (AR->getLoop() == L)
02396           Strides.insert(AR->getStepRecurrence(SE));
02397         Worklist.push_back(AR->getStart());
02398       } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
02399         Worklist.append(Add->op_begin(), Add->op_end());
02400       }
02401     } while (!Worklist.empty());
02402   }
02403 
02404   // Compute interesting factors from the set of interesting strides.
02405   for (SmallSetVector<const SCEV *, 4>::const_iterator
02406        I = Strides.begin(), E = Strides.end(); I != E; ++I)
02407     for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
02408          std::next(I); NewStrideIter != E; ++NewStrideIter) {
02409       const SCEV *OldStride = *I;
02410       const SCEV *NewStride = *NewStrideIter;
02411 
02412       if (SE.getTypeSizeInBits(OldStride->getType()) !=
02413           SE.getTypeSizeInBits(NewStride->getType())) {
02414         if (SE.getTypeSizeInBits(OldStride->getType()) >
02415             SE.getTypeSizeInBits(NewStride->getType()))
02416           NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
02417         else
02418           OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
02419       }
02420       if (const SCEVConstant *Factor =
02421             dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
02422                                                         SE, true))) {
02423         if (Factor->getValue()->getValue().getMinSignedBits() <= 64)
02424           Factors.insert(Factor->getValue()->getValue().getSExtValue());
02425       } else if (const SCEVConstant *Factor =
02426                    dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
02427                                                                NewStride,
02428                                                                SE, true))) {
02429         if (Factor->getValue()->getValue().getMinSignedBits() <= 64)
02430           Factors.insert(Factor->getValue()->getValue().getSExtValue());
02431       }
02432     }
02433 
02434   // If all uses use the same type, don't bother looking for truncation-based
02435   // reuse.
02436   if (Types.size() == 1)
02437     Types.clear();
02438 
02439   DEBUG(print_factors_and_types(dbgs()));
02440 }
02441 
02442 /// findIVOperand - Helper for CollectChains that finds an IV operand (computed
02443 /// by an AddRec in this loop) within [OI,OE) or returns OE. If IVUsers mapped
02444 /// Instructions to IVStrideUses, we could partially skip this.
02445 static User::op_iterator
02446 findIVOperand(User::op_iterator OI, User::op_iterator OE,
02447               Loop *L, ScalarEvolution &SE) {
02448   for(; OI != OE; ++OI) {
02449     if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
02450       if (!SE.isSCEVable(Oper->getType()))
02451         continue;
02452 
02453       if (const SCEVAddRecExpr *AR =
02454           dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) {
02455         if (AR->getLoop() == L)
02456           break;
02457       }
02458     }
02459   }
02460   return OI;
02461 }
02462 
02463 /// getWideOperand - IVChain logic must consistenctly peek base TruncInst
02464 /// operands, so wrap it in a convenient helper.
02465 static Value *getWideOperand(Value *Oper) {
02466   if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
02467     return Trunc->getOperand(0);
02468   return Oper;
02469 }
02470 
02471 /// isCompatibleIVType - Return true if we allow an IV chain to include both
02472 /// types.
02473 static bool isCompatibleIVType(Value *LVal, Value *RVal) {
02474   Type *LType = LVal->getType();
02475   Type *RType = RVal->getType();
02476   return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy());
02477 }
02478 
02479 /// getExprBase - Return an approximation of this SCEV expression's "base", or
02480 /// NULL for any constant. Returning the expression itself is
02481 /// conservative. Returning a deeper subexpression is more precise and valid as
02482 /// long as it isn't less complex than another subexpression. For expressions
02483 /// involving multiple unscaled values, we need to return the pointer-type
02484 /// SCEVUnknown. This avoids forming chains across objects, such as:
02485 /// PrevOper==a[i], IVOper==b[i], IVInc==b-a.
02486 ///
02487 /// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
02488 /// SCEVUnknown, we simply return the rightmost SCEV operand.
02489 static const SCEV *getExprBase(const SCEV *S) {
02490   switch (S->getSCEVType()) {
02491   default: // uncluding scUnknown.
02492     return S;
02493   case scConstant:
02494     return nullptr;
02495   case scTruncate:
02496     return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
02497   case scZeroExtend:
02498     return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
02499   case scSignExtend:
02500     return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
02501   case scAddExpr: {
02502     // Skip over scaled operands (scMulExpr) to follow add operands as long as
02503     // there's nothing more complex.
02504     // FIXME: not sure if we want to recognize negation.
02505     const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
02506     for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(Add->op_end()),
02507            E(Add->op_begin()); I != E; ++I) {
02508       const SCEV *SubExpr = *I;
02509       if (SubExpr->getSCEVType() == scAddExpr)
02510         return getExprBase(SubExpr);
02511 
02512       if (SubExpr->getSCEVType() != scMulExpr)
02513         return SubExpr;
02514     }
02515     return S; // all operands are scaled, be conservative.
02516   }
02517   case scAddRecExpr:
02518     return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
02519   }
02520 }
02521 
02522 /// Return true if the chain increment is profitable to expand into a loop
02523 /// invariant value, which may require its own register. A profitable chain
02524 /// increment will be an offset relative to the same base. We allow such offsets
02525 /// to potentially be used as chain increment as long as it's not obviously
02526 /// expensive to expand using real instructions.
02527 bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
02528                                     const SCEV *IncExpr,
02529                                     ScalarEvolution &SE) {
02530   // Aggressively form chains when -stress-ivchain.
02531   if (StressIVChain)
02532     return true;
02533 
02534   // Do not replace a constant offset from IV head with a nonconstant IV
02535   // increment.
02536   if (!isa<SCEVConstant>(IncExpr)) {
02537     const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
02538     if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
02539       return 0;
02540   }
02541 
02542   SmallPtrSet<const SCEV*, 8> Processed;
02543   return !isHighCostExpansion(IncExpr, Processed, SE);
02544 }
02545 
02546 /// Return true if the number of registers needed for the chain is estimated to
02547 /// be less than the number required for the individual IV users. First prohibit
02548 /// any IV users that keep the IV live across increments (the Users set should
02549 /// be empty). Next count the number and type of increments in the chain.
02550 ///
02551 /// Chaining IVs can lead to considerable code bloat if ISEL doesn't
02552 /// effectively use postinc addressing modes. Only consider it profitable it the
02553 /// increments can be computed in fewer registers when chained.
02554 ///
02555 /// TODO: Consider IVInc free if it's already used in another chains.
02556 static bool
02557 isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users,
02558                   ScalarEvolution &SE, const TargetTransformInfo &TTI) {
02559   if (StressIVChain)
02560     return true;
02561 
02562   if (!Chain.hasIncs())
02563     return false;
02564 
02565   if (!Users.empty()) {
02566     DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
02567           for (Instruction *Inst : Users) {
02568             dbgs() << "  " << *Inst << "\n";
02569           });
02570     return false;
02571   }
02572   assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
02573 
02574   // The chain itself may require a register, so intialize cost to 1.
02575   int cost = 1;
02576 
02577   // A complete chain likely eliminates the need for keeping the original IV in
02578   // a register. LSR does not currently know how to form a complete chain unless
02579   // the header phi already exists.
02580   if (isa<PHINode>(Chain.tailUserInst())
02581       && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
02582     --cost;
02583   }
02584   const SCEV *LastIncExpr = nullptr;
02585   unsigned NumConstIncrements = 0;
02586   unsigned NumVarIncrements = 0;
02587   unsigned NumReusedIncrements = 0;
02588   for (IVChain::const_iterator I = Chain.begin(), E = Chain.end();
02589        I != E; ++I) {
02590 
02591     if (I->IncExpr->isZero())
02592       continue;
02593 
02594     // Incrementing by zero or some constant is neutral. We assume constants can
02595     // be folded into an addressing mode or an add's immediate operand.
02596     if (isa<SCEVConstant>(I->IncExpr)) {
02597       ++NumConstIncrements;
02598       continue;
02599     }
02600 
02601     if (I->IncExpr == LastIncExpr)
02602       ++NumReusedIncrements;
02603     else
02604       ++NumVarIncrements;
02605 
02606     LastIncExpr = I->IncExpr;
02607   }
02608   // An IV chain with a single increment is handled by LSR's postinc
02609   // uses. However, a chain with multiple increments requires keeping the IV's
02610   // value live longer than it needs to be if chained.
02611   if (NumConstIncrements > 1)
02612     --cost;
02613 
02614   // Materializing increment expressions in the preheader that didn't exist in
02615   // the original code may cost a register. For example, sign-extended array
02616   // indices can produce ridiculous increments like this:
02617   // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
02618   cost += NumVarIncrements;
02619 
02620   // Reusing variable increments likely saves a register to hold the multiple of
02621   // the stride.
02622   cost -= NumReusedIncrements;
02623 
02624   DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
02625                << "\n");
02626 
02627   return cost < 0;
02628 }
02629 
02630 /// ChainInstruction - Add this IV user to an existing chain or make it the head
02631 /// of a new chain.
02632 void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
02633                                    SmallVectorImpl<ChainUsers> &ChainUsersVec) {
02634   // When IVs are used as types of varying widths, they are generally converted
02635   // to a wider type with some uses remaining narrow under a (free) trunc.
02636   Value *const NextIV = getWideOperand(IVOper);
02637   const SCEV *const OperExpr = SE.getSCEV(NextIV);
02638   const SCEV *const OperExprBase = getExprBase(OperExpr);
02639 
02640   // Visit all existing chains. Check if its IVOper can be computed as a
02641   // profitable loop invariant increment from the last link in the Chain.
02642   unsigned ChainIdx = 0, NChains = IVChainVec.size();
02643   const SCEV *LastIncExpr = nullptr;
02644   for (; ChainIdx < NChains; ++ChainIdx) {
02645     IVChain &Chain = IVChainVec[ChainIdx];
02646 
02647     // Prune the solution space aggressively by checking that both IV operands
02648     // are expressions that operate on the same unscaled SCEVUnknown. This
02649     // "base" will be canceled by the subsequent getMinusSCEV call. Checking
02650     // first avoids creating extra SCEV expressions.
02651     if (!StressIVChain && Chain.ExprBase != OperExprBase)
02652       continue;
02653 
02654     Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
02655     if (!isCompatibleIVType(PrevIV, NextIV))
02656       continue;
02657 
02658     // A phi node terminates a chain.
02659     if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
02660       continue;
02661 
02662     // The increment must be loop-invariant so it can be kept in a register.
02663     const SCEV *PrevExpr = SE.getSCEV(PrevIV);
02664     const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
02665     if (!SE.isLoopInvariant(IncExpr, L))
02666       continue;
02667 
02668     if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
02669       LastIncExpr = IncExpr;
02670       break;
02671     }
02672   }
02673   // If we haven't found a chain, create a new one, unless we hit the max. Don't
02674   // bother for phi nodes, because they must be last in the chain.
02675   if (ChainIdx == NChains) {
02676     if (isa<PHINode>(UserInst))
02677       return;
02678     if (NChains >= MaxChains && !StressIVChain) {
02679       DEBUG(dbgs() << "IV Chain Limit\n");
02680       return;
02681     }
02682     LastIncExpr = OperExpr;
02683     // IVUsers may have skipped over sign/zero extensions. We don't currently
02684     // attempt to form chains involving extensions unless they can be hoisted
02685     // into this loop's AddRec.
02686     if (!isa<SCEVAddRecExpr>(LastIncExpr))
02687       return;
02688     ++NChains;
02689     IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
02690                                  OperExprBase));
02691     ChainUsersVec.resize(NChains);
02692     DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
02693                  << ") IV=" << *LastIncExpr << "\n");
02694   } else {
02695     DEBUG(dbgs() << "IV Chain#" << ChainIdx << "  Inc: (" << *UserInst
02696                  << ") IV+" << *LastIncExpr << "\n");
02697     // Add this IV user to the end of the chain.
02698     IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
02699   }
02700   IVChain &Chain = IVChainVec[ChainIdx];
02701 
02702   SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
02703   // This chain's NearUsers become FarUsers.
02704   if (!LastIncExpr->isZero()) {
02705     ChainUsersVec[ChainIdx].FarUsers.insert(NearUsers.begin(),
02706                                             NearUsers.end());
02707     NearUsers.clear();
02708   }
02709 
02710   // All other uses of IVOperand become near uses of the chain.
02711   // We currently ignore intermediate values within SCEV expressions, assuming
02712   // they will eventually be used be the current chain, or can be computed
02713   // from one of the chain increments. To be more precise we could
02714   // transitively follow its user and only add leaf IV users to the set.
02715   for (User *U : IVOper->users()) {
02716     Instruction *OtherUse = dyn_cast<Instruction>(U);
02717     if (!OtherUse)
02718       continue;
02719     // Uses in the chain will no longer be uses if the chain is formed.
02720     // Include the head of the chain in this iteration (not Chain.begin()).
02721     IVChain::const_iterator IncIter = Chain.Incs.begin();
02722     IVChain::const_iterator IncEnd = Chain.Incs.end();
02723     for( ; IncIter != IncEnd; ++IncIter) {
02724       if (IncIter->UserInst == OtherUse)
02725         break;
02726     }
02727     if (IncIter != IncEnd)
02728       continue;
02729 
02730     if (SE.isSCEVable(OtherUse->getType())
02731         && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
02732         && IU.isIVUserOrOperand(OtherUse)) {
02733       continue;
02734     }
02735     NearUsers.insert(OtherUse);
02736   }
02737 
02738   // Since this user is part of the chain, it's no longer considered a use
02739   // of the chain.
02740   ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
02741 }
02742 
02743 /// CollectChains - Populate the vector of Chains.
02744 ///
02745 /// This decreases ILP at the architecture level. Targets with ample registers,
02746 /// multiple memory ports, and no register renaming probably don't want
02747 /// this. However, such targets should probably disable LSR altogether.
02748 ///
02749 /// The job of LSR is to make a reasonable choice of induction variables across
02750 /// the loop. Subsequent passes can easily "unchain" computation exposing more
02751 /// ILP *within the loop* if the target wants it.
02752 ///
02753 /// Finding the best IV chain is potentially a scheduling problem. Since LSR
02754 /// will not reorder memory operations, it will recognize this as a chain, but
02755 /// will generate redundant IV increments. Ideally this would be corrected later
02756 /// by a smart scheduler:
02757 ///        = A[i]
02758 ///        = A[i+x]
02759 /// A[i]   =
02760 /// A[i+x] =
02761 ///
02762 /// TODO: Walk the entire domtree within this loop, not just the path to the
02763 /// loop latch. This will discover chains on side paths, but requires
02764 /// maintaining multiple copies of the Chains state.
02765 void LSRInstance::CollectChains() {
02766   DEBUG(dbgs() << "Collecting IV Chains.\n");
02767   SmallVector<ChainUsers, 8> ChainUsersVec;
02768 
02769   SmallVector<BasicBlock *,8> LatchPath;
02770   BasicBlock *LoopHeader = L->getHeader();
02771   for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
02772        Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
02773     LatchPath.push_back(Rung->getBlock());
02774   }
02775   LatchPath.push_back(LoopHeader);
02776 
02777   // Walk the instruction stream from the loop header to the loop latch.
02778   for (SmallVectorImpl<BasicBlock *>::reverse_iterator
02779          BBIter = LatchPath.rbegin(), BBEnd = LatchPath.rend();
02780        BBIter != BBEnd; ++BBIter) {
02781     for (BasicBlock::iterator I = (*BBIter)->begin(), E = (*BBIter)->end();
02782          I != E; ++I) {
02783       // Skip instructions that weren't seen by IVUsers analysis.
02784       if (isa<PHINode>(I) || !IU.isIVUserOrOperand(I))
02785         continue;
02786 
02787       // Ignore users that are part of a SCEV expression. This way we only
02788       // consider leaf IV Users. This effectively rediscovers a portion of
02789       // IVUsers analysis but in program order this time.
02790       if (SE.isSCEVable(I->getType()) && !isa<SCEVUnknown>(SE.getSCEV(I)))
02791         continue;
02792 
02793       // Remove this instruction from any NearUsers set it may be in.
02794       for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
02795            ChainIdx < NChains; ++ChainIdx) {
02796         ChainUsersVec[ChainIdx].NearUsers.erase(I);
02797       }
02798       // Search for operands that can be chained.
02799       SmallPtrSet<Instruction*, 4> UniqueOperands;
02800       User::op_iterator IVOpEnd = I->op_end();
02801       User::op_iterator IVOpIter = findIVOperand(I->op_begin(), IVOpEnd, L, SE);
02802       while (IVOpIter != IVOpEnd) {
02803         Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
02804         if (UniqueOperands.insert(IVOpInst).second)
02805           ChainInstruction(I, IVOpInst, ChainUsersVec);
02806         IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
02807       }
02808     } // Continue walking down the instructions.
02809   } // Continue walking down the domtree.
02810   // Visit phi backedges to determine if the chain can generate the IV postinc.
02811   for (BasicBlock::iterator I = L->getHeader()->begin();
02812        PHINode *PN = dyn_cast<PHINode>(I); ++I) {
02813     if (!SE.isSCEVable(PN->getType()))
02814       continue;
02815 
02816     Instruction *IncV =
02817       dyn_cast<Instruction>(PN->getIncomingValueForBlock(L->getLoopLatch()));
02818     if (IncV)
02819       ChainInstruction(PN, IncV, ChainUsersVec);
02820   }
02821   // Remove any unprofitable chains.
02822   unsigned ChainIdx = 0;
02823   for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
02824        UsersIdx < NChains; ++UsersIdx) {
02825     if (!isProfitableChain(IVChainVec[UsersIdx],
02826                            ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
02827       continue;
02828     // Preserve the chain at UsesIdx.
02829     if (ChainIdx != UsersIdx)
02830       IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
02831     FinalizeChain(IVChainVec[ChainIdx]);
02832     ++ChainIdx;
02833   }
02834   IVChainVec.resize(ChainIdx);
02835 }
02836 
02837 void LSRInstance::FinalizeChain(IVChain &Chain) {
02838   assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
02839   DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
02840 
02841   for (IVChain::const_iterator I = Chain.begin(), E = Chain.end();
02842        I != E; ++I) {
02843     DEBUG(dbgs() << "        Inc: " << *I->UserInst << "\n");
02844     User::op_iterator UseI =
02845       std::find(I->UserInst->op_begin(), I->UserInst->op_end(), I->IVOperand);
02846     assert(UseI != I->UserInst->op_end() && "cannot find IV operand");
02847     IVIncSet.insert(UseI);
02848   }
02849 }
02850 
02851 /// Return true if the IVInc can be folded into an addressing mode.
02852 static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
02853                              Value *Operand, const TargetTransformInfo &TTI) {
02854   const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
02855   if (!IncConst || !isAddressUse(UserInst, Operand))
02856     return false;
02857 
02858   if (IncConst->getValue()->getValue().getMinSignedBits() > 64)
02859     return false;
02860 
02861   int64_t IncOffset = IncConst->getValue()->getSExtValue();
02862   if (!isAlwaysFoldable(TTI, LSRUse::Address,
02863                         getAccessType(UserInst), /*BaseGV=*/ nullptr,
02864                         IncOffset, /*HaseBaseReg=*/ false))
02865     return false;
02866 
02867   return true;
02868 }
02869 
02870 /// GenerateIVChains - Generate an add or subtract for each IVInc in a chain to
02871 /// materialize the IV user's operand from the previous IV user's operand.
02872 void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
02873                                   SmallVectorImpl<WeakVH> &DeadInsts) {
02874   // Find the new IVOperand for the head of the chain. It may have been replaced
02875   // by LSR.
02876   const IVInc &Head = Chain.Incs[0];
02877   User::op_iterator IVOpEnd = Head.UserInst->op_end();
02878   // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
02879   User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
02880                                              IVOpEnd, L, SE);
02881   Value *IVSrc = nullptr;
02882   while (IVOpIter != IVOpEnd) {
02883     IVSrc = getWideOperand(*IVOpIter);
02884 
02885     // If this operand computes the expression that the chain needs, we may use
02886     // it. (Check this after setting IVSrc which is used below.)
02887     //
02888     // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
02889     // narrow for the chain, so we can no longer use it. We do allow using a
02890     // wider phi, assuming the LSR checked for free truncation. In that case we
02891     // should already have a truncate on this operand such that
02892     // getSCEV(IVSrc) == IncExpr.
02893     if (SE.getSCEV(*IVOpIter) == Head.IncExpr
02894         || SE.getSCEV(IVSrc) == Head.IncExpr) {
02895       break;
02896     }
02897     IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
02898   }
02899   if (IVOpIter == IVOpEnd) {
02900     // Gracefully give up on this chain.
02901     DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
02902     return;
02903   }
02904 
02905   DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
02906   Type *IVTy = IVSrc->getType();
02907   Type *IntTy = SE.getEffectiveSCEVType(IVTy);
02908   const SCEV *LeftOverExpr = nullptr;
02909   for (IVChain::const_iterator IncI = Chain.begin(),
02910          IncE = Chain.end(); IncI != IncE; ++IncI) {
02911 
02912     Instruction *InsertPt = IncI->UserInst;
02913     if (isa<PHINode>(InsertPt))
02914       InsertPt = L->getLoopLatch()->getTerminator();
02915 
02916     // IVOper will replace the current IV User's operand. IVSrc is the IV
02917     // value currently held in a register.
02918     Value *IVOper = IVSrc;
02919     if (!IncI->IncExpr->isZero()) {
02920       // IncExpr was the result of subtraction of two narrow values, so must
02921       // be signed.
02922       const SCEV *IncExpr = SE.getNoopOrSignExtend(IncI->IncExpr, IntTy);
02923       LeftOverExpr = LeftOverExpr ?
02924         SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
02925     }
02926     if (LeftOverExpr && !LeftOverExpr->isZero()) {
02927       // Expand the IV increment.
02928       Rewriter.clearPostInc();
02929       Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
02930       const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
02931                                              SE.getUnknown(IncV));
02932       IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
02933 
02934       // If an IV increment can't be folded, use it as the next IV value.
02935       if (!canFoldIVIncExpr(LeftOverExpr, IncI->UserInst, IncI->IVOperand,
02936                             TTI)) {
02937         assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
02938         IVSrc = IVOper;
02939         LeftOverExpr = nullptr;
02940       }
02941     }
02942     Type *OperTy = IncI->IVOperand->getType();
02943     if (IVTy != OperTy) {
02944       assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
02945              "cannot extend a chained IV");
02946       IRBuilder<> Builder(InsertPt);
02947       IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
02948     }
02949     IncI->UserInst->replaceUsesOfWith(IncI->IVOperand, IVOper);
02950     DeadInsts.push_back(IncI->IVOperand);
02951   }
02952   // If LSR created a new, wider phi, we may also replace its postinc. We only
02953   // do this if we also found a wide value for the head of the chain.
02954   if (isa<PHINode>(Chain.tailUserInst())) {
02955     for (BasicBlock::iterator I = L->getHeader()->begin();
02956          PHINode *Phi = dyn_cast<PHINode>(I); ++I) {
02957       if (!isCompatibleIVType(Phi, IVSrc))
02958         continue;
02959       Instruction *PostIncV = dyn_cast<Instruction>(
02960         Phi->getIncomingValueForBlock(L->getLoopLatch()));
02961       if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
02962         continue;
02963       Value *IVOper = IVSrc;
02964       Type *PostIncTy = PostIncV->getType();
02965       if (IVTy != PostIncTy) {
02966         assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
02967         IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
02968         Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
02969         IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
02970       }
02971       Phi->replaceUsesOfWith(PostIncV, IVOper);
02972       DeadInsts.push_back(PostIncV);
02973     }
02974   }
02975 }
02976 
02977 void LSRInstance::CollectFixupsAndInitialFormulae() {
02978   for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) {
02979     Instruction *UserInst = UI->getUser();
02980     // Skip IV users that are part of profitable IV Chains.
02981     User::op_iterator UseI = std::find(UserInst->op_begin(), UserInst->op_end(),
02982                                        UI->getOperandValToReplace());
02983     assert(UseI != UserInst->op_end() && "cannot find IV operand");
02984     if (IVIncSet.count(UseI))
02985       continue;
02986 
02987     // Record the uses.
02988     LSRFixup &LF = getNewFixup();
02989     LF.UserInst = UserInst;
02990     LF.OperandValToReplace = UI->getOperandValToReplace();
02991     LF.PostIncLoops = UI->getPostIncLoops();
02992 
02993     LSRUse::KindType Kind = LSRUse::Basic;
02994     Type *AccessTy = nullptr;
02995     if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) {
02996       Kind = LSRUse::Address;
02997       AccessTy = getAccessType(LF.UserInst);
02998     }
02999 
03000     const SCEV *S = IU.getExpr(*UI);
03001 
03002     // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
03003     // (N - i == 0), and this allows (N - i) to be the expression that we work
03004     // with rather than just N or i, so we can consider the register
03005     // requirements for both N and i at the same time. Limiting this code to
03006     // equality icmps is not a problem because all interesting loops use
03007     // equality icmps, thanks to IndVarSimplify.
03008     if (ICmpInst *CI = dyn_cast<ICmpInst>(LF.UserInst))
03009       if (CI->isEquality()) {
03010         // Swap the operands if needed to put the OperandValToReplace on the
03011         // left, for consistency.
03012         Value *NV = CI->getOperand(1);
03013         if (NV == LF.OperandValToReplace) {
03014           CI->setOperand(1, CI->getOperand(0));
03015           CI->setOperand(0, NV);
03016           NV = CI->getOperand(1);
03017           Changed = true;
03018         }
03019 
03020         // x == y  -->  x - y == 0
03021         const SCEV *N = SE.getSCEV(NV);
03022         if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) {
03023           // S is normalized, so normalize N before folding it into S
03024           // to keep the result normalized.
03025           N = TransformForPostIncUse(Normalize, N, CI, nullptr,
03026                                      LF.PostIncLoops, SE, DT);
03027           Kind = LSRUse::ICmpZero;
03028           S = SE.getMinusSCEV(N, S);
03029         }
03030 
03031         // -1 and the negations of all interesting strides (except the negation
03032         // of -1) are now also interesting.
03033         for (size_t i = 0, e = Factors.size(); i != e; ++i)
03034           if (Factors[i] != -1)
03035             Factors.insert(-(uint64_t)Factors[i]);
03036         Factors.insert(-1);
03037       }
03038 
03039     // Set up the initial formula for this use.
03040     std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
03041     LF.LUIdx = P.first;
03042     LF.Offset = P.second;
03043     LSRUse &LU = Uses[LF.LUIdx];
03044     LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
03045     if (!LU.WidestFixupType ||
03046         SE.getTypeSizeInBits(LU.WidestFixupType) <
03047         SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
03048       LU.WidestFixupType = LF.OperandValToReplace->getType();
03049 
03050     // If this is the first use of this LSRUse, give it a formula.
03051     if (LU.Formulae.empty()) {
03052       InsertInitialFormula(S, LU, LF.LUIdx);
03053       CountRegisters(LU.Formulae.back(), LF.LUIdx);
03054     }
03055   }
03056 
03057   DEBUG(print_fixups(dbgs()));
03058 }
03059 
03060 /// InsertInitialFormula - Insert a formula for the given expression into
03061 /// the given use, separating out loop-variant portions from loop-invariant
03062 /// and loop-computable portions.
03063 void
03064 LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
03065   // Mark uses whose expressions cannot be expanded.
03066   if (!isSafeToExpand(S, SE))
03067     LU.RigidFormula = true;
03068 
03069   Formula F;
03070   F.InitialMatch(S, L, SE);
03071   bool Inserted = InsertFormula(LU, LUIdx, F);
03072   assert(Inserted && "Initial formula already exists!"); (void)Inserted;
03073 }
03074 
03075 /// InsertSupplementalFormula - Insert a simple single-register formula for
03076 /// the given expression into the given use.
03077 void
03078 LSRInstance::InsertSupplementalFormula(const SCEV *S,
03079                                        LSRUse &LU, size_t LUIdx) {
03080   Formula F;
03081   F.BaseRegs.push_back(S);
03082   F.HasBaseReg = true;
03083   bool Inserted = InsertFormula(LU, LUIdx, F);
03084   assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
03085 }
03086 
03087 /// CountRegisters - Note which registers are used by the given formula,
03088 /// updating RegUses.
03089 void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
03090   if (F.ScaledReg)
03091     RegUses.CountRegister(F.ScaledReg, LUIdx);
03092   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
03093        E = F.BaseRegs.end(); I != E; ++I)
03094     RegUses.CountRegister(*I, LUIdx);
03095 }
03096 
03097 /// InsertFormula - If the given formula has not yet been inserted, add it to
03098 /// the list, and return true. Return false otherwise.
03099 bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
03100   // Do not insert formula that we will not be able to expand.
03101   assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
03102          "Formula is illegal");
03103   if (!LU.InsertFormula(F))
03104     return false;
03105 
03106   CountRegisters(F, LUIdx);
03107   return true;
03108 }
03109 
03110 /// CollectLoopInvariantFixupsAndFormulae - Check for other uses of
03111 /// loop-invariant values which we're tracking. These other uses will pin these
03112 /// values in registers, making them less profitable for elimination.
03113 /// TODO: This currently misses non-constant addrec step registers.
03114 /// TODO: Should this give more weight to users inside the loop?
03115 void
03116 LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
03117   SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
03118   SmallPtrSet<const SCEV *, 32> Visited;
03119 
03120   while (!Worklist.empty()) {
03121     const SCEV *S = Worklist.pop_back_val();
03122 
03123     // Don't process the same SCEV twice
03124     if (!Visited.insert(S).second)
03125       continue;
03126 
03127     if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
03128       Worklist.append(N->op_begin(), N->op_end());
03129     else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
03130       Worklist.push_back(C->getOperand());
03131     else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
03132       Worklist.push_back(D->getLHS());
03133       Worklist.push_back(D->getRHS());
03134     } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
03135       const Value *V = US->getValue();
03136       if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
03137         // Look for instructions defined outside the loop.
03138         if (L->contains(Inst)) continue;
03139       } else if (isa<UndefValue>(V))
03140         // Undef doesn't have a live range, so it doesn't matter.
03141         continue;
03142       for (const Use &U : V->uses()) {
03143         const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
03144         // Ignore non-instructions.
03145         if (!UserInst)
03146           continue;
03147         // Ignore instructions in other functions (as can happen with
03148         // Constants).
03149         if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
03150           continue;
03151         // Ignore instructions not dominated by the loop.
03152         const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
03153           UserInst->getParent() :
03154           cast<PHINode>(UserInst)->getIncomingBlock(
03155             PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
03156         if (!DT.dominates(L->getHeader(), UseBB))
03157           continue;
03158         // Ignore uses which are part of other SCEV expressions, to avoid
03159         // analyzing them multiple times.
03160         if (SE.isSCEVable(UserInst->getType())) {
03161           const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
03162           // If the user is a no-op, look through to its uses.
03163           if (!isa<SCEVUnknown>(UserS))
03164             continue;
03165           if (UserS == US) {
03166             Worklist.push_back(
03167               SE.getUnknown(const_cast<Instruction *>(UserInst)));
03168             continue;
03169           }
03170         }
03171         // Ignore icmp instructions which are already being analyzed.
03172         if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
03173           unsigned OtherIdx = !U.getOperandNo();
03174           Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
03175           if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
03176             continue;
03177         }
03178 
03179         LSRFixup &LF = getNewFixup();
03180         LF.UserInst = const_cast<Instruction *>(UserInst);
03181         LF.OperandValToReplace = U;
03182         std::pair<size_t, int64_t> P = getUse(S, LSRUse::Basic, nullptr);
03183         LF.LUIdx = P.first;
03184         LF.Offset = P.second;
03185         LSRUse &LU = Uses[LF.LUIdx];
03186         LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
03187         if (!LU.WidestFixupType ||
03188             SE.getTypeSizeInBits(LU.WidestFixupType) <
03189             SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
03190           LU.WidestFixupType = LF.OperandValToReplace->getType();
03191         InsertSupplementalFormula(US, LU, LF.LUIdx);
03192         CountRegisters(LU.Formulae.back(), Uses.size() - 1);
03193         break;
03194       }
03195     }
03196   }
03197 }
03198 
03199 /// CollectSubexprs - Split S into subexpressions which can be pulled out into
03200 /// separate registers. If C is non-null, multiply each subexpression by C.
03201 ///
03202 /// Return remainder expression after factoring the subexpressions captured by
03203 /// Ops. If Ops is complete, return NULL.
03204 static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
03205                                    SmallVectorImpl<const SCEV *> &Ops,
03206                                    const Loop *L,
03207                                    ScalarEvolution &SE,
03208                                    unsigned Depth = 0) {
03209   // Arbitrarily cap recursion to protect compile time.
03210   if (Depth >= 3)
03211     return S;
03212 
03213   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
03214     // Break out add operands.
03215     for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
03216          I != E; ++I) {
03217       const SCEV *Remainder = CollectSubexprs(*I, C, Ops, L, SE, Depth+1);
03218       if (Remainder)
03219         Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
03220     }
03221     return nullptr;
03222   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
03223     // Split a non-zero base out of an addrec.
03224     if (AR->getStart()->isZero())
03225       return S;
03226 
03227     const SCEV *Remainder = CollectSubexprs(AR->getStart(),
03228                                             C, Ops, L, SE, Depth+1);
03229     // Split the non-zero AddRec unless it is part of a nested recurrence that
03230     // does not pertain to this loop.
03231     if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) {
03232       Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
03233       Remainder = nullptr;
03234     }
03235     if (Remainder != AR->getStart()) {
03236       if (!Remainder)
03237         Remainder = SE.getConstant(AR->getType(), 0);
03238       return SE.getAddRecExpr(Remainder,
03239                               AR->getStepRecurrence(SE),
03240                               AR->getLoop(),
03241                               //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
03242                               SCEV::FlagAnyWrap);
03243     }
03244   } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
03245     // Break (C * (a + b + c)) into C*a + C*b + C*c.
03246     if (Mul->getNumOperands() != 2)
03247       return S;
03248     if (const SCEVConstant *Op0 =
03249         dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
03250       C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
03251       const SCEV *Remainder =
03252         CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1);
03253       if (Remainder)
03254         Ops.push_back(SE.getMulExpr(C, Remainder));
03255       return nullptr;
03256     }
03257   }
03258   return S;
03259 }
03260 
03261 /// \brief Helper function for LSRInstance::GenerateReassociations.
03262 void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
03263                                              const Formula &Base,
03264                                              unsigned Depth, size_t Idx,
03265                                              bool IsScaledReg) {
03266   const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
03267   SmallVector<const SCEV *, 8> AddOps;
03268   const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
03269   if (Remainder)
03270     AddOps.push_back(Remainder);
03271 
03272   if (AddOps.size() == 1)
03273     return;
03274 
03275   for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
03276                                                      JE = AddOps.end();
03277        J != JE; ++J) {
03278 
03279     // Loop-variant "unknown" values are uninteresting; we won't be able to
03280     // do anything meaningful with them.
03281     if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
03282       continue;
03283 
03284     // Don't pull a constant into a register if the constant could be folded
03285     // into an immediate field.
03286     if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
03287                          LU.AccessTy, *J, Base.getNumRegs() > 1))
03288       continue;
03289 
03290     // Collect all operands except *J.
03291     SmallVector<const SCEV *, 8> InnerAddOps(
03292         ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
03293     InnerAddOps.append(std::next(J),
03294                        ((const SmallVector<const SCEV *, 8> &)AddOps).end());
03295 
03296     // Don't leave just a constant behind in a register if the constant could
03297     // be folded into an immediate field.
03298     if (InnerAddOps.size() == 1 &&
03299         isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
03300                          LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
03301       continue;
03302 
03303     const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
03304     if (InnerSum->isZero())
03305       continue;
03306     Formula F = Base;
03307 
03308     // Add the remaining pieces of the add back into the new formula.
03309     const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
03310     if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
03311         TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
03312                                 InnerSumSC->getValue()->getZExtValue())) {
03313       F.UnfoldedOffset =
03314           (uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue();
03315       if (IsScaledReg)
03316         F.ScaledReg = nullptr;
03317       else
03318         F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
03319     } else if (IsScaledReg)
03320       F.ScaledReg = InnerSum;
03321     else
03322       F.BaseRegs[Idx] = InnerSum;
03323 
03324     // Add J as its own register, or an unfolded immediate.
03325     const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
03326     if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
03327         TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
03328                                 SC->getValue()->getZExtValue()))
03329       F.UnfoldedOffset =
03330           (uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue();
03331     else
03332       F.BaseRegs.push_back(*J);
03333     // We may have changed the number of register in base regs, adjust the
03334     // formula accordingly.
03335     F.Canonicalize();
03336 
03337     if (InsertFormula(LU, LUIdx, F))
03338       // If that formula hadn't been seen before, recurse to find more like
03339       // it.
03340       GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth + 1);
03341   }
03342 }
03343 
03344 /// GenerateReassociations - Split out subexpressions from adds and the bases of
03345 /// addrecs.
03346 void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
03347                                          Formula Base, unsigned Depth) {
03348   assert(Base.isCanonical() && "Input must be in the canonical form");
03349   // Arbitrarily cap recursion to protect compile time.
03350   if (Depth >= 3)
03351     return;
03352 
03353   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
03354     GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
03355 
03356   if (Base.Scale == 1)
03357     GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
03358                                /* Idx */ -1, /* IsScaledReg */ true);
03359 }
03360 
03361 /// GenerateCombinations - Generate a formula consisting of all of the
03362 /// loop-dominating registers added into a single register.
03363 void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
03364                                        Formula Base) {
03365   // This method is only interesting on a plurality of registers.
03366   if (Base.BaseRegs.size() + (Base.Scale == 1) <= 1)
03367     return;
03368 
03369   // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
03370   // processing the formula.
03371   Base.Unscale();
03372   Formula F = Base;
03373   F.BaseRegs.clear();
03374   SmallVector<const SCEV *, 4> Ops;
03375   for (SmallVectorImpl<const SCEV *>::const_iterator
03376        I = Base.BaseRegs.begin(), E = Base.BaseRegs.end(); I != E; ++I) {
03377     const SCEV *BaseReg = *I;
03378     if (SE.properlyDominates(BaseReg, L->getHeader()) &&
03379         !SE.hasComputableLoopEvolution(BaseReg, L))
03380       Ops.push_back(BaseReg);
03381     else
03382       F.BaseRegs.push_back(BaseReg);
03383   }
03384   if (Ops.size() > 1) {
03385     const SCEV *Sum = SE.getAddExpr(Ops);
03386     // TODO: If Sum is zero, it probably means ScalarEvolution missed an
03387     // opportunity to fold something. For now, just ignore such cases
03388     // rather than proceed with zero in a register.
03389     if (!Sum->isZero()) {
03390       F.BaseRegs.push_back(Sum);
03391       F.Canonicalize();
03392       (void)InsertFormula(LU, LUIdx, F);
03393     }
03394   }
03395 }
03396 
03397 /// \brief Helper function for LSRInstance::GenerateSymbolicOffsets.
03398 void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
03399                                               const Formula &Base, size_t Idx,
03400                                               bool IsScaledReg) {
03401   const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
03402   GlobalValue *GV = ExtractSymbol(G, SE);
03403   if (G->isZero() || !GV)
03404     return;
03405   Formula F = Base;
03406   F.BaseGV = GV;
03407   if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
03408     return;
03409   if (IsScaledReg)
03410     F.ScaledReg = G;
03411   else
03412     F.BaseRegs[Idx] = G;
03413   (void)InsertFormula(LU, LUIdx, F);
03414 }
03415 
03416 /// GenerateSymbolicOffsets - Generate reuse formulae using symbolic offsets.
03417 void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
03418                                           Formula Base) {
03419   // We can't add a symbolic offset if the address already contains one.
03420   if (Base.BaseGV) return;
03421 
03422   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
03423     GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
03424   if (Base.Scale == 1)
03425     GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
03426                                 /* IsScaledReg */ true);
03427 }
03428 
03429 /// \brief Helper function for LSRInstance::GenerateConstantOffsets.
03430 void LSRInstance::GenerateConstantOffsetsImpl(
03431     LSRUse &LU, unsigned LUIdx, const Formula &Base,
03432     const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) {
03433   const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
03434   for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(),
03435                                                 E = Worklist.end();
03436        I != E; ++I) {
03437     Formula F = Base;
03438     F.BaseOffset = (uint64_t)Base.BaseOffset - *I;
03439     if (isLegalUse(TTI, LU.MinOffset - *I, LU.MaxOffset - *I, LU.Kind,
03440                    LU.AccessTy, F)) {
03441       // Add the offset to the base register.
03442       const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G);
03443       // If it cancelled out, drop the base register, otherwise update it.
03444       if (NewG->isZero()) {
03445         if (IsScaledReg) {
03446           F.Scale = 0;
03447           F.ScaledReg = nullptr;
03448         } else
03449           F.DeleteBaseReg(F.BaseRegs[Idx]);
03450         F.Canonicalize();
03451       } else if (IsScaledReg)
03452         F.ScaledReg = NewG;
03453       else
03454         F.BaseRegs[Idx] = NewG;
03455 
03456       (void)InsertFormula(LU, LUIdx, F);
03457     }
03458   }
03459 
03460   int64_t Imm = ExtractImmediate(G, SE);
03461   if (G->isZero() || Imm == 0)
03462     return;
03463   Formula F = Base;
03464   F.BaseOffset = (uint64_t)F.BaseOffset + Imm;
03465   if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
03466     return;
03467   if (IsScaledReg)
03468     F.ScaledReg = G;
03469   else
03470     F.BaseRegs[Idx] = G;
03471   (void)InsertFormula(LU, LUIdx, F);
03472 }
03473 
03474 /// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
03475 void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
03476                                           Formula Base) {
03477   // TODO: For now, just add the min and max offset, because it usually isn't
03478   // worthwhile looking at everything inbetween.
03479   SmallVector<int64_t, 2> Worklist;
03480   Worklist.push_back(LU.MinOffset);
03481   if (LU.MaxOffset != LU.MinOffset)
03482     Worklist.push_back(LU.MaxOffset);
03483 
03484   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
03485     GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
03486   if (Base.Scale == 1)
03487     GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
03488                                 /* IsScaledReg */ true);
03489 }
03490 
03491 /// GenerateICmpZeroScales - For ICmpZero, check to see if we can scale up
03492 /// the comparison. For example, x == y -> x*c == y*c.
03493 void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
03494                                          Formula Base) {
03495   if (LU.Kind != LSRUse::ICmpZero) return;
03496 
03497   // Determine the integer type for the base formula.
03498   Type *IntTy = Base.getType();
03499   if (!IntTy) return;
03500   if (SE.getTypeSizeInBits(IntTy) > 64) return;
03501 
03502   // Don't do this if there is more than one offset.
03503   if (LU.MinOffset != LU.MaxOffset) return;
03504 
03505   assert(!Base.BaseGV && "ICmpZero use is not legal!");
03506 
03507   // Check each interesting stride.
03508   for (SmallSetVector<int64_t, 8>::const_iterator
03509        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
03510     int64_t Factor = *I;
03511 
03512     // Check that the multiplication doesn't overflow.
03513     if (Base.BaseOffset == INT64_MIN && Factor == -1)
03514       continue;
03515     int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor;
03516     if (NewBaseOffset / Factor != Base.BaseOffset)
03517       continue;
03518     // If the offset will be truncated at this use, check that it is in bounds.
03519     if (!IntTy->isPointerTy() &&
03520         !ConstantInt::isValueValidForType(IntTy, NewBaseOffset))
03521       continue;
03522 
03523     // Check that multiplying with the use offset doesn't overflow.
03524     int64_t Offset = LU.MinOffset;
03525     if (Offset == INT64_MIN && Factor == -1)
03526       continue;
03527     Offset = (uint64_t)Offset * Factor;
03528     if (Offset / Factor != LU.MinOffset)
03529       continue;
03530     // If the offset will be truncated at this use, check that it is in bounds.
03531     if (!IntTy->isPointerTy() &&
03532         !ConstantInt::isValueValidForType(IntTy, Offset))
03533       continue;
03534 
03535     Formula F = Base;
03536     F.BaseOffset = NewBaseOffset;
03537 
03538     // Check that this scale is legal.
03539     if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
03540       continue;
03541 
03542     // Compensate for the use having MinOffset built into it.
03543     F.BaseOffset = (uint64_t)F.BaseOffset + Offset - LU.MinOffset;
03544 
03545     const SCEV *FactorS = SE.getConstant(IntTy, Factor);
03546 
03547     // Check that multiplying with each base register doesn't overflow.
03548     for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
03549       F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
03550       if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
03551         goto next;
03552     }
03553 
03554     // Check that multiplying with the scaled register doesn't overflow.
03555     if (F.ScaledReg) {
03556       F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
03557       if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
03558         continue;
03559     }
03560 
03561     // Check that multiplying with the unfolded offset doesn't overflow.
03562     if (F.UnfoldedOffset != 0) {
03563       if (F.UnfoldedOffset == INT64_MIN && Factor == -1)
03564         continue;
03565       F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor;
03566       if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset)
03567         continue;
03568       // If the offset will be truncated, check that it is in bounds.
03569       if (!IntTy->isPointerTy() &&
03570           !ConstantInt::isValueValidForType(IntTy, F.UnfoldedOffset))
03571         continue;
03572     }
03573 
03574     // If we make it here and it's legal, add it.
03575     (void)InsertFormula(LU, LUIdx, F);
03576   next:;
03577   }
03578 }
03579 
03580 /// GenerateScales - Generate stride factor reuse formulae by making use of
03581 /// scaled-offset address modes, for example.
03582 void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
03583   // Determine the integer type for the base formula.
03584   Type *IntTy = Base.getType();
03585   if (!IntTy) return;
03586 
03587   // If this Formula already has a scaled register, we can't add another one.
03588   // Try to unscale the formula to generate a better scale.
03589   if (Base.Scale != 0 && !Base.Unscale())
03590     return;
03591 
03592   assert(Base.Scale == 0 && "Unscale did not did its job!");
03593 
03594   // Check each interesting stride.
03595   for (SmallSetVector<int64_t, 8>::const_iterator
03596        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
03597     int64_t Factor = *I;
03598 
03599     Base.Scale = Factor;
03600     Base.HasBaseReg = Base.BaseRegs.size() > 1;
03601     // Check whether this scale is going to be legal.
03602     if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
03603                     Base)) {
03604       // As a special-case, handle special out-of-loop Basic users specially.
03605       // TODO: Reconsider this special case.
03606       if (LU.Kind == LSRUse::Basic &&
03607           isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
03608                      LU.AccessTy, Base) &&
03609           LU.AllFixupsOutsideLoop)
03610         LU.Kind = LSRUse::Special;
03611       else
03612         continue;
03613     }
03614     // For an ICmpZero, negating a solitary base register won't lead to
03615     // new solutions.
03616     if (LU.Kind == LSRUse::ICmpZero &&
03617         !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV)
03618       continue;
03619     // For each addrec base reg, apply the scale, if possible.
03620     for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
03621       if (const SCEVAddRecExpr *AR =
03622             dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) {
03623         const SCEV *FactorS = SE.getConstant(IntTy, Factor);
03624         if (FactorS->isZero())
03625           continue;
03626         // Divide out the factor, ignoring high bits, since we'll be
03627         // scaling the value back up in the end.
03628         if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) {
03629           // TODO: This could be optimized to avoid all the copying.
03630           Formula F = Base;
03631           F.ScaledReg = Quotient;
03632           F.DeleteBaseReg(F.BaseRegs[i]);
03633           // The canonical representation of 1*reg is reg, which is already in
03634           // Base. In that case, do not try to insert the formula, it will be
03635           // rejected anyway.
03636           if (F.Scale == 1 && F.BaseRegs.empty())
03637             continue;
03638           (void)InsertFormula(LU, LUIdx, F);
03639         }
03640       }
03641   }
03642 }
03643 
03644 /// GenerateTruncates - Generate reuse formulae from different IV types.
03645 void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
03646   // Don't bother truncating symbolic values.
03647   if (Base.BaseGV) return;
03648 
03649   // Determine the integer type for the base formula.
03650   Type *DstTy = Base.getType();
03651   if (!DstTy) return;
03652   DstTy = SE.getEffectiveSCEVType(DstTy);
03653 
03654   for (SmallSetVector<Type *, 4>::const_iterator
03655        I = Types.begin(), E = Types.end(); I != E; ++I) {
03656     Type *SrcTy = *I;
03657     if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
03658       Formula F = Base;
03659 
03660       if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, *I);
03661       for (SmallVectorImpl<const SCEV *>::iterator J = F.BaseRegs.begin(),
03662            JE = F.BaseRegs.end(); J != JE; ++J)
03663         *J = SE.getAnyExtendExpr(*J, SrcTy);
03664 
03665       // TODO: This assumes we've done basic processing on all uses and
03666       // have an idea what the register usage is.
03667       if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
03668         continue;
03669 
03670       (void)InsertFormula(LU, LUIdx, F);
03671     }
03672   }
03673 }
03674 
03675 namespace {
03676 
03677 /// WorkItem - Helper class for GenerateCrossUseConstantOffsets. It's used to
03678 /// defer modifications so that the search phase doesn't have to worry about
03679 /// the data structures moving underneath it.
03680 struct WorkItem {
03681   size_t LUIdx;
03682   int64_t Imm;
03683   const SCEV *OrigReg;
03684 
03685   WorkItem(size_t LI, int64_t I, const SCEV *R)
03686     : LUIdx(LI), Imm(I), OrigReg(R) {}
03687 
03688   void print(raw_ostream &OS) const;
03689   void dump() const;
03690 };
03691 
03692 }
03693 
03694 void WorkItem::print(raw_ostream &OS) const {
03695   OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
03696      << " , add offset " << Imm;
03697 }
03698 
03699 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
03700 void WorkItem::dump() const {
03701   print(errs()); errs() << '\n';
03702 }
03703 #endif
03704 
03705 /// GenerateCrossUseConstantOffsets - Look for registers which are a constant
03706 /// distance apart and try to form reuse opportunities between them.
03707 void LSRInstance::GenerateCrossUseConstantOffsets() {
03708   // Group the registers by their value without any added constant offset.
03709   typedef std::map<int64_t, const SCEV *> ImmMapTy;
03710   typedef DenseMap<const SCEV *, ImmMapTy> RegMapTy;
03711   RegMapTy Map;
03712   DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
03713   SmallVector<const SCEV *, 8> Sequence;
03714   for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end();
03715        I != E; ++I) {
03716     const SCEV *Reg = *I;
03717     int64_t Imm = ExtractImmediate(Reg, SE);
03718     std::pair<RegMapTy::iterator, bool> Pair =
03719       Map.insert(std::make_pair(Reg, ImmMapTy()));
03720     if (Pair.second)
03721       Sequence.push_back(Reg);
03722     Pair.first->second.insert(std::make_pair(Imm, *I));
03723     UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(*I);
03724   }
03725 
03726   // Now examine each set of registers with the same base value. Build up
03727   // a list of work to do and do the work in a separate step so that we're
03728   // not adding formulae and register counts while we're searching.
03729   SmallVector<WorkItem, 32> WorkItems;
03730   SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems;
03731   for (SmallVectorImpl<const SCEV *>::const_iterator I = Sequence.begin(),
03732        E = Sequence.end(); I != E; ++I) {
03733     const SCEV *Reg = *I;
03734     const ImmMapTy &Imms = Map.find(Reg)->second;
03735 
03736     // It's not worthwhile looking for reuse if there's only one offset.
03737     if (Imms.size() == 1)
03738       continue;
03739 
03740     DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
03741           for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
03742                J != JE; ++J)
03743             dbgs() << ' ' << J->first;
03744           dbgs() << '\n');
03745 
03746     // Examine each offset.
03747     for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
03748          J != JE; ++J) {
03749       const SCEV *OrigReg = J->second;
03750 
03751       int64_t JImm = J->first;
03752       const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
03753 
03754       if (!isa<SCEVConstant>(OrigReg) &&
03755           UsedByIndicesMap[Reg].count() == 1) {
03756         DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg << '\n');
03757         continue;
03758       }
03759 
03760       // Conservatively examine offsets between this orig reg a few selected
03761       // other orig regs.
03762       ImmMapTy::const_iterator OtherImms[] = {
03763         Imms.begin(), std::prev(Imms.end()),
03764         Imms.lower_bound((Imms.begin()->first + std::prev(Imms.end())->first) /
03765                          2)
03766       };
03767       for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {
03768         ImmMapTy::const_iterator M = OtherImms[i];
03769         if (M == J || M == JE) continue;
03770 
03771         // Compute the difference between the two.
03772         int64_t Imm = (uint64_t)JImm - M->first;
03773         for (int LUIdx = UsedByIndices.find_first(); LUIdx != -1;
03774              LUIdx = UsedByIndices.find_next(LUIdx))
03775           // Make a memo of this use, offset, and register tuple.
03776           if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
03777             WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
03778       }
03779     }
03780   }
03781 
03782   Map.clear();
03783   Sequence.clear();
03784   UsedByIndicesMap.clear();
03785   UniqueItems.clear();
03786 
03787   // Now iterate through the worklist and add new formulae.
03788   for (SmallVectorImpl<WorkItem>::const_iterator I = WorkItems.begin(),
03789        E = WorkItems.end(); I != E; ++I) {
03790     const WorkItem &WI = *I;
03791     size_t LUIdx = WI.LUIdx;
03792     LSRUse &LU = Uses[LUIdx];
03793     int64_t Imm = WI.Imm;
03794     const SCEV *OrigReg = WI.OrigReg;
03795 
03796     Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
03797     const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
03798     unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
03799 
03800     // TODO: Use a more targeted data structure.
03801     for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
03802       Formula F = LU.Formulae[L];
03803       // FIXME: The code for the scaled and unscaled registers looks
03804       // very similar but slightly different. Investigate if they
03805       // could be merged. That way, we would not have to unscale the
03806       // Formula.
03807       F.Unscale();
03808       // Use the immediate in the scaled register.
03809       if (F.ScaledReg == OrigReg) {
03810         int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale;
03811         // Don't create 50 + reg(-50).
03812         if (F.referencesReg(SE.getSCEV(
03813                    ConstantInt::get(IntTy, -(uint64_t)Offset))))
03814           continue;
03815         Formula NewF = F;
03816         NewF.BaseOffset = Offset;
03817         if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
03818                         NewF))
03819           continue;
03820         NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
03821 
03822         // If the new scale is a constant in a register, and adding the constant
03823         // value to the immediate would produce a value closer to zero than the
03824         // immediate itself, then the formula isn't worthwhile.
03825         if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
03826           if (C->getValue()->isNegative() !=
03827                 (NewF.BaseOffset < 0) &&
03828               (C->getValue()->getValue().abs() * APInt(BitWidth, F.Scale))
03829                 .ule(std::abs(NewF.BaseOffset)))
03830             continue;
03831 
03832         // OK, looks good.
03833         NewF.Canonicalize();
03834         (void)InsertFormula(LU, LUIdx, NewF);
03835       } else {
03836         // Use the immediate in a base register.
03837         for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
03838           const SCEV *BaseReg = F.BaseRegs[N];
03839           if (BaseReg != OrigReg)
03840             continue;
03841           Formula NewF = F;
03842           NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm;
03843           if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
03844                           LU.Kind, LU.AccessTy, NewF)) {
03845             if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
03846               continue;
03847             NewF = F;
03848             NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm;
03849           }
03850           NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
03851 
03852           // If the new formula has a constant in a register, and adding the
03853           // constant value to the immediate would produce a value closer to
03854           // zero than the immediate itself, then the formula isn't worthwhile.
03855           for (SmallVectorImpl<const SCEV *>::const_iterator
03856                J = NewF.BaseRegs.begin(), JE = NewF.BaseRegs.end();
03857                J != JE; ++J)
03858             if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*J))
03859               if ((C->getValue()->getValue() + NewF.BaseOffset).abs().slt(
03860                    std::abs(NewF.BaseOffset)) &&
03861                   (C->getValue()->getValue() +
03862                    NewF.BaseOffset).countTrailingZeros() >=
03863                    countTrailingZeros<uint64_t>(NewF.BaseOffset))
03864                 goto skip_formula;
03865 
03866           // Ok, looks good.
03867           NewF.Canonicalize();
03868           (void)InsertFormula(LU, LUIdx, NewF);
03869           break;
03870         skip_formula:;
03871         }
03872       }
03873     }
03874   }
03875 }
03876 
03877 /// GenerateAllReuseFormulae - Generate formulae for each use.
03878 void
03879 LSRInstance::GenerateAllReuseFormulae() {
03880   // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
03881   // queries are more precise.
03882   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
03883     LSRUse &LU = Uses[LUIdx];
03884     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
03885       GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
03886     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
03887       GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
03888   }
03889   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
03890     LSRUse &LU = Uses[LUIdx];
03891     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
03892       GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
03893     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
03894       GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
03895     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
03896       GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
03897     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
03898       GenerateScales(LU, LUIdx, LU.Formulae[i]);
03899   }
03900   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
03901     LSRUse &LU = Uses[LUIdx];
03902     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
03903       GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
03904   }
03905 
03906   GenerateCrossUseConstantOffsets();
03907 
03908   DEBUG(dbgs() << "\n"
03909                   "After generating reuse formulae:\n";
03910         print_uses(dbgs()));
03911 }
03912 
03913 /// If there are multiple formulae with the same set of registers used
03914 /// by other uses, pick the best one and delete the others.
03915 void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
03916   DenseSet<const SCEV *> VisitedRegs;
03917   SmallPtrSet<const SCEV *, 16> Regs;
03918   SmallPtrSet<const SCEV *, 16> LoserRegs;
03919 #ifndef NDEBUG
03920   bool ChangedFormulae = false;
03921 #endif
03922 
03923   // Collect the best formula for each unique set of shared registers. This
03924   // is reset for each use.
03925   typedef DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>
03926     BestFormulaeTy;
03927   BestFormulaeTy BestFormulae;
03928 
03929   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
03930     LSRUse &LU = Uses[LUIdx];
03931     DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n');
03932 
03933     bool Any = false;
03934     for (size_t FIdx = 0, NumForms = LU.Formulae.size();
03935          FIdx != NumForms; ++FIdx) {
03936       Formula &F = LU.Formulae[FIdx];
03937 
03938       // Some formulas are instant losers. For example, they may depend on
03939       // nonexistent AddRecs from other loops. These need to be filtered
03940       // immediately, otherwise heuristics could choose them over others leading
03941       // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
03942       // avoids the need to recompute this information across formulae using the
03943       // same bad AddRec. Passing LoserRegs is also essential unless we remove
03944       // the corresponding bad register from the Regs set.
03945       Cost CostF;
03946       Regs.clear();
03947       CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, LU,
03948                         &LoserRegs);
03949       if (CostF.isLoser()) {
03950         // During initial formula generation, undesirable formulae are generated
03951         // by uses within other loops that have some non-trivial address mode or
03952         // use the postinc form of the IV. LSR needs to provide these formulae
03953         // as the basis of rediscovering the desired formula that uses an AddRec
03954         // corresponding to the existing phi. Once all formulae have been
03955         // generated, these initial losers may be pruned.
03956         DEBUG(dbgs() << "  Filtering loser "; F.print(dbgs());
03957               dbgs() << "\n");
03958       }
03959       else {
03960         SmallVector<const SCEV *, 4> Key;
03961         for (SmallVectorImpl<const SCEV *>::const_iterator J = F.BaseRegs.begin(),
03962                JE = F.BaseRegs.end(); J != JE; ++J) {
03963           const SCEV *Reg = *J;
03964           if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
03965             Key.push_back(Reg);
03966         }
03967         if (F.ScaledReg &&
03968             RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
03969           Key.push_back(F.ScaledReg);
03970         // Unstable sort by host order ok, because this is only used for
03971         // uniquifying.
03972         std::sort(Key.begin(), Key.end());
03973 
03974         std::pair<BestFormulaeTy::const_iterator, bool> P =
03975           BestFormulae.insert(std::make_pair(Key, FIdx));
03976         if (P.second)
03977           continue;
03978 
03979         Formula &Best = LU.Formulae[P.first->second];
03980 
03981         Cost CostBest;
03982         Regs.clear();
03983         CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, LU.Offsets, SE,
03984                              DT, LU);
03985         if (CostF < CostBest)
03986           std::swap(F, Best);
03987         DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
03988               dbgs() << "\n"
03989                         "    in favor of formula "; Best.print(dbgs());
03990               dbgs() << '\n');
03991       }
03992 #ifndef NDEBUG
03993       ChangedFormulae = true;
03994 #endif
03995       LU.DeleteFormula(F);
03996       --FIdx;
03997       --NumForms;
03998       Any = true;
03999     }
04000 
04001     // Now that we've filtered out some formulae, recompute the Regs set.
04002     if (Any)
04003       LU.RecomputeRegs(LUIdx, RegUses);
04004 
04005     // Reset this to prepare for the next use.
04006     BestFormulae.clear();
04007   }
04008 
04009   DEBUG(if (ChangedFormulae) {
04010           dbgs() << "\n"
04011                     "After filtering out undesirable candidates:\n";
04012           print_uses(dbgs());
04013         });
04014 }
04015 
04016 // This is a rough guess that seems to work fairly well.
04017 static const size_t ComplexityLimit = UINT16_MAX;
04018 
04019 /// EstimateSearchSpaceComplexity - Estimate the worst-case number of
04020 /// solutions the solver might have to consider. It almost never considers
04021 /// this many solutions because it prune the search space, but the pruning
04022 /// isn't always sufficient.
04023 size_t LSRInstance::EstimateSearchSpaceComplexity() const {
04024   size_t Power = 1;
04025   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
04026        E = Uses.end(); I != E; ++I) {
04027     size_t FSize = I->Formulae.size();
04028     if (FSize >= ComplexityLimit) {
04029       Power = ComplexityLimit;
04030       break;
04031     }
04032     Power *= FSize;
04033     if (Power >= ComplexityLimit)
04034       break;
04035   }
04036   return Power;
04037 }
04038 
04039 /// NarrowSearchSpaceByDetectingSupersets - When one formula uses a superset
04040 /// of the registers of another formula, it won't help reduce register
04041 /// pressure (though it may not necessarily hurt register pressure); remove
04042 /// it to simplify the system.
04043 void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
04044   if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
04045     DEBUG(dbgs() << "The search space is too complex.\n");
04046 
04047     DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
04048                     "which use a superset of registers used by other "
04049                     "formulae.\n");
04050 
04051     for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
04052       LSRUse &LU = Uses[LUIdx];
04053       bool Any = false;
04054       for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
04055         Formula &F = LU.Formulae[i];
04056         // Look for a formula with a constant or GV in a register. If the use
04057         // also has a formula with that same value in an immediate field,
04058         // delete the one that uses a register.
04059         for (SmallVectorImpl<const SCEV *>::const_iterator
04060              I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
04061           if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
04062             Formula NewF = F;
04063             NewF.BaseOffset += C->getValue()->getSExtValue();
04064             NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
04065                                 (I - F.BaseRegs.begin()));
04066             if (LU.HasFormulaWithSameRegs(NewF)) {
04067               DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
04068               LU.DeleteFormula(F);
04069               --i;
04070               --e;
04071               Any = true;
04072               break;
04073             }
04074           } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
04075             if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
04076               if (!F.BaseGV) {
04077                 Formula NewF = F;
04078                 NewF.BaseGV = GV;
04079                 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
04080                                     (I - F.BaseRegs.begin()));
04081                 if (LU.HasFormulaWithSameRegs(NewF)) {
04082                   DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
04083                         dbgs() << '\n');
04084                   LU.DeleteFormula(F);
04085                   --i;
04086                   --e;
04087                   Any = true;
04088                   break;
04089                 }
04090               }
04091           }
04092         }
04093       }
04094       if (Any)
04095         LU.RecomputeRegs(LUIdx, RegUses);
04096     }
04097 
04098     DEBUG(dbgs() << "After pre-selection:\n";
04099           print_uses(dbgs()));
04100   }
04101 }
04102 
04103 /// NarrowSearchSpaceByCollapsingUnrolledCode - When there are many registers
04104 /// for expressions like A, A+1, A+2, etc., allocate a single register for
04105 /// them.
04106 void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
04107   if (EstimateSearchSpaceComplexity() < ComplexityLimit)
04108     return;
04109 
04110   DEBUG(dbgs() << "The search space is too complex.\n"
04111                   "Narrowing the search space by assuming that uses separated "
04112                   "by a constant offset will use the same registers.\n");
04113 
04114   // This is especially useful for unrolled loops.
04115 
04116   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
04117     LSRUse &LU = Uses[LUIdx];
04118     for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
04119          E = LU.Formulae.end(); I != E; ++I) {
04120       const Formula &F = *I;
04121       if (F.BaseOffset == 0 || (F.Scale != 0 && F.Scale != 1))
04122         continue;
04123 
04124       LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
04125       if (!LUThatHas)
04126         continue;
04127 
04128       if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
04129                               LU.Kind, LU.AccessTy))
04130         continue;
04131 
04132       DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs()); dbgs() << '\n');
04133 
04134       LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
04135 
04136       // Update the relocs to reference the new use.
04137       for (SmallVectorImpl<LSRFixup>::iterator I = Fixups.begin(),
04138            E = Fixups.end(); I != E; ++I) {
04139         LSRFixup &Fixup = *I;
04140         if (Fixup.LUIdx == LUIdx) {
04141           Fixup.LUIdx = LUThatHas - &Uses.front();
04142           Fixup.Offset += F.BaseOffset;
04143           // Add the new offset to LUThatHas' offset list.
04144           if (LUThatHas->Offsets.back() != Fixup.Offset) {
04145             LUThatHas->Offsets.push_back(Fixup.Offset);
04146             if (Fixup.Offset > LUThatHas->MaxOffset)
04147               LUThatHas->MaxOffset = Fixup.Offset;
04148             if (Fixup.Offset < LUThatHas->MinOffset)
04149               LUThatHas->MinOffset = Fixup.Offset;
04150           }
04151           DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
04152         }
04153         if (Fixup.LUIdx == NumUses-1)
04154           Fixup.LUIdx = LUIdx;
04155       }
04156 
04157       // Delete formulae from the new use which are no longer legal.
04158       bool Any = false;
04159       for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
04160         Formula &F = LUThatHas->Formulae[i];
04161         if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
04162                         LUThatHas->Kind, LUThatHas->AccessTy, F)) {
04163           DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
04164                 dbgs() << '\n');
04165           LUThatHas->DeleteFormula(F);
04166           --i;
04167           --e;
04168           Any = true;
04169         }
04170       }
04171 
04172       if (Any)
04173         LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
04174 
04175       // Delete the old use.
04176       DeleteUse(LU, LUIdx);
04177       --LUIdx;
04178       --NumUses;
04179       break;
04180     }
04181   }
04182 
04183   DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
04184 }
04185 
04186 /// NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters - Call
04187 /// FilterOutUndesirableDedicatedRegisters again, if necessary, now that
04188 /// we've done more filtering, as it may be able to find more formulae to
04189 /// eliminate.
04190 void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
04191   if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
04192     DEBUG(dbgs() << "The search space is too complex.\n");
04193 
04194     DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
04195                     "undesirable dedicated registers.\n");
04196 
04197     FilterOutUndesirableDedicatedRegisters();
04198 
04199     DEBUG(dbgs() << "After pre-selection:\n";
04200           print_uses(dbgs()));
04201   }
04202 }
04203 
04204 /// NarrowSearchSpaceByPickingWinnerRegs - Pick a register which seems likely
04205 /// to be profitable, and then in any use which has any reference to that
04206 /// register, delete all formulae which do not reference that register.
04207 void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
04208   // With all other options exhausted, loop until the system is simple
04209   // enough to handle.
04210   SmallPtrSet<const SCEV *, 4> Taken;
04211   while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
04212     // Ok, we have too many of formulae on our hands to conveniently handle.
04213     // Use a rough heuristic to thin out the list.
04214     DEBUG(dbgs() << "The search space is too complex.\n");
04215 
04216     // Pick the register which is used by the most LSRUses, which is likely
04217     // to be a good reuse register candidate.
04218     const SCEV *Best = nullptr;
04219     unsigned BestNum = 0;
04220     for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end();
04221          I != E; ++I) {
04222       const SCEV *Reg = *I;
04223       if (Taken.count(Reg))
04224         continue;
04225       if (!Best)
04226         Best = Reg;
04227       else {
04228         unsigned Count = RegUses.getUsedByIndices(Reg).count();
04229         if (Count > BestNum) {
04230           Best = Reg;
04231           BestNum = Count;
04232         }
04233       }
04234     }
04235 
04236     DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
04237                  << " will yield profitable reuse.\n");
04238     Taken.insert(Best);
04239 
04240     // In any use with formulae which references this register, delete formulae
04241     // which don't reference it.
04242     for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
04243       LSRUse &LU = Uses[LUIdx];
04244       if (!LU.Regs.count(Best)) continue;
04245 
04246       bool Any = false;
04247       for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
04248         Formula &F = LU.Formulae[i];
04249         if (!F.referencesReg(Best)) {
04250           DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
04251           LU.DeleteFormula(F);
04252           --e;
04253           --i;
04254           Any = true;
04255           assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
04256           continue;
04257         }
04258       }
04259 
04260       if (Any)
04261         LU.RecomputeRegs(LUIdx, RegUses);
04262     }
04263 
04264     DEBUG(dbgs() << "After pre-selection:\n";
04265           print_uses(dbgs()));
04266   }
04267 }
04268 
04269 /// NarrowSearchSpaceUsingHeuristics - If there are an extraordinary number of
04270 /// formulae to choose from, use some rough heuristics to prune down the number
04271 /// of formulae. This keeps the main solver from taking an extraordinary amount
04272 /// of time in some worst-case scenarios.
04273 void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
04274   NarrowSearchSpaceByDetectingSupersets();
04275   NarrowSearchSpaceByCollapsingUnrolledCode();
04276   NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
04277   NarrowSearchSpaceByPickingWinnerRegs();
04278 }
04279 
04280 /// SolveRecurse - This is the recursive solver.
04281 void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
04282                                Cost &SolutionCost,
04283                                SmallVectorImpl<const Formula *> &Workspace,
04284                                const Cost &CurCost,
04285                                const SmallPtrSet<const SCEV *, 16> &CurRegs,
04286                                DenseSet<const SCEV *> &VisitedRegs) const {
04287   // Some ideas:
04288   //  - prune more:
04289   //    - use more aggressive filtering
04290   //    - sort the formula so that the most profitable solutions are found first
04291   //    - sort the uses too
04292   //  - search faster:
04293   //    - don't compute a cost, and then compare. compare while computing a cost
04294   //      and bail early.
04295   //    - track register sets with SmallBitVector
04296 
04297   const LSRUse &LU = Uses[Workspace.size()];
04298 
04299   // If this use references any register that's already a part of the
04300   // in-progress solution, consider it a requirement that a formula must
04301   // reference that register in order to be considered. This prunes out
04302   // unprofitable searching.
04303   SmallSetVector<const SCEV *, 4> ReqRegs;
04304   for (const SCEV *S : CurRegs)
04305     if (LU.Regs.count(S))
04306       ReqRegs.insert(S);
04307 
04308   SmallPtrSet<const SCEV *, 16> NewRegs;
04309   Cost NewCost;
04310   for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
04311        E = LU.Formulae.end(); I != E; ++I) {
04312     const Formula &F = *I;
04313 
04314     // Ignore formulae which may not be ideal in terms of register reuse of
04315     // ReqRegs.  The formula should use all required registers before
04316     // introducing new ones.
04317     int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
04318     for (SmallSetVector<const SCEV *, 4>::const_iterator J = ReqRegs.begin(),
04319          JE = ReqRegs.end(); J != JE; ++J) {
04320       const SCEV *Reg = *J;
04321       if ((F.ScaledReg && F.ScaledReg == Reg) ||
04322           std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) !=
04323           F.BaseRegs.end()) {
04324         --NumReqRegsToFind;
04325         if (NumReqRegsToFind == 0)
04326           break;
04327       }
04328     }
04329     if (NumReqRegsToFind != 0) {
04330       // If none of the formulae satisfied the required registers, then we could
04331       // clear ReqRegs and try again. Currently, we simply give up in this case.
04332       continue;
04333     }
04334 
04335     // Evaluate the cost of the current formula. If it's already worse than
04336     // the current best, prune the search at that point.
04337     NewCost = CurCost;
04338     NewRegs = CurRegs;
04339     NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT,
04340                         LU);
04341     if (NewCost < SolutionCost) {
04342       Workspace.push_back(&F);
04343       if (Workspace.size() != Uses.size()) {
04344         SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
04345                      NewRegs, VisitedRegs);
04346         if (F.getNumRegs() == 1 && Workspace.size() == 1)
04347           VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
04348       } else {
04349         DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
04350               dbgs() << ".\n Regs:";
04351               for (const SCEV *S : NewRegs)
04352                 dbgs() << ' ' << *S;
04353               dbgs() << '\n');
04354 
04355         SolutionCost = NewCost;
04356         Solution = Workspace;
04357       }
04358       Workspace.pop_back();
04359     }
04360   }
04361 }
04362 
04363 /// Solve - Choose one formula from each use. Return the results in the given
04364 /// Solution vector.
04365 void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
04366   SmallVector<const Formula *, 8> Workspace;
04367   Cost SolutionCost;
04368   SolutionCost.Lose();
04369   Cost CurCost;
04370   SmallPtrSet<const SCEV *, 16> CurRegs;
04371   DenseSet<const SCEV *> VisitedRegs;
04372   Workspace.reserve(Uses.size());
04373 
04374   // SolveRecurse does all the work.
04375   SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
04376                CurRegs, VisitedRegs);
04377   if (Solution.empty()) {
04378     DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
04379     return;
04380   }
04381 
04382   // Ok, we've now made all our decisions.
04383   DEBUG(dbgs() << "\n"
04384                   "The chosen solution requires "; SolutionCost.print(dbgs());
04385         dbgs() << ":\n";
04386         for (size_t i = 0, e = Uses.size(); i != e; ++i) {
04387           dbgs() << "  ";
04388           Uses[i].print(dbgs());
04389           dbgs() << "\n"
04390                     "    ";
04391           Solution[i]->print(dbgs());
04392           dbgs() << '\n';
04393         });
04394 
04395   assert(Solution.size() == Uses.size() && "Malformed solution!");
04396 }
04397 
04398 /// HoistInsertPosition - Helper for AdjustInsertPositionForExpand. Climb up
04399 /// the dominator tree far as we can go while still being dominated by the
04400 /// input positions. This helps canonicalize the insert position, which
04401 /// encourages sharing.
04402 BasicBlock::iterator
04403 LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
04404                                  const SmallVectorImpl<Instruction *> &Inputs)
04405                                                                          const {
04406   for (;;) {
04407     const Loop *IPLoop = LI.getLoopFor(IP->getParent());
04408     unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
04409 
04410     BasicBlock *IDom;
04411     for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
04412       if (!Rung) return IP;
04413       Rung = Rung->getIDom();
04414       if (!Rung) return IP;
04415       IDom = Rung->getBlock();
04416 
04417       // Don't climb into a loop though.
04418       const Loop *IDomLoop = LI.getLoopFor(IDom);
04419       unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
04420       if (IDomDepth <= IPLoopDepth &&
04421           (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
04422         break;
04423     }
04424 
04425     bool AllDominate = true;
04426     Instruction *BetterPos = nullptr;
04427     Instruction *Tentative = IDom->getTerminator();
04428     for (SmallVectorImpl<Instruction *>::const_iterator I = Inputs.begin(),
04429          E = Inputs.end(); I != E; ++I) {
04430       Instruction *Inst = *I;
04431       if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
04432         AllDominate = false;
04433         break;
04434       }
04435       // Attempt to find an insert position in the middle of the block,
04436       // instead of at the end, so that it can be used for other expansions.
04437       if (IDom == Inst->getParent() &&
04438           (!BetterPos || !DT.dominates(Inst, BetterPos)))
04439         BetterPos = std::next(BasicBlock::iterator(Inst));
04440     }
04441     if (!AllDominate)
04442       break;
04443     if (BetterPos)
04444       IP = BetterPos;
04445     else
04446       IP = Tentative;
04447   }
04448 
04449   return IP;
04450 }
04451 
04452 /// AdjustInsertPositionForExpand - Determine an input position which will be
04453 /// dominated by the operands and which will dominate the result.
04454 BasicBlock::iterator
04455 LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,
04456                                            const LSRFixup &LF,
04457                                            const LSRUse &LU,
04458                                            SCEVExpander &Rewriter) const {
04459   // Collect some instructions which must be dominated by the
04460   // expanding replacement. These must be dominated by any operands that
04461   // will be required in the expansion.
04462   SmallVector<Instruction *, 4> Inputs;
04463   if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
04464     Inputs.push_back(I);
04465   if (LU.Kind == LSRUse::ICmpZero)
04466     if (Instruction *I =
04467           dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
04468       Inputs.push_back(I);
04469   if (LF.PostIncLoops.count(L)) {
04470     if (LF.isUseFullyOutsideLoop(L))
04471       Inputs.push_back(L->getLoopLatch()->getTerminator());
04472     else
04473       Inputs.push_back(IVIncInsertPos);
04474   }
04475   // The expansion must also be dominated by the increment positions of any
04476   // loops it for which it is using post-inc mode.
04477   for (PostIncLoopSet::const_iterator I = LF.PostIncLoops.begin(),
04478        E = LF.PostIncLoops.end(); I != E; ++I) {
04479     const Loop *PIL = *I;
04480     if (PIL == L) continue;
04481 
04482     // Be dominated by the loop exit.
04483     SmallVector<BasicBlock *, 4> ExitingBlocks;
04484     PIL->getExitingBlocks(ExitingBlocks);
04485     if (!ExitingBlocks.empty()) {
04486       BasicBlock *BB = ExitingBlocks[0];
04487       for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
04488         BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
04489       Inputs.push_back(BB->getTerminator());
04490     }
04491   }
04492 
04493   assert(!isa<PHINode>(LowestIP) && !isa<LandingPadInst>(LowestIP)
04494          && !isa<DbgInfoIntrinsic>(LowestIP) &&
04495          "Insertion point must be a normal instruction");
04496 
04497   // Then, climb up the immediate dominator tree as far as we can go while
04498   // still being dominated by the input positions.
04499   BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
04500 
04501   // Don't insert instructions before PHI nodes.
04502   while (isa<PHINode>(IP)) ++IP;
04503 
04504   // Ignore landingpad instructions.
04505   while (isa<LandingPadInst>(IP)) ++IP;
04506 
04507   // Ignore debug intrinsics.
04508   while (isa<DbgInfoIntrinsic>(IP)) ++IP;
04509 
04510   // Set IP below instructions recently inserted by SCEVExpander. This keeps the
04511   // IP consistent across expansions and allows the previously inserted
04512   // instructions to be reused by subsequent expansion.
04513   while (Rewriter.isInsertedInstruction(IP) && IP != LowestIP) ++IP;
04514 
04515   return IP;
04516 }
04517 
04518 /// Expand - Emit instructions for the leading candidate expression for this
04519 /// LSRUse (this is called "expanding").
04520 Value *LSRInstance::Expand(const LSRFixup &LF,
04521                            const Formula &F,
04522                            BasicBlock::iterator IP,
04523                            SCEVExpander &Rewriter,
04524                            SmallVectorImpl<WeakVH> &DeadInsts) const {
04525   const LSRUse &LU = Uses[LF.LUIdx];
04526   if (LU.RigidFormula)
04527     return LF.OperandValToReplace;
04528 
04529   // Determine an input position which will be dominated by the operands and
04530   // which will dominate the result.
04531   IP = AdjustInsertPositionForExpand(IP, LF, LU, Rewriter);
04532 
04533   // Inform the Rewriter if we have a post-increment use, so that it can
04534   // perform an advantageous expansion.
04535   Rewriter.setPostInc(LF.PostIncLoops);
04536 
04537   // This is the type that the user actually needs.
04538   Type *OpTy = LF.OperandValToReplace->getType();
04539   // This will be the type that we'll initially expand to.
04540   Type *Ty = F.getType();
04541   if (!Ty)
04542     // No type known; just expand directly to the ultimate type.
04543     Ty = OpTy;
04544   else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
04545     // Expand directly to the ultimate type if it's the right size.
04546     Ty = OpTy;
04547   // This is the type to do integer arithmetic in.
04548   Type *IntTy = SE.getEffectiveSCEVType(Ty);
04549 
04550   // Build up a list of operands to add together to form the full base.
04551   SmallVector<const SCEV *, 8> Ops;
04552 
04553   // Expand the BaseRegs portion.
04554   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
04555        E = F.BaseRegs.end(); I != E; ++I) {
04556     const SCEV *Reg = *I;
04557     assert(!Reg->isZero() && "Zero allocated in a base register!");
04558 
04559     // If we're expanding for a post-inc user, make the post-inc adjustment.
04560     PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
04561     Reg = TransformForPostIncUse(Denormalize, Reg,
04562                                  LF.UserInst, LF.OperandValToReplace,
04563                                  Loops, SE, DT);
04564 
04565     Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr, IP)));
04566   }
04567 
04568   // Expand the ScaledReg portion.
04569   Value *ICmpScaledV = nullptr;
04570   if (F.Scale != 0) {
04571     const SCEV *ScaledS = F.ScaledReg;
04572 
04573     // If we're expanding for a post-inc user, make the post-inc adjustment.
04574     PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
04575     ScaledS = TransformForPostIncUse(Denormalize, ScaledS,
04576                                      LF.UserInst, LF.OperandValToReplace,
04577                                      Loops, SE, DT);
04578 
04579     if (LU.Kind == LSRUse::ICmpZero) {
04580       // Expand ScaleReg as if it was part of the base regs.
04581       if (F.Scale == 1)
04582         Ops.push_back(
04583             SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, IP)));
04584       else {
04585         // An interesting way of "folding" with an icmp is to use a negated
04586         // scale, which we'll implement by inserting it into the other operand
04587         // of the icmp.
04588         assert(F.Scale == -1 &&
04589                "The only scale supported by ICmpZero uses is -1!");
04590         ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr, IP);
04591       }
04592     } else {
04593       // Otherwise just expand the scaled register and an explicit scale,
04594       // which is expected to be matched as part of the address.
04595 
04596       // Flush the operand list to suppress SCEVExpander hoisting address modes.
04597       // Unless the addressing mode will not be folded.
04598       if (!Ops.empty() && LU.Kind == LSRUse::Address &&
04599           isAMCompletelyFolded(TTI, LU, F)) {
04600         Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
04601         Ops.clear();
04602         Ops.push_back(SE.getUnknown(FullV));
04603       }
04604       ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, IP));
04605       if (F.Scale != 1)
04606         ScaledS =
04607             SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
04608       Ops.push_back(ScaledS);
04609     }
04610   }
04611 
04612   // Expand the GV portion.
04613   if (F.BaseGV) {
04614     // Flush the operand list to suppress SCEVExpander hoisting.
04615     if (!Ops.empty()) {
04616       Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
04617       Ops.clear();
04618       Ops.push_back(SE.getUnknown(FullV));
04619     }
04620     Ops.push_back(SE.getUnknown(F.BaseGV));
04621   }
04622 
04623   // Flush the operand list to suppress SCEVExpander hoisting of both folded and
04624   // unfolded offsets. LSR assumes they both live next to their uses.
04625   if (!Ops.empty()) {
04626     Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
04627     Ops.clear();
04628     Ops.push_back(SE.getUnknown(FullV));
04629   }
04630 
04631   // Expand the immediate portion.
04632   int64_t Offset = (uint64_t)F.BaseOffset + LF.Offset;
04633   if (Offset != 0) {
04634     if (LU.Kind == LSRUse::ICmpZero) {
04635       // The other interesting way of "folding" with an ICmpZero is to use a
04636       // negated immediate.
04637       if (!ICmpScaledV)
04638         ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset);
04639       else {
04640         Ops.push_back(SE.getUnknown(ICmpScaledV));
04641         ICmpScaledV = ConstantInt::get(IntTy, Offset);
04642       }
04643     } else {
04644       // Just add the immediate values. These again are expected to be matched
04645       // as part of the address.
04646       Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset)));
04647     }
04648   }
04649 
04650   // Expand the unfolded offset portion.
04651   int64_t UnfoldedOffset = F.UnfoldedOffset;
04652   if (UnfoldedOffset != 0) {
04653     // Just add the immediate values.
04654     Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy,
04655                                                        UnfoldedOffset)));
04656   }
04657 
04658   // Emit instructions summing all the operands.
04659   const SCEV *FullS = Ops.empty() ?
04660                       SE.getConstant(IntTy, 0) :
04661                       SE.getAddExpr(Ops);
04662   Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP);
04663 
04664   // We're done expanding now, so reset the rewriter.
04665   Rewriter.clearPostInc();
04666 
04667   // An ICmpZero Formula represents an ICmp which we're handling as a
04668   // comparison against zero. Now that we've expanded an expression for that
04669   // form, update the ICmp's other operand.
04670   if (LU.Kind == LSRUse::ICmpZero) {
04671     ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
04672     DeadInsts.push_back(CI->getOperand(1));
04673     assert(!F.BaseGV && "ICmp does not support folding a global value and "
04674                            "a scale at the same time!");
04675     if (F.Scale == -1) {
04676       if (ICmpScaledV->getType() != OpTy) {
04677         Instruction *Cast =
04678           CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false,
04679                                                    OpTy, false),
04680                            ICmpScaledV, OpTy, "tmp", CI);
04681         ICmpScaledV = Cast;
04682       }
04683       CI->setOperand(1, ICmpScaledV);
04684     } else {
04685       // A scale of 1 means that the scale has been expanded as part of the
04686       // base regs.
04687       assert((F.Scale == 0 || F.Scale == 1) &&
04688              "ICmp does not support folding a global value and "
04689              "a scale at the same time!");
04690       Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
04691                                            -(uint64_t)Offset);
04692       if (C->getType() != OpTy)
04693         C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
04694                                                           OpTy, false),
04695                                   C, OpTy);
04696 
04697       CI->setOperand(1, C);
04698     }
04699   }
04700 
04701   return FullV;
04702 }
04703 
04704 /// RewriteForPHI - Helper for Rewrite. PHI nodes are special because the use
04705 /// of their operands effectively happens in their predecessor blocks, so the
04706 /// expression may need to be expanded in multiple places.
04707 void LSRInstance::RewriteForPHI(PHINode *PN,
04708                                 const LSRFixup &LF,
04709                                 const Formula &F,
04710                                 SCEVExpander &Rewriter,
04711                                 SmallVectorImpl<WeakVH> &DeadInsts,
04712                                 Pass *P) const {
04713   DenseMap<BasicBlock *, Value *> Inserted;
04714   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
04715     if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
04716       BasicBlock *BB = PN->getIncomingBlock(i);
04717 
04718       // If this is a critical edge, split the edge so that we do not insert
04719       // the code on all predecessor/successor paths.  We do this unless this
04720       // is the canonical backedge for this loop, which complicates post-inc
04721       // users.
04722       if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
04723           !isa<IndirectBrInst>(BB->getTerminator())) {
04724         BasicBlock *Parent = PN->getParent();
04725         Loop *PNLoop = LI.getLoopFor(Parent);
04726         if (!PNLoop || Parent != PNLoop->getHeader()) {
04727           // Split the critical edge.
04728           BasicBlock *NewBB = nullptr;
04729           if (!Parent->isLandingPad()) {
04730             NewBB = SplitCriticalEdge(BB, Parent,
04731                                       CriticalEdgeSplittingOptions(&DT, &LI)
04732                                           .setMergeIdenticalEdges()
04733                                           .setDontDeleteUselessPHIs());
04734           } else {
04735             SmallVector<BasicBlock*, 2> NewBBs;
04736             SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs,
04737                                         /*AliasAnalysis*/ nullptr, &DT, &LI);
04738             NewBB = NewBBs[0];
04739           }
04740           // If NewBB==NULL, then SplitCriticalEdge refused to split because all
04741           // phi predecessors are identical. The simple thing to do is skip
04742           // splitting in this case rather than complicate the API.
04743           if (NewBB) {
04744             // If PN is outside of the loop and BB is in the loop, we want to
04745             // move the block to be immediately before the PHI block, not
04746             // immediately after BB.
04747             if (L->contains(BB) && !L->contains(PN))
04748               NewBB->moveBefore(PN->getParent());
04749 
04750             // Splitting the edge can reduce the number of PHI entries we have.
04751             e = PN->getNumIncomingValues();
04752             BB = NewBB;
04753             i = PN->getBasicBlockIndex(BB);
04754           }
04755         }
04756       }
04757 
04758       std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
04759         Inserted.insert(std::make_pair(BB, static_cast<Value *>(nullptr)));
04760       if (!Pair.second)
04761         PN->setIncomingValue(i, Pair.first->second);
04762       else {
04763         Value *FullV = Expand(LF, F, BB->getTerminator(), Rewriter, DeadInsts);
04764 
04765         // If this is reuse-by-noop-cast, insert the noop cast.
04766         Type *OpTy = LF.OperandValToReplace->getType();
04767         if (FullV->getType() != OpTy)
04768           FullV =
04769             CastInst::Create(CastInst::getCastOpcode(FullV, false,
04770                                                      OpTy, false),
04771                              FullV, LF.OperandValToReplace->getType(),
04772                              "tmp", BB->getTerminator());
04773 
04774         PN->setIncomingValue(i, FullV);
04775         Pair.first->second = FullV;
04776       }
04777     }
04778 }
04779 
04780 /// Rewrite - Emit instructions for the leading candidate expression for this
04781 /// LSRUse (this is called "expanding"), and update the UserInst to reference
04782 /// the newly expanded value.
04783 void LSRInstance::Rewrite(const LSRFixup &LF,
04784                           const Formula &F,
04785                           SCEVExpander &Rewriter,
04786                           SmallVectorImpl<WeakVH> &DeadInsts,
04787                           Pass *P) const {
04788   // First, find an insertion point that dominates UserInst. For PHI nodes,
04789   // find the nearest block which dominates all the relevant uses.
04790   if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
04791     RewriteForPHI(PN, LF, F, Rewriter, DeadInsts, P);
04792   } else {
04793     Value *FullV = Expand(LF, F, LF.UserInst, Rewriter, DeadInsts);
04794 
04795     // If this is reuse-by-noop-cast, insert the noop cast.
04796     Type *OpTy = LF.OperandValToReplace->getType();
04797     if (FullV->getType() != OpTy) {
04798       Instruction *Cast =
04799         CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
04800                          FullV, OpTy, "tmp", LF.UserInst);
04801       FullV = Cast;
04802     }
04803 
04804     // Update the user. ICmpZero is handled specially here (for now) because
04805     // Expand may have updated one of the operands of the icmp already, and
04806     // its new value may happen to be equal to LF.OperandValToReplace, in
04807     // which case doing replaceUsesOfWith leads to replacing both operands
04808     // with the same value. TODO: Reorganize this.
04809     if (Uses[LF.LUIdx].Kind == LSRUse::ICmpZero)
04810       LF.UserInst->setOperand(0, FullV);
04811     else
04812       LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
04813   }
04814 
04815   DeadInsts.push_back(LF.OperandValToReplace);
04816 }
04817 
04818 /// ImplementSolution - Rewrite all the fixup locations with new values,
04819 /// following the chosen solution.
04820 void
04821 LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
04822                                Pass *P) {
04823   // Keep track of instructions we may have made dead, so that
04824   // we can remove them after we are done working.
04825   SmallVector<WeakVH, 16> DeadInsts;
04826 
04827   SCEVExpander Rewriter(SE, L->getHeader()->getModule()->getDataLayout(),
04828                         "lsr");
04829 #ifndef NDEBUG
04830   Rewriter.setDebugType(DEBUG_TYPE);
04831 #endif
04832   Rewriter.disableCanonicalMode();
04833   Rewriter.enableLSRMode();
04834   Rewriter.setIVIncInsertPos(L, IVIncInsertPos);
04835 
04836   // Mark phi nodes that terminate chains so the expander tries to reuse them.
04837   for (SmallVectorImpl<IVChain>::const_iterator ChainI = IVChainVec.begin(),
04838          ChainE = IVChainVec.end(); ChainI != ChainE; ++ChainI) {
04839     if (PHINode *PN = dyn_cast<PHINode>(ChainI->tailUserInst()))
04840       Rewriter.setChainedPhi(PN);
04841   }
04842 
04843   // Expand the new value definitions and update the users.
04844   for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(),
04845        E = Fixups.end(); I != E; ++I) {
04846     const LSRFixup &Fixup = *I;
04847 
04848     Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts, P);
04849 
04850     Changed = true;
04851   }
04852 
04853   for (SmallVectorImpl<IVChain>::const_iterator ChainI = IVChainVec.begin(),
04854          ChainE = IVChainVec.end(); ChainI != ChainE; ++ChainI) {
04855     GenerateIVChain(*ChainI, Rewriter, DeadInsts);
04856     Changed = true;
04857   }
04858   // Clean up after ourselves. This must be done before deleting any
04859   // instructions.
04860   Rewriter.clear();
04861 
04862   Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
04863 }
04864 
04865 LSRInstance::LSRInstance(Loop *L, Pass *P)
04866     : IU(P->getAnalysis<IVUsers>()), SE(P->getAnalysis<ScalarEvolution>()),
04867       DT(P->getAnalysis<DominatorTreeWrapperPass>().getDomTree()),
04868       LI(P->getAnalysis<LoopInfoWrapperPass>().getLoopInfo()),
04869       TTI(P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
04870           *L->getHeader()->getParent())),
04871       L(L), Changed(false), IVIncInsertPos(nullptr) {
04872   // If LoopSimplify form is not available, stay out of trouble.
04873   if (!L->isLoopSimplifyForm())
04874     return;
04875 
04876   // If there's no interesting work to be done, bail early.
04877   if (IU.empty()) return;
04878 
04879   // If there's too much analysis to be done, bail early. We won't be able to
04880   // model the problem anyway.
04881   unsigned NumUsers = 0;
04882   for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) {
04883     if (++NumUsers > MaxIVUsers) {
04884       DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << *L
04885             << "\n");
04886       return;
04887     }
04888   }
04889 
04890 #ifndef NDEBUG
04891   // All dominating loops must have preheaders, or SCEVExpander may not be able
04892   // to materialize an AddRecExpr whose Start is an outer AddRecExpr.
04893   //
04894   // IVUsers analysis should only create users that are dominated by simple loop
04895   // headers. Since this loop should dominate all of its users, its user list
04896   // should be empty if this loop itself is not within a simple loop nest.
04897   for (DomTreeNode *Rung = DT.getNode(L->getLoopPreheader());
04898        Rung; Rung = Rung->getIDom()) {
04899     BasicBlock *BB = Rung->getBlock();
04900     const Loop *DomLoop = LI.getLoopFor(BB);
04901     if (DomLoop && DomLoop->getHeader() == BB) {
04902       assert(DomLoop->getLoopPreheader() && "LSR needs a simplified loop nest");
04903     }
04904   }
04905 #endif // DEBUG
04906 
04907   DEBUG(dbgs() << "\nLSR on loop ";
04908         L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
04909         dbgs() << ":\n");
04910 
04911   // First, perform some low-level loop optimizations.
04912   OptimizeShadowIV();
04913   OptimizeLoopTermCond();
04914 
04915   // If loop preparation eliminates all interesting IV users, bail.
04916   if (IU.empty()) return;
04917 
04918   // Skip nested loops until we can model them better with formulae.
04919   if (!L->empty()) {
04920     DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
04921     return;
04922   }
04923 
04924   // Start collecting data and preparing for the solver.
04925   CollectChains();
04926   CollectInterestingTypesAndFactors();
04927   CollectFixupsAndInitialFormulae();
04928   CollectLoopInvariantFixupsAndFormulae();
04929 
04930   assert(!Uses.empty() && "IVUsers reported at least one use");
04931   DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
04932         print_uses(dbgs()));
04933 
04934   // Now use the reuse data to generate a bunch of interesting ways
04935   // to formulate the values needed for the uses.
04936   GenerateAllReuseFormulae();
04937 
04938   FilterOutUndesirableDedicatedRegisters();
04939   NarrowSearchSpaceUsingHeuristics();
04940 
04941   SmallVector<const Formula *, 8> Solution;
04942   Solve(Solution);
04943 
04944   // Release memory that is no longer needed.
04945   Factors.clear();
04946   Types.clear();
04947   RegUses.clear();
04948 
04949   if (Solution.empty())
04950     return;
04951 
04952 #ifndef NDEBUG
04953   // Formulae should be legal.
04954   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(), E = Uses.end();
04955        I != E; ++I) {
04956     const LSRUse &LU = *I;
04957     for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
04958                                                   JE = LU.Formulae.end();
04959          J != JE; ++J)
04960       assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
04961                         *J) && "Illegal formula generated!");
04962   };
04963 #endif
04964 
04965   // Now that we've decided what we want, make it so.
04966   ImplementSolution(Solution, P);
04967 }
04968 
04969 void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
04970   if (Factors.empty() && Types.empty()) return;
04971 
04972   OS << "LSR has identified the following interesting factors and types: ";
04973   bool First = true;
04974 
04975   for (SmallSetVector<int64_t, 8>::const_iterator
04976        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
04977     if (!First) OS << ", ";
04978     First = false;
04979     OS << '*' << *I;
04980   }
04981 
04982   for (SmallSetVector<Type *, 4>::const_iterator
04983        I = Types.begin(), E = Types.end(); I != E; ++I) {
04984     if (!First) OS << ", ";
04985     First = false;
04986     OS << '(' << **I << ')';
04987   }
04988   OS << '\n';
04989 }
04990 
04991 void LSRInstance::print_fixups(raw_ostream &OS) const {
04992   OS << "LSR is examining the following fixup sites:\n";
04993   for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(),
04994        E = Fixups.end(); I != E; ++I) {
04995     dbgs() << "  ";
04996     I->print(OS);
04997     OS << '\n';
04998   }
04999 }
05000 
05001 void LSRInstance::print_uses(raw_ostream &OS) const {
05002   OS << "LSR is examining the following uses:\n";
05003   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
05004        E = Uses.end(); I != E; ++I) {
05005     const LSRUse &LU = *I;
05006     dbgs() << "  ";
05007     LU.print(OS);
05008     OS << '\n';
05009     for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
05010          JE = LU.Formulae.end(); J != JE; ++J) {
05011       OS << "    ";
05012       J->print(OS);
05013       OS << '\n';
05014     }
05015   }
05016 }
05017 
05018 void LSRInstance::print(raw_ostream &OS) const {
05019   print_factors_and_types(OS);
05020   print_fixups(OS);
05021   print_uses(OS);
05022 }
05023 
05024 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
05025 void LSRInstance::dump() const {
05026   print(errs()); errs() << '\n';
05027 }
05028 #endif
05029 
05030 namespace {
05031 
05032 class LoopStrengthReduce : public LoopPass {
05033 public:
05034   static char ID; // Pass ID, replacement for typeid
05035   LoopStrengthReduce();
05036 
05037 private:
05038   bool runOnLoop(Loop *L, LPPassManager &LPM) override;
05039   void getAnalysisUsage(AnalysisUsage &AU) const override;
05040 };
05041 
05042 }
05043 
05044 char LoopStrengthReduce::ID = 0;
05045 INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
05046                 "Loop Strength Reduction", false, false)
05047 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
05048 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
05049 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
05050 INITIALIZE_PASS_DEPENDENCY(IVUsers)
05051 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
05052 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
05053 INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
05054                 "Loop Strength Reduction", false, false)
05055 
05056 
05057 Pass *llvm::createLoopStrengthReducePass() {
05058   return new LoopStrengthReduce();
05059 }
05060 
05061 LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
05062   initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry());
05063 }
05064 
05065 void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
05066   // We split critical edges, so we change the CFG.  However, we do update
05067   // many analyses if they are around.
05068   AU.addPreservedID(LoopSimplifyID);
05069 
05070   AU.addRequired<LoopInfoWrapperPass>();
05071   AU.addPreserved<LoopInfoWrapperPass>();
05072   AU.addRequiredID(LoopSimplifyID);
05073   AU.addRequired<DominatorTreeWrapperPass>();
05074   AU.addPreserved<DominatorTreeWrapperPass>();
05075   AU.addRequired<ScalarEvolution>();
05076   AU.addPreserved<ScalarEvolution>();
05077   // Requiring LoopSimplify a second time here prevents IVUsers from running
05078   // twice, since LoopSimplify was invalidated by running ScalarEvolution.
05079   AU.addRequiredID(LoopSimplifyID);
05080   AU.addRequired<IVUsers>();
05081   AU.addPreserved<IVUsers>();
05082   AU.addRequired<TargetTransformInfoWrapperPass>();
05083 }
05084 
05085 bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
05086   if (skipOptnoneFunction(L))
05087     return false;
05088 
05089   bool Changed = false;
05090 
05091   // Run the main LSR transformation.
05092   Changed |= LSRInstance(L, this).getChanged();
05093 
05094   // Remove any extra phis created by processing inner loops.
05095   Changed |= DeleteDeadPHIs(L->getHeader());
05096   if (EnablePhiElim && L->isLoopSimplifyForm()) {
05097     SmallVector<WeakVH, 16> DeadInsts;
05098     const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
05099     SCEVExpander Rewriter(getAnalysis<ScalarEvolution>(), DL, "lsr");
05100 #ifndef NDEBUG
05101     Rewriter.setDebugType(DEBUG_TYPE);
05102 #endif
05103     unsigned numFolded = Rewriter.replaceCongruentIVs(
05104         L, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(), DeadInsts,
05105         &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
05106             *L->getHeader()->getParent()));
05107     if (numFolded) {
05108       Changed = true;
05109       DeleteTriviallyDeadInstructions(DeadInsts);
05110       DeleteDeadPHIs(L->getHeader());
05111     }
05112   }
05113   return Changed;
05114 }