LLVM  mainline
LoopStrengthReduce.cpp
Go to the documentation of this file.
00001 //===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // This transformation analyzes and transforms the induction variables (and
00011 // computations derived from them) into forms suitable for efficient execution
00012 // on the target.
00013 //
00014 // This pass performs a strength reduction on array references inside loops that
00015 // have as one or more of their components the loop induction variable, it
00016 // rewrites expressions to take advantage of scaled-index addressing modes
00017 // available on the target, and it performs a variety of other optimizations
00018 // related to loop induction variables.
00019 //
00020 // Terminology note: this code has a lot of handling for "post-increment" or
00021 // "post-inc" users. This is not talking about post-increment addressing modes;
00022 // it is instead talking about code like this:
00023 //
00024 //   %i = phi [ 0, %entry ], [ %i.next, %latch ]
00025 //   ...
00026 //   %i.next = add %i, 1
00027 //   %c = icmp eq %i.next, %n
00028 //
00029 // The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
00030 // it's useful to think about these as the same register, with some uses using
00031 // the value of the register before the add and some using it after. In this
00032 // example, the icmp is a post-increment user, since it uses %i.next, which is
00033 // the value of the induction variable after the increment. The other common
00034 // case of post-increment users is users outside the loop.
00035 //
00036 // TODO: More sophistication in the way Formulae are generated and filtered.
00037 //
00038 // TODO: Handle multiple loops at a time.
00039 //
00040 // TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
00041 //       of a GlobalValue?
00042 //
00043 // TODO: When truncation is free, truncate ICmp users' operands to make it a
00044 //       smaller encoding (on x86 at least).
00045 //
00046 // TODO: When a negated register is used by an add (such as in a list of
00047 //       multiple base registers, or as the increment expression in an addrec),
00048 //       we may not actually need both reg and (-1 * reg) in registers; the
00049 //       negation can be implemented by using a sub instead of an add. The
00050 //       lack of support for taking this into consideration when making
00051 //       register pressure decisions is partly worked around by the "Special"
00052 //       use kind.
00053 //
00054 //===----------------------------------------------------------------------===//
00055 
00056 #include "llvm/Transforms/Scalar.h"
00057 #include "llvm/ADT/DenseSet.h"
00058 #include "llvm/ADT/Hashing.h"
00059 #include "llvm/ADT/STLExtras.h"
00060 #include "llvm/ADT/SetVector.h"
00061 #include "llvm/ADT/SmallBitVector.h"
00062 #include "llvm/Analysis/IVUsers.h"
00063 #include "llvm/Analysis/LoopPass.h"
00064 #include "llvm/Analysis/ScalarEvolutionExpander.h"
00065 #include "llvm/Analysis/TargetTransformInfo.h"
00066 #include "llvm/IR/Constants.h"
00067 #include "llvm/IR/DerivedTypes.h"
00068 #include "llvm/IR/Dominators.h"
00069 #include "llvm/IR/Instructions.h"
00070 #include "llvm/IR/IntrinsicInst.h"
00071 #include "llvm/IR/Module.h"
00072 #include "llvm/IR/ValueHandle.h"
00073 #include "llvm/Support/CommandLine.h"
00074 #include "llvm/Support/Debug.h"
00075 #include "llvm/Support/raw_ostream.h"
00076 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
00077 #include "llvm/Transforms/Utils/Local.h"
00078 #include <algorithm>
00079 using namespace llvm;
00080 
00081 #define DEBUG_TYPE "loop-reduce"
00082 
00083 /// MaxIVUsers is an arbitrary threshold that provides an early opportunitiy for
00084 /// bail out. This threshold is far beyond the number of users that LSR can
00085 /// conceivably solve, so it should not affect generated code, but catches the
00086 /// worst cases before LSR burns too much compile time and stack space.
00087 static const unsigned MaxIVUsers = 200;
00088 
00089 // Temporary flag to cleanup congruent phis after LSR phi expansion.
00090 // It's currently disabled until we can determine whether it's truly useful or
00091 // not. The flag should be removed after the v3.0 release.
00092 // This is now needed for ivchains.
00093 static cl::opt<bool> EnablePhiElim(
00094   "enable-lsr-phielim", cl::Hidden, cl::init(true),
00095   cl::desc("Enable LSR phi elimination"));
00096 
00097 #ifndef NDEBUG
00098 // Stress test IV chain generation.
00099 static cl::opt<bool> StressIVChain(
00100   "stress-ivchain", cl::Hidden, cl::init(false),
00101   cl::desc("Stress test LSR IV chains"));
00102 #else
00103 static bool StressIVChain = false;
00104 #endif
00105 
00106 namespace {
00107 
00108 struct MemAccessTy {
00109   /// Used in situations where the accessed memory type is unknown.
00110   static const unsigned UnknownAddressSpace = ~0u;
00111 
00112   Type *MemTy;
00113   unsigned AddrSpace;
00114 
00115   MemAccessTy() : MemTy(nullptr), AddrSpace(UnknownAddressSpace) {}
00116 
00117   MemAccessTy(Type *Ty, unsigned AS) :
00118     MemTy(Ty), AddrSpace(AS) {}
00119 
00120   bool operator==(MemAccessTy Other) const {
00121     return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
00122   }
00123 
00124   bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
00125 
00126   static MemAccessTy getUnknown(LLVMContext &Ctx) {
00127     return MemAccessTy(Type::getVoidTy(Ctx), UnknownAddressSpace);
00128   }
00129 };
00130 
00131 /// This class holds data which is used to order reuse candidates.
00132 class RegSortData {
00133 public:
00134   /// This represents the set of LSRUse indices which reference
00135   /// a particular register.
00136   SmallBitVector UsedByIndices;
00137 
00138   void print(raw_ostream &OS) const;
00139   void dump() const;
00140 };
00141 
00142 }
00143 
00144 void RegSortData::print(raw_ostream &OS) const {
00145   OS << "[NumUses=" << UsedByIndices.count() << ']';
00146 }
00147 
00148 LLVM_DUMP_METHOD
00149 void RegSortData::dump() const {
00150   print(errs()); errs() << '\n';
00151 }
00152 
00153 namespace {
00154 
00155 /// Map register candidates to information about how they are used.
00156 class RegUseTracker {
00157   typedef DenseMap<const SCEV *, RegSortData> RegUsesTy;
00158 
00159   RegUsesTy RegUsesMap;
00160   SmallVector<const SCEV *, 16> RegSequence;
00161 
00162 public:
00163   void countRegister(const SCEV *Reg, size_t LUIdx);
00164   void dropRegister(const SCEV *Reg, size_t LUIdx);
00165   void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
00166 
00167   bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
00168 
00169   const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
00170 
00171   void clear();
00172 
00173   typedef SmallVectorImpl<const SCEV *>::iterator iterator;
00174   typedef SmallVectorImpl<const SCEV *>::const_iterator const_iterator;
00175   iterator begin() { return RegSequence.begin(); }
00176   iterator end()   { return RegSequence.end(); }
00177   const_iterator begin() const { return RegSequence.begin(); }
00178   const_iterator end() const   { return RegSequence.end(); }
00179 };
00180 
00181 }
00182 
00183 void
00184 RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
00185   std::pair<RegUsesTy::iterator, bool> Pair =
00186     RegUsesMap.insert(std::make_pair(Reg, RegSortData()));
00187   RegSortData &RSD = Pair.first->second;
00188   if (Pair.second)
00189     RegSequence.push_back(Reg);
00190   RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
00191   RSD.UsedByIndices.set(LUIdx);
00192 }
00193 
00194 void
00195 RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
00196   RegUsesTy::iterator It = RegUsesMap.find(Reg);
00197   assert(It != RegUsesMap.end());
00198   RegSortData &RSD = It->second;
00199   assert(RSD.UsedByIndices.size() > LUIdx);
00200   RSD.UsedByIndices.reset(LUIdx);
00201 }
00202 
00203 void
00204 RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
00205   assert(LUIdx <= LastLUIdx);
00206 
00207   // Update RegUses. The data structure is not optimized for this purpose;
00208   // we must iterate through it and update each of the bit vectors.
00209   for (auto &Pair : RegUsesMap) {
00210     SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
00211     if (LUIdx < UsedByIndices.size())
00212       UsedByIndices[LUIdx] =
00213         LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : 0;
00214     UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
00215   }
00216 }
00217 
00218 bool
00219 RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
00220   RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
00221   if (I == RegUsesMap.end())
00222     return false;
00223   const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
00224   int i = UsedByIndices.find_first();
00225   if (i == -1) return false;
00226   if ((size_t)i != LUIdx) return true;
00227   return UsedByIndices.find_next(i) != -1;
00228 }
00229 
00230 const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
00231   RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
00232   assert(I != RegUsesMap.end() && "Unknown register!");
00233   return I->second.UsedByIndices;
00234 }
00235 
00236 void RegUseTracker::clear() {
00237   RegUsesMap.clear();
00238   RegSequence.clear();
00239 }
00240 
00241 namespace {
00242 
00243 /// This class holds information that describes a formula for computing
00244 /// satisfying a use. It may include broken-out immediates and scaled registers.
00245 struct Formula {
00246   /// Global base address used for complex addressing.
00247   GlobalValue *BaseGV;
00248 
00249   /// Base offset for complex addressing.
00250   int64_t BaseOffset;
00251 
00252   /// Whether any complex addressing has a base register.
00253   bool HasBaseReg;
00254 
00255   /// The scale of any complex addressing.
00256   int64_t Scale;
00257 
00258   /// The list of "base" registers for this use. When this is non-empty. The
00259   /// canonical representation of a formula is
00260   /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
00261   /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
00262   /// #1 enforces that the scaled register is always used when at least two
00263   /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
00264   /// #2 enforces that 1 * reg is reg.
00265   /// This invariant can be temporarly broken while building a formula.
00266   /// However, every formula inserted into the LSRInstance must be in canonical
00267   /// form.
00268   SmallVector<const SCEV *, 4> BaseRegs;
00269 
00270   /// The 'scaled' register for this use. This should be non-null when Scale is
00271   /// not zero.
00272   const SCEV *ScaledReg;
00273 
00274   /// An additional constant offset which added near the use. This requires a
00275   /// temporary register, but the offset itself can live in an add immediate
00276   /// field rather than a register.
00277   int64_t UnfoldedOffset;
00278 
00279   Formula()
00280       : BaseGV(nullptr), BaseOffset(0), HasBaseReg(false), Scale(0),
00281         ScaledReg(nullptr), UnfoldedOffset(0) {}
00282 
00283   void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
00284 
00285   bool isCanonical() const;
00286 
00287   void canonicalize();
00288 
00289   bool unscale();
00290 
00291   size_t getNumRegs() const;
00292   Type *getType() const;
00293 
00294   void deleteBaseReg(const SCEV *&S);
00295 
00296   bool referencesReg(const SCEV *S) const;
00297   bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
00298                                   const RegUseTracker &RegUses) const;
00299 
00300   void print(raw_ostream &OS) const;
00301   void dump() const;
00302 };
00303 
00304 }
00305 
00306 /// Recursion helper for initialMatch.
00307 static void DoInitialMatch(const SCEV *S, Loop *L,
00308                            SmallVectorImpl<const SCEV *> &Good,
00309                            SmallVectorImpl<const SCEV *> &Bad,
00310                            ScalarEvolution &SE) {
00311   // Collect expressions which properly dominate the loop header.
00312   if (SE.properlyDominates(S, L->getHeader())) {
00313     Good.push_back(S);
00314     return;
00315   }
00316 
00317   // Look at add operands.
00318   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
00319     for (const SCEV *S : Add->operands())
00320       DoInitialMatch(S, L, Good, Bad, SE);
00321     return;
00322   }
00323 
00324   // Look at addrec operands.
00325   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
00326     if (!AR->getStart()->isZero()) {
00327       DoInitialMatch(AR->getStart(), L, Good, Bad, SE);
00328       DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
00329                                       AR->getStepRecurrence(SE),
00330                                       // FIXME: AR->getNoWrapFlags()
00331                                       AR->getLoop(), SCEV::FlagAnyWrap),
00332                      L, Good, Bad, SE);
00333       return;
00334     }
00335 
00336   // Handle a multiplication by -1 (negation) if it didn't fold.
00337   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
00338     if (Mul->getOperand(0)->isAllOnesValue()) {
00339       SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end());
00340       const SCEV *NewMul = SE.getMulExpr(Ops);
00341 
00342       SmallVector<const SCEV *, 4> MyGood;
00343       SmallVector<const SCEV *, 4> MyBad;
00344       DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
00345       const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
00346         SE.getEffectiveSCEVType(NewMul->getType())));
00347       for (const SCEV *S : MyGood)
00348         Good.push_back(SE.getMulExpr(NegOne, S));
00349       for (const SCEV *S : MyBad)
00350         Bad.push_back(SE.getMulExpr(NegOne, S));
00351       return;
00352     }
00353 
00354   // Ok, we can't do anything interesting. Just stuff the whole thing into a
00355   // register and hope for the best.
00356   Bad.push_back(S);
00357 }
00358 
00359 /// Incorporate loop-variant parts of S into this Formula, attempting to keep
00360 /// all loop-invariant and loop-computable values in a single base register.
00361 void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
00362   SmallVector<const SCEV *, 4> Good;
00363   SmallVector<const SCEV *, 4> Bad;
00364   DoInitialMatch(S, L, Good, Bad, SE);
00365   if (!Good.empty()) {
00366     const SCEV *Sum = SE.getAddExpr(Good);
00367     if (!Sum->isZero())
00368       BaseRegs.push_back(Sum);
00369     HasBaseReg = true;
00370   }
00371   if (!Bad.empty()) {
00372     const SCEV *Sum = SE.getAddExpr(Bad);
00373     if (!Sum->isZero())
00374       BaseRegs.push_back(Sum);
00375     HasBaseReg = true;
00376   }
00377   canonicalize();
00378 }
00379 
00380 /// \brief Check whether or not this formula statisfies the canonical
00381 /// representation.
00382 /// \see Formula::BaseRegs.
00383 bool Formula::isCanonical() const {
00384   if (ScaledReg)
00385     return Scale != 1 || !BaseRegs.empty();
00386   return BaseRegs.size() <= 1;
00387 }
00388 
00389 /// \brief Helper method to morph a formula into its canonical representation.
00390 /// \see Formula::BaseRegs.
00391 /// Every formula having more than one base register, must use the ScaledReg
00392 /// field. Otherwise, we would have to do special cases everywhere in LSR
00393 /// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
00394 /// On the other hand, 1*reg should be canonicalized into reg.
00395 void Formula::canonicalize() {
00396   if (isCanonical())
00397     return;
00398   // So far we did not need this case. This is easy to implement but it is
00399   // useless to maintain dead code. Beside it could hurt compile time.
00400   assert(!BaseRegs.empty() && "1*reg => reg, should not be needed.");
00401   // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
00402   ScaledReg = BaseRegs.back();
00403   BaseRegs.pop_back();
00404   Scale = 1;
00405   size_t BaseRegsSize = BaseRegs.size();
00406   size_t Try = 0;
00407   // If ScaledReg is an invariant, try to find a variant expression.
00408   while (Try < BaseRegsSize && !isa<SCEVAddRecExpr>(ScaledReg))
00409     std::swap(ScaledReg, BaseRegs[Try++]);
00410 }
00411 
00412 /// \brief Get rid of the scale in the formula.
00413 /// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
00414 /// \return true if it was possible to get rid of the scale, false otherwise.
00415 /// \note After this operation the formula may not be in the canonical form.
00416 bool Formula::unscale() {
00417   if (Scale != 1)
00418     return false;
00419   Scale = 0;
00420   BaseRegs.push_back(ScaledReg);
00421   ScaledReg = nullptr;
00422   return true;
00423 }
00424 
00425 /// Return the total number of register operands used by this formula. This does
00426 /// not include register uses implied by non-constant addrec strides.
00427 size_t Formula::getNumRegs() const {
00428   return !!ScaledReg + BaseRegs.size();
00429 }
00430 
00431 /// Return the type of this formula, if it has one, or null otherwise. This type
00432 /// is meaningless except for the bit size.
00433 Type *Formula::getType() const {
00434   return !BaseRegs.empty() ? BaseRegs.front()->getType() :
00435          ScaledReg ? ScaledReg->getType() :
00436          BaseGV ? BaseGV->getType() :
00437          nullptr;
00438 }
00439 
00440 /// Delete the given base reg from the BaseRegs list.
00441 void Formula::deleteBaseReg(const SCEV *&S) {
00442   if (&S != &BaseRegs.back())
00443     std::swap(S, BaseRegs.back());
00444   BaseRegs.pop_back();
00445 }
00446 
00447 /// Test if this formula references the given register.
00448 bool Formula::referencesReg(const SCEV *S) const {
00449   return S == ScaledReg ||
00450          std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end();
00451 }
00452 
00453 /// Test whether this formula uses registers which are used by uses other than
00454 /// the use with the given index.
00455 bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
00456                                          const RegUseTracker &RegUses) const {
00457   if (ScaledReg)
00458     if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
00459       return true;
00460   for (const SCEV *BaseReg : BaseRegs)
00461     if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
00462       return true;
00463   return false;
00464 }
00465 
00466 void Formula::print(raw_ostream &OS) const {
00467   bool First = true;
00468   if (BaseGV) {
00469     if (!First) OS << " + "; else First = false;
00470     BaseGV->printAsOperand(OS, /*PrintType=*/false);
00471   }
00472   if (BaseOffset != 0) {
00473     if (!First) OS << " + "; else First = false;
00474     OS << BaseOffset;
00475   }
00476   for (const SCEV *BaseReg : BaseRegs) {
00477     if (!First) OS << " + "; else First = false;
00478     OS << "reg(" << *BaseReg << ')';
00479   }
00480   if (HasBaseReg && BaseRegs.empty()) {
00481     if (!First) OS << " + "; else First = false;
00482     OS << "**error: HasBaseReg**";
00483   } else if (!HasBaseReg && !BaseRegs.empty()) {
00484     if (!First) OS << " + "; else First = false;
00485     OS << "**error: !HasBaseReg**";
00486   }
00487   if (Scale != 0) {
00488     if (!First) OS << " + "; else First = false;
00489     OS << Scale << "*reg(";
00490     if (ScaledReg)
00491       OS << *ScaledReg;
00492     else
00493       OS << "<unknown>";
00494     OS << ')';
00495   }
00496   if (UnfoldedOffset != 0) {
00497     if (!First) OS << " + ";
00498     OS << "imm(" << UnfoldedOffset << ')';
00499   }
00500 }
00501 
00502 LLVM_DUMP_METHOD
00503 void Formula::dump() const {
00504   print(errs()); errs() << '\n';
00505 }
00506 
00507 /// Return true if the given addrec can be sign-extended without changing its
00508 /// value.
00509 static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
00510   Type *WideTy =
00511     IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1);
00512   return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
00513 }
00514 
00515 /// Return true if the given add can be sign-extended without changing its
00516 /// value.
00517 static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
00518   Type *WideTy =
00519     IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
00520   return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
00521 }
00522 
00523 /// Return true if the given mul can be sign-extended without changing its
00524 /// value.
00525 static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
00526   Type *WideTy =
00527     IntegerType::get(SE.getContext(),
00528                      SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
00529   return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
00530 }
00531 
00532 /// Return an expression for LHS /s RHS, if it can be determined and if the
00533 /// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
00534 /// is true, expressions like (X * Y) /s Y are simplified to Y, ignoring that
00535 /// the multiplication may overflow, which is useful when the result will be
00536 /// used in a context where the most significant bits are ignored.
00537 static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
00538                                 ScalarEvolution &SE,
00539                                 bool IgnoreSignificantBits = false) {
00540   // Handle the trivial case, which works for any SCEV type.
00541   if (LHS == RHS)
00542     return SE.getConstant(LHS->getType(), 1);
00543 
00544   // Handle a few RHS special cases.
00545   const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
00546   if (RC) {
00547     const APInt &RA = RC->getAPInt();
00548     // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
00549     // some folding.
00550     if (RA.isAllOnesValue())
00551       return SE.getMulExpr(LHS, RC);
00552     // Handle x /s 1 as x.
00553     if (RA == 1)
00554       return LHS;
00555   }
00556 
00557   // Check for a division of a constant by a constant.
00558   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
00559     if (!RC)
00560       return nullptr;
00561     const APInt &LA = C->getAPInt();
00562     const APInt &RA = RC->getAPInt();
00563     if (LA.srem(RA) != 0)
00564       return nullptr;
00565     return SE.getConstant(LA.sdiv(RA));
00566   }
00567 
00568   // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
00569   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
00570     if (IgnoreSignificantBits || isAddRecSExtable(AR, SE)) {
00571       const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
00572                                       IgnoreSignificantBits);
00573       if (!Step) return nullptr;
00574       const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
00575                                        IgnoreSignificantBits);
00576       if (!Start) return nullptr;
00577       // FlagNW is independent of the start value, step direction, and is
00578       // preserved with smaller magnitude steps.
00579       // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
00580       return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
00581     }
00582     return nullptr;
00583   }
00584 
00585   // Distribute the sdiv over add operands, if the add doesn't overflow.
00586   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
00587     if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
00588       SmallVector<const SCEV *, 8> Ops;
00589       for (const SCEV *S : Add->operands()) {
00590         const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
00591         if (!Op) return nullptr;
00592         Ops.push_back(Op);
00593       }
00594       return SE.getAddExpr(Ops);
00595     }
00596     return nullptr;
00597   }
00598 
00599   // Check for a multiply operand that we can pull RHS out of.
00600   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
00601     if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
00602       SmallVector<const SCEV *, 4> Ops;
00603       bool Found = false;
00604       for (const SCEV *S : Mul->operands()) {
00605         if (!Found)
00606           if (const SCEV *Q = getExactSDiv(S, RHS, SE,
00607                                            IgnoreSignificantBits)) {
00608             S = Q;
00609             Found = true;
00610           }
00611         Ops.push_back(S);
00612       }
00613       return Found ? SE.getMulExpr(Ops) : nullptr;
00614     }
00615     return nullptr;
00616   }
00617 
00618   // Otherwise we don't know.
00619   return nullptr;
00620 }
00621 
00622 /// If S involves the addition of a constant integer value, return that integer
00623 /// value, and mutate S to point to a new SCEV with that value excluded.
00624 static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
00625   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
00626     if (C->getAPInt().getMinSignedBits() <= 64) {
00627       S = SE.getConstant(C->getType(), 0);
00628       return C->getValue()->getSExtValue();
00629     }
00630   } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
00631     SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
00632     int64_t Result = ExtractImmediate(NewOps.front(), SE);
00633     if (Result != 0)
00634       S = SE.getAddExpr(NewOps);
00635     return Result;
00636   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
00637     SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
00638     int64_t Result = ExtractImmediate(NewOps.front(), SE);
00639     if (Result != 0)
00640       S = SE.getAddRecExpr(NewOps, AR->getLoop(),
00641                            // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
00642                            SCEV::FlagAnyWrap);
00643     return Result;
00644   }
00645   return 0;
00646 }
00647 
00648 /// If S involves the addition of a GlobalValue address, return that symbol, and
00649 /// mutate S to point to a new SCEV with that value excluded.
00650 static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
00651   if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
00652     if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
00653       S = SE.getConstant(GV->getType(), 0);
00654       return GV;
00655     }
00656   } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
00657     SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
00658     GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
00659     if (Result)
00660       S = SE.getAddExpr(NewOps);
00661     return Result;
00662   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
00663     SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
00664     GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
00665     if (Result)
00666       S = SE.getAddRecExpr(NewOps, AR->getLoop(),
00667                            // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
00668                            SCEV::FlagAnyWrap);
00669     return Result;
00670   }
00671   return nullptr;
00672 }
00673 
00674 /// Returns true if the specified instruction is using the specified value as an
00675 /// address.
00676 static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
00677   bool isAddress = isa<LoadInst>(Inst);
00678   if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
00679     if (SI->getOperand(1) == OperandVal)
00680       isAddress = true;
00681   } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
00682     // Addressing modes can also be folded into prefetches and a variety
00683     // of intrinsics.
00684     switch (II->getIntrinsicID()) {
00685       default: break;
00686       case Intrinsic::prefetch:
00687       case Intrinsic::x86_sse_storeu_ps:
00688       case Intrinsic::x86_sse2_storeu_pd:
00689       case Intrinsic::x86_sse2_storeu_dq:
00690       case Intrinsic::x86_sse2_storel_dq:
00691         if (II->getArgOperand(0) == OperandVal)
00692           isAddress = true;
00693         break;
00694     }
00695   }
00696   return isAddress;
00697 }
00698 
00699 /// Return the type of the memory being accessed.
00700 static MemAccessTy getAccessType(const Instruction *Inst) {
00701   MemAccessTy AccessTy(Inst->getType(), MemAccessTy::UnknownAddressSpace);
00702   if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
00703     AccessTy.MemTy = SI->getOperand(0)->getType();
00704     AccessTy.AddrSpace = SI->getPointerAddressSpace();
00705   } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
00706     AccessTy.AddrSpace = LI->getPointerAddressSpace();
00707   } else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
00708     // Addressing modes can also be folded into prefetches and a variety
00709     // of intrinsics.
00710     switch (II->getIntrinsicID()) {
00711     default: break;
00712     case Intrinsic::x86_sse_storeu_ps:
00713     case Intrinsic::x86_sse2_storeu_pd:
00714     case Intrinsic::x86_sse2_storeu_dq:
00715     case Intrinsic::x86_sse2_storel_dq:
00716       AccessTy.MemTy = II->getArgOperand(0)->getType();
00717       break;
00718     }
00719   }
00720 
00721   // All pointers have the same requirements, so canonicalize them to an
00722   // arbitrary pointer type to minimize variation.
00723   if (PointerType *PTy = dyn_cast<PointerType>(AccessTy.MemTy))
00724     AccessTy.MemTy = PointerType::get(IntegerType::get(PTy->getContext(), 1),
00725                                       PTy->getAddressSpace());
00726 
00727   return AccessTy;
00728 }
00729 
00730 /// Return true if this AddRec is already a phi in its loop.
00731 static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
00732   for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();
00733        PHINode *PN = dyn_cast<PHINode>(I); ++I) {
00734     if (SE.isSCEVable(PN->getType()) &&
00735         (SE.getEffectiveSCEVType(PN->getType()) ==
00736          SE.getEffectiveSCEVType(AR->getType())) &&
00737         SE.getSCEV(PN) == AR)
00738       return true;
00739   }
00740   return false;
00741 }
00742 
00743 /// Check if expanding this expression is likely to incur significant cost. This
00744 /// is tricky because SCEV doesn't track which expressions are actually computed
00745 /// by the current IR.
00746 ///
00747 /// We currently allow expansion of IV increments that involve adds,
00748 /// multiplication by constants, and AddRecs from existing phis.
00749 ///
00750 /// TODO: Allow UDivExpr if we can find an existing IV increment that is an
00751 /// obvious multiple of the UDivExpr.
00752 static bool isHighCostExpansion(const SCEV *S,
00753                                 SmallPtrSetImpl<const SCEV*> &Processed,
00754                                 ScalarEvolution &SE) {
00755   // Zero/One operand expressions
00756   switch (S->getSCEVType()) {
00757   case scUnknown:
00758   case scConstant:
00759     return false;
00760   case scTruncate:
00761     return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
00762                                Processed, SE);
00763   case scZeroExtend:
00764     return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
00765                                Processed, SE);
00766   case scSignExtend:
00767     return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
00768                                Processed, SE);
00769   }
00770 
00771   if (!Processed.insert(S).second)
00772     return false;
00773 
00774   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
00775     for (const SCEV *S : Add->operands()) {
00776       if (isHighCostExpansion(S, Processed, SE))
00777         return true;
00778     }
00779     return false;
00780   }
00781 
00782   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
00783     if (Mul->getNumOperands() == 2) {
00784       // Multiplication by a constant is ok
00785       if (isa<SCEVConstant>(Mul->getOperand(0)))
00786         return isHighCostExpansion(Mul->getOperand(1), Processed, SE);
00787 
00788       // If we have the value of one operand, check if an existing
00789       // multiplication already generates this expression.
00790       if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) {
00791         Value *UVal = U->getValue();
00792         for (User *UR : UVal->users()) {
00793           // If U is a constant, it may be used by a ConstantExpr.
00794           Instruction *UI = dyn_cast<Instruction>(UR);
00795           if (UI && UI->getOpcode() == Instruction::Mul &&
00796               SE.isSCEVable(UI->getType())) {
00797             return SE.getSCEV(UI) == Mul;
00798           }
00799         }
00800       }
00801     }
00802   }
00803 
00804   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
00805     if (isExistingPhi(AR, SE))
00806       return false;
00807   }
00808 
00809   // Fow now, consider any other type of expression (div/mul/min/max) high cost.
00810   return true;
00811 }
00812 
00813 /// If any of the instructions is the specified set are trivially dead, delete
00814 /// them and see if this makes any of their operands subsequently dead.
00815 static bool
00816 DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
00817   bool Changed = false;
00818 
00819   while (!DeadInsts.empty()) {
00820     Value *V = DeadInsts.pop_back_val();
00821     Instruction *I = dyn_cast_or_null<Instruction>(V);
00822 
00823     if (!I || !isInstructionTriviallyDead(I))
00824       continue;
00825 
00826     for (Use &O : I->operands())
00827       if (Instruction *U = dyn_cast<Instruction>(O)) {
00828         O = nullptr;
00829         if (U->use_empty())
00830           DeadInsts.emplace_back(U);
00831       }
00832 
00833     I->eraseFromParent();
00834     Changed = true;
00835   }
00836 
00837   return Changed;
00838 }
00839 
00840 namespace {
00841 class LSRUse;
00842 }
00843 
00844 /// \brief Check if the addressing mode defined by \p F is completely
00845 /// folded in \p LU at isel time.
00846 /// This includes address-mode folding and special icmp tricks.
00847 /// This function returns true if \p LU can accommodate what \p F
00848 /// defines and up to 1 base + 1 scaled + offset.
00849 /// In other words, if \p F has several base registers, this function may
00850 /// still return true. Therefore, users still need to account for
00851 /// additional base registers and/or unfolded offsets to derive an
00852 /// accurate cost model.
00853 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
00854                                  const LSRUse &LU, const Formula &F);
00855 // Get the cost of the scaling factor used in F for LU.
00856 static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
00857                                      const LSRUse &LU, const Formula &F);
00858 
00859 namespace {
00860 
00861 /// This class is used to measure and compare candidate formulae.
00862 class Cost {
00863   /// TODO: Some of these could be merged. Also, a lexical ordering
00864   /// isn't always optimal.
00865   unsigned NumRegs;
00866   unsigned AddRecCost;
00867   unsigned NumIVMuls;
00868   unsigned NumBaseAdds;
00869   unsigned ImmCost;
00870   unsigned SetupCost;
00871   unsigned ScaleCost;
00872 
00873 public:
00874   Cost()
00875     : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0),
00876       SetupCost(0), ScaleCost(0) {}
00877 
00878   bool operator<(const Cost &Other) const;
00879 
00880   void Lose();
00881 
00882 #ifndef NDEBUG
00883   // Once any of the metrics loses, they must all remain losers.
00884   bool isValid() {
00885     return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
00886              | ImmCost | SetupCost | ScaleCost) != ~0u)
00887       || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
00888            & ImmCost & SetupCost & ScaleCost) == ~0u);
00889   }
00890 #endif
00891 
00892   bool isLoser() {
00893     assert(isValid() && "invalid cost");
00894     return NumRegs == ~0u;
00895   }
00896 
00897   void RateFormula(const TargetTransformInfo &TTI,
00898                    const Formula &F,
00899                    SmallPtrSetImpl<const SCEV *> &Regs,
00900                    const DenseSet<const SCEV *> &VisitedRegs,
00901                    const Loop *L,
00902                    const SmallVectorImpl<int64_t> &Offsets,
00903                    ScalarEvolution &SE, DominatorTree &DT,
00904                    const LSRUse &LU,
00905                    SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
00906 
00907   void print(raw_ostream &OS) const;
00908   void dump() const;
00909 
00910 private:
00911   void RateRegister(const SCEV *Reg,
00912                     SmallPtrSetImpl<const SCEV *> &Regs,
00913                     const Loop *L,
00914                     ScalarEvolution &SE, DominatorTree &DT);
00915   void RatePrimaryRegister(const SCEV *Reg,
00916                            SmallPtrSetImpl<const SCEV *> &Regs,
00917                            const Loop *L,
00918                            ScalarEvolution &SE, DominatorTree &DT,
00919                            SmallPtrSetImpl<const SCEV *> *LoserRegs);
00920 };
00921 
00922 }
00923 
00924 /// Tally up interesting quantities from the given register.
00925 void Cost::RateRegister(const SCEV *Reg,
00926                         SmallPtrSetImpl<const SCEV *> &Regs,
00927                         const Loop *L,
00928                         ScalarEvolution &SE, DominatorTree &DT) {
00929   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
00930     // If this is an addrec for another loop, don't second-guess its addrec phi
00931     // nodes. LSR isn't currently smart enough to reason about more than one
00932     // loop at a time. LSR has already run on inner loops, will not run on outer
00933     // loops, and cannot be expected to change sibling loops.
00934     if (AR->getLoop() != L) {
00935       // If the AddRec exists, consider it's register free and leave it alone.
00936       if (isExistingPhi(AR, SE))
00937         return;
00938 
00939       // Otherwise, do not consider this formula at all.
00940       Lose();
00941       return;
00942     }
00943     AddRecCost += 1; /// TODO: This should be a function of the stride.
00944 
00945     // Add the step value register, if it needs one.
00946     // TODO: The non-affine case isn't precisely modeled here.
00947     if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
00948       if (!Regs.count(AR->getOperand(1))) {
00949         RateRegister(AR->getOperand(1), Regs, L, SE, DT);
00950         if (isLoser())
00951           return;
00952       }
00953     }
00954   }
00955   ++NumRegs;
00956 
00957   // Rough heuristic; favor registers which don't require extra setup
00958   // instructions in the preheader.
00959   if (!isa<SCEVUnknown>(Reg) &&
00960       !isa<SCEVConstant>(Reg) &&
00961       !(isa<SCEVAddRecExpr>(Reg) &&
00962         (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) ||
00963          isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart()))))
00964     ++SetupCost;
00965 
00966     NumIVMuls += isa<SCEVMulExpr>(Reg) &&
00967                  SE.hasComputableLoopEvolution(Reg, L);
00968 }
00969 
00970 /// Record this register in the set. If we haven't seen it before, rate
00971 /// it. Optional LoserRegs provides a way to declare any formula that refers to
00972 /// one of those regs an instant loser.
00973 void Cost::RatePrimaryRegister(const SCEV *Reg,
00974                                SmallPtrSetImpl<const SCEV *> &Regs,
00975                                const Loop *L,
00976                                ScalarEvolution &SE, DominatorTree &DT,
00977                                SmallPtrSetImpl<const SCEV *> *LoserRegs) {
00978   if (LoserRegs && LoserRegs->count(Reg)) {
00979     Lose();
00980     return;
00981   }
00982   if (Regs.insert(Reg).second) {
00983     RateRegister(Reg, Regs, L, SE, DT);
00984     if (LoserRegs && isLoser())
00985       LoserRegs->insert(Reg);
00986   }
00987 }
00988 
00989 void Cost::RateFormula(const TargetTransformInfo &TTI,
00990                        const Formula &F,
00991                        SmallPtrSetImpl<const SCEV *> &Regs,
00992                        const DenseSet<const SCEV *> &VisitedRegs,
00993                        const Loop *L,
00994                        const SmallVectorImpl<int64_t> &Offsets,
00995                        ScalarEvolution &SE, DominatorTree &DT,
00996                        const LSRUse &LU,
00997                        SmallPtrSetImpl<const SCEV *> *LoserRegs) {
00998   assert(F.isCanonical() && "Cost is accurate only for canonical formula");
00999   // Tally up the registers.
01000   if (const SCEV *ScaledReg = F.ScaledReg) {
01001     if (VisitedRegs.count(ScaledReg)) {
01002       Lose();
01003       return;
01004     }
01005     RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs);
01006     if (isLoser())
01007       return;
01008   }
01009   for (const SCEV *BaseReg : F.BaseRegs) {
01010     if (VisitedRegs.count(BaseReg)) {
01011       Lose();
01012       return;
01013     }
01014     RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs);
01015     if (isLoser())
01016       return;
01017   }
01018 
01019   // Determine how many (unfolded) adds we'll need inside the loop.
01020   size_t NumBaseParts = F.getNumRegs();
01021   if (NumBaseParts > 1)
01022     // Do not count the base and a possible second register if the target
01023     // allows to fold 2 registers.
01024     NumBaseAdds +=
01025         NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F)));
01026   NumBaseAdds += (F.UnfoldedOffset != 0);
01027 
01028   // Accumulate non-free scaling amounts.
01029   ScaleCost += getScalingFactorCost(TTI, LU, F);
01030 
01031   // Tally up the non-zero immediates.
01032   for (int64_t O : Offsets) {
01033     int64_t Offset = (uint64_t)O + F.BaseOffset;
01034     if (F.BaseGV)
01035       ImmCost += 64; // Handle symbolic values conservatively.
01036                      // TODO: This should probably be the pointer size.
01037     else if (Offset != 0)
01038       ImmCost += APInt(64, Offset, true).getMinSignedBits();
01039   }
01040   assert(isValid() && "invalid cost");
01041 }
01042 
01043 /// Set this cost to a losing value.
01044 void Cost::Lose() {
01045   NumRegs = ~0u;
01046   AddRecCost = ~0u;
01047   NumIVMuls = ~0u;
01048   NumBaseAdds = ~0u;
01049   ImmCost = ~0u;
01050   SetupCost = ~0u;
01051   ScaleCost = ~0u;
01052 }
01053 
01054 /// Choose the lower cost.
01055 bool Cost::operator<(const Cost &Other) const {
01056   return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost,
01057                   ImmCost, SetupCost) <
01058          std::tie(Other.NumRegs, Other.AddRecCost, Other.NumIVMuls,
01059                   Other.NumBaseAdds, Other.ScaleCost, Other.ImmCost,
01060                   Other.SetupCost);
01061 }
01062 
01063 void Cost::print(raw_ostream &OS) const {
01064   OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
01065   if (AddRecCost != 0)
01066     OS << ", with addrec cost " << AddRecCost;
01067   if (NumIVMuls != 0)
01068     OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s");
01069   if (NumBaseAdds != 0)
01070     OS << ", plus " << NumBaseAdds << " base add"
01071        << (NumBaseAdds == 1 ? "" : "s");
01072   if (ScaleCost != 0)
01073     OS << ", plus " << ScaleCost << " scale cost";
01074   if (ImmCost != 0)
01075     OS << ", plus " << ImmCost << " imm cost";
01076   if (SetupCost != 0)
01077     OS << ", plus " << SetupCost << " setup cost";
01078 }
01079 
01080 LLVM_DUMP_METHOD
01081 void Cost::dump() const {
01082   print(errs()); errs() << '\n';
01083 }
01084 
01085 namespace {
01086 
01087 /// An operand value in an instruction which is to be replaced with some
01088 /// equivalent, possibly strength-reduced, replacement.
01089 struct LSRFixup {
01090   /// The instruction which will be updated.
01091   Instruction *UserInst;
01092 
01093   /// The operand of the instruction which will be replaced. The operand may be
01094   /// used more than once; every instance will be replaced.
01095   Value *OperandValToReplace;
01096 
01097   /// If this user is to use the post-incremented value of an induction
01098   /// variable, this variable is non-null and holds the loop associated with the
01099   /// induction variable.
01100   PostIncLoopSet PostIncLoops;
01101 
01102   /// The index of the LSRUse describing the expression which this fixup needs,
01103   /// minus an offset (below).
01104   size_t LUIdx;
01105 
01106   /// A constant offset to be added to the LSRUse expression.  This allows
01107   /// multiple fixups to share the same LSRUse with different offsets, for
01108   /// example in an unrolled loop.
01109   int64_t Offset;
01110 
01111   bool isUseFullyOutsideLoop(const Loop *L) const;
01112 
01113   LSRFixup();
01114 
01115   void print(raw_ostream &OS) const;
01116   void dump() const;
01117 };
01118 
01119 }
01120 
01121 LSRFixup::LSRFixup()
01122   : UserInst(nullptr), OperandValToReplace(nullptr), LUIdx(~size_t(0)),
01123     Offset(0) {}
01124 
01125 /// Test whether this fixup always uses its value outside of the given loop.
01126 bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
01127   // PHI nodes use their value in their incoming blocks.
01128   if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
01129     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
01130       if (PN->getIncomingValue(i) == OperandValToReplace &&
01131           L->contains(PN->getIncomingBlock(i)))
01132         return false;
01133     return true;
01134   }
01135 
01136   return !L->contains(UserInst);
01137 }
01138 
01139 void LSRFixup::print(raw_ostream &OS) const {
01140   OS << "UserInst=";
01141   // Store is common and interesting enough to be worth special-casing.
01142   if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
01143     OS << "store ";
01144     Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
01145   } else if (UserInst->getType()->isVoidTy())
01146     OS << UserInst->getOpcodeName();
01147   else
01148     UserInst->printAsOperand(OS, /*PrintType=*/false);
01149 
01150   OS << ", OperandValToReplace=";
01151   OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
01152 
01153   for (const Loop *PIL : PostIncLoops) {
01154     OS << ", PostIncLoop=";
01155     PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
01156   }
01157 
01158   if (LUIdx != ~size_t(0))
01159     OS << ", LUIdx=" << LUIdx;
01160 
01161   if (Offset != 0)
01162     OS << ", Offset=" << Offset;
01163 }
01164 
01165 LLVM_DUMP_METHOD
01166 void LSRFixup::dump() const {
01167   print(errs()); errs() << '\n';
01168 }
01169 
01170 namespace {
01171 
01172 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted
01173 /// SmallVectors of const SCEV*.
01174 struct UniquifierDenseMapInfo {
01175   static SmallVector<const SCEV *, 4> getEmptyKey() {
01176     SmallVector<const SCEV *, 4>  V;
01177     V.push_back(reinterpret_cast<const SCEV *>(-1));
01178     return V;
01179   }
01180 
01181   static SmallVector<const SCEV *, 4> getTombstoneKey() {
01182     SmallVector<const SCEV *, 4> V;
01183     V.push_back(reinterpret_cast<const SCEV *>(-2));
01184     return V;
01185   }
01186 
01187   static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) {
01188     return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
01189   }
01190 
01191   static bool isEqual(const SmallVector<const SCEV *, 4> &LHS,
01192                       const SmallVector<const SCEV *, 4> &RHS) {
01193     return LHS == RHS;
01194   }
01195 };
01196 
01197 /// This class holds the state that LSR keeps for each use in IVUsers, as well
01198 /// as uses invented by LSR itself. It includes information about what kinds of
01199 /// things can be folded into the user, information about the user itself, and
01200 /// information about how the use may be satisfied.  TODO: Represent multiple
01201 /// users of the same expression in common?
01202 class LSRUse {
01203   DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;
01204 
01205 public:
01206   /// An enum for a kind of use, indicating what types of scaled and immediate
01207   /// operands it might support.
01208   enum KindType {
01209     Basic,   ///< A normal use, with no folding.
01210     Special, ///< A special case of basic, allowing -1 scales.
01211     Address, ///< An address use; folding according to TargetLowering
01212     ICmpZero ///< An equality icmp with both operands folded into one.
01213     // TODO: Add a generic icmp too?
01214   };
01215 
01216   typedef PointerIntPair<const SCEV *, 2, KindType> SCEVUseKindPair;
01217 
01218   KindType Kind;
01219   MemAccessTy AccessTy;
01220 
01221   SmallVector<int64_t, 8> Offsets;
01222   int64_t MinOffset;
01223   int64_t MaxOffset;
01224 
01225   /// This records whether all of the fixups using this LSRUse are outside of
01226   /// the loop, in which case some special-case heuristics may be used.
01227   bool AllFixupsOutsideLoop;
01228 
01229   /// RigidFormula is set to true to guarantee that this use will be associated
01230   /// with a single formula--the one that initially matched. Some SCEV
01231   /// expressions cannot be expanded. This allows LSR to consider the registers
01232   /// used by those expressions without the need to expand them later after
01233   /// changing the formula.
01234   bool RigidFormula;
01235 
01236   /// This records the widest use type for any fixup using this
01237   /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
01238   /// fixup widths to be equivalent, because the narrower one may be relying on
01239   /// the implicit truncation to truncate away bogus bits.
01240   Type *WidestFixupType;
01241 
01242   /// A list of ways to build a value that can satisfy this user.  After the
01243   /// list is populated, one of these is selected heuristically and used to
01244   /// formulate a replacement for OperandValToReplace in UserInst.
01245   SmallVector<Formula, 12> Formulae;
01246 
01247   /// The set of register candidates used by all formulae in this LSRUse.
01248   SmallPtrSet<const SCEV *, 4> Regs;
01249 
01250   LSRUse(KindType K, MemAccessTy AT)
01251       : Kind(K), AccessTy(AT), MinOffset(INT64_MAX), MaxOffset(INT64_MIN),
01252         AllFixupsOutsideLoop(true), RigidFormula(false),
01253         WidestFixupType(nullptr) {}
01254 
01255   bool HasFormulaWithSameRegs(const Formula &F) const;
01256   bool InsertFormula(const Formula &F);
01257   void DeleteFormula(Formula &F);
01258   void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
01259 
01260   void print(raw_ostream &OS) const;
01261   void dump() const;
01262 };
01263 
01264 }
01265 
01266 /// Test whether this use as a formula which has the same registers as the given
01267 /// formula.
01268 bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
01269   SmallVector<const SCEV *, 4> Key = F.BaseRegs;
01270   if (F.ScaledReg) Key.push_back(F.ScaledReg);
01271   // Unstable sort by host order ok, because this is only used for uniquifying.
01272   std::sort(Key.begin(), Key.end());
01273   return Uniquifier.count(Key);
01274 }
01275 
01276 /// If the given formula has not yet been inserted, add it to the list, and
01277 /// return true. Return false otherwise.  The formula must be in canonical form.
01278 bool LSRUse::InsertFormula(const Formula &F) {
01279   assert(F.isCanonical() && "Invalid canonical representation");
01280 
01281   if (!Formulae.empty() && RigidFormula)
01282     return false;
01283 
01284   SmallVector<const SCEV *, 4> Key = F.BaseRegs;
01285   if (F.ScaledReg) Key.push_back(F.ScaledReg);
01286   // Unstable sort by host order ok, because this is only used for uniquifying.
01287   std::sort(Key.begin(), Key.end());
01288 
01289   if (!Uniquifier.insert(Key).second)
01290     return false;
01291 
01292   // Using a register to hold the value of 0 is not profitable.
01293   assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
01294          "Zero allocated in a scaled register!");
01295 #ifndef NDEBUG
01296   for (const SCEV *BaseReg : F.BaseRegs)
01297     assert(!BaseReg->isZero() && "Zero allocated in a base register!");
01298 #endif
01299 
01300   // Add the formula to the list.
01301   Formulae.push_back(F);
01302 
01303   // Record registers now being used by this use.
01304   Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
01305   if (F.ScaledReg)
01306     Regs.insert(F.ScaledReg);
01307 
01308   return true;
01309 }
01310 
01311 /// Remove the given formula from this use's list.
01312 void LSRUse::DeleteFormula(Formula &F) {
01313   if (&F != &Formulae.back())
01314     std::swap(F, Formulae.back());
01315   Formulae.pop_back();
01316 }
01317 
01318 /// Recompute the Regs field, and update RegUses.
01319 void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
01320   // Now that we've filtered out some formulae, recompute the Regs set.
01321   SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
01322   Regs.clear();
01323   for (const Formula &F : Formulae) {
01324     if (F.ScaledReg) Regs.insert(F.ScaledReg);
01325     Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
01326   }
01327 
01328   // Update the RegTracker.
01329   for (const SCEV *S : OldRegs)
01330     if (!Regs.count(S))
01331       RegUses.dropRegister(S, LUIdx);
01332 }
01333 
01334 void LSRUse::print(raw_ostream &OS) const {
01335   OS << "LSR Use: Kind=";
01336   switch (Kind) {
01337   case Basic:    OS << "Basic"; break;
01338   case Special:  OS << "Special"; break;
01339   case ICmpZero: OS << "ICmpZero"; break;
01340   case Address:
01341     OS << "Address of ";
01342     if (AccessTy.MemTy->isPointerTy())
01343       OS << "pointer"; // the full pointer type could be really verbose
01344     else {
01345       OS << *AccessTy.MemTy;
01346     }
01347 
01348     OS << " in addrspace(" << AccessTy.AddrSpace << ')';
01349   }
01350 
01351   OS << ", Offsets={";
01352   bool NeedComma = false;
01353   for (int64_t O : Offsets) {
01354     if (NeedComma) OS << ',';
01355     OS << O;
01356     NeedComma = true;
01357   }
01358   OS << '}';
01359 
01360   if (AllFixupsOutsideLoop)
01361     OS << ", all-fixups-outside-loop";
01362 
01363   if (WidestFixupType)
01364     OS << ", widest fixup type: " << *WidestFixupType;
01365 }
01366 
01367 LLVM_DUMP_METHOD
01368 void LSRUse::dump() const {
01369   print(errs()); errs() << '\n';
01370 }
01371 
01372 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
01373                                  LSRUse::KindType Kind, MemAccessTy AccessTy,
01374                                  GlobalValue *BaseGV, int64_t BaseOffset,
01375                                  bool HasBaseReg, int64_t Scale) {
01376   switch (Kind) {
01377   case LSRUse::Address:
01378     return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, BaseOffset,
01379                                      HasBaseReg, Scale, AccessTy.AddrSpace);
01380 
01381   case LSRUse::ICmpZero:
01382     // There's not even a target hook for querying whether it would be legal to
01383     // fold a GV into an ICmp.
01384     if (BaseGV)
01385       return false;
01386 
01387     // ICmp only has two operands; don't allow more than two non-trivial parts.
01388     if (Scale != 0 && HasBaseReg && BaseOffset != 0)
01389       return false;
01390 
01391     // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
01392     // putting the scaled register in the other operand of the icmp.
01393     if (Scale != 0 && Scale != -1)
01394       return false;
01395 
01396     // If we have low-level target information, ask the target if it can fold an
01397     // integer immediate on an icmp.
01398     if (BaseOffset != 0) {
01399       // We have one of:
01400       // ICmpZero     BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
01401       // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
01402       // Offs is the ICmp immediate.
01403       if (Scale == 0)
01404         // The cast does the right thing with INT64_MIN.
01405         BaseOffset = -(uint64_t)BaseOffset;
01406       return TTI.isLegalICmpImmediate(BaseOffset);
01407     }
01408 
01409     // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
01410     return true;
01411 
01412   case LSRUse::Basic:
01413     // Only handle single-register values.
01414     return !BaseGV && Scale == 0 && BaseOffset == 0;
01415 
01416   case LSRUse::Special:
01417     // Special case Basic to handle -1 scales.
01418     return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset == 0;
01419   }
01420 
01421   llvm_unreachable("Invalid LSRUse Kind!");
01422 }
01423 
01424 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
01425                                  int64_t MinOffset, int64_t MaxOffset,
01426                                  LSRUse::KindType Kind, MemAccessTy AccessTy,
01427                                  GlobalValue *BaseGV, int64_t BaseOffset,
01428                                  bool HasBaseReg, int64_t Scale) {
01429   // Check for overflow.
01430   if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) !=
01431       (MinOffset > 0))
01432     return false;
01433   MinOffset = (uint64_t)BaseOffset + MinOffset;
01434   if (((int64_t)((uint64_t)BaseOffset + MaxOffset) > BaseOffset) !=
01435       (MaxOffset > 0))
01436     return false;
01437   MaxOffset = (uint64_t)BaseOffset + MaxOffset;
01438 
01439   return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
01440                               HasBaseReg, Scale) &&
01441          isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
01442                               HasBaseReg, Scale);
01443 }
01444 
01445 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
01446                                  int64_t MinOffset, int64_t MaxOffset,
01447                                  LSRUse::KindType Kind, MemAccessTy AccessTy,
01448                                  const Formula &F) {
01449   // For the purpose of isAMCompletelyFolded either having a canonical formula
01450   // or a scale not equal to zero is correct.
01451   // Problems may arise from non canonical formulae having a scale == 0.
01452   // Strictly speaking it would best to just rely on canonical formulae.
01453   // However, when we generate the scaled formulae, we first check that the
01454   // scaling factor is profitable before computing the actual ScaledReg for
01455   // compile time sake.
01456   assert((F.isCanonical() || F.Scale != 0));
01457   return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
01458                               F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
01459 }
01460 
01461 /// Test whether we know how to expand the current formula.
01462 static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
01463                        int64_t MaxOffset, LSRUse::KindType Kind,
01464                        MemAccessTy AccessTy, GlobalValue *BaseGV,
01465                        int64_t BaseOffset, bool HasBaseReg, int64_t Scale) {
01466   // We know how to expand completely foldable formulae.
01467   return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
01468                               BaseOffset, HasBaseReg, Scale) ||
01469          // Or formulae that use a base register produced by a sum of base
01470          // registers.
01471          (Scale == 1 &&
01472           isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
01473                                BaseGV, BaseOffset, true, 0));
01474 }
01475 
01476 static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
01477                        int64_t MaxOffset, LSRUse::KindType Kind,
01478                        MemAccessTy AccessTy, const Formula &F) {
01479   return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
01480                     F.BaseOffset, F.HasBaseReg, F.Scale);
01481 }
01482 
01483 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
01484                                  const LSRUse &LU, const Formula &F) {
01485   return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
01486                               LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
01487                               F.Scale);
01488 }
01489 
01490 static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
01491                                      const LSRUse &LU, const Formula &F) {
01492   if (!F.Scale)
01493     return 0;
01494 
01495   // If the use is not completely folded in that instruction, we will have to
01496   // pay an extra cost only for scale != 1.
01497   if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
01498                             LU.AccessTy, F))
01499     return F.Scale != 1;
01500 
01501   switch (LU.Kind) {
01502   case LSRUse::Address: {
01503     // Check the scaling factor cost with both the min and max offsets.
01504     int ScaleCostMinOffset = TTI.getScalingFactorCost(
01505         LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MinOffset, F.HasBaseReg,
01506         F.Scale, LU.AccessTy.AddrSpace);
01507     int ScaleCostMaxOffset = TTI.getScalingFactorCost(
01508         LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MaxOffset, F.HasBaseReg,
01509         F.Scale, LU.AccessTy.AddrSpace);
01510 
01511     assert(ScaleCostMinOffset >= 0 && ScaleCostMaxOffset >= 0 &&
01512            "Legal addressing mode has an illegal cost!");
01513     return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
01514   }
01515   case LSRUse::ICmpZero:
01516   case LSRUse::Basic:
01517   case LSRUse::Special:
01518     // The use is completely folded, i.e., everything is folded into the
01519     // instruction.
01520     return 0;
01521   }
01522 
01523   llvm_unreachable("Invalid LSRUse Kind!");
01524 }
01525 
01526 static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
01527                              LSRUse::KindType Kind, MemAccessTy AccessTy,
01528                              GlobalValue *BaseGV, int64_t BaseOffset,
01529                              bool HasBaseReg) {
01530   // Fast-path: zero is always foldable.
01531   if (BaseOffset == 0 && !BaseGV) return true;
01532 
01533   // Conservatively, create an address with an immediate and a
01534   // base and a scale.
01535   int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
01536 
01537   // Canonicalize a scale of 1 to a base register if the formula doesn't
01538   // already have a base register.
01539   if (!HasBaseReg && Scale == 1) {
01540     Scale = 0;
01541     HasBaseReg = true;
01542   }
01543 
01544   return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
01545                               HasBaseReg, Scale);
01546 }
01547 
01548 static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
01549                              ScalarEvolution &SE, int64_t MinOffset,
01550                              int64_t MaxOffset, LSRUse::KindType Kind,
01551                              MemAccessTy AccessTy, const SCEV *S,
01552                              bool HasBaseReg) {
01553   // Fast-path: zero is always foldable.
01554   if (S->isZero()) return true;
01555 
01556   // Conservatively, create an address with an immediate and a
01557   // base and a scale.
01558   int64_t BaseOffset = ExtractImmediate(S, SE);
01559   GlobalValue *BaseGV = ExtractSymbol(S, SE);
01560 
01561   // If there's anything else involved, it's not foldable.
01562   if (!S->isZero()) return false;
01563 
01564   // Fast-path: zero is always foldable.
01565   if (BaseOffset == 0 && !BaseGV) return true;
01566 
01567   // Conservatively, create an address with an immediate and a
01568   // base and a scale.
01569   int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
01570 
01571   return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
01572                               BaseOffset, HasBaseReg, Scale);
01573 }
01574 
01575 namespace {
01576 
01577 /// An individual increment in a Chain of IV increments.  Relate an IV user to
01578 /// an expression that computes the IV it uses from the IV used by the previous
01579 /// link in the Chain.
01580 ///
01581 /// For the head of a chain, IncExpr holds the absolute SCEV expression for the
01582 /// original IVOperand. The head of the chain's IVOperand is only valid during
01583 /// chain collection, before LSR replaces IV users. During chain generation,
01584 /// IncExpr can be used to find the new IVOperand that computes the same
01585 /// expression.
01586 struct IVInc {
01587   Instruction *UserInst;
01588   Value* IVOperand;
01589   const SCEV *IncExpr;
01590 
01591   IVInc(Instruction *U, Value *O, const SCEV *E):
01592     UserInst(U), IVOperand(O), IncExpr(E) {}
01593 };
01594 
01595 // The list of IV increments in program order.  We typically add the head of a
01596 // chain without finding subsequent links.
01597 struct IVChain {
01598   SmallVector<IVInc,1> Incs;
01599   const SCEV *ExprBase;
01600 
01601   IVChain() : ExprBase(nullptr) {}
01602 
01603   IVChain(const IVInc &Head, const SCEV *Base)
01604     : Incs(1, Head), ExprBase(Base) {}
01605 
01606   typedef SmallVectorImpl<IVInc>::const_iterator const_iterator;
01607 
01608   // Return the first increment in the chain.
01609   const_iterator begin() const {
01610     assert(!Incs.empty());
01611     return std::next(Incs.begin());
01612   }
01613   const_iterator end() const {
01614     return Incs.end();
01615   }
01616 
01617   // Returns true if this chain contains any increments.
01618   bool hasIncs() const { return Incs.size() >= 2; }
01619 
01620   // Add an IVInc to the end of this chain.
01621   void add(const IVInc &X) { Incs.push_back(X); }
01622 
01623   // Returns the last UserInst in the chain.
01624   Instruction *tailUserInst() const { return Incs.back().UserInst; }
01625 
01626   // Returns true if IncExpr can be profitably added to this chain.
01627   bool isProfitableIncrement(const SCEV *OperExpr,
01628                              const SCEV *IncExpr,
01629                              ScalarEvolution&);
01630 };
01631 
01632 /// Helper for CollectChains to track multiple IV increment uses.  Distinguish
01633 /// between FarUsers that definitely cross IV increments and NearUsers that may
01634 /// be used between IV increments.
01635 struct ChainUsers {
01636   SmallPtrSet<Instruction*, 4> FarUsers;
01637   SmallPtrSet<Instruction*, 4> NearUsers;
01638 };
01639 
01640 /// This class holds state for the main loop strength reduction logic.
01641 class LSRInstance {
01642   IVUsers &IU;
01643   ScalarEvolution &SE;
01644   DominatorTree &DT;
01645   LoopInfo &LI;
01646   const TargetTransformInfo &TTI;
01647   Loop *const L;
01648   bool Changed;
01649 
01650   /// This is the insert position that the current loop's induction variable
01651   /// increment should be placed. In simple loops, this is the latch block's
01652   /// terminator. But in more complicated cases, this is a position which will
01653   /// dominate all the in-loop post-increment users.
01654   Instruction *IVIncInsertPos;
01655 
01656   /// Interesting factors between use strides.
01657   SmallSetVector<int64_t, 8> Factors;
01658 
01659   /// Interesting use types, to facilitate truncation reuse.
01660   SmallSetVector<Type *, 4> Types;
01661 
01662   /// The list of operands which are to be replaced.
01663   SmallVector<LSRFixup, 16> Fixups;
01664 
01665   /// The list of interesting uses.
01666   SmallVector<LSRUse, 16> Uses;
01667 
01668   /// Track which uses use which register candidates.
01669   RegUseTracker RegUses;
01670 
01671   // Limit the number of chains to avoid quadratic behavior. We don't expect to
01672   // have more than a few IV increment chains in a loop. Missing a Chain falls
01673   // back to normal LSR behavior for those uses.
01674   static const unsigned MaxChains = 8;
01675 
01676   /// IV users can form a chain of IV increments.
01677   SmallVector<IVChain, MaxChains> IVChainVec;
01678 
01679   /// IV users that belong to profitable IVChains.
01680   SmallPtrSet<Use*, MaxChains> IVIncSet;
01681 
01682   void OptimizeShadowIV();
01683   bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
01684   ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
01685   void OptimizeLoopTermCond();
01686 
01687   void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
01688                         SmallVectorImpl<ChainUsers> &ChainUsersVec);
01689   void FinalizeChain(IVChain &Chain);
01690   void CollectChains();
01691   void GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
01692                        SmallVectorImpl<WeakVH> &DeadInsts);
01693 
01694   void CollectInterestingTypesAndFactors();
01695   void CollectFixupsAndInitialFormulae();
01696 
01697   LSRFixup &getNewFixup() {
01698     Fixups.push_back(LSRFixup());
01699     return Fixups.back();
01700   }
01701 
01702   // Support for sharing of LSRUses between LSRFixups.
01703   typedef DenseMap<LSRUse::SCEVUseKindPair, size_t> UseMapTy;
01704   UseMapTy UseMap;
01705 
01706   bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
01707                           LSRUse::KindType Kind, MemAccessTy AccessTy);
01708 
01709   std::pair<size_t, int64_t> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
01710                                     MemAccessTy AccessTy);
01711 
01712   void DeleteUse(LSRUse &LU, size_t LUIdx);
01713 
01714   LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
01715 
01716   void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
01717   void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
01718   void CountRegisters(const Formula &F, size_t LUIdx);
01719   bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
01720 
01721   void CollectLoopInvariantFixupsAndFormulae();
01722 
01723   void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
01724                               unsigned Depth = 0);
01725 
01726   void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
01727                                   const Formula &Base, unsigned Depth,
01728                                   size_t Idx, bool IsScaledReg = false);
01729   void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
01730   void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
01731                                    const Formula &Base, size_t Idx,
01732                                    bool IsScaledReg = false);
01733   void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
01734   void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
01735                                    const Formula &Base,
01736                                    const SmallVectorImpl<int64_t> &Worklist,
01737                                    size_t Idx, bool IsScaledReg = false);
01738   void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
01739   void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
01740   void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
01741   void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
01742   void GenerateCrossUseConstantOffsets();
01743   void GenerateAllReuseFormulae();
01744 
01745   void FilterOutUndesirableDedicatedRegisters();
01746 
01747   size_t EstimateSearchSpaceComplexity() const;
01748   void NarrowSearchSpaceByDetectingSupersets();
01749   void NarrowSearchSpaceByCollapsingUnrolledCode();
01750   void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
01751   void NarrowSearchSpaceByPickingWinnerRegs();
01752   void NarrowSearchSpaceUsingHeuristics();
01753 
01754   void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
01755                     Cost &SolutionCost,
01756                     SmallVectorImpl<const Formula *> &Workspace,
01757                     const Cost &CurCost,
01758                     const SmallPtrSet<const SCEV *, 16> &CurRegs,
01759                     DenseSet<const SCEV *> &VisitedRegs) const;
01760   void Solve(SmallVectorImpl<const Formula *> &Solution) const;
01761 
01762   BasicBlock::iterator
01763     HoistInsertPosition(BasicBlock::iterator IP,
01764                         const SmallVectorImpl<Instruction *> &Inputs) const;
01765   BasicBlock::iterator
01766     AdjustInsertPositionForExpand(BasicBlock::iterator IP,
01767                                   const LSRFixup &LF,
01768                                   const LSRUse &LU,
01769                                   SCEVExpander &Rewriter) const;
01770 
01771   Value *Expand(const LSRFixup &LF,
01772                 const Formula &F,
01773                 BasicBlock::iterator IP,
01774                 SCEVExpander &Rewriter,
01775                 SmallVectorImpl<WeakVH> &DeadInsts) const;
01776   void RewriteForPHI(PHINode *PN, const LSRFixup &LF,
01777                      const Formula &F,
01778                      SCEVExpander &Rewriter,
01779                      SmallVectorImpl<WeakVH> &DeadInsts) const;
01780   void Rewrite(const LSRFixup &LF,
01781                const Formula &F,
01782                SCEVExpander &Rewriter,
01783                SmallVectorImpl<WeakVH> &DeadInsts) const;
01784   void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
01785 
01786 public:
01787   LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
01788               LoopInfo &LI, const TargetTransformInfo &TTI);
01789 
01790   bool getChanged() const { return Changed; }
01791 
01792   void print_factors_and_types(raw_ostream &OS) const;
01793   void print_fixups(raw_ostream &OS) const;
01794   void print_uses(raw_ostream &OS) const;
01795   void print(raw_ostream &OS) const;
01796   void dump() const;
01797 };
01798 
01799 }
01800 
01801 /// If IV is used in a int-to-float cast inside the loop then try to eliminate
01802 /// the cast operation.
01803 void LSRInstance::OptimizeShadowIV() {
01804   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
01805   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
01806     return;
01807 
01808   for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
01809        UI != E; /* empty */) {
01810     IVUsers::const_iterator CandidateUI = UI;
01811     ++UI;
01812     Instruction *ShadowUse = CandidateUI->getUser();
01813     Type *DestTy = nullptr;
01814     bool IsSigned = false;
01815 
01816     /* If shadow use is a int->float cast then insert a second IV
01817        to eliminate this cast.
01818 
01819          for (unsigned i = 0; i < n; ++i)
01820            foo((double)i);
01821 
01822        is transformed into
01823 
01824          double d = 0.0;
01825          for (unsigned i = 0; i < n; ++i, ++d)
01826            foo(d);
01827     */
01828     if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
01829       IsSigned = false;
01830       DestTy = UCast->getDestTy();
01831     }
01832     else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
01833       IsSigned = true;
01834       DestTy = SCast->getDestTy();
01835     }
01836     if (!DestTy) continue;
01837 
01838     // If target does not support DestTy natively then do not apply
01839     // this transformation.
01840     if (!TTI.isTypeLegal(DestTy)) continue;
01841 
01842     PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
01843     if (!PH) continue;
01844     if (PH->getNumIncomingValues() != 2) continue;
01845 
01846     Type *SrcTy = PH->getType();
01847     int Mantissa = DestTy->getFPMantissaWidth();
01848     if (Mantissa == -1) continue;
01849     if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
01850       continue;
01851 
01852     unsigned Entry, Latch;
01853     if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
01854       Entry = 0;
01855       Latch = 1;
01856     } else {
01857       Entry = 1;
01858       Latch = 0;
01859     }
01860 
01861     ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
01862     if (!Init) continue;
01863     Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
01864                                         (double)Init->getSExtValue() :
01865                                         (double)Init->getZExtValue());
01866 
01867     BinaryOperator *Incr =
01868       dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
01869     if (!Incr) continue;
01870     if (Incr->getOpcode() != Instruction::Add
01871         && Incr->getOpcode() != Instruction::Sub)
01872       continue;
01873 
01874     /* Initialize new IV, double d = 0.0 in above example. */
01875     ConstantInt *C = nullptr;
01876     if (Incr->getOperand(0) == PH)
01877       C = dyn_cast<ConstantInt>(Incr->getOperand(1));
01878     else if (Incr->getOperand(1) == PH)
01879       C = dyn_cast<ConstantInt>(Incr->getOperand(0));
01880     else
01881       continue;
01882 
01883     if (!C) continue;
01884 
01885     // Ignore negative constants, as the code below doesn't handle them
01886     // correctly. TODO: Remove this restriction.
01887     if (!C->getValue().isStrictlyPositive()) continue;
01888 
01889     /* Add new PHINode. */
01890     PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH);
01891 
01892     /* create new increment. '++d' in above example. */
01893     Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
01894     BinaryOperator *NewIncr =
01895       BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
01896                                Instruction::FAdd : Instruction::FSub,
01897                              NewPH, CFP, "IV.S.next.", Incr);
01898 
01899     NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
01900     NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
01901 
01902     /* Remove cast operation */
01903     ShadowUse->replaceAllUsesWith(NewPH);
01904     ShadowUse->eraseFromParent();
01905     Changed = true;
01906     break;
01907   }
01908 }
01909 
01910 /// If Cond has an operand that is an expression of an IV, set the IV user and
01911 /// stride information and return true, otherwise return false.
01912 bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
01913   for (IVStrideUse &U : IU)
01914     if (U.getUser() == Cond) {
01915       // NOTE: we could handle setcc instructions with multiple uses here, but
01916       // InstCombine does it as well for simple uses, it's not clear that it
01917       // occurs enough in real life to handle.
01918       CondUse = &U;
01919       return true;
01920     }
01921   return false;
01922 }
01923 
01924 /// Rewrite the loop's terminating condition if it uses a max computation.
01925 ///
01926 /// This is a narrow solution to a specific, but acute, problem. For loops
01927 /// like this:
01928 ///
01929 ///   i = 0;
01930 ///   do {
01931 ///     p[i] = 0.0;
01932 ///   } while (++i < n);
01933 ///
01934 /// the trip count isn't just 'n', because 'n' might not be positive. And
01935 /// unfortunately this can come up even for loops where the user didn't use
01936 /// a C do-while loop. For example, seemingly well-behaved top-test loops
01937 /// will commonly be lowered like this:
01938 //
01939 ///   if (n > 0) {
01940 ///     i = 0;
01941 ///     do {
01942 ///       p[i] = 0.0;
01943 ///     } while (++i < n);
01944 ///   }
01945 ///
01946 /// and then it's possible for subsequent optimization to obscure the if
01947 /// test in such a way that indvars can't find it.
01948 ///
01949 /// When indvars can't find the if test in loops like this, it creates a
01950 /// max expression, which allows it to give the loop a canonical
01951 /// induction variable:
01952 ///
01953 ///   i = 0;
01954 ///   max = n < 1 ? 1 : n;
01955 ///   do {
01956 ///     p[i] = 0.0;
01957 ///   } while (++i != max);
01958 ///
01959 /// Canonical induction variables are necessary because the loop passes
01960 /// are designed around them. The most obvious example of this is the
01961 /// LoopInfo analysis, which doesn't remember trip count values. It
01962 /// expects to be able to rediscover the trip count each time it is
01963 /// needed, and it does this using a simple analysis that only succeeds if
01964 /// the loop has a canonical induction variable.
01965 ///
01966 /// However, when it comes time to generate code, the maximum operation
01967 /// can be quite costly, especially if it's inside of an outer loop.
01968 ///
01969 /// This function solves this problem by detecting this type of loop and
01970 /// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
01971 /// the instructions for the maximum computation.
01972 ///
01973 ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
01974   // Check that the loop matches the pattern we're looking for.
01975   if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
01976       Cond->getPredicate() != CmpInst::ICMP_NE)
01977     return Cond;
01978 
01979   SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
01980   if (!Sel || !Sel->hasOneUse()) return Cond;
01981 
01982   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
01983   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
01984     return Cond;
01985   const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
01986 
01987   // Add one to the backedge-taken count to get the trip count.
01988   const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
01989   if (IterationCount != SE.getSCEV(Sel)) return Cond;
01990 
01991   // Check for a max calculation that matches the pattern. There's no check
01992   // for ICMP_ULE here because the comparison would be with zero, which
01993   // isn't interesting.
01994   CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
01995   const SCEVNAryExpr *Max = nullptr;
01996   if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
01997     Pred = ICmpInst::ICMP_SLE;
01998     Max = S;
01999   } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
02000     Pred = ICmpInst::ICMP_SLT;
02001     Max = S;
02002   } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
02003     Pred = ICmpInst::ICMP_ULT;
02004     Max = U;
02005   } else {
02006     // No match; bail.
02007     return Cond;
02008   }
02009 
02010   // To handle a max with more than two operands, this optimization would
02011   // require additional checking and setup.
02012   if (Max->getNumOperands() != 2)
02013     return Cond;
02014 
02015   const SCEV *MaxLHS = Max->getOperand(0);
02016   const SCEV *MaxRHS = Max->getOperand(1);
02017 
02018   // ScalarEvolution canonicalizes constants to the left. For < and >, look
02019   // for a comparison with 1. For <= and >=, a comparison with zero.
02020   if (!MaxLHS ||
02021       (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
02022     return Cond;
02023 
02024   // Check the relevant induction variable for conformance to
02025   // the pattern.
02026   const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
02027   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
02028   if (!AR || !AR->isAffine() ||
02029       AR->getStart() != One ||
02030       AR->getStepRecurrence(SE) != One)
02031     return Cond;
02032 
02033   assert(AR->getLoop() == L &&
02034          "Loop condition operand is an addrec in a different loop!");
02035 
02036   // Check the right operand of the select, and remember it, as it will
02037   // be used in the new comparison instruction.
02038   Value *NewRHS = nullptr;
02039   if (ICmpInst::isTrueWhenEqual(Pred)) {
02040     // Look for n+1, and grab n.
02041     if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
02042       if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
02043          if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
02044            NewRHS = BO->getOperand(0);
02045     if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
02046       if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
02047         if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
02048           NewRHS = BO->getOperand(0);
02049     if (!NewRHS)
02050       return Cond;
02051   } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
02052     NewRHS = Sel->getOperand(1);
02053   else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
02054     NewRHS = Sel->getOperand(2);
02055   else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
02056     NewRHS = SU->getValue();
02057   else
02058     // Max doesn't match expected pattern.
02059     return Cond;
02060 
02061   // Determine the new comparison opcode. It may be signed or unsigned,
02062   // and the original comparison may be either equality or inequality.
02063   if (Cond->getPredicate() == CmpInst::ICMP_EQ)
02064     Pred = CmpInst::getInversePredicate(Pred);
02065 
02066   // Ok, everything looks ok to change the condition into an SLT or SGE and
02067   // delete the max calculation.
02068   ICmpInst *NewCond =
02069     new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp");
02070 
02071   // Delete the max calculation instructions.
02072   Cond->replaceAllUsesWith(NewCond);
02073   CondUse->setUser(NewCond);
02074   Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
02075   Cond->eraseFromParent();
02076   Sel->eraseFromParent();
02077   if (Cmp->use_empty())
02078     Cmp->eraseFromParent();
02079   return NewCond;
02080 }
02081 
02082 /// Change loop terminating condition to use the postinc iv when possible.
02083 void
02084 LSRInstance::OptimizeLoopTermCond() {
02085   SmallPtrSet<Instruction *, 4> PostIncs;
02086 
02087   BasicBlock *LatchBlock = L->getLoopLatch();
02088   SmallVector<BasicBlock*, 8> ExitingBlocks;
02089   L->getExitingBlocks(ExitingBlocks);
02090 
02091   for (BasicBlock *ExitingBlock : ExitingBlocks) {
02092 
02093     // Get the terminating condition for the loop if possible.  If we
02094     // can, we want to change it to use a post-incremented version of its
02095     // induction variable, to allow coalescing the live ranges for the IV into
02096     // one register value.
02097 
02098     BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
02099     if (!TermBr)
02100       continue;
02101     // FIXME: Overly conservative, termination condition could be an 'or' etc..
02102     if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
02103       continue;
02104 
02105     // Search IVUsesByStride to find Cond's IVUse if there is one.
02106     IVStrideUse *CondUse = nullptr;
02107     ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
02108     if (!FindIVUserForCond(Cond, CondUse))
02109       continue;
02110 
02111     // If the trip count is computed in terms of a max (due to ScalarEvolution
02112     // being unable to find a sufficient guard, for example), change the loop
02113     // comparison to use SLT or ULT instead of NE.
02114     // One consequence of doing this now is that it disrupts the count-down
02115     // optimization. That's not always a bad thing though, because in such
02116     // cases it may still be worthwhile to avoid a max.
02117     Cond = OptimizeMax(Cond, CondUse);
02118 
02119     // If this exiting block dominates the latch block, it may also use
02120     // the post-inc value if it won't be shared with other uses.
02121     // Check for dominance.
02122     if (!DT.dominates(ExitingBlock, LatchBlock))
02123       continue;
02124 
02125     // Conservatively avoid trying to use the post-inc value in non-latch
02126     // exits if there may be pre-inc users in intervening blocks.
02127     if (LatchBlock != ExitingBlock)
02128       for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
02129         // Test if the use is reachable from the exiting block. This dominator
02130         // query is a conservative approximation of reachability.
02131         if (&*UI != CondUse &&
02132             !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {
02133           // Conservatively assume there may be reuse if the quotient of their
02134           // strides could be a legal scale.
02135           const SCEV *A = IU.getStride(*CondUse, L);
02136           const SCEV *B = IU.getStride(*UI, L);
02137           if (!A || !B) continue;
02138           if (SE.getTypeSizeInBits(A->getType()) !=
02139               SE.getTypeSizeInBits(B->getType())) {
02140             if (SE.getTypeSizeInBits(A->getType()) >
02141                 SE.getTypeSizeInBits(B->getType()))
02142               B = SE.getSignExtendExpr(B, A->getType());
02143             else
02144               A = SE.getSignExtendExpr(A, B->getType());
02145           }
02146           if (const SCEVConstant *D =
02147                 dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
02148             const ConstantInt *C = D->getValue();
02149             // Stride of one or negative one can have reuse with non-addresses.
02150             if (C->isOne() || C->isAllOnesValue())
02151               goto decline_post_inc;
02152             // Avoid weird situations.
02153             if (C->getValue().getMinSignedBits() >= 64 ||
02154                 C->getValue().isMinSignedValue())
02155               goto decline_post_inc;
02156             // Check for possible scaled-address reuse.
02157             MemAccessTy AccessTy = getAccessType(UI->getUser());
02158             int64_t Scale = C->getSExtValue();
02159             if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
02160                                           /*BaseOffset=*/0,
02161                                           /*HasBaseReg=*/false, Scale,
02162                                           AccessTy.AddrSpace))
02163               goto decline_post_inc;
02164             Scale = -Scale;
02165             if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
02166                                           /*BaseOffset=*/0,
02167                                           /*HasBaseReg=*/false, Scale,
02168                                           AccessTy.AddrSpace))
02169               goto decline_post_inc;
02170           }
02171         }
02172 
02173     DEBUG(dbgs() << "  Change loop exiting icmp to use postinc iv: "
02174                  << *Cond << '\n');
02175 
02176     // It's possible for the setcc instruction to be anywhere in the loop, and
02177     // possible for it to have multiple users.  If it is not immediately before
02178     // the exiting block branch, move it.
02179     if (&*++BasicBlock::iterator(Cond) != TermBr) {
02180       if (Cond->hasOneUse()) {
02181         Cond->moveBefore(TermBr);
02182       } else {
02183         // Clone the terminating condition and insert into the loopend.
02184         ICmpInst *OldCond = Cond;
02185         Cond = cast<ICmpInst>(Cond->clone());
02186         Cond->setName(L->getHeader()->getName() + ".termcond");
02187         ExitingBlock->getInstList().insert(TermBr->getIterator(), Cond);
02188 
02189         // Clone the IVUse, as the old use still exists!
02190         CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
02191         TermBr->replaceUsesOfWith(OldCond, Cond);
02192       }
02193     }
02194 
02195     // If we get to here, we know that we can transform the setcc instruction to
02196     // use the post-incremented version of the IV, allowing us to coalesce the
02197     // live ranges for the IV correctly.
02198     CondUse->transformToPostInc(L);
02199     Changed = true;
02200 
02201     PostIncs.insert(Cond);
02202   decline_post_inc:;
02203   }
02204 
02205   // Determine an insertion point for the loop induction variable increment. It
02206   // must dominate all the post-inc comparisons we just set up, and it must
02207   // dominate the loop latch edge.
02208   IVIncInsertPos = L->getLoopLatch()->getTerminator();
02209   for (Instruction *Inst : PostIncs) {
02210     BasicBlock *BB =
02211       DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
02212                                     Inst->getParent());
02213     if (BB == Inst->getParent())
02214       IVIncInsertPos = Inst;
02215     else if (BB != IVIncInsertPos->getParent())
02216       IVIncInsertPos = BB->getTerminator();
02217   }
02218 }
02219 
02220 /// Determine if the given use can accommodate a fixup at the given offset and
02221 /// other details. If so, update the use and return true.
02222 bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
02223                                      bool HasBaseReg, LSRUse::KindType Kind,
02224                                      MemAccessTy AccessTy) {
02225   int64_t NewMinOffset = LU.MinOffset;
02226   int64_t NewMaxOffset = LU.MaxOffset;
02227   MemAccessTy NewAccessTy = AccessTy;
02228 
02229   // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
02230   // something conservative, however this can pessimize in the case that one of
02231   // the uses will have all its uses outside the loop, for example.
02232   if (LU.Kind != Kind)
02233     return false;
02234 
02235   // Check for a mismatched access type, and fall back conservatively as needed.
02236   // TODO: Be less conservative when the type is similar and can use the same
02237   // addressing modes.
02238   if (Kind == LSRUse::Address) {
02239     if (AccessTy != LU.AccessTy)
02240       NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext());
02241   }
02242 
02243   // Conservatively assume HasBaseReg is true for now.
02244   if (NewOffset < LU.MinOffset) {
02245     if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
02246                           LU.MaxOffset - NewOffset, HasBaseReg))
02247       return false;
02248     NewMinOffset = NewOffset;
02249   } else if (NewOffset > LU.MaxOffset) {
02250     if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
02251                           NewOffset - LU.MinOffset, HasBaseReg))
02252       return false;
02253     NewMaxOffset = NewOffset;
02254   }
02255 
02256   // Update the use.
02257   LU.MinOffset = NewMinOffset;
02258   LU.MaxOffset = NewMaxOffset;
02259   LU.AccessTy = NewAccessTy;
02260   if (NewOffset != LU.Offsets.back())
02261     LU.Offsets.push_back(NewOffset);
02262   return true;
02263 }
02264 
02265 /// Return an LSRUse index and an offset value for a fixup which needs the given
02266 /// expression, with the given kind and optional access type.  Either reuse an
02267 /// existing use or create a new one, as needed.
02268 std::pair<size_t, int64_t> LSRInstance::getUse(const SCEV *&Expr,
02269                                                LSRUse::KindType Kind,
02270                                                MemAccessTy AccessTy) {
02271   const SCEV *Copy = Expr;
02272   int64_t Offset = ExtractImmediate(Expr, SE);
02273 
02274   // Basic uses can't accept any offset, for example.
02275   if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
02276                         Offset, /*HasBaseReg=*/ true)) {
02277     Expr = Copy;
02278     Offset = 0;
02279   }
02280 
02281   std::pair<UseMapTy::iterator, bool> P =
02282     UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0));
02283   if (!P.second) {
02284     // A use already existed with this base.
02285     size_t LUIdx = P.first->second;
02286     LSRUse &LU = Uses[LUIdx];
02287     if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
02288       // Reuse this use.
02289       return std::make_pair(LUIdx, Offset);
02290   }
02291 
02292   // Create a new use.
02293   size_t LUIdx = Uses.size();
02294   P.first->second = LUIdx;
02295   Uses.push_back(LSRUse(Kind, AccessTy));
02296   LSRUse &LU = Uses[LUIdx];
02297 
02298   // We don't need to track redundant offsets, but we don't need to go out
02299   // of our way here to avoid them.
02300   if (LU.Offsets.empty() || Offset != LU.Offsets.back())
02301     LU.Offsets.push_back(Offset);
02302 
02303   LU.MinOffset = Offset;
02304   LU.MaxOffset = Offset;
02305   return std::make_pair(LUIdx, Offset);
02306 }
02307 
02308 /// Delete the given use from the Uses list.
02309 void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
02310   if (&LU != &Uses.back())
02311     std::swap(LU, Uses.back());
02312   Uses.pop_back();
02313 
02314   // Update RegUses.
02315   RegUses.swapAndDropUse(LUIdx, Uses.size());
02316 }
02317 
02318 /// Look for a use distinct from OrigLU which is has a formula that has the same
02319 /// registers as the given formula.
02320 LSRUse *
02321 LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
02322                                        const LSRUse &OrigLU) {
02323   // Search all uses for the formula. This could be more clever.
02324   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
02325     LSRUse &LU = Uses[LUIdx];
02326     // Check whether this use is close enough to OrigLU, to see whether it's
02327     // worthwhile looking through its formulae.
02328     // Ignore ICmpZero uses because they may contain formulae generated by
02329     // GenerateICmpZeroScales, in which case adding fixup offsets may
02330     // be invalid.
02331     if (&LU != &OrigLU &&
02332         LU.Kind != LSRUse::ICmpZero &&
02333         LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
02334         LU.WidestFixupType == OrigLU.WidestFixupType &&
02335         LU.HasFormulaWithSameRegs(OrigF)) {
02336       // Scan through this use's formulae.
02337       for (const Formula &F : LU.Formulae) {
02338         // Check to see if this formula has the same registers and symbols
02339         // as OrigF.
02340         if (F.BaseRegs == OrigF.BaseRegs &&
02341             F.ScaledReg == OrigF.ScaledReg &&
02342             F.BaseGV == OrigF.BaseGV &&
02343             F.Scale == OrigF.Scale &&
02344             F.UnfoldedOffset == OrigF.UnfoldedOffset) {
02345           if (F.BaseOffset == 0)
02346             return &LU;
02347           // This is the formula where all the registers and symbols matched;
02348           // there aren't going to be any others. Since we declined it, we
02349           // can skip the rest of the formulae and proceed to the next LSRUse.
02350           break;
02351         }
02352       }
02353     }
02354   }
02355 
02356   // Nothing looked good.
02357   return nullptr;
02358 }
02359 
02360 void LSRInstance::CollectInterestingTypesAndFactors() {
02361   SmallSetVector<const SCEV *, 4> Strides;
02362 
02363   // Collect interesting types and strides.
02364   SmallVector<const SCEV *, 4> Worklist;
02365   for (const IVStrideUse &U : IU) {
02366     const SCEV *Expr = IU.getExpr(U);
02367 
02368     // Collect interesting types.
02369     Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
02370 
02371     // Add strides for mentioned loops.
02372     Worklist.push_back(Expr);
02373     do {
02374       const SCEV *S = Worklist.pop_back_val();
02375       if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
02376         if (AR->getLoop() == L)
02377           Strides.insert(AR->getStepRecurrence(SE));
02378         Worklist.push_back(AR->getStart());
02379       } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
02380         Worklist.append(Add->op_begin(), Add->op_end());
02381       }
02382     } while (!Worklist.empty());
02383   }
02384 
02385   // Compute interesting factors from the set of interesting strides.
02386   for (SmallSetVector<const SCEV *, 4>::const_iterator
02387        I = Strides.begin(), E = Strides.end(); I != E; ++I)
02388     for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
02389          std::next(I); NewStrideIter != E; ++NewStrideIter) {
02390       const SCEV *OldStride = *I;
02391       const SCEV *NewStride = *NewStrideIter;
02392 
02393       if (SE.getTypeSizeInBits(OldStride->getType()) !=
02394           SE.getTypeSizeInBits(NewStride->getType())) {
02395         if (SE.getTypeSizeInBits(OldStride->getType()) >
02396             SE.getTypeSizeInBits(NewStride->getType()))
02397           NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
02398         else
02399           OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
02400       }
02401       if (const SCEVConstant *Factor =
02402             dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
02403                                                         SE, true))) {
02404         if (Factor->getAPInt().getMinSignedBits() <= 64)
02405           Factors.insert(Factor->getAPInt().getSExtValue());
02406       } else if (const SCEVConstant *Factor =
02407                    dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
02408                                                                NewStride,
02409                                                                SE, true))) {
02410         if (Factor->getAPInt().getMinSignedBits() <= 64)
02411           Factors.insert(Factor->getAPInt().getSExtValue());
02412       }
02413     }
02414 
02415   // If all uses use the same type, don't bother looking for truncation-based
02416   // reuse.
02417   if (Types.size() == 1)
02418     Types.clear();
02419 
02420   DEBUG(print_factors_and_types(dbgs()));
02421 }
02422 
02423 /// Helper for CollectChains that finds an IV operand (computed by an AddRec in
02424 /// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
02425 /// IVStrideUses, we could partially skip this.
02426 static User::op_iterator
02427 findIVOperand(User::op_iterator OI, User::op_iterator OE,
02428               Loop *L, ScalarEvolution &SE) {
02429   for(; OI != OE; ++OI) {
02430     if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
02431       if (!SE.isSCEVable(Oper->getType()))
02432         continue;
02433 
02434       if (const SCEVAddRecExpr *AR =
02435           dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) {
02436         if (AR->getLoop() == L)
02437           break;
02438       }
02439     }
02440   }
02441   return OI;
02442 }
02443 
02444 /// IVChain logic must consistenctly peek base TruncInst operands, so wrap it in
02445 /// a convenient helper.
02446 static Value *getWideOperand(Value *Oper) {
02447   if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
02448     return Trunc->getOperand(0);
02449   return Oper;
02450 }
02451 
02452 /// Return true if we allow an IV chain to include both types.
02453 static bool isCompatibleIVType(Value *LVal, Value *RVal) {
02454   Type *LType = LVal->getType();
02455   Type *RType = RVal->getType();
02456   return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy());
02457 }
02458 
02459 /// Return an approximation of this SCEV expression's "base", or NULL for any
02460 /// constant. Returning the expression itself is conservative. Returning a
02461 /// deeper subexpression is more precise and valid as long as it isn't less
02462 /// complex than another subexpression. For expressions involving multiple
02463 /// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
02464 /// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
02465 /// IVInc==b-a.
02466 ///
02467 /// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
02468 /// SCEVUnknown, we simply return the rightmost SCEV operand.
02469 static const SCEV *getExprBase(const SCEV *S) {
02470   switch (S->getSCEVType()) {
02471   default: // uncluding scUnknown.
02472     return S;
02473   case scConstant:
02474     return nullptr;
02475   case scTruncate:
02476     return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
02477   case scZeroExtend:
02478     return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
02479   case scSignExtend:
02480     return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
02481   case scAddExpr: {
02482     // Skip over scaled operands (scMulExpr) to follow add operands as long as
02483     // there's nothing more complex.
02484     // FIXME: not sure if we want to recognize negation.
02485     const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
02486     for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(Add->op_end()),
02487            E(Add->op_begin()); I != E; ++I) {
02488       const SCEV *SubExpr = *I;
02489       if (SubExpr->getSCEVType() == scAddExpr)
02490         return getExprBase(SubExpr);
02491 
02492       if (SubExpr->getSCEVType() != scMulExpr)
02493         return SubExpr;
02494     }
02495     return S; // all operands are scaled, be conservative.
02496   }
02497   case scAddRecExpr:
02498     return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
02499   }
02500 }
02501 
02502 /// Return true if the chain increment is profitable to expand into a loop
02503 /// invariant value, which may require its own register. A profitable chain
02504 /// increment will be an offset relative to the same base. We allow such offsets
02505 /// to potentially be used as chain increment as long as it's not obviously
02506 /// expensive to expand using real instructions.
02507 bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
02508                                     const SCEV *IncExpr,
02509                                     ScalarEvolution &SE) {
02510   // Aggressively form chains when -stress-ivchain.
02511   if (StressIVChain)
02512     return true;
02513 
02514   // Do not replace a constant offset from IV head with a nonconstant IV
02515   // increment.
02516   if (!isa<SCEVConstant>(IncExpr)) {
02517     const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
02518     if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
02519       return 0;
02520   }
02521 
02522   SmallPtrSet<const SCEV*, 8> Processed;
02523   return !isHighCostExpansion(IncExpr, Processed, SE);
02524 }
02525 
02526 /// Return true if the number of registers needed for the chain is estimated to
02527 /// be less than the number required for the individual IV users. First prohibit
02528 /// any IV users that keep the IV live across increments (the Users set should
02529 /// be empty). Next count the number and type of increments in the chain.
02530 ///
02531 /// Chaining IVs can lead to considerable code bloat if ISEL doesn't
02532 /// effectively use postinc addressing modes. Only consider it profitable it the
02533 /// increments can be computed in fewer registers when chained.
02534 ///
02535 /// TODO: Consider IVInc free if it's already used in another chains.
02536 static bool
02537 isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users,
02538                   ScalarEvolution &SE, const TargetTransformInfo &TTI) {
02539   if (StressIVChain)
02540     return true;
02541 
02542   if (!Chain.hasIncs())
02543     return false;
02544 
02545   if (!Users.empty()) {
02546     DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
02547           for (Instruction *Inst : Users) {
02548             dbgs() << "  " << *Inst << "\n";
02549           });
02550     return false;
02551   }
02552   assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
02553 
02554   // The chain itself may require a register, so intialize cost to 1.
02555   int cost = 1;
02556 
02557   // A complete chain likely eliminates the need for keeping the original IV in
02558   // a register. LSR does not currently know how to form a complete chain unless
02559   // the header phi already exists.
02560   if (isa<PHINode>(Chain.tailUserInst())
02561       && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
02562     --cost;
02563   }
02564   const SCEV *LastIncExpr = nullptr;
02565   unsigned NumConstIncrements = 0;
02566   unsigned NumVarIncrements = 0;
02567   unsigned NumReusedIncrements = 0;
02568   for (const IVInc &Inc : Chain) {
02569     if (Inc.IncExpr->isZero())
02570       continue;
02571 
02572     // Incrementing by zero or some constant is neutral. We assume constants can
02573     // be folded into an addressing mode or an add's immediate operand.
02574     if (isa<SCEVConstant>(Inc.IncExpr)) {
02575       ++NumConstIncrements;
02576       continue;
02577     }
02578 
02579     if (Inc.IncExpr == LastIncExpr)
02580       ++NumReusedIncrements;
02581     else
02582       ++NumVarIncrements;
02583 
02584     LastIncExpr = Inc.IncExpr;
02585   }
02586   // An IV chain with a single increment is handled by LSR's postinc
02587   // uses. However, a chain with multiple increments requires keeping the IV's
02588   // value live longer than it needs to be if chained.
02589   if (NumConstIncrements > 1)
02590     --cost;
02591 
02592   // Materializing increment expressions in the preheader that didn't exist in
02593   // the original code may cost a register. For example, sign-extended array
02594   // indices can produce ridiculous increments like this:
02595   // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
02596   cost += NumVarIncrements;
02597 
02598   // Reusing variable increments likely saves a register to hold the multiple of
02599   // the stride.
02600   cost -= NumReusedIncrements;
02601 
02602   DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
02603                << "\n");
02604 
02605   return cost < 0;
02606 }
02607 
02608 /// Add this IV user to an existing chain or make it the head of a new chain.
02609 void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
02610                                    SmallVectorImpl<ChainUsers> &ChainUsersVec) {
02611   // When IVs are used as types of varying widths, they are generally converted
02612   // to a wider type with some uses remaining narrow under a (free) trunc.
02613   Value *const NextIV = getWideOperand(IVOper);
02614   const SCEV *const OperExpr = SE.getSCEV(NextIV);
02615   const SCEV *const OperExprBase = getExprBase(OperExpr);
02616 
02617   // Visit all existing chains. Check if its IVOper can be computed as a
02618   // profitable loop invariant increment from the last link in the Chain.
02619   unsigned ChainIdx = 0, NChains = IVChainVec.size();
02620   const SCEV *LastIncExpr = nullptr;
02621   for (; ChainIdx < NChains; ++ChainIdx) {
02622     IVChain &Chain = IVChainVec[ChainIdx];
02623 
02624     // Prune the solution space aggressively by checking that both IV operands
02625     // are expressions that operate on the same unscaled SCEVUnknown. This
02626     // "base" will be canceled by the subsequent getMinusSCEV call. Checking
02627     // first avoids creating extra SCEV expressions.
02628     if (!StressIVChain && Chain.ExprBase != OperExprBase)
02629       continue;
02630 
02631     Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
02632     if (!isCompatibleIVType(PrevIV, NextIV))
02633       continue;
02634 
02635     // A phi node terminates a chain.
02636     if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
02637       continue;
02638 
02639     // The increment must be loop-invariant so it can be kept in a register.
02640     const SCEV *PrevExpr = SE.getSCEV(PrevIV);
02641     const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
02642     if (!SE.isLoopInvariant(IncExpr, L))
02643       continue;
02644 
02645     if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
02646       LastIncExpr = IncExpr;
02647       break;
02648     }
02649   }
02650   // If we haven't found a chain, create a new one, unless we hit the max. Don't
02651   // bother for phi nodes, because they must be last in the chain.
02652   if (ChainIdx == NChains) {
02653     if (isa<PHINode>(UserInst))
02654       return;
02655     if (NChains >= MaxChains && !StressIVChain) {
02656       DEBUG(dbgs() << "IV Chain Limit\n");
02657       return;
02658     }
02659     LastIncExpr = OperExpr;
02660     // IVUsers may have skipped over sign/zero extensions. We don't currently
02661     // attempt to form chains involving extensions unless they can be hoisted
02662     // into this loop's AddRec.
02663     if (!isa<SCEVAddRecExpr>(LastIncExpr))
02664       return;
02665     ++NChains;
02666     IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
02667                                  OperExprBase));
02668     ChainUsersVec.resize(NChains);
02669     DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
02670                  << ") IV=" << *LastIncExpr << "\n");
02671   } else {
02672     DEBUG(dbgs() << "IV Chain#" << ChainIdx << "  Inc: (" << *UserInst
02673                  << ") IV+" << *LastIncExpr << "\n");
02674     // Add this IV user to the end of the chain.
02675     IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
02676   }
02677   IVChain &Chain = IVChainVec[ChainIdx];
02678 
02679   SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
02680   // This chain's NearUsers become FarUsers.
02681   if (!LastIncExpr->isZero()) {
02682     ChainUsersVec[ChainIdx].FarUsers.insert(NearUsers.begin(),
02683                                             NearUsers.end());
02684     NearUsers.clear();
02685   }
02686 
02687   // All other uses of IVOperand become near uses of the chain.
02688   // We currently ignore intermediate values within SCEV expressions, assuming
02689   // they will eventually be used be the current chain, or can be computed
02690   // from one of the chain increments. To be more precise we could
02691   // transitively follow its user and only add leaf IV users to the set.
02692   for (User *U : IVOper->users()) {
02693     Instruction *OtherUse = dyn_cast<Instruction>(U);
02694     if (!OtherUse)
02695       continue;
02696     // Uses in the chain will no longer be uses if the chain is formed.
02697     // Include the head of the chain in this iteration (not Chain.begin()).
02698     IVChain::const_iterator IncIter = Chain.Incs.begin();
02699     IVChain::const_iterator IncEnd = Chain.Incs.end();
02700     for( ; IncIter != IncEnd; ++IncIter) {
02701       if (IncIter->UserInst == OtherUse)
02702         break;
02703     }
02704     if (IncIter != IncEnd)
02705       continue;
02706 
02707     if (SE.isSCEVable(OtherUse->getType())
02708         && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
02709         && IU.isIVUserOrOperand(OtherUse)) {
02710       continue;
02711     }
02712     NearUsers.insert(OtherUse);
02713   }
02714 
02715   // Since this user is part of the chain, it's no longer considered a use
02716   // of the chain.
02717   ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
02718 }
02719 
02720 /// Populate the vector of Chains.
02721 ///
02722 /// This decreases ILP at the architecture level. Targets with ample registers,
02723 /// multiple memory ports, and no register renaming probably don't want
02724 /// this. However, such targets should probably disable LSR altogether.
02725 ///
02726 /// The job of LSR is to make a reasonable choice of induction variables across
02727 /// the loop. Subsequent passes can easily "unchain" computation exposing more
02728 /// ILP *within the loop* if the target wants it.
02729 ///
02730 /// Finding the best IV chain is potentially a scheduling problem. Since LSR
02731 /// will not reorder memory operations, it will recognize this as a chain, but
02732 /// will generate redundant IV increments. Ideally this would be corrected later
02733 /// by a smart scheduler:
02734 ///        = A[i]
02735 ///        = A[i+x]
02736 /// A[i]   =
02737 /// A[i+x] =
02738 ///
02739 /// TODO: Walk the entire domtree within this loop, not just the path to the
02740 /// loop latch. This will discover chains on side paths, but requires
02741 /// maintaining multiple copies of the Chains state.
02742 void LSRInstance::CollectChains() {
02743   DEBUG(dbgs() << "Collecting IV Chains.\n");
02744   SmallVector<ChainUsers, 8> ChainUsersVec;
02745 
02746   SmallVector<BasicBlock *,8> LatchPath;
02747   BasicBlock *LoopHeader = L->getHeader();
02748   for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
02749        Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
02750     LatchPath.push_back(Rung->getBlock());
02751   }
02752   LatchPath.push_back(LoopHeader);
02753 
02754   // Walk the instruction stream from the loop header to the loop latch.
02755   for (SmallVectorImpl<BasicBlock *>::reverse_iterator
02756          BBIter = LatchPath.rbegin(), BBEnd = LatchPath.rend();
02757        BBIter != BBEnd; ++BBIter) {
02758     for (BasicBlock::iterator I = (*BBIter)->begin(), E = (*BBIter)->end();
02759          I != E; ++I) {
02760       // Skip instructions that weren't seen by IVUsers analysis.
02761       if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&*I))
02762         continue;
02763 
02764       // Ignore users that are part of a SCEV expression. This way we only
02765       // consider leaf IV Users. This effectively rediscovers a portion of
02766       // IVUsers analysis but in program order this time.
02767       if (SE.isSCEVable(I->getType()) && !isa<SCEVUnknown>(SE.getSCEV(&*I)))
02768         continue;
02769 
02770       // Remove this instruction from any NearUsers set it may be in.
02771       for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
02772            ChainIdx < NChains; ++ChainIdx) {
02773         ChainUsersVec[ChainIdx].NearUsers.erase(&*I);
02774       }
02775       // Search for operands that can be chained.
02776       SmallPtrSet<Instruction*, 4> UniqueOperands;
02777       User::op_iterator IVOpEnd = I->op_end();
02778       User::op_iterator IVOpIter = findIVOperand(I->op_begin(), IVOpEnd, L, SE);
02779       while (IVOpIter != IVOpEnd) {
02780         Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
02781         if (UniqueOperands.insert(IVOpInst).second)
02782           ChainInstruction(&*I, IVOpInst, ChainUsersVec);
02783         IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
02784       }
02785     } // Continue walking down the instructions.
02786   } // Continue walking down the domtree.
02787   // Visit phi backedges to determine if the chain can generate the IV postinc.
02788   for (BasicBlock::iterator I = L->getHeader()->begin();
02789        PHINode *PN = dyn_cast<PHINode>(I); ++I) {
02790     if (!SE.isSCEVable(PN->getType()))
02791       continue;
02792 
02793     Instruction *IncV =
02794       dyn_cast<Instruction>(PN->getIncomingValueForBlock(L->getLoopLatch()));
02795     if (IncV)
02796       ChainInstruction(PN, IncV, ChainUsersVec);
02797   }
02798   // Remove any unprofitable chains.
02799   unsigned ChainIdx = 0;
02800   for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
02801        UsersIdx < NChains; ++UsersIdx) {
02802     if (!isProfitableChain(IVChainVec[UsersIdx],
02803                            ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
02804       continue;
02805     // Preserve the chain at UsesIdx.
02806     if (ChainIdx != UsersIdx)
02807       IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
02808     FinalizeChain(IVChainVec[ChainIdx]);
02809     ++ChainIdx;
02810   }
02811   IVChainVec.resize(ChainIdx);
02812 }
02813 
02814 void LSRInstance::FinalizeChain(IVChain &Chain) {
02815   assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
02816   DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
02817 
02818   for (const IVInc &Inc : Chain) {
02819     DEBUG(dbgs() << "        Inc: " << Inc.UserInst << "\n");
02820     auto UseI = std::find(Inc.UserInst->op_begin(), Inc.UserInst->op_end(),
02821                           Inc.IVOperand);
02822     assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
02823     IVIncSet.insert(UseI);
02824   }
02825 }
02826 
02827 /// Return true if the IVInc can be folded into an addressing mode.
02828 static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
02829                              Value *Operand, const TargetTransformInfo &TTI) {
02830   const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
02831   if (!IncConst || !isAddressUse(UserInst, Operand))
02832     return false;
02833 
02834   if (IncConst->getAPInt().getMinSignedBits() > 64)
02835     return false;
02836 
02837   MemAccessTy AccessTy = getAccessType(UserInst);
02838   int64_t IncOffset = IncConst->getValue()->getSExtValue();
02839   if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
02840                         IncOffset, /*HaseBaseReg=*/false))
02841     return false;
02842 
02843   return true;
02844 }
02845 
02846 /// Generate an add or subtract for each IVInc in a chain to materialize the IV
02847 /// user's operand from the previous IV user's operand.
02848 void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
02849                                   SmallVectorImpl<WeakVH> &DeadInsts) {
02850   // Find the new IVOperand for the head of the chain. It may have been replaced
02851   // by LSR.
02852   const IVInc &Head = Chain.Incs[0];
02853   User::op_iterator IVOpEnd = Head.UserInst->op_end();
02854   // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
02855   User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
02856                                              IVOpEnd, L, SE);
02857   Value *IVSrc = nullptr;
02858   while (IVOpIter != IVOpEnd) {
02859     IVSrc = getWideOperand(*IVOpIter);
02860 
02861     // If this operand computes the expression that the chain needs, we may use
02862     // it. (Check this after setting IVSrc which is used below.)
02863     //
02864     // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
02865     // narrow for the chain, so we can no longer use it. We do allow using a
02866     // wider phi, assuming the LSR checked for free truncation. In that case we
02867     // should already have a truncate on this operand such that
02868     // getSCEV(IVSrc) == IncExpr.
02869     if (SE.getSCEV(*IVOpIter) == Head.IncExpr
02870         || SE.getSCEV(IVSrc) == Head.IncExpr) {
02871       break;
02872     }
02873     IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
02874   }
02875   if (IVOpIter == IVOpEnd) {
02876     // Gracefully give up on this chain.
02877     DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
02878     return;
02879   }
02880 
02881   DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
02882   Type *IVTy = IVSrc->getType();
02883   Type *IntTy = SE.getEffectiveSCEVType(IVTy);
02884   const SCEV *LeftOverExpr = nullptr;
02885   for (const IVInc &Inc : Chain) {
02886     Instruction *InsertPt = Inc.UserInst;
02887     if (isa<PHINode>(InsertPt))
02888       InsertPt = L->getLoopLatch()->getTerminator();
02889 
02890     // IVOper will replace the current IV User's operand. IVSrc is the IV
02891     // value currently held in a register.
02892     Value *IVOper = IVSrc;
02893     if (!Inc.IncExpr->isZero()) {
02894       // IncExpr was the result of subtraction of two narrow values, so must
02895       // be signed.
02896       const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
02897       LeftOverExpr = LeftOverExpr ?
02898         SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
02899     }
02900     if (LeftOverExpr && !LeftOverExpr->isZero()) {
02901       // Expand the IV increment.
02902       Rewriter.clearPostInc();
02903       Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
02904       const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
02905                                              SE.getUnknown(IncV));
02906       IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
02907 
02908       // If an IV increment can't be folded, use it as the next IV value.
02909       if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
02910         assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
02911         IVSrc = IVOper;
02912         LeftOverExpr = nullptr;
02913       }
02914     }
02915     Type *OperTy = Inc.IVOperand->getType();
02916     if (IVTy != OperTy) {
02917       assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
02918              "cannot extend a chained IV");
02919       IRBuilder<> Builder(InsertPt);
02920       IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
02921     }
02922     Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
02923     DeadInsts.emplace_back(Inc.IVOperand);
02924   }
02925   // If LSR created a new, wider phi, we may also replace its postinc. We only
02926   // do this if we also found a wide value for the head of the chain.
02927   if (isa<PHINode>(Chain.tailUserInst())) {
02928     for (BasicBlock::iterator I = L->getHeader()->begin();
02929          PHINode *Phi = dyn_cast<PHINode>(I); ++I) {
02930       if (!isCompatibleIVType(Phi, IVSrc))
02931         continue;
02932       Instruction *PostIncV = dyn_cast<Instruction>(
02933         Phi->getIncomingValueForBlock(L->getLoopLatch()));
02934       if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
02935         continue;
02936       Value *IVOper = IVSrc;
02937       Type *PostIncTy = PostIncV->getType();
02938       if (IVTy != PostIncTy) {
02939         assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
02940         IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
02941         Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
02942         IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
02943       }
02944       Phi->replaceUsesOfWith(PostIncV, IVOper);
02945       DeadInsts.emplace_back(PostIncV);
02946     }
02947   }
02948 }
02949 
02950 void LSRInstance::CollectFixupsAndInitialFormulae() {
02951   for (const IVStrideUse &U : IU) {
02952     Instruction *UserInst = U.getUser();
02953     // Skip IV users that are part of profitable IV Chains.
02954     User::op_iterator UseI = std::find(UserInst->op_begin(), UserInst->op_end(),
02955                                        U.getOperandValToReplace());
02956     assert(UseI != UserInst->op_end() && "cannot find IV operand");
02957     if (IVIncSet.count(UseI))
02958       continue;
02959 
02960     // Record the uses.
02961     LSRFixup &LF = getNewFixup();
02962     LF.UserInst = UserInst;
02963     LF.OperandValToReplace = U.getOperandValToReplace();
02964     LF.PostIncLoops = U.getPostIncLoops();
02965 
02966     LSRUse::KindType Kind = LSRUse::Basic;
02967     MemAccessTy AccessTy;
02968     if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) {
02969       Kind = LSRUse::Address;
02970       AccessTy = getAccessType(LF.UserInst);
02971     }
02972 
02973     const SCEV *S = IU.getExpr(U);
02974 
02975     // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
02976     // (N - i == 0), and this allows (N - i) to be the expression that we work
02977     // with rather than just N or i, so we can consider the register
02978     // requirements for both N and i at the same time. Limiting this code to
02979     // equality icmps is not a problem because all interesting loops use
02980     // equality icmps, thanks to IndVarSimplify.
02981     if (ICmpInst *CI = dyn_cast<ICmpInst>(LF.UserInst))
02982       if (CI->isEquality()) {
02983         // Swap the operands if needed to put the OperandValToReplace on the
02984         // left, for consistency.
02985         Value *NV = CI->getOperand(1);
02986         if (NV == LF.OperandValToReplace) {
02987           CI->setOperand(1, CI->getOperand(0));
02988           CI->setOperand(0, NV);
02989           NV = CI->getOperand(1);
02990           Changed = true;
02991         }
02992 
02993         // x == y  -->  x - y == 0
02994         const SCEV *N = SE.getSCEV(NV);
02995         if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) {
02996           // S is normalized, so normalize N before folding it into S
02997           // to keep the result normalized.
02998           N = TransformForPostIncUse(Normalize, N, CI, nullptr,
02999                                      LF.PostIncLoops, SE, DT);
03000           Kind = LSRUse::ICmpZero;
03001           S = SE.getMinusSCEV(N, S);
03002         }
03003 
03004         // -1 and the negations of all interesting strides (except the negation
03005         // of -1) are now also interesting.
03006         for (size_t i = 0, e = Factors.size(); i != e; ++i)
03007           if (Factors[i] != -1)
03008             Factors.insert(-(uint64_t)Factors[i]);
03009         Factors.insert(-1);
03010       }
03011 
03012     // Set up the initial formula for this use.
03013     std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
03014     LF.LUIdx = P.first;
03015     LF.Offset = P.second;
03016     LSRUse &LU = Uses[LF.LUIdx];
03017     LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
03018     if (!LU.WidestFixupType ||
03019         SE.getTypeSizeInBits(LU.WidestFixupType) <
03020         SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
03021       LU.WidestFixupType = LF.OperandValToReplace->getType();
03022 
03023     // If this is the first use of this LSRUse, give it a formula.
03024     if (LU.Formulae.empty()) {
03025       InsertInitialFormula(S, LU, LF.LUIdx);
03026       CountRegisters(LU.Formulae.back(), LF.LUIdx);
03027     }
03028   }
03029 
03030   DEBUG(print_fixups(dbgs()));
03031 }
03032 
03033 /// Insert a formula for the given expression into the given use, separating out
03034 /// loop-variant portions from loop-invariant and loop-computable portions.
03035 void
03036 LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
03037   // Mark uses whose expressions cannot be expanded.
03038   if (!isSafeToExpand(S, SE))
03039     LU.RigidFormula = true;
03040 
03041   Formula F;
03042   F.initialMatch(S, L, SE);
03043   bool Inserted = InsertFormula(LU, LUIdx, F);
03044   assert(Inserted && "Initial formula already exists!"); (void)Inserted;
03045 }
03046 
03047 /// Insert a simple single-register formula for the given expression into the
03048 /// given use.
03049 void
03050 LSRInstance::InsertSupplementalFormula(const SCEV *S,
03051                                        LSRUse &LU, size_t LUIdx) {
03052   Formula F;
03053   F.BaseRegs.push_back(S);
03054   F.HasBaseReg = true;
03055   bool Inserted = InsertFormula(LU, LUIdx, F);
03056   assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
03057 }
03058 
03059 /// Note which registers are used by the given formula, updating RegUses.
03060 void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
03061   if (F.ScaledReg)
03062     RegUses.countRegister(F.ScaledReg, LUIdx);
03063   for (const SCEV *BaseReg : F.BaseRegs)
03064     RegUses.countRegister(BaseReg, LUIdx);
03065 }
03066 
03067 /// If the given formula has not yet been inserted, add it to the list, and
03068 /// return true. Return false otherwise.
03069 bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
03070   // Do not insert formula that we will not be able to expand.
03071   assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
03072          "Formula is illegal");
03073   if (!LU.InsertFormula(F))
03074     return false;
03075 
03076   CountRegisters(F, LUIdx);
03077   return true;
03078 }
03079 
03080 /// Check for other uses of loop-invariant values which we're tracking. These
03081 /// other uses will pin these values in registers, making them less profitable
03082 /// for elimination.
03083 /// TODO: This currently misses non-constant addrec step registers.
03084 /// TODO: Should this give more weight to users inside the loop?
03085 void
03086 LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
03087   SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
03088   SmallPtrSet<const SCEV *, 32> Visited;
03089 
03090   while (!Worklist.empty()) {
03091     const SCEV *S = Worklist.pop_back_val();
03092 
03093     // Don't process the same SCEV twice
03094     if (!Visited.insert(S).second)
03095       continue;
03096 
03097     if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
03098       Worklist.append(N->op_begin(), N->op_end());
03099     else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
03100       Worklist.push_back(C->getOperand());
03101     else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
03102       Worklist.push_back(D->getLHS());
03103       Worklist.push_back(D->getRHS());
03104     } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
03105       const Value *V = US->getValue();
03106       if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
03107         // Look for instructions defined outside the loop.
03108         if (L->contains(Inst)) continue;
03109       } else if (isa<UndefValue>(V))
03110         // Undef doesn't have a live range, so it doesn't matter.
03111         continue;
03112       for (const Use &U : V->uses()) {
03113         const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
03114         // Ignore non-instructions.
03115         if (!UserInst)
03116           continue;
03117         // Ignore instructions in other functions (as can happen with
03118         // Constants).
03119         if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
03120           continue;
03121         // Ignore instructions not dominated by the loop.
03122         const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
03123           UserInst->getParent() :
03124           cast<PHINode>(UserInst)->getIncomingBlock(
03125             PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
03126         if (!DT.dominates(L->getHeader(), UseBB))
03127           continue;
03128         // Don't bother if the instruction is in a BB which ends in an EHPad.
03129         if (UseBB->getTerminator()->isEHPad())
03130           continue;
03131         // Ignore uses which are part of other SCEV expressions, to avoid
03132         // analyzing them multiple times.
03133         if (SE.isSCEVable(UserInst->getType())) {
03134           const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
03135           // If the user is a no-op, look through to its uses.
03136           if (!isa<SCEVUnknown>(UserS))
03137             continue;
03138           if (UserS == US) {
03139             Worklist.push_back(
03140               SE.getUnknown(const_cast<Instruction *>(UserInst)));
03141             continue;
03142           }
03143         }
03144         // Ignore icmp instructions which are already being analyzed.
03145         if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
03146           unsigned OtherIdx = !U.getOperandNo();
03147           Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
03148           if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
03149             continue;
03150         }
03151 
03152         LSRFixup &LF = getNewFixup();
03153         LF.UserInst = const_cast<Instruction *>(UserInst);
03154         LF.OperandValToReplace = U;
03155         std::pair<size_t, int64_t> P = getUse(
03156             S, LSRUse::Basic, MemAccessTy());
03157         LF.LUIdx = P.first;
03158         LF.Offset = P.second;
03159         LSRUse &LU = Uses[LF.LUIdx];
03160         LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
03161         if (!LU.WidestFixupType ||
03162             SE.getTypeSizeInBits(LU.WidestFixupType) <
03163             SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
03164           LU.WidestFixupType = LF.OperandValToReplace->getType();
03165         InsertSupplementalFormula(US, LU, LF.LUIdx);
03166         CountRegisters(LU.Formulae.back(), Uses.size() - 1);
03167         break;
03168       }
03169     }
03170   }
03171 }
03172 
03173 /// Split S into subexpressions which can be pulled out into separate
03174 /// registers. If C is non-null, multiply each subexpression by C.
03175 ///
03176 /// Return remainder expression after factoring the subexpressions captured by
03177 /// Ops. If Ops is complete, return NULL.
03178 static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
03179                                    SmallVectorImpl<const SCEV *> &Ops,
03180                                    const Loop *L,
03181                                    ScalarEvolution &SE,
03182                                    unsigned Depth = 0) {
03183   // Arbitrarily cap recursion to protect compile time.
03184   if (Depth >= 3)
03185     return S;
03186 
03187   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
03188     // Break out add operands.
03189     for (const SCEV *S : Add->operands()) {
03190       const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
03191       if (Remainder)
03192         Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
03193     }
03194     return nullptr;
03195   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
03196     // Split a non-zero base out of an addrec.
03197     if (AR->getStart()->isZero())
03198       return S;
03199 
03200     const SCEV *Remainder = CollectSubexprs(AR->getStart(),
03201                                             C, Ops, L, SE, Depth+1);
03202     // Split the non-zero AddRec unless it is part of a nested recurrence that
03203     // does not pertain to this loop.
03204     if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) {
03205       Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
03206       Remainder = nullptr;
03207     }
03208     if (Remainder != AR->getStart()) {
03209       if (!Remainder)
03210         Remainder = SE.getConstant(AR->getType(), 0);
03211       return SE.getAddRecExpr(Remainder,
03212                               AR->getStepRecurrence(SE),
03213                               AR->getLoop(),
03214                               //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
03215                               SCEV::FlagAnyWrap);
03216     }
03217   } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
03218     // Break (C * (a + b + c)) into C*a + C*b + C*c.
03219     if (Mul->getNumOperands() != 2)
03220       return S;
03221     if (const SCEVConstant *Op0 =
03222         dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
03223       C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
03224       const SCEV *Remainder =
03225         CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1);
03226       if (Remainder)
03227         Ops.push_back(SE.getMulExpr(C, Remainder));
03228       return nullptr;
03229     }
03230   }
03231   return S;
03232 }
03233 
03234 /// \brief Helper function for LSRInstance::GenerateReassociations.
03235 void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
03236                                              const Formula &Base,
03237                                              unsigned Depth, size_t Idx,
03238                                              bool IsScaledReg) {
03239   const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
03240   SmallVector<const SCEV *, 8> AddOps;
03241   const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
03242   if (Remainder)
03243     AddOps.push_back(Remainder);
03244 
03245   if (AddOps.size() == 1)
03246     return;
03247 
03248   for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
03249                                                      JE = AddOps.end();
03250        J != JE; ++J) {
03251 
03252     // Loop-variant "unknown" values are uninteresting; we won't be able to
03253     // do anything meaningful with them.
03254     if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
03255       continue;
03256 
03257     // Don't pull a constant into a register if the constant could be folded
03258     // into an immediate field.
03259     if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
03260                          LU.AccessTy, *J, Base.getNumRegs() > 1))
03261       continue;
03262 
03263     // Collect all operands except *J.
03264     SmallVector<const SCEV *, 8> InnerAddOps(
03265         ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
03266     InnerAddOps.append(std::next(J),
03267                        ((const SmallVector<const SCEV *, 8> &)AddOps).end());
03268 
03269     // Don't leave just a constant behind in a register if the constant could
03270     // be folded into an immediate field.
03271     if (InnerAddOps.size() == 1 &&
03272         isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
03273                          LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
03274       continue;
03275 
03276     const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
03277     if (InnerSum->isZero())
03278       continue;
03279     Formula F = Base;
03280 
03281     // Add the remaining pieces of the add back into the new formula.
03282     const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
03283     if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
03284         TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
03285                                 InnerSumSC->getValue()->getZExtValue())) {
03286       F.UnfoldedOffset =
03287           (uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue();
03288       if (IsScaledReg)
03289         F.ScaledReg = nullptr;
03290       else
03291         F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
03292     } else if (IsScaledReg)
03293       F.ScaledReg = InnerSum;
03294     else
03295       F.BaseRegs[Idx] = InnerSum;
03296 
03297     // Add J as its own register, or an unfolded immediate.
03298     const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
03299     if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
03300         TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
03301                                 SC->getValue()->getZExtValue()))
03302       F.UnfoldedOffset =
03303           (uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue();
03304     else
03305       F.BaseRegs.push_back(*J);
03306     // We may have changed the number of register in base regs, adjust the
03307     // formula accordingly.
03308     F.canonicalize();
03309 
03310     if (InsertFormula(LU, LUIdx, F))
03311       // If that formula hadn't been seen before, recurse to find more like
03312       // it.
03313       GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth + 1);
03314   }
03315 }
03316 
03317 /// Split out subexpressions from adds and the bases of addrecs.
03318 void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
03319                                          Formula Base, unsigned Depth) {
03320   assert(Base.isCanonical() && "Input must be in the canonical form");
03321   // Arbitrarily cap recursion to protect compile time.
03322   if (Depth >= 3)
03323     return;
03324 
03325   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
03326     GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
03327 
03328   if (Base.Scale == 1)
03329     GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
03330                                /* Idx */ -1, /* IsScaledReg */ true);
03331 }
03332 
03333 ///  Generate a formula consisting of all of the loop-dominating registers added
03334 /// into a single register.
03335 void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
03336                                        Formula Base) {
03337   // This method is only interesting on a plurality of registers.
03338   if (Base.BaseRegs.size() + (Base.Scale == 1) <= 1)
03339     return;
03340 
03341   // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
03342   // processing the formula.
03343   Base.unscale();
03344   Formula F = Base;
03345   F.BaseRegs.clear();
03346   SmallVector<const SCEV *, 4> Ops;
03347   for (const SCEV *BaseReg : Base.BaseRegs) {
03348     if (SE.properlyDominates(BaseReg, L->getHeader()) &&
03349         !SE.hasComputableLoopEvolution(BaseReg, L))
03350       Ops.push_back(BaseReg);
03351     else
03352       F.BaseRegs.push_back(BaseReg);
03353   }
03354   if (Ops.size() > 1) {
03355     const SCEV *Sum = SE.getAddExpr(Ops);
03356     // TODO: If Sum is zero, it probably means ScalarEvolution missed an
03357     // opportunity to fold something. For now, just ignore such cases
03358     // rather than proceed with zero in a register.
03359     if (!Sum->isZero()) {
03360       F.BaseRegs.push_back(Sum);
03361       F.canonicalize();
03362       (void)InsertFormula(LU, LUIdx, F);
03363     }
03364   }
03365 }
03366 
03367 /// \brief Helper function for LSRInstance::GenerateSymbolicOffsets.
03368 void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
03369                                               const Formula &Base, size_t Idx,
03370                                               bool IsScaledReg) {
03371   const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
03372   GlobalValue *GV = ExtractSymbol(G, SE);
03373   if (G->isZero() || !GV)
03374     return;
03375   Formula F = Base;
03376   F.BaseGV = GV;
03377   if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
03378     return;
03379   if (IsScaledReg)
03380     F.ScaledReg = G;
03381   else
03382     F.BaseRegs[Idx] = G;
03383   (void)InsertFormula(LU, LUIdx, F);
03384 }
03385 
03386 /// Generate reuse formulae using symbolic offsets.
03387 void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
03388                                           Formula Base) {
03389   // We can't add a symbolic offset if the address already contains one.
03390   if (Base.BaseGV) return;
03391 
03392   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
03393     GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
03394   if (Base.Scale == 1)
03395     GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
03396                                 /* IsScaledReg */ true);
03397 }
03398 
03399 /// \brief Helper function for LSRInstance::GenerateConstantOffsets.
03400 void LSRInstance::GenerateConstantOffsetsImpl(
03401     LSRUse &LU, unsigned LUIdx, const Formula &Base,
03402     const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) {
03403   const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
03404   for (int64_t Offset : Worklist) {
03405     Formula F = Base;
03406     F.BaseOffset = (uint64_t)Base.BaseOffset - Offset;
03407     if (isLegalUse(TTI, LU.MinOffset - Offset, LU.MaxOffset - Offset, LU.Kind,
03408                    LU.AccessTy, F)) {
03409       // Add the offset to the base register.
03410       const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), Offset), G);
03411       // If it cancelled out, drop the base register, otherwise update it.
03412       if (NewG->isZero()) {
03413         if (IsScaledReg) {
03414           F.Scale = 0;
03415           F.ScaledReg = nullptr;
03416         } else
03417           F.deleteBaseReg(F.BaseRegs[Idx]);
03418         F.canonicalize();
03419       } else if (IsScaledReg)
03420         F.ScaledReg = NewG;
03421       else
03422         F.BaseRegs[Idx] = NewG;
03423 
03424       (void)InsertFormula(LU, LUIdx, F);
03425     }
03426   }
03427 
03428   int64_t Imm = ExtractImmediate(G, SE);
03429   if (G->isZero() || Imm == 0)
03430     return;
03431   Formula F = Base;
03432   F.BaseOffset = (uint64_t)F.BaseOffset + Imm;
03433   if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
03434     return;
03435   if (IsScaledReg)
03436     F.ScaledReg = G;
03437   else
03438     F.BaseRegs[Idx] = G;
03439   (void)InsertFormula(LU, LUIdx, F);
03440 }
03441 
03442 /// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
03443 void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
03444                                           Formula Base) {
03445   // TODO: For now, just add the min and max offset, because it usually isn't
03446   // worthwhile looking at everything inbetween.
03447   SmallVector<int64_t, 2> Worklist;
03448   Worklist.push_back(LU.MinOffset);
03449   if (LU.MaxOffset != LU.MinOffset)
03450     Worklist.push_back(LU.MaxOffset);
03451 
03452   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
03453     GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
03454   if (Base.Scale == 1)
03455     GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
03456                                 /* IsScaledReg */ true);
03457 }
03458 
03459 /// For ICmpZero, check to see if we can scale up the comparison. For example, x
03460 /// == y -> x*c == y*c.
03461 void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
03462                                          Formula Base) {
03463   if (LU.Kind != LSRUse::ICmpZero) return;
03464 
03465   // Determine the integer type for the base formula.
03466   Type *IntTy = Base.getType();
03467   if (!IntTy) return;
03468   if (SE.getTypeSizeInBits(IntTy) > 64) return;
03469 
03470   // Don't do this if there is more than one offset.
03471   if (LU.MinOffset != LU.MaxOffset) return;
03472 
03473   assert(!Base.BaseGV && "ICmpZero use is not legal!");
03474 
03475   // Check each interesting stride.
03476   for (int64_t Factor : Factors) {
03477     // Check that the multiplication doesn't overflow.
03478     if (Base.BaseOffset == INT64_MIN && Factor == -1)
03479       continue;
03480     int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor;
03481     if (NewBaseOffset / Factor != Base.BaseOffset)
03482       continue;
03483     // If the offset will be truncated at this use, check that it is in bounds.
03484     if (!IntTy->isPointerTy() &&
03485         !ConstantInt::isValueValidForType(IntTy, NewBaseOffset))
03486       continue;
03487 
03488     // Check that multiplying with the use offset doesn't overflow.
03489     int64_t Offset = LU.MinOffset;
03490     if (Offset == INT64_MIN && Factor == -1)
03491       continue;
03492     Offset = (uint64_t)Offset * Factor;
03493     if (Offset / Factor != LU.MinOffset)
03494       continue;
03495     // If the offset will be truncated at this use, check that it is in bounds.
03496     if (!IntTy->isPointerTy() &&
03497         !ConstantInt::isValueValidForType(IntTy, Offset))
03498       continue;
03499 
03500     Formula F = Base;
03501     F.BaseOffset = NewBaseOffset;
03502 
03503     // Check that this scale is legal.
03504     if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
03505       continue;
03506 
03507     // Compensate for the use having MinOffset built into it.
03508     F.BaseOffset = (uint64_t)F.BaseOffset + Offset - LU.MinOffset;
03509 
03510     const SCEV *FactorS = SE.getConstant(IntTy, Factor);
03511 
03512     // Check that multiplying with each base register doesn't overflow.
03513     for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
03514       F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
03515       if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
03516         goto next;
03517     }
03518 
03519     // Check that multiplying with the scaled register doesn't overflow.
03520     if (F.ScaledReg) {
03521       F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
03522       if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
03523         continue;
03524     }
03525 
03526     // Check that multiplying with the unfolded offset doesn't overflow.
03527     if (F.UnfoldedOffset != 0) {
03528       if (F.UnfoldedOffset == INT64_MIN && Factor == -1)
03529         continue;
03530       F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor;
03531       if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset)
03532         continue;
03533       // If the offset will be truncated, check that it is in bounds.
03534       if (!IntTy->isPointerTy() &&
03535           !ConstantInt::isValueValidForType(IntTy, F.UnfoldedOffset))
03536         continue;
03537     }
03538 
03539     // If we make it here and it's legal, add it.
03540     (void)InsertFormula(LU, LUIdx, F);
03541   next:;
03542   }
03543 }
03544 
03545 /// Generate stride factor reuse formulae by making use of scaled-offset address
03546 /// modes, for example.
03547 void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
03548   // Determine the integer type for the base formula.
03549   Type *IntTy = Base.getType();
03550   if (!IntTy) return;
03551 
03552   // If this Formula already has a scaled register, we can't add another one.
03553   // Try to unscale the formula to generate a better scale.
03554   if (Base.Scale != 0 && !Base.unscale())
03555     return;
03556 
03557   assert(Base.Scale == 0 && "unscale did not did its job!");
03558 
03559   // Check each interesting stride.
03560   for (int64_t Factor : Factors) {
03561     Base.Scale = Factor;
03562     Base.HasBaseReg = Base.BaseRegs.size() > 1;
03563     // Check whether this scale is going to be legal.
03564     if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
03565                     Base)) {
03566       // As a special-case, handle special out-of-loop Basic users specially.
03567       // TODO: Reconsider this special case.
03568       if (LU.Kind == LSRUse::Basic &&
03569           isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
03570                      LU.AccessTy, Base) &&
03571           LU.AllFixupsOutsideLoop)
03572         LU.Kind = LSRUse::Special;
03573       else
03574         continue;
03575     }
03576     // For an ICmpZero, negating a solitary base register won't lead to
03577     // new solutions.
03578     if (LU.Kind == LSRUse::ICmpZero &&
03579         !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV)
03580       continue;
03581     // For each addrec base reg, apply the scale, if possible.
03582     for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
03583       if (const SCEVAddRecExpr *AR =
03584             dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) {
03585         const SCEV *FactorS = SE.getConstant(IntTy, Factor);
03586         if (FactorS->isZero())
03587           continue;
03588         // Divide out the factor, ignoring high bits, since we'll be
03589         // scaling the value back up in the end.
03590         if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) {
03591           // TODO: This could be optimized to avoid all the copying.
03592           Formula F = Base;
03593           F.ScaledReg = Quotient;
03594           F.deleteBaseReg(F.BaseRegs[i]);
03595           // The canonical representation of 1*reg is reg, which is already in
03596           // Base. In that case, do not try to insert the formula, it will be
03597           // rejected anyway.
03598           if (F.Scale == 1 && F.BaseRegs.empty())
03599             continue;
03600           (void)InsertFormula(LU, LUIdx, F);
03601         }
03602       }
03603   }
03604 }
03605 
03606 /// Generate reuse formulae from different IV types.
03607 void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
03608   // Don't bother truncating symbolic values.
03609   if (Base.BaseGV) return;
03610 
03611   // Determine the integer type for the base formula.
03612   Type *DstTy = Base.getType();
03613   if (!DstTy) return;
03614   DstTy = SE.getEffectiveSCEVType(DstTy);
03615 
03616   for (Type *SrcTy : Types) {
03617     if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
03618       Formula F = Base;
03619 
03620       if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, SrcTy);
03621       for (const SCEV *&BaseReg : F.BaseRegs)
03622         BaseReg = SE.getAnyExtendExpr(BaseReg, SrcTy);
03623 
03624       // TODO: This assumes we've done basic processing on all uses and
03625       // have an idea what the register usage is.
03626       if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
03627         continue;
03628 
03629       (void)InsertFormula(LU, LUIdx, F);
03630     }
03631   }
03632 }
03633 
03634 namespace {
03635 
03636 /// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
03637 /// modifications so that the search phase doesn't have to worry about the data
03638 /// structures moving underneath it.
03639 struct WorkItem {
03640   size_t LUIdx;
03641   int64_t Imm;
03642   const SCEV *OrigReg;
03643 
03644   WorkItem(size_t LI, int64_t I, const SCEV *R)
03645     : LUIdx(LI), Imm(I), OrigReg(R) {}
03646 
03647   void print(raw_ostream &OS) const;
03648   void dump() const;
03649 };
03650 
03651 }
03652 
03653 void WorkItem::print(raw_ostream &OS) const {
03654   OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
03655      << " , add offset " << Imm;
03656 }
03657 
03658 LLVM_DUMP_METHOD
03659 void WorkItem::dump() const {
03660   print(errs()); errs() << '\n';
03661 }
03662 
03663 /// Look for registers which are a constant distance apart and try to form reuse
03664 /// opportunities between them.
03665 void LSRInstance::GenerateCrossUseConstantOffsets() {
03666   // Group the registers by their value without any added constant offset.
03667   typedef std::map<int64_t, const SCEV *> ImmMapTy;
03668   DenseMap<const SCEV *, ImmMapTy> Map;
03669   DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
03670   SmallVector<const SCEV *, 8> Sequence;
03671   for (const SCEV *Use : RegUses) {
03672     const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify.
03673     int64_t Imm = ExtractImmediate(Reg, SE);
03674     auto Pair = Map.insert(std::make_pair(Reg, ImmMapTy()));
03675     if (Pair.second)
03676       Sequence.push_back(Reg);
03677     Pair.first->second.insert(std::make_pair(Imm, Use));
03678     UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
03679   }
03680 
03681   // Now examine each set of registers with the same base value. Build up
03682   // a list of work to do and do the work in a separate step so that we're
03683   // not adding formulae and register counts while we're searching.
03684   SmallVector<WorkItem, 32> WorkItems;
03685   SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems;
03686   for (const SCEV *Reg : Sequence) {
03687     const ImmMapTy &Imms = Map.find(Reg)->second;
03688 
03689     // It's not worthwhile looking for reuse if there's only one offset.
03690     if (Imms.size() == 1)
03691       continue;
03692 
03693     DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
03694           for (const auto &Entry : Imms)
03695             dbgs() << ' ' << Entry.first;
03696           dbgs() << '\n');
03697 
03698     // Examine each offset.
03699     for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
03700          J != JE; ++J) {
03701       const SCEV *OrigReg = J->second;
03702 
03703       int64_t JImm = J->first;
03704       const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
03705 
03706       if (!isa<SCEVConstant>(OrigReg) &&
03707           UsedByIndicesMap[Reg].count() == 1) {
03708         DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg << '\n');
03709         continue;
03710       }
03711 
03712       // Conservatively examine offsets between this orig reg a few selected
03713       // other orig regs.
03714       ImmMapTy::const_iterator OtherImms[] = {
03715         Imms.begin(), std::prev(Imms.end()),
03716         Imms.lower_bound((Imms.begin()->first + std::prev(Imms.end())->first) /
03717                          2)
03718       };
03719       for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {
03720         ImmMapTy::const_iterator M = OtherImms[i];
03721         if (M == J || M == JE) continue;
03722 
03723         // Compute the difference between the two.
03724         int64_t Imm = (uint64_t)JImm - M->first;
03725         for (int LUIdx = UsedByIndices.find_first(); LUIdx != -1;
03726              LUIdx = UsedByIndices.find_next(LUIdx))
03727           // Make a memo of this use, offset, and register tuple.
03728           if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
03729             WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
03730       }
03731     }
03732   }
03733 
03734   Map.clear();
03735   Sequence.clear();
03736   UsedByIndicesMap.clear();
03737   UniqueItems.clear();
03738 
03739   // Now iterate through the worklist and add new formulae.
03740   for (const WorkItem &WI : WorkItems) {
03741     size_t LUIdx = WI.LUIdx;
03742     LSRUse &LU = Uses[LUIdx];
03743     int64_t Imm = WI.Imm;
03744     const SCEV *OrigReg = WI.OrigReg;
03745 
03746     Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
03747     const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
03748     unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
03749 
03750     // TODO: Use a more targeted data structure.
03751     for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
03752       Formula F = LU.Formulae[L];
03753       // FIXME: The code for the scaled and unscaled registers looks
03754       // very similar but slightly different. Investigate if they
03755       // could be merged. That way, we would not have to unscale the
03756       // Formula.
03757       F.unscale();
03758       // Use the immediate in the scaled register.
03759       if (F.ScaledReg == OrigReg) {
03760         int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale;
03761         // Don't create 50 + reg(-50).
03762         if (F.referencesReg(SE.getSCEV(
03763                    ConstantInt::get(IntTy, -(uint64_t)Offset))))
03764           continue;
03765         Formula NewF = F;
03766         NewF.BaseOffset = Offset;
03767         if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
03768                         NewF))
03769           continue;
03770         NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
03771 
03772         // If the new scale is a constant in a register, and adding the constant
03773         // value to the immediate would produce a value closer to zero than the
03774         // immediate itself, then the formula isn't worthwhile.
03775         if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
03776           if (C->getValue()->isNegative() != (NewF.BaseOffset < 0) &&
03777               (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
03778                   .ule(std::abs(NewF.BaseOffset)))
03779             continue;
03780 
03781         // OK, looks good.
03782         NewF.canonicalize();
03783         (void)InsertFormula(LU, LUIdx, NewF);
03784       } else {
03785         // Use the immediate in a base register.
03786         for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
03787           const SCEV *BaseReg = F.BaseRegs[N];
03788           if (BaseReg != OrigReg)
03789             continue;
03790           Formula NewF = F;
03791           NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm;
03792           if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
03793                           LU.Kind, LU.AccessTy, NewF)) {
03794             if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
03795               continue;
03796             NewF = F;
03797             NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm;
03798           }
03799           NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
03800 
03801           // If the new formula has a constant in a register, and adding the
03802           // constant value to the immediate would produce a value closer to
03803           // zero than the immediate itself, then the formula isn't worthwhile.
03804           for (const SCEV *NewReg : NewF.BaseRegs)
03805             if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg))
03806               if ((C->getAPInt() + NewF.BaseOffset)
03807                       .abs()
03808                       .slt(std::abs(NewF.BaseOffset)) &&
03809                   (C->getAPInt() + NewF.BaseOffset).countTrailingZeros() >=
03810                       countTrailingZeros<uint64_t>(NewF.BaseOffset))
03811                 goto skip_formula;
03812 
03813           // Ok, looks good.
03814           NewF.canonicalize();
03815           (void)InsertFormula(LU, LUIdx, NewF);
03816           break;
03817         skip_formula:;
03818         }
03819       }
03820     }
03821   }
03822 }
03823 
03824 /// Generate formulae for each use.
03825 void
03826 LSRInstance::GenerateAllReuseFormulae() {
03827   // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
03828   // queries are more precise.
03829   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
03830     LSRUse &LU = Uses[LUIdx];
03831     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
03832       GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
03833     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
03834       GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
03835   }
03836   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
03837     LSRUse &LU = Uses[LUIdx];
03838     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
03839       GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
03840     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
03841       GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
03842     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
03843       GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
03844     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
03845       GenerateScales(LU, LUIdx, LU.Formulae[i]);
03846   }
03847   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
03848     LSRUse &LU = Uses[LUIdx];
03849     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
03850       GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
03851   }
03852 
03853   GenerateCrossUseConstantOffsets();
03854 
03855   DEBUG(dbgs() << "\n"
03856                   "After generating reuse formulae:\n";
03857         print_uses(dbgs()));
03858 }
03859 
03860 /// If there are multiple formulae with the same set of registers used
03861 /// by other uses, pick the best one and delete the others.
03862 void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
03863   DenseSet<const SCEV *> VisitedRegs;
03864   SmallPtrSet<const SCEV *, 16> Regs;
03865   SmallPtrSet<const SCEV *, 16> LoserRegs;
03866 #ifndef NDEBUG
03867   bool ChangedFormulae = false;
03868 #endif
03869 
03870   // Collect the best formula for each unique set of shared registers. This
03871   // is reset for each use.
03872   typedef DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>
03873     BestFormulaeTy;
03874   BestFormulaeTy BestFormulae;
03875 
03876   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
03877     LSRUse &LU = Uses[LUIdx];
03878     DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n');
03879 
03880     bool Any = false;
03881     for (size_t FIdx = 0, NumForms = LU.Formulae.size();
03882          FIdx != NumForms; ++FIdx) {
03883       Formula &F = LU.Formulae[FIdx];
03884 
03885       // Some formulas are instant losers. For example, they may depend on
03886       // nonexistent AddRecs from other loops. These need to be filtered
03887       // immediately, otherwise heuristics could choose them over others leading
03888       // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
03889       // avoids the need to recompute this information across formulae using the
03890       // same bad AddRec. Passing LoserRegs is also essential unless we remove
03891       // the corresponding bad register from the Regs set.
03892       Cost CostF;
03893       Regs.clear();
03894       CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, LU,
03895                         &LoserRegs);
03896       if (CostF.isLoser()) {
03897         // During initial formula generation, undesirable formulae are generated
03898         // by uses within other loops that have some non-trivial address mode or
03899         // use the postinc form of the IV. LSR needs to provide these formulae
03900         // as the basis of rediscovering the desired formula that uses an AddRec
03901         // corresponding to the existing phi. Once all formulae have been
03902         // generated, these initial losers may be pruned.
03903         DEBUG(dbgs() << "  Filtering loser "; F.print(dbgs());
03904               dbgs() << "\n");
03905       }
03906       else {
03907         SmallVector<const SCEV *, 4> Key;
03908         for (const SCEV *Reg : F.BaseRegs) {
03909           if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
03910             Key.push_back(Reg);
03911         }
03912         if (F.ScaledReg &&
03913             RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
03914           Key.push_back(F.ScaledReg);
03915         // Unstable sort by host order ok, because this is only used for
03916         // uniquifying.
03917         std::sort(Key.begin(), Key.end());
03918 
03919         std::pair<BestFormulaeTy::const_iterator, bool> P =
03920           BestFormulae.insert(std::make_pair(Key, FIdx));
03921         if (P.second)
03922           continue;
03923 
03924         Formula &Best = LU.Formulae[P.first->second];
03925 
03926         Cost CostBest;
03927         Regs.clear();
03928         CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, LU.Offsets, SE,
03929                              DT, LU);
03930         if (CostF < CostBest)
03931           std::swap(F, Best);
03932         DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
03933               dbgs() << "\n"
03934                         "    in favor of formula "; Best.print(dbgs());
03935               dbgs() << '\n');
03936       }
03937 #ifndef NDEBUG
03938       ChangedFormulae = true;
03939 #endif
03940       LU.DeleteFormula(F);
03941       --FIdx;
03942       --NumForms;
03943       Any = true;
03944     }
03945 
03946     // Now that we've filtered out some formulae, recompute the Regs set.
03947     if (Any)
03948       LU.RecomputeRegs(LUIdx, RegUses);
03949 
03950     // Reset this to prepare for the next use.
03951     BestFormulae.clear();
03952   }
03953 
03954   DEBUG(if (ChangedFormulae) {
03955           dbgs() << "\n"
03956                     "After filtering out undesirable candidates:\n";
03957           print_uses(dbgs());
03958         });
03959 }
03960 
03961 // This is a rough guess that seems to work fairly well.
03962 static const size_t ComplexityLimit = UINT16_MAX;
03963 
03964 /// Estimate the worst-case number of solutions the solver might have to
03965 /// consider. It almost never considers this many solutions because it prune the
03966 /// search space, but the pruning isn't always sufficient.
03967 size_t LSRInstance::EstimateSearchSpaceComplexity() const {
03968   size_t Power = 1;
03969   for (const LSRUse &LU : Uses) {
03970     size_t FSize = LU.Formulae.size();
03971     if (FSize >= ComplexityLimit) {
03972       Power = ComplexityLimit;
03973       break;
03974     }
03975     Power *= FSize;
03976     if (Power >= ComplexityLimit)
03977       break;
03978   }
03979   return Power;
03980 }
03981 
03982 /// When one formula uses a superset of the registers of another formula, it
03983 /// won't help reduce register pressure (though it may not necessarily hurt
03984 /// register pressure); remove it to simplify the system.
03985 void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
03986   if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
03987     DEBUG(dbgs() << "The search space is too complex.\n");
03988 
03989     DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
03990                     "which use a superset of registers used by other "
03991                     "formulae.\n");
03992 
03993     for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
03994       LSRUse &LU = Uses[LUIdx];
03995       bool Any = false;
03996       for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
03997         Formula &F = LU.Formulae[i];
03998         // Look for a formula with a constant or GV in a register. If the use
03999         // also has a formula with that same value in an immediate field,
04000         // delete the one that uses a register.
04001         for (SmallVectorImpl<const SCEV *>::const_iterator
04002              I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
04003           if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
04004             Formula NewF = F;
04005             NewF.BaseOffset += C->getValue()->getSExtValue();
04006             NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
04007                                 (I - F.BaseRegs.begin()));
04008             if (LU.HasFormulaWithSameRegs(NewF)) {
04009               DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
04010               LU.DeleteFormula(F);
04011               --i;
04012               --e;
04013               Any = true;
04014               break;
04015             }
04016           } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
04017             if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
04018               if (!F.BaseGV) {
04019                 Formula NewF = F;
04020                 NewF.BaseGV = GV;
04021                 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
04022                                     (I - F.BaseRegs.begin()));
04023                 if (LU.HasFormulaWithSameRegs(NewF)) {
04024                   DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
04025                         dbgs() << '\n');
04026                   LU.DeleteFormula(F);
04027                   --i;
04028                   --e;
04029                   Any = true;
04030                   break;
04031                 }
04032               }
04033           }
04034         }
04035       }
04036       if (Any)
04037         LU.RecomputeRegs(LUIdx, RegUses);
04038     }
04039 
04040     DEBUG(dbgs() << "After pre-selection:\n";
04041           print_uses(dbgs()));
04042   }
04043 }
04044 
04045 /// When there are many registers for expressions like A, A+1, A+2, etc.,
04046 /// allocate a single register for them.
04047 void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
04048   if (EstimateSearchSpaceComplexity() < ComplexityLimit)
04049     return;
04050 
04051   DEBUG(dbgs() << "The search space is too complex.\n"
04052                   "Narrowing the search space by assuming that uses separated "
04053                   "by a constant offset will use the same registers.\n");
04054 
04055   // This is especially useful for unrolled loops.
04056 
04057   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
04058     LSRUse &LU = Uses[LUIdx];
04059     for (const Formula &F : LU.Formulae) {
04060       if (F.BaseOffset == 0 || (F.Scale != 0 && F.Scale != 1))
04061         continue;
04062 
04063       LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
04064       if (!LUThatHas)
04065         continue;
04066 
04067       if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
04068                               LU.Kind, LU.AccessTy))
04069         continue;
04070 
04071       DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs()); dbgs() << '\n');
04072 
04073       LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
04074 
04075       // Update the relocs to reference the new use.
04076       for (LSRFixup &Fixup : Fixups) {
04077         if (Fixup.LUIdx == LUIdx) {
04078           Fixup.LUIdx = LUThatHas - &Uses.front();
04079           Fixup.Offset += F.BaseOffset;
04080           // Add the new offset to LUThatHas' offset list.
04081           if (LUThatHas->Offsets.back() != Fixup.Offset) {
04082             LUThatHas->Offsets.push_back(Fixup.Offset);
04083             if (Fixup.Offset > LUThatHas->MaxOffset)
04084               LUThatHas->MaxOffset = Fixup.Offset;
04085             if (Fixup.Offset < LUThatHas->MinOffset)
04086               LUThatHas->MinOffset = Fixup.Offset;
04087           }
04088           DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
04089         }
04090         if (Fixup.LUIdx == NumUses-1)
04091           Fixup.LUIdx = LUIdx;
04092       }
04093 
04094       // Delete formulae from the new use which are no longer legal.
04095       bool Any = false;
04096       for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
04097         Formula &F = LUThatHas->Formulae[i];
04098         if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
04099                         LUThatHas->Kind, LUThatHas->AccessTy, F)) {
04100           DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
04101                 dbgs() << '\n');
04102           LUThatHas->DeleteFormula(F);
04103           --i;
04104           --e;
04105           Any = true;
04106         }
04107       }
04108 
04109       if (Any)
04110         LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
04111 
04112       // Delete the old use.
04113       DeleteUse(LU, LUIdx);
04114       --LUIdx;
04115       --NumUses;
04116       break;
04117     }
04118   }
04119 
04120   DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
04121 }
04122 
04123 /// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
04124 /// we've done more filtering, as it may be able to find more formulae to
04125 /// eliminate.
04126 void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
04127   if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
04128     DEBUG(dbgs() << "The search space is too complex.\n");
04129 
04130     DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
04131                     "undesirable dedicated registers.\n");
04132 
04133     FilterOutUndesirableDedicatedRegisters();
04134 
04135     DEBUG(dbgs() << "After pre-selection:\n";
04136           print_uses(dbgs()));
04137   }
04138 }
04139 
04140 /// Pick a register which seems likely to be profitable, and then in any use
04141 /// which has any reference to that register, delete all formulae which do not
04142 /// reference that register.
04143 void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
04144   // With all other options exhausted, loop until the system is simple
04145   // enough to handle.
04146   SmallPtrSet<const SCEV *, 4> Taken;
04147   while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
04148     // Ok, we have too many of formulae on our hands to conveniently handle.
04149     // Use a rough heuristic to thin out the list.
04150     DEBUG(dbgs() << "The search space is too complex.\n");
04151 
04152     // Pick the register which is used by the most LSRUses, which is likely
04153     // to be a good reuse register candidate.
04154     const SCEV *Best = nullptr;
04155     unsigned BestNum = 0;
04156     for (const SCEV *Reg : RegUses) {
04157       if (Taken.count(Reg))
04158         continue;
04159       if (!Best)
04160         Best = Reg;
04161       else {
04162         unsigned Count = RegUses.getUsedByIndices(Reg).count();
04163         if (Count > BestNum) {
04164           Best = Reg;
04165           BestNum = Count;
04166         }
04167       }
04168     }
04169 
04170     DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
04171                  << " will yield profitable reuse.\n");
04172     Taken.insert(Best);
04173 
04174     // In any use with formulae which references this register, delete formulae
04175     // which don't reference it.
04176     for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
04177       LSRUse &LU = Uses[LUIdx];
04178       if (!LU.Regs.count(Best)) continue;
04179 
04180       bool Any = false;
04181       for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
04182         Formula &F = LU.Formulae[i];
04183         if (!F.referencesReg(Best)) {
04184           DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
04185           LU.DeleteFormula(F);
04186           --e;
04187           --i;
04188           Any = true;
04189           assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
04190           continue;
04191         }
04192       }
04193 
04194       if (Any)
04195         LU.RecomputeRegs(LUIdx, RegUses);
04196     }
04197 
04198     DEBUG(dbgs() << "After pre-selection:\n";
04199           print_uses(dbgs()));
04200   }
04201 }
04202 
04203 /// If there are an extraordinary number of formulae to choose from, use some
04204 /// rough heuristics to prune down the number of formulae. This keeps the main
04205 /// solver from taking an extraordinary amount of time in some worst-case
04206 /// scenarios.
04207 void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
04208   NarrowSearchSpaceByDetectingSupersets();
04209   NarrowSearchSpaceByCollapsingUnrolledCode();
04210   NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
04211   NarrowSearchSpaceByPickingWinnerRegs();
04212 }
04213 
04214 /// This is the recursive solver.
04215 void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
04216                                Cost &SolutionCost,
04217                                SmallVectorImpl<const Formula *> &Workspace,
04218                                const Cost &CurCost,
04219                                const SmallPtrSet<const SCEV *, 16> &CurRegs,
04220                                DenseSet<const SCEV *> &VisitedRegs) const {
04221   // Some ideas:
04222   //  - prune more:
04223   //    - use more aggressive filtering
04224   //    - sort the formula so that the most profitable solutions are found first
04225   //    - sort the uses too
04226   //  - search faster:
04227   //    - don't compute a cost, and then compare. compare while computing a cost
04228   //      and bail early.
04229   //    - track register sets with SmallBitVector
04230 
04231   const LSRUse &LU = Uses[Workspace.size()];
04232 
04233   // If this use references any register that's already a part of the
04234   // in-progress solution, consider it a requirement that a formula must
04235   // reference that register in order to be considered. This prunes out
04236   // unprofitable searching.
04237   SmallSetVector<const SCEV *, 4> ReqRegs;
04238   for (const SCEV *S : CurRegs)
04239     if (LU.Regs.count(S))
04240       ReqRegs.insert(S);
04241 
04242   SmallPtrSet<const SCEV *, 16> NewRegs;
04243   Cost NewCost;
04244   for (const Formula &F : LU.Formulae) {
04245     // Ignore formulae which may not be ideal in terms of register reuse of
04246     // ReqRegs.  The formula should use all required registers before
04247     // introducing new ones.
04248     int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
04249     for (const SCEV *Reg : ReqRegs) {
04250       if ((F.ScaledReg && F.ScaledReg == Reg) ||
04251           std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) !=
04252           F.BaseRegs.end()) {
04253         --NumReqRegsToFind;
04254         if (NumReqRegsToFind == 0)
04255           break;
04256       }
04257     }
04258     if (NumReqRegsToFind != 0) {
04259       // If none of the formulae satisfied the required registers, then we could
04260       // clear ReqRegs and try again. Currently, we simply give up in this case.
04261       continue;
04262     }
04263 
04264     // Evaluate the cost of the current formula. If it's already worse than
04265     // the current best, prune the search at that point.
04266     NewCost = CurCost;
04267     NewRegs = CurRegs;
04268     NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT,
04269                         LU);
04270     if (NewCost < SolutionCost) {
04271       Workspace.push_back(&F);
04272       if (Workspace.size() != Uses.size()) {
04273         SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
04274                      NewRegs, VisitedRegs);
04275         if (F.getNumRegs() == 1 && Workspace.size() == 1)
04276           VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
04277       } else {
04278         DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
04279               dbgs() << ".\n Regs:";
04280               for (const SCEV *S : NewRegs)
04281                 dbgs() << ' ' << *S;
04282               dbgs() << '\n');
04283 
04284         SolutionCost = NewCost;
04285         Solution = Workspace;
04286       }
04287       Workspace.pop_back();
04288     }
04289   }
04290 }
04291 
04292 /// Choose one formula from each use. Return the results in the given Solution
04293 /// vector.
04294 void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
04295   SmallVector<const Formula *, 8> Workspace;
04296   Cost SolutionCost;
04297   SolutionCost.Lose();
04298   Cost CurCost;
04299   SmallPtrSet<const SCEV *, 16> CurRegs;
04300   DenseSet<const SCEV *> VisitedRegs;
04301   Workspace.reserve(Uses.size());
04302 
04303   // SolveRecurse does all the work.
04304   SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
04305                CurRegs, VisitedRegs);
04306   if (Solution.empty()) {
04307     DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
04308     return;
04309   }
04310 
04311   // Ok, we've now made all our decisions.
04312   DEBUG(dbgs() << "\n"
04313                   "The chosen solution requires "; SolutionCost.print(dbgs());
04314         dbgs() << ":\n";
04315         for (size_t i = 0, e = Uses.size(); i != e; ++i) {
04316           dbgs() << "  ";
04317           Uses[i].print(dbgs());
04318           dbgs() << "\n"
04319                     "    ";
04320           Solution[i]->print(dbgs());
04321           dbgs() << '\n';
04322         });
04323 
04324   assert(Solution.size() == Uses.size() && "Malformed solution!");
04325 }
04326 
04327 /// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
04328 /// we can go while still being dominated by the input positions. This helps
04329 /// canonicalize the insert position, which encourages sharing.
04330 BasicBlock::iterator
04331 LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
04332                                  const SmallVectorImpl<Instruction *> &Inputs)
04333                                                                          const {
04334   for (;;) {
04335     const Loop *IPLoop = LI.getLoopFor(IP->getParent());
04336     unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
04337 
04338     BasicBlock *IDom;
04339     for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
04340       if (!Rung) return IP;
04341       Rung = Rung->getIDom();
04342       if (!Rung) return IP;
04343       IDom = Rung->getBlock();
04344 
04345       // Don't climb into a loop though.
04346       const Loop *IDomLoop = LI.getLoopFor(IDom);
04347       unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
04348       if (IDomDepth <= IPLoopDepth &&
04349           (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
04350         break;
04351     }
04352 
04353     bool AllDominate = true;
04354     Instruction *BetterPos = nullptr;
04355     Instruction *Tentative = IDom->getTerminator();
04356     for (Instruction *Inst : Inputs) {
04357       if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
04358         AllDominate = false;
04359         break;
04360       }
04361       // Attempt to find an insert position in the middle of the block,
04362       // instead of at the end, so that it can be used for other expansions.
04363       if (IDom == Inst->getParent() &&
04364           (!BetterPos || !DT.dominates(Inst, BetterPos)))
04365         BetterPos = &*std::next(BasicBlock::iterator(Inst));
04366     }
04367     if (!AllDominate)
04368       break;
04369     if (BetterPos)
04370       IP = BetterPos->getIterator();
04371     else
04372       IP = Tentative->getIterator();
04373   }
04374 
04375   return IP;
04376 }
04377 
04378 /// Determine an input position which will be dominated by the operands and
04379 /// which will dominate the result.
04380 BasicBlock::iterator
04381 LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,
04382                                            const LSRFixup &LF,
04383                                            const LSRUse &LU,
04384                                            SCEVExpander &Rewriter) const {
04385   // Collect some instructions which must be dominated by the
04386   // expanding replacement. These must be dominated by any operands that
04387   // will be required in the expansion.
04388   SmallVector<Instruction *, 4> Inputs;
04389   if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
04390     Inputs.push_back(I);
04391   if (LU.Kind == LSRUse::ICmpZero)
04392     if (Instruction *I =
04393           dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
04394       Inputs.push_back(I);
04395   if (LF.PostIncLoops.count(L)) {
04396     if (LF.isUseFullyOutsideLoop(L))
04397       Inputs.push_back(L->getLoopLatch()->getTerminator());
04398     else
04399       Inputs.push_back(IVIncInsertPos);
04400   }
04401   // The expansion must also be dominated by the increment positions of any
04402   // loops it for which it is using post-inc mode.
04403   for (const Loop *PIL : LF.PostIncLoops) {
04404     if (PIL == L) continue;
04405 
04406     // Be dominated by the loop exit.
04407     SmallVector<BasicBlock *, 4> ExitingBlocks;
04408     PIL->getExitingBlocks(ExitingBlocks);
04409     if (!ExitingBlocks.empty()) {
04410       BasicBlock *BB = ExitingBlocks[0];
04411       for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
04412         BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
04413       Inputs.push_back(BB->getTerminator());
04414     }
04415   }
04416 
04417   assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad()
04418          && !isa<DbgInfoIntrinsic>(LowestIP) &&
04419          "Insertion point must be a normal instruction");
04420 
04421   // Then, climb up the immediate dominator tree as far as we can go while
04422   // still being dominated by the input positions.
04423   BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
04424 
04425   // Don't insert instructions before PHI nodes.
04426   while (isa<PHINode>(IP)) ++IP;
04427 
04428   // Ignore landingpad instructions.
04429   while (!isa<TerminatorInst>(IP) && IP->isEHPad()) ++IP;
04430 
04431   // Ignore debug intrinsics.
04432   while (isa<DbgInfoIntrinsic>(IP)) ++IP;
04433 
04434   // Set IP below instructions recently inserted by SCEVExpander. This keeps the
04435   // IP consistent across expansions and allows the previously inserted
04436   // instructions to be reused by subsequent expansion.
04437   while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP)
04438     ++IP;
04439 
04440   return IP;
04441 }
04442 
04443 /// Emit instructions for the leading candidate expression for this LSRUse (this
04444 /// is called "expanding").
04445 Value *LSRInstance::Expand(const LSRFixup &LF,
04446                            const Formula &F,
04447                            BasicBlock::iterator IP,
04448                            SCEVExpander &Rewriter,
04449                            SmallVectorImpl<WeakVH> &DeadInsts) const {
04450   const LSRUse &LU = Uses[LF.LUIdx];
04451   if (LU.RigidFormula)
04452     return LF.OperandValToReplace;
04453 
04454   // Determine an input position which will be dominated by the operands and
04455   // which will dominate the result.
04456   IP = AdjustInsertPositionForExpand(IP, LF, LU, Rewriter);
04457 
04458   // Inform the Rewriter if we have a post-increment use, so that it can
04459   // perform an advantageous expansion.
04460   Rewriter.setPostInc(LF.PostIncLoops);
04461 
04462   // This is the type that the user actually needs.
04463   Type *OpTy = LF.OperandValToReplace->getType();
04464   // This will be the type that we'll initially expand to.
04465   Type *Ty = F.getType();
04466   if (!Ty)
04467     // No type known; just expand directly to the ultimate type.
04468     Ty = OpTy;
04469   else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
04470     // Expand directly to the ultimate type if it's the right size.
04471     Ty = OpTy;
04472   // This is the type to do integer arithmetic in.
04473   Type *IntTy = SE.getEffectiveSCEVType(Ty);
04474 
04475   // Build up a list of operands to add together to form the full base.
04476   SmallVector<const SCEV *, 8> Ops;
04477 
04478   // Expand the BaseRegs portion.
04479   for (const SCEV *Reg : F.BaseRegs) {
04480     assert(!Reg->isZero() && "Zero allocated in a base register!");
04481 
04482     // If we're expanding for a post-inc user, make the post-inc adjustment.
04483     PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
04484     Reg = TransformForPostIncUse(Denormalize, Reg,
04485                                  LF.UserInst, LF.OperandValToReplace,
04486                                  Loops, SE, DT);
04487 
04488     Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr, &*IP)));
04489   }
04490 
04491   // Expand the ScaledReg portion.
04492   Value *ICmpScaledV = nullptr;
04493   if (F.Scale != 0) {
04494     const SCEV *ScaledS = F.ScaledReg;
04495 
04496     // If we're expanding for a post-inc user, make the post-inc adjustment.
04497     PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
04498     ScaledS = TransformForPostIncUse(Denormalize, ScaledS,
04499                                      LF.UserInst, LF.OperandValToReplace,
04500                                      Loops, SE, DT);
04501 
04502     if (LU.Kind == LSRUse::ICmpZero) {
04503       // Expand ScaleReg as if it was part of the base regs.
04504       if (F.Scale == 1)
04505         Ops.push_back(
04506             SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, &*IP)));
04507       else {
04508         // An interesting way of "folding" with an icmp is to use a negated
04509         // scale, which we'll implement by inserting it into the other operand
04510         // of the icmp.
04511         assert(F.Scale == -1 &&
04512                "The only scale supported by ICmpZero uses is -1!");
04513         ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr, &*IP);
04514       }
04515     } else {
04516       // Otherwise just expand the scaled register and an explicit scale,
04517       // which is expected to be matched as part of the address.
04518 
04519       // Flush the operand list to suppress SCEVExpander hoisting address modes.
04520       // Unless the addressing mode will not be folded.
04521       if (!Ops.empty() && LU.Kind == LSRUse::Address &&
04522           isAMCompletelyFolded(TTI, LU, F)) {
04523         Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP);
04524         Ops.clear();
04525         Ops.push_back(SE.getUnknown(FullV));
04526       }
04527       ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, &*IP));
04528       if (F.Scale != 1)
04529         ScaledS =
04530             SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
04531       Ops.push_back(ScaledS);
04532     }
04533   }
04534 
04535   // Expand the GV portion.
04536   if (F.BaseGV) {
04537     // Flush the operand list to suppress SCEVExpander hoisting.
04538     if (!Ops.empty()) {
04539       Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP);
04540       Ops.clear();
04541       Ops.push_back(SE.getUnknown(FullV));
04542     }
04543     Ops.push_back(SE.getUnknown(F.BaseGV));
04544   }
04545 
04546   // Flush the operand list to suppress SCEVExpander hoisting of both folded and
04547   // unfolded offsets. LSR assumes they both live next to their uses.
04548   if (!Ops.empty()) {
04549     Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, &*IP);
04550     Ops.clear();
04551     Ops.push_back(SE.getUnknown(FullV));
04552   }
04553 
04554   // Expand the immediate portion.
04555   int64_t Offset = (uint64_t)F.BaseOffset + LF.Offset;
04556   if (Offset != 0) {
04557     if (LU.Kind == LSRUse::ICmpZero) {
04558       // The other interesting way of "folding" with an ICmpZero is to use a
04559       // negated immediate.
04560       if (!ICmpScaledV)
04561         ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset);
04562       else {
04563         Ops.push_back(SE.getUnknown(ICmpScaledV));
04564         ICmpScaledV = ConstantInt::get(IntTy, Offset);
04565       }
04566     } else {
04567       // Just add the immediate values. These again are expected to be matched
04568       // as part of the address.
04569       Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset)));
04570     }
04571   }
04572 
04573   // Expand the unfolded offset portion.
04574   int64_t UnfoldedOffset = F.UnfoldedOffset;
04575   if (UnfoldedOffset != 0) {
04576     // Just add the immediate values.
04577     Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy,
04578                                                        UnfoldedOffset)));
04579   }
04580 
04581   // Emit instructions summing all the operands.
04582   const SCEV *FullS = Ops.empty() ?
04583                       SE.getConstant(IntTy, 0) :
04584                       SE.getAddExpr(Ops);
04585   Value *FullV = Rewriter.expandCodeFor(FullS, Ty, &*IP);
04586 
04587   // We're done expanding now, so reset the rewriter.
04588   Rewriter.clearPostInc();
04589 
04590   // An ICmpZero Formula represents an ICmp which we're handling as a
04591   // comparison against zero. Now that we've expanded an expression for that
04592   // form, update the ICmp's other operand.
04593   if (LU.Kind == LSRUse::ICmpZero) {
04594     ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
04595     DeadInsts.emplace_back(CI->getOperand(1));
04596     assert(!F.BaseGV && "ICmp does not support folding a global value and "
04597                            "a scale at the same time!");
04598     if (F.Scale == -1) {
04599       if (ICmpScaledV->getType() != OpTy) {
04600         Instruction *Cast =
04601           CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false,
04602                                                    OpTy, false),
04603                            ICmpScaledV, OpTy, "tmp", CI);
04604         ICmpScaledV = Cast;
04605       }
04606       CI->setOperand(1, ICmpScaledV);
04607     } else {
04608       // A scale of 1 means that the scale has been expanded as part of the
04609       // base regs.
04610       assert((F.Scale == 0 || F.Scale == 1) &&
04611              "ICmp does not support folding a global value and "
04612              "a scale at the same time!");
04613       Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
04614                                            -(uint64_t)Offset);
04615       if (C->getType() != OpTy)
04616         C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
04617                                                           OpTy, false),
04618                                   C, OpTy);
04619 
04620       CI->setOperand(1, C);
04621     }
04622   }
04623 
04624   return FullV;
04625 }
04626 
04627 /// Helper for Rewrite. PHI nodes are special because the use of their operands
04628 /// effectively happens in their predecessor blocks, so the expression may need
04629 /// to be expanded in multiple places.
04630 void LSRInstance::RewriteForPHI(PHINode *PN,
04631                                 const LSRFixup &LF,
04632                                 const Formula &F,
04633                                 SCEVExpander &Rewriter,
04634                                 SmallVectorImpl<WeakVH> &DeadInsts) const {
04635   DenseMap<BasicBlock *, Value *> Inserted;
04636   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
04637     if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
04638       BasicBlock *BB = PN->getIncomingBlock(i);
04639 
04640       // If this is a critical edge, split the edge so that we do not insert
04641       // the code on all predecessor/successor paths.  We do this unless this
04642       // is the canonical backedge for this loop, which complicates post-inc
04643       // users.
04644       if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
04645           !isa<IndirectBrInst>(BB->getTerminator())) {
04646         BasicBlock *Parent = PN->getParent();
04647         Loop *PNLoop = LI.getLoopFor(Parent);
04648         if (!PNLoop || Parent != PNLoop->getHeader()) {
04649           // Split the critical edge.
04650           BasicBlock *NewBB = nullptr;
04651           if (!Parent->isLandingPad()) {
04652             NewBB = SplitCriticalEdge(BB, Parent,
04653                                       CriticalEdgeSplittingOptions(&DT, &LI)
04654                                           .setMergeIdenticalEdges()
04655                                           .setDontDeleteUselessPHIs());
04656           } else {
04657             SmallVector<BasicBlock*, 2> NewBBs;
04658             SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DT, &LI);
04659             NewBB = NewBBs[0];
04660           }
04661           // If NewBB==NULL, then SplitCriticalEdge refused to split because all
04662           // phi predecessors are identical. The simple thing to do is skip
04663           // splitting in this case rather than complicate the API.
04664           if (NewBB) {
04665             // If PN is outside of the loop and BB is in the loop, we want to
04666             // move the block to be immediately before the PHI block, not
04667             // immediately after BB.
04668             if (L->contains(BB) && !L->contains(PN))
04669               NewBB->moveBefore(PN->getParent());
04670 
04671             // Splitting the edge can reduce the number of PHI entries we have.
04672             e = PN->getNumIncomingValues();
04673             BB = NewBB;
04674             i = PN->getBasicBlockIndex(BB);
04675           }
04676         }
04677       }
04678 
04679       std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
04680         Inserted.insert(std::make_pair(BB, static_cast<Value *>(nullptr)));
04681       if (!Pair.second)
04682         PN->setIncomingValue(i, Pair.first->second);
04683       else {
04684         Value *FullV = Expand(LF, F, BB->getTerminator()->getIterator(),
04685                               Rewriter, DeadInsts);
04686 
04687         // If this is reuse-by-noop-cast, insert the noop cast.
04688         Type *OpTy = LF.OperandValToReplace->getType();
04689         if (FullV->getType() != OpTy)
04690           FullV =
04691             CastInst::Create(CastInst::getCastOpcode(FullV, false,
04692                                                      OpTy, false),
04693                              FullV, LF.OperandValToReplace->getType(),
04694                              "tmp", BB->getTerminator());
04695 
04696         PN->setIncomingValue(i, FullV);
04697         Pair.first->second = FullV;
04698       }
04699     }
04700 }
04701 
04702 /// Emit instructions for the leading candidate expression for this LSRUse (this
04703 /// is called "expanding"), and update the UserInst to reference the newly
04704 /// expanded value.
04705 void LSRInstance::Rewrite(const LSRFixup &LF,
04706                           const Formula &F,
04707                           SCEVExpander &Rewriter,
04708                           SmallVectorImpl<WeakVH> &DeadInsts) const {
04709   // First, find an insertion point that dominates UserInst. For PHI nodes,
04710   // find the nearest block which dominates all the relevant uses.
04711   if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
04712     RewriteForPHI(PN, LF, F, Rewriter, DeadInsts);
04713   } else {
04714     Value *FullV =
04715         Expand(LF, F, LF.UserInst->getIterator(), Rewriter, DeadInsts);
04716 
04717     // If this is reuse-by-noop-cast, insert the noop cast.
04718     Type *OpTy = LF.OperandValToReplace->getType();
04719     if (FullV->getType() != OpTy) {
04720       Instruction *Cast =
04721         CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
04722                          FullV, OpTy, "tmp", LF.UserInst);
04723       FullV = Cast;
04724     }
04725 
04726     // Update the user. ICmpZero is handled specially here (for now) because
04727     // Expand may have updated one of the operands of the icmp already, and
04728     // its new value may happen to be equal to LF.OperandValToReplace, in
04729     // which case doing replaceUsesOfWith leads to replacing both operands
04730     // with the same value. TODO: Reorganize this.
04731     if (Uses[LF.LUIdx].Kind == LSRUse::ICmpZero)
04732       LF.UserInst->setOperand(0, FullV);
04733     else
04734       LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
04735   }
04736 
04737   DeadInsts.emplace_back(LF.OperandValToReplace);
04738 }
04739 
04740 /// Rewrite all the fixup locations with new values, following the chosen
04741 /// solution.
04742 void LSRInstance::ImplementSolution(
04743     const SmallVectorImpl<const Formula *> &Solution) {
04744   // Keep track of instructions we may have made dead, so that
04745   // we can remove them after we are done working.
04746   SmallVector<WeakVH, 16> DeadInsts;
04747 
04748   SCEVExpander Rewriter(SE, L->getHeader()->getModule()->getDataLayout(),
04749                         "lsr");
04750 #ifndef NDEBUG
04751   Rewriter.setDebugType(DEBUG_TYPE);
04752 #endif
04753   Rewriter.disableCanonicalMode();
04754   Rewriter.enableLSRMode();
04755   Rewriter.setIVIncInsertPos(L, IVIncInsertPos);
04756 
04757   // Mark phi nodes that terminate chains so the expander tries to reuse them.
04758   for (const IVChain &Chain : IVChainVec) {
04759     if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst()))
04760       Rewriter.setChainedPhi(PN);
04761   }
04762 
04763   // Expand the new value definitions and update the users.
04764   for (const LSRFixup &Fixup : Fixups) {
04765     Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts);
04766 
04767     Changed = true;
04768   }
04769 
04770   for (const IVChain &Chain : IVChainVec) {
04771     GenerateIVChain(Chain, Rewriter, DeadInsts);
04772     Changed = true;
04773   }
04774   // Clean up after ourselves. This must be done before deleting any
04775   // instructions.
04776   Rewriter.clear();
04777 
04778   Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
04779 }
04780 
04781 LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
04782                          DominatorTree &DT, LoopInfo &LI,
04783                          const TargetTransformInfo &TTI)
04784     : IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L), Changed(false),
04785       IVIncInsertPos(nullptr) {
04786   // If LoopSimplify form is not available, stay out of trouble.
04787   if (!L->isLoopSimplifyForm())
04788     return;
04789 
04790   // If there's no interesting work to be done, bail early.
04791   if (IU.empty()) return;
04792 
04793   // If there's too much analysis to be done, bail early. We won't be able to
04794   // model the problem anyway.
04795   unsigned NumUsers = 0;
04796   for (const IVStrideUse &U : IU) {
04797     if (++NumUsers > MaxIVUsers) {
04798       (void)U;
04799       DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U << "\n");
04800       return;
04801     }
04802   }
04803 
04804 #ifndef NDEBUG
04805   // All dominating loops must have preheaders, or SCEVExpander may not be able
04806   // to materialize an AddRecExpr whose Start is an outer AddRecExpr.
04807   //
04808   // IVUsers analysis should only create users that are dominated by simple loop
04809   // headers. Since this loop should dominate all of its users, its user list
04810   // should be empty if this loop itself is not within a simple loop nest.
04811   for (DomTreeNode *Rung = DT.getNode(L->getLoopPreheader());
04812        Rung; Rung = Rung->getIDom()) {
04813     BasicBlock *BB = Rung->getBlock();
04814     const Loop *DomLoop = LI.getLoopFor(BB);
04815     if (DomLoop && DomLoop->getHeader() == BB) {
04816       assert(DomLoop->getLoopPreheader() && "LSR needs a simplified loop nest");
04817     }
04818   }
04819 #endif // DEBUG
04820 
04821   DEBUG(dbgs() << "\nLSR on loop ";
04822         L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
04823         dbgs() << ":\n");
04824 
04825   // First, perform some low-level loop optimizations.
04826   OptimizeShadowIV();
04827   OptimizeLoopTermCond();
04828 
04829   // If loop preparation eliminates all interesting IV users, bail.
04830   if (IU.empty()) return;
04831 
04832   // Skip nested loops until we can model them better with formulae.
04833   if (!L->empty()) {
04834     DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
04835     return;
04836   }
04837 
04838   // Start collecting data and preparing for the solver.
04839   CollectChains();
04840   CollectInterestingTypesAndFactors();
04841   CollectFixupsAndInitialFormulae();
04842   CollectLoopInvariantFixupsAndFormulae();
04843 
04844   assert(!Uses.empty() && "IVUsers reported at least one use");
04845   DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
04846         print_uses(dbgs()));
04847 
04848   // Now use the reuse data to generate a bunch of interesting ways
04849   // to formulate the values needed for the uses.
04850   GenerateAllReuseFormulae();
04851 
04852   FilterOutUndesirableDedicatedRegisters();
04853   NarrowSearchSpaceUsingHeuristics();
04854 
04855   SmallVector<const Formula *, 8> Solution;
04856   Solve(Solution);
04857 
04858   // Release memory that is no longer needed.
04859   Factors.clear();
04860   Types.clear();
04861   RegUses.clear();
04862 
04863   if (Solution.empty())
04864     return;
04865 
04866 #ifndef NDEBUG
04867   // Formulae should be legal.
04868   for (const LSRUse &LU : Uses) {
04869     for (const Formula &F : LU.Formulae)
04870       assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
04871                         F) && "Illegal formula generated!");
04872   };
04873 #endif
04874 
04875   // Now that we've decided what we want, make it so.
04876   ImplementSolution(Solution);
04877 }
04878 
04879 void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
04880   if (Factors.empty() && Types.empty()) return;
04881 
04882   OS << "LSR has identified the following interesting factors and types: ";
04883   bool First = true;
04884 
04885   for (int64_t Factor : Factors) {
04886     if (!First) OS << ", ";
04887     First = false;
04888     OS << '*' << Factor;
04889   }
04890 
04891   for (Type *Ty : Types) {
04892     if (!First) OS << ", ";
04893     First = false;
04894     OS << '(' << *Ty << ')';
04895   }
04896   OS << '\n';
04897 }
04898 
04899 void LSRInstance::print_fixups(raw_ostream &OS) const {
04900   OS << "LSR is examining the following fixup sites:\n";
04901   for (const LSRFixup &LF : Fixups) {
04902     dbgs() << "  ";
04903     LF.print(OS);
04904     OS << '\n';
04905   }
04906 }
04907 
04908 void LSRInstance::print_uses(raw_ostream &OS) const {
04909   OS << "LSR is examining the following uses:\n";
04910   for (const LSRUse &LU : Uses) {
04911     dbgs() << "  ";
04912     LU.print(OS);
04913     OS << '\n';
04914     for (const Formula &F : LU.Formulae) {
04915       OS << "    ";
04916       F.print(OS);
04917       OS << '\n';
04918     }
04919   }
04920 }
04921 
04922 void LSRInstance::print(raw_ostream &OS) const {
04923   print_factors_and_types(OS);
04924   print_fixups(OS);
04925   print_uses(OS);
04926 }
04927 
04928 LLVM_DUMP_METHOD
04929 void LSRInstance::dump() const {
04930   print(errs()); errs() << '\n';
04931 }
04932 
04933 namespace {
04934 
04935 class LoopStrengthReduce : public LoopPass {
04936 public:
04937   static char ID; // Pass ID, replacement for typeid
04938   LoopStrengthReduce();
04939 
04940 private:
04941   bool runOnLoop(Loop *L, LPPassManager &LPM) override;
04942   void getAnalysisUsage(AnalysisUsage &AU) const override;
04943 };
04944 
04945 }
04946 
04947 char LoopStrengthReduce::ID = 0;
04948 INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
04949                 "Loop Strength Reduction", false, false)
04950 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
04951 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
04952 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
04953 INITIALIZE_PASS_DEPENDENCY(IVUsers)
04954 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
04955 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
04956 INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
04957                 "Loop Strength Reduction", false, false)
04958 
04959 
04960 Pass *llvm::createLoopStrengthReducePass() {
04961   return new LoopStrengthReduce();
04962 }
04963 
04964 LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
04965   initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry());
04966 }
04967 
04968 void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
04969   // We split critical edges, so we change the CFG.  However, we do update
04970   // many analyses if they are around.
04971   AU.addPreservedID(LoopSimplifyID);
04972 
04973   AU.addRequired<LoopInfoWrapperPass>();
04974   AU.addPreserved<LoopInfoWrapperPass>();
04975   AU.addRequiredID(LoopSimplifyID);
04976   AU.addRequired<DominatorTreeWrapperPass>();
04977   AU.addPreserved<DominatorTreeWrapperPass>();
04978   AU.addRequired<ScalarEvolutionWrapperPass>();
04979   AU.addPreserved<ScalarEvolutionWrapperPass>();
04980   // Requiring LoopSimplify a second time here prevents IVUsers from running
04981   // twice, since LoopSimplify was invalidated by running ScalarEvolution.
04982   AU.addRequiredID(LoopSimplifyID);
04983   AU.addRequired<IVUsers>();
04984   AU.addPreserved<IVUsers>();
04985   AU.addRequired<TargetTransformInfoWrapperPass>();
04986 }
04987 
04988 bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
04989   if (skipOptnoneFunction(L))
04990     return false;
04991 
04992   auto &IU = getAnalysis<IVUsers>();
04993   auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
04994   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
04995   auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
04996   const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
04997       *L->getHeader()->getParent());
04998   bool Changed = false;
04999 
05000   // Run the main LSR transformation.
05001   Changed |= LSRInstance(L, IU, SE, DT, LI, TTI).getChanged();
05002 
05003   // Remove any extra phis created by processing inner loops.
05004   Changed |= DeleteDeadPHIs(L->getHeader());
05005   if (EnablePhiElim && L->isLoopSimplifyForm()) {
05006     SmallVector<WeakVH, 16> DeadInsts;
05007     const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
05008     SCEVExpander Rewriter(getAnalysis<ScalarEvolutionWrapperPass>().getSE(), DL,
05009                           "lsr");
05010 #ifndef NDEBUG
05011     Rewriter.setDebugType(DEBUG_TYPE);
05012 #endif
05013     unsigned numFolded = Rewriter.replaceCongruentIVs(
05014         L, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(), DeadInsts,
05015         &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
05016             *L->getHeader()->getParent()));
05017     if (numFolded) {
05018       Changed = true;
05019       DeleteTriviallyDeadInstructions(DeadInsts);
05020       DeleteDeadPHIs(L->getHeader());
05021     }
05022   }
05023   return Changed;
05024 }