LLVM API Documentation
00001 //===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 // 00010 // This transformation analyzes and transforms the induction variables (and 00011 // computations derived from them) into forms suitable for efficient execution 00012 // on the target. 00013 // 00014 // This pass performs a strength reduction on array references inside loops that 00015 // have as one or more of their components the loop induction variable, it 00016 // rewrites expressions to take advantage of scaled-index addressing modes 00017 // available on the target, and it performs a variety of other optimizations 00018 // related to loop induction variables. 00019 // 00020 // Terminology note: this code has a lot of handling for "post-increment" or 00021 // "post-inc" users. This is not talking about post-increment addressing modes; 00022 // it is instead talking about code like this: 00023 // 00024 // %i = phi [ 0, %entry ], [ %i.next, %latch ] 00025 // ... 00026 // %i.next = add %i, 1 00027 // %c = icmp eq %i.next, %n 00028 // 00029 // The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however 00030 // it's useful to think about these as the same register, with some uses using 00031 // the value of the register before the add and some using // it after. In this 00032 // example, the icmp is a post-increment user, since it uses %i.next, which is 00033 // the value of the induction variable after the increment. The other common 00034 // case of post-increment users is users outside the loop. 00035 // 00036 // TODO: More sophistication in the way Formulae are generated and filtered. 00037 // 00038 // TODO: Handle multiple loops at a time. 00039 // 00040 // TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead 00041 // of a GlobalValue? 00042 // 00043 // TODO: When truncation is free, truncate ICmp users' operands to make it a 00044 // smaller encoding (on x86 at least). 00045 // 00046 // TODO: When a negated register is used by an add (such as in a list of 00047 // multiple base registers, or as the increment expression in an addrec), 00048 // we may not actually need both reg and (-1 * reg) in registers; the 00049 // negation can be implemented by using a sub instead of an add. The 00050 // lack of support for taking this into consideration when making 00051 // register pressure decisions is partly worked around by the "Special" 00052 // use kind. 00053 // 00054 //===----------------------------------------------------------------------===// 00055 00056 #define DEBUG_TYPE "loop-reduce" 00057 #include "llvm/Transforms/Scalar.h" 00058 #include "llvm/ADT/DenseSet.h" 00059 #include "llvm/ADT/SetVector.h" 00060 #include "llvm/ADT/SmallBitVector.h" 00061 #include "llvm/ADT/STLExtras.h" 00062 #include "llvm/Analysis/Dominators.h" 00063 #include "llvm/Analysis/IVUsers.h" 00064 #include "llvm/Analysis/LoopPass.h" 00065 #include "llvm/Analysis/ScalarEvolutionExpander.h" 00066 #include "llvm/Analysis/TargetTransformInfo.h" 00067 #include "llvm/Assembly/Writer.h" 00068 #include "llvm/IR/Constants.h" 00069 #include "llvm/IR/DerivedTypes.h" 00070 #include "llvm/IR/Instructions.h" 00071 #include "llvm/IR/IntrinsicInst.h" 00072 #include "llvm/Support/CommandLine.h" 00073 #include "llvm/Support/Debug.h" 00074 #include "llvm/Support/ValueHandle.h" 00075 #include "llvm/Support/raw_ostream.h" 00076 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 00077 #include "llvm/Transforms/Utils/Local.h" 00078 #include <algorithm> 00079 using namespace llvm; 00080 00081 /// MaxIVUsers is an arbitrary threshold that provides an early opportunitiy for 00082 /// bail out. This threshold is far beyond the number of users that LSR can 00083 /// conceivably solve, so it should not affect generated code, but catches the 00084 /// worst cases before LSR burns too much compile time and stack space. 00085 static const unsigned MaxIVUsers = 200; 00086 00087 // Temporary flag to cleanup congruent phis after LSR phi expansion. 00088 // It's currently disabled until we can determine whether it's truly useful or 00089 // not. The flag should be removed after the v3.0 release. 00090 // This is now needed for ivchains. 00091 static cl::opt<bool> EnablePhiElim( 00092 "enable-lsr-phielim", cl::Hidden, cl::init(true), 00093 cl::desc("Enable LSR phi elimination")); 00094 00095 #ifndef NDEBUG 00096 // Stress test IV chain generation. 00097 static cl::opt<bool> StressIVChain( 00098 "stress-ivchain", cl::Hidden, cl::init(false), 00099 cl::desc("Stress test LSR IV chains")); 00100 #else 00101 static bool StressIVChain = false; 00102 #endif 00103 00104 namespace { 00105 00106 /// RegSortData - This class holds data which is used to order reuse candidates. 00107 class RegSortData { 00108 public: 00109 /// UsedByIndices - This represents the set of LSRUse indices which reference 00110 /// a particular register. 00111 SmallBitVector UsedByIndices; 00112 00113 RegSortData() {} 00114 00115 void print(raw_ostream &OS) const; 00116 void dump() const; 00117 }; 00118 00119 } 00120 00121 void RegSortData::print(raw_ostream &OS) const { 00122 OS << "[NumUses=" << UsedByIndices.count() << ']'; 00123 } 00124 00125 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 00126 void RegSortData::dump() const { 00127 print(errs()); errs() << '\n'; 00128 } 00129 #endif 00130 00131 namespace { 00132 00133 /// RegUseTracker - Map register candidates to information about how they are 00134 /// used. 00135 class RegUseTracker { 00136 typedef DenseMap<const SCEV *, RegSortData> RegUsesTy; 00137 00138 RegUsesTy RegUsesMap; 00139 SmallVector<const SCEV *, 16> RegSequence; 00140 00141 public: 00142 void CountRegister(const SCEV *Reg, size_t LUIdx); 00143 void DropRegister(const SCEV *Reg, size_t LUIdx); 00144 void SwapAndDropUse(size_t LUIdx, size_t LastLUIdx); 00145 00146 bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const; 00147 00148 const SmallBitVector &getUsedByIndices(const SCEV *Reg) const; 00149 00150 void clear(); 00151 00152 typedef SmallVectorImpl<const SCEV *>::iterator iterator; 00153 typedef SmallVectorImpl<const SCEV *>::const_iterator const_iterator; 00154 iterator begin() { return RegSequence.begin(); } 00155 iterator end() { return RegSequence.end(); } 00156 const_iterator begin() const { return RegSequence.begin(); } 00157 const_iterator end() const { return RegSequence.end(); } 00158 }; 00159 00160 } 00161 00162 void 00163 RegUseTracker::CountRegister(const SCEV *Reg, size_t LUIdx) { 00164 std::pair<RegUsesTy::iterator, bool> Pair = 00165 RegUsesMap.insert(std::make_pair(Reg, RegSortData())); 00166 RegSortData &RSD = Pair.first->second; 00167 if (Pair.second) 00168 RegSequence.push_back(Reg); 00169 RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1)); 00170 RSD.UsedByIndices.set(LUIdx); 00171 } 00172 00173 void 00174 RegUseTracker::DropRegister(const SCEV *Reg, size_t LUIdx) { 00175 RegUsesTy::iterator It = RegUsesMap.find(Reg); 00176 assert(It != RegUsesMap.end()); 00177 RegSortData &RSD = It->second; 00178 assert(RSD.UsedByIndices.size() > LUIdx); 00179 RSD.UsedByIndices.reset(LUIdx); 00180 } 00181 00182 void 00183 RegUseTracker::SwapAndDropUse(size_t LUIdx, size_t LastLUIdx) { 00184 assert(LUIdx <= LastLUIdx); 00185 00186 // Update RegUses. The data structure is not optimized for this purpose; 00187 // we must iterate through it and update each of the bit vectors. 00188 for (RegUsesTy::iterator I = RegUsesMap.begin(), E = RegUsesMap.end(); 00189 I != E; ++I) { 00190 SmallBitVector &UsedByIndices = I->second.UsedByIndices; 00191 if (LUIdx < UsedByIndices.size()) 00192 UsedByIndices[LUIdx] = 00193 LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : 0; 00194 UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx)); 00195 } 00196 } 00197 00198 bool 00199 RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const { 00200 RegUsesTy::const_iterator I = RegUsesMap.find(Reg); 00201 if (I == RegUsesMap.end()) 00202 return false; 00203 const SmallBitVector &UsedByIndices = I->second.UsedByIndices; 00204 int i = UsedByIndices.find_first(); 00205 if (i == -1) return false; 00206 if ((size_t)i != LUIdx) return true; 00207 return UsedByIndices.find_next(i) != -1; 00208 } 00209 00210 const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const { 00211 RegUsesTy::const_iterator I = RegUsesMap.find(Reg); 00212 assert(I != RegUsesMap.end() && "Unknown register!"); 00213 return I->second.UsedByIndices; 00214 } 00215 00216 void RegUseTracker::clear() { 00217 RegUsesMap.clear(); 00218 RegSequence.clear(); 00219 } 00220 00221 namespace { 00222 00223 /// Formula - This class holds information that describes a formula for 00224 /// computing satisfying a use. It may include broken-out immediates and scaled 00225 /// registers. 00226 struct Formula { 00227 /// Global base address used for complex addressing. 00228 GlobalValue *BaseGV; 00229 00230 /// Base offset for complex addressing. 00231 int64_t BaseOffset; 00232 00233 /// Whether any complex addressing has a base register. 00234 bool HasBaseReg; 00235 00236 /// The scale of any complex addressing. 00237 int64_t Scale; 00238 00239 /// BaseRegs - The list of "base" registers for this use. When this is 00240 /// non-empty, 00241 SmallVector<const SCEV *, 4> BaseRegs; 00242 00243 /// ScaledReg - The 'scaled' register for this use. This should be non-null 00244 /// when Scale is not zero. 00245 const SCEV *ScaledReg; 00246 00247 /// UnfoldedOffset - An additional constant offset which added near the 00248 /// use. This requires a temporary register, but the offset itself can 00249 /// live in an add immediate field rather than a register. 00250 int64_t UnfoldedOffset; 00251 00252 Formula() 00253 : BaseGV(0), BaseOffset(0), HasBaseReg(false), Scale(0), ScaledReg(0), 00254 UnfoldedOffset(0) {} 00255 00256 void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE); 00257 00258 unsigned getNumRegs() const; 00259 Type *getType() const; 00260 00261 void DeleteBaseReg(const SCEV *&S); 00262 00263 bool referencesReg(const SCEV *S) const; 00264 bool hasRegsUsedByUsesOtherThan(size_t LUIdx, 00265 const RegUseTracker &RegUses) const; 00266 00267 void print(raw_ostream &OS) const; 00268 void dump() const; 00269 }; 00270 00271 } 00272 00273 /// DoInitialMatch - Recursion helper for InitialMatch. 00274 static void DoInitialMatch(const SCEV *S, Loop *L, 00275 SmallVectorImpl<const SCEV *> &Good, 00276 SmallVectorImpl<const SCEV *> &Bad, 00277 ScalarEvolution &SE) { 00278 // Collect expressions which properly dominate the loop header. 00279 if (SE.properlyDominates(S, L->getHeader())) { 00280 Good.push_back(S); 00281 return; 00282 } 00283 00284 // Look at add operands. 00285 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { 00286 for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); 00287 I != E; ++I) 00288 DoInitialMatch(*I, L, Good, Bad, SE); 00289 return; 00290 } 00291 00292 // Look at addrec operands. 00293 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) 00294 if (!AR->getStart()->isZero()) { 00295 DoInitialMatch(AR->getStart(), L, Good, Bad, SE); 00296 DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0), 00297 AR->getStepRecurrence(SE), 00298 // FIXME: AR->getNoWrapFlags() 00299 AR->getLoop(), SCEV::FlagAnyWrap), 00300 L, Good, Bad, SE); 00301 return; 00302 } 00303 00304 // Handle a multiplication by -1 (negation) if it didn't fold. 00305 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) 00306 if (Mul->getOperand(0)->isAllOnesValue()) { 00307 SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end()); 00308 const SCEV *NewMul = SE.getMulExpr(Ops); 00309 00310 SmallVector<const SCEV *, 4> MyGood; 00311 SmallVector<const SCEV *, 4> MyBad; 00312 DoInitialMatch(NewMul, L, MyGood, MyBad, SE); 00313 const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue( 00314 SE.getEffectiveSCEVType(NewMul->getType()))); 00315 for (SmallVectorImpl<const SCEV *>::const_iterator I = MyGood.begin(), 00316 E = MyGood.end(); I != E; ++I) 00317 Good.push_back(SE.getMulExpr(NegOne, *I)); 00318 for (SmallVectorImpl<const SCEV *>::const_iterator I = MyBad.begin(), 00319 E = MyBad.end(); I != E; ++I) 00320 Bad.push_back(SE.getMulExpr(NegOne, *I)); 00321 return; 00322 } 00323 00324 // Ok, we can't do anything interesting. Just stuff the whole thing into a 00325 // register and hope for the best. 00326 Bad.push_back(S); 00327 } 00328 00329 /// InitialMatch - Incorporate loop-variant parts of S into this Formula, 00330 /// attempting to keep all loop-invariant and loop-computable values in a 00331 /// single base register. 00332 void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) { 00333 SmallVector<const SCEV *, 4> Good; 00334 SmallVector<const SCEV *, 4> Bad; 00335 DoInitialMatch(S, L, Good, Bad, SE); 00336 if (!Good.empty()) { 00337 const SCEV *Sum = SE.getAddExpr(Good); 00338 if (!Sum->isZero()) 00339 BaseRegs.push_back(Sum); 00340 HasBaseReg = true; 00341 } 00342 if (!Bad.empty()) { 00343 const SCEV *Sum = SE.getAddExpr(Bad); 00344 if (!Sum->isZero()) 00345 BaseRegs.push_back(Sum); 00346 HasBaseReg = true; 00347 } 00348 } 00349 00350 /// getNumRegs - Return the total number of register operands used by this 00351 /// formula. This does not include register uses implied by non-constant 00352 /// addrec strides. 00353 unsigned Formula::getNumRegs() const { 00354 return !!ScaledReg + BaseRegs.size(); 00355 } 00356 00357 /// getType - Return the type of this formula, if it has one, or null 00358 /// otherwise. This type is meaningless except for the bit size. 00359 Type *Formula::getType() const { 00360 return !BaseRegs.empty() ? BaseRegs.front()->getType() : 00361 ScaledReg ? ScaledReg->getType() : 00362 BaseGV ? BaseGV->getType() : 00363 0; 00364 } 00365 00366 /// DeleteBaseReg - Delete the given base reg from the BaseRegs list. 00367 void Formula::DeleteBaseReg(const SCEV *&S) { 00368 if (&S != &BaseRegs.back()) 00369 std::swap(S, BaseRegs.back()); 00370 BaseRegs.pop_back(); 00371 } 00372 00373 /// referencesReg - Test if this formula references the given register. 00374 bool Formula::referencesReg(const SCEV *S) const { 00375 return S == ScaledReg || 00376 std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end(); 00377 } 00378 00379 /// hasRegsUsedByUsesOtherThan - Test whether this formula uses registers 00380 /// which are used by uses other than the use with the given index. 00381 bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx, 00382 const RegUseTracker &RegUses) const { 00383 if (ScaledReg) 00384 if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx)) 00385 return true; 00386 for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(), 00387 E = BaseRegs.end(); I != E; ++I) 00388 if (RegUses.isRegUsedByUsesOtherThan(*I, LUIdx)) 00389 return true; 00390 return false; 00391 } 00392 00393 void Formula::print(raw_ostream &OS) const { 00394 bool First = true; 00395 if (BaseGV) { 00396 if (!First) OS << " + "; else First = false; 00397 WriteAsOperand(OS, BaseGV, /*PrintType=*/false); 00398 } 00399 if (BaseOffset != 0) { 00400 if (!First) OS << " + "; else First = false; 00401 OS << BaseOffset; 00402 } 00403 for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(), 00404 E = BaseRegs.end(); I != E; ++I) { 00405 if (!First) OS << " + "; else First = false; 00406 OS << "reg(" << **I << ')'; 00407 } 00408 if (HasBaseReg && BaseRegs.empty()) { 00409 if (!First) OS << " + "; else First = false; 00410 OS << "**error: HasBaseReg**"; 00411 } else if (!HasBaseReg && !BaseRegs.empty()) { 00412 if (!First) OS << " + "; else First = false; 00413 OS << "**error: !HasBaseReg**"; 00414 } 00415 if (Scale != 0) { 00416 if (!First) OS << " + "; else First = false; 00417 OS << Scale << "*reg("; 00418 if (ScaledReg) 00419 OS << *ScaledReg; 00420 else 00421 OS << "<unknown>"; 00422 OS << ')'; 00423 } 00424 if (UnfoldedOffset != 0) { 00425 if (!First) OS << " + "; else First = false; 00426 OS << "imm(" << UnfoldedOffset << ')'; 00427 } 00428 } 00429 00430 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 00431 void Formula::dump() const { 00432 print(errs()); errs() << '\n'; 00433 } 00434 #endif 00435 00436 /// isAddRecSExtable - Return true if the given addrec can be sign-extended 00437 /// without changing its value. 00438 static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) { 00439 Type *WideTy = 00440 IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1); 00441 return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy)); 00442 } 00443 00444 /// isAddSExtable - Return true if the given add can be sign-extended 00445 /// without changing its value. 00446 static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) { 00447 Type *WideTy = 00448 IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1); 00449 return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy)); 00450 } 00451 00452 /// isMulSExtable - Return true if the given mul can be sign-extended 00453 /// without changing its value. 00454 static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) { 00455 Type *WideTy = 00456 IntegerType::get(SE.getContext(), 00457 SE.getTypeSizeInBits(M->getType()) * M->getNumOperands()); 00458 return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy)); 00459 } 00460 00461 /// getExactSDiv - Return an expression for LHS /s RHS, if it can be determined 00462 /// and if the remainder is known to be zero, or null otherwise. If 00463 /// IgnoreSignificantBits is true, expressions like (X * Y) /s Y are simplified 00464 /// to Y, ignoring that the multiplication may overflow, which is useful when 00465 /// the result will be used in a context where the most significant bits are 00466 /// ignored. 00467 static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, 00468 ScalarEvolution &SE, 00469 bool IgnoreSignificantBits = false) { 00470 // Handle the trivial case, which works for any SCEV type. 00471 if (LHS == RHS) 00472 return SE.getConstant(LHS->getType(), 1); 00473 00474 // Handle a few RHS special cases. 00475 const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS); 00476 if (RC) { 00477 const APInt &RA = RC->getValue()->getValue(); 00478 // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do 00479 // some folding. 00480 if (RA.isAllOnesValue()) 00481 return SE.getMulExpr(LHS, RC); 00482 // Handle x /s 1 as x. 00483 if (RA == 1) 00484 return LHS; 00485 } 00486 00487 // Check for a division of a constant by a constant. 00488 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) { 00489 if (!RC) 00490 return 0; 00491 const APInt &LA = C->getValue()->getValue(); 00492 const APInt &RA = RC->getValue()->getValue(); 00493 if (LA.srem(RA) != 0) 00494 return 0; 00495 return SE.getConstant(LA.sdiv(RA)); 00496 } 00497 00498 // Distribute the sdiv over addrec operands, if the addrec doesn't overflow. 00499 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) { 00500 if (IgnoreSignificantBits || isAddRecSExtable(AR, SE)) { 00501 const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE, 00502 IgnoreSignificantBits); 00503 if (!Step) return 0; 00504 const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE, 00505 IgnoreSignificantBits); 00506 if (!Start) return 0; 00507 // FlagNW is independent of the start value, step direction, and is 00508 // preserved with smaller magnitude steps. 00509 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW) 00510 return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap); 00511 } 00512 return 0; 00513 } 00514 00515 // Distribute the sdiv over add operands, if the add doesn't overflow. 00516 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) { 00517 if (IgnoreSignificantBits || isAddSExtable(Add, SE)) { 00518 SmallVector<const SCEV *, 8> Ops; 00519 for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); 00520 I != E; ++I) { 00521 const SCEV *Op = getExactSDiv(*I, RHS, SE, 00522 IgnoreSignificantBits); 00523 if (!Op) return 0; 00524 Ops.push_back(Op); 00525 } 00526 return SE.getAddExpr(Ops); 00527 } 00528 return 0; 00529 } 00530 00531 // Check for a multiply operand that we can pull RHS out of. 00532 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) { 00533 if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) { 00534 SmallVector<const SCEV *, 4> Ops; 00535 bool Found = false; 00536 for (SCEVMulExpr::op_iterator I = Mul->op_begin(), E = Mul->op_end(); 00537 I != E; ++I) { 00538 const SCEV *S = *I; 00539 if (!Found) 00540 if (const SCEV *Q = getExactSDiv(S, RHS, SE, 00541 IgnoreSignificantBits)) { 00542 S = Q; 00543 Found = true; 00544 } 00545 Ops.push_back(S); 00546 } 00547 return Found ? SE.getMulExpr(Ops) : 0; 00548 } 00549 return 0; 00550 } 00551 00552 // Otherwise we don't know. 00553 return 0; 00554 } 00555 00556 /// ExtractImmediate - If S involves the addition of a constant integer value, 00557 /// return that integer value, and mutate S to point to a new SCEV with that 00558 /// value excluded. 00559 static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) { 00560 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) { 00561 if (C->getValue()->getValue().getMinSignedBits() <= 64) { 00562 S = SE.getConstant(C->getType(), 0); 00563 return C->getValue()->getSExtValue(); 00564 } 00565 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { 00566 SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end()); 00567 int64_t Result = ExtractImmediate(NewOps.front(), SE); 00568 if (Result != 0) 00569 S = SE.getAddExpr(NewOps); 00570 return Result; 00571 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { 00572 SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end()); 00573 int64_t Result = ExtractImmediate(NewOps.front(), SE); 00574 if (Result != 0) 00575 S = SE.getAddRecExpr(NewOps, AR->getLoop(), 00576 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW) 00577 SCEV::FlagAnyWrap); 00578 return Result; 00579 } 00580 return 0; 00581 } 00582 00583 /// ExtractSymbol - If S involves the addition of a GlobalValue address, 00584 /// return that symbol, and mutate S to point to a new SCEV with that 00585 /// value excluded. 00586 static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) { 00587 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) { 00588 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) { 00589 S = SE.getConstant(GV->getType(), 0); 00590 return GV; 00591 } 00592 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { 00593 SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end()); 00594 GlobalValue *Result = ExtractSymbol(NewOps.back(), SE); 00595 if (Result) 00596 S = SE.getAddExpr(NewOps); 00597 return Result; 00598 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { 00599 SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end()); 00600 GlobalValue *Result = ExtractSymbol(NewOps.front(), SE); 00601 if (Result) 00602 S = SE.getAddRecExpr(NewOps, AR->getLoop(), 00603 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW) 00604 SCEV::FlagAnyWrap); 00605 return Result; 00606 } 00607 return 0; 00608 } 00609 00610 /// isAddressUse - Returns true if the specified instruction is using the 00611 /// specified value as an address. 00612 static bool isAddressUse(Instruction *Inst, Value *OperandVal) { 00613 bool isAddress = isa<LoadInst>(Inst); 00614 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { 00615 if (SI->getOperand(1) == OperandVal) 00616 isAddress = true; 00617 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { 00618 // Addressing modes can also be folded into prefetches and a variety 00619 // of intrinsics. 00620 switch (II->getIntrinsicID()) { 00621 default: break; 00622 case Intrinsic::prefetch: 00623 case Intrinsic::x86_sse_storeu_ps: 00624 case Intrinsic::x86_sse2_storeu_pd: 00625 case Intrinsic::x86_sse2_storeu_dq: 00626 case Intrinsic::x86_sse2_storel_dq: 00627 if (II->getArgOperand(0) == OperandVal) 00628 isAddress = true; 00629 break; 00630 } 00631 } 00632 return isAddress; 00633 } 00634 00635 /// getAccessType - Return the type of the memory being accessed. 00636 static Type *getAccessType(const Instruction *Inst) { 00637 Type *AccessTy = Inst->getType(); 00638 if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) 00639 AccessTy = SI->getOperand(0)->getType(); 00640 else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { 00641 // Addressing modes can also be folded into prefetches and a variety 00642 // of intrinsics. 00643 switch (II->getIntrinsicID()) { 00644 default: break; 00645 case Intrinsic::x86_sse_storeu_ps: 00646 case Intrinsic::x86_sse2_storeu_pd: 00647 case Intrinsic::x86_sse2_storeu_dq: 00648 case Intrinsic::x86_sse2_storel_dq: 00649 AccessTy = II->getArgOperand(0)->getType(); 00650 break; 00651 } 00652 } 00653 00654 // All pointers have the same requirements, so canonicalize them to an 00655 // arbitrary pointer type to minimize variation. 00656 if (PointerType *PTy = dyn_cast<PointerType>(AccessTy)) 00657 AccessTy = PointerType::get(IntegerType::get(PTy->getContext(), 1), 00658 PTy->getAddressSpace()); 00659 00660 return AccessTy; 00661 } 00662 00663 /// isExistingPhi - Return true if this AddRec is already a phi in its loop. 00664 static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) { 00665 for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin(); 00666 PHINode *PN = dyn_cast<PHINode>(I); ++I) { 00667 if (SE.isSCEVable(PN->getType()) && 00668 (SE.getEffectiveSCEVType(PN->getType()) == 00669 SE.getEffectiveSCEVType(AR->getType())) && 00670 SE.getSCEV(PN) == AR) 00671 return true; 00672 } 00673 return false; 00674 } 00675 00676 /// Check if expanding this expression is likely to incur significant cost. This 00677 /// is tricky because SCEV doesn't track which expressions are actually computed 00678 /// by the current IR. 00679 /// 00680 /// We currently allow expansion of IV increments that involve adds, 00681 /// multiplication by constants, and AddRecs from existing phis. 00682 /// 00683 /// TODO: Allow UDivExpr if we can find an existing IV increment that is an 00684 /// obvious multiple of the UDivExpr. 00685 static bool isHighCostExpansion(const SCEV *S, 00686 SmallPtrSet<const SCEV*, 8> &Processed, 00687 ScalarEvolution &SE) { 00688 // Zero/One operand expressions 00689 switch (S->getSCEVType()) { 00690 case scUnknown: 00691 case scConstant: 00692 return false; 00693 case scTruncate: 00694 return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(), 00695 Processed, SE); 00696 case scZeroExtend: 00697 return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(), 00698 Processed, SE); 00699 case scSignExtend: 00700 return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(), 00701 Processed, SE); 00702 } 00703 00704 if (!Processed.insert(S)) 00705 return false; 00706 00707 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { 00708 for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); 00709 I != E; ++I) { 00710 if (isHighCostExpansion(*I, Processed, SE)) 00711 return true; 00712 } 00713 return false; 00714 } 00715 00716 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) { 00717 if (Mul->getNumOperands() == 2) { 00718 // Multiplication by a constant is ok 00719 if (isa<SCEVConstant>(Mul->getOperand(0))) 00720 return isHighCostExpansion(Mul->getOperand(1), Processed, SE); 00721 00722 // If we have the value of one operand, check if an existing 00723 // multiplication already generates this expression. 00724 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) { 00725 Value *UVal = U->getValue(); 00726 for (Value::use_iterator UI = UVal->use_begin(), UE = UVal->use_end(); 00727 UI != UE; ++UI) { 00728 // If U is a constant, it may be used by a ConstantExpr. 00729 Instruction *User = dyn_cast<Instruction>(*UI); 00730 if (User && User->getOpcode() == Instruction::Mul 00731 && SE.isSCEVable(User->getType())) { 00732 return SE.getSCEV(User) == Mul; 00733 } 00734 } 00735 } 00736 } 00737 } 00738 00739 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { 00740 if (isExistingPhi(AR, SE)) 00741 return false; 00742 } 00743 00744 // Fow now, consider any other type of expression (div/mul/min/max) high cost. 00745 return true; 00746 } 00747 00748 /// DeleteTriviallyDeadInstructions - If any of the instructions is the 00749 /// specified set are trivially dead, delete them and see if this makes any of 00750 /// their operands subsequently dead. 00751 static bool 00752 DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) { 00753 bool Changed = false; 00754 00755 while (!DeadInsts.empty()) { 00756 Value *V = DeadInsts.pop_back_val(); 00757 Instruction *I = dyn_cast_or_null<Instruction>(V); 00758 00759 if (I == 0 || !isInstructionTriviallyDead(I)) 00760 continue; 00761 00762 for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) 00763 if (Instruction *U = dyn_cast<Instruction>(*OI)) { 00764 *OI = 0; 00765 if (U->use_empty()) 00766 DeadInsts.push_back(U); 00767 } 00768 00769 I->eraseFromParent(); 00770 Changed = true; 00771 } 00772 00773 return Changed; 00774 } 00775 00776 namespace { 00777 00778 /// Cost - This class is used to measure and compare candidate formulae. 00779 class Cost { 00780 /// TODO: Some of these could be merged. Also, a lexical ordering 00781 /// isn't always optimal. 00782 unsigned NumRegs; 00783 unsigned AddRecCost; 00784 unsigned NumIVMuls; 00785 unsigned NumBaseAdds; 00786 unsigned ImmCost; 00787 unsigned SetupCost; 00788 00789 public: 00790 Cost() 00791 : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0), 00792 SetupCost(0) {} 00793 00794 bool operator<(const Cost &Other) const; 00795 00796 void Loose(); 00797 00798 #ifndef NDEBUG 00799 // Once any of the metrics loses, they must all remain losers. 00800 bool isValid() { 00801 return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds 00802 | ImmCost | SetupCost) != ~0u) 00803 || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds 00804 & ImmCost & SetupCost) == ~0u); 00805 } 00806 #endif 00807 00808 bool isLoser() { 00809 assert(isValid() && "invalid cost"); 00810 return NumRegs == ~0u; 00811 } 00812 00813 void RateFormula(const Formula &F, 00814 SmallPtrSet<const SCEV *, 16> &Regs, 00815 const DenseSet<const SCEV *> &VisitedRegs, 00816 const Loop *L, 00817 const SmallVectorImpl<int64_t> &Offsets, 00818 ScalarEvolution &SE, DominatorTree &DT, 00819 SmallPtrSet<const SCEV *, 16> *LoserRegs = 0); 00820 00821 void print(raw_ostream &OS) const; 00822 void dump() const; 00823 00824 private: 00825 void RateRegister(const SCEV *Reg, 00826 SmallPtrSet<const SCEV *, 16> &Regs, 00827 const Loop *L, 00828 ScalarEvolution &SE, DominatorTree &DT); 00829 void RatePrimaryRegister(const SCEV *Reg, 00830 SmallPtrSet<const SCEV *, 16> &Regs, 00831 const Loop *L, 00832 ScalarEvolution &SE, DominatorTree &DT, 00833 SmallPtrSet<const SCEV *, 16> *LoserRegs); 00834 }; 00835 00836 } 00837 00838 /// RateRegister - Tally up interesting quantities from the given register. 00839 void Cost::RateRegister(const SCEV *Reg, 00840 SmallPtrSet<const SCEV *, 16> &Regs, 00841 const Loop *L, 00842 ScalarEvolution &SE, DominatorTree &DT) { 00843 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) { 00844 // If this is an addrec for another loop, don't second-guess its addrec phi 00845 // nodes. LSR isn't currently smart enough to reason about more than one 00846 // loop at a time. LSR has already run on inner loops, will not run on outer 00847 // loops, and cannot be expected to change sibling loops. 00848 if (AR->getLoop() != L) { 00849 // If the AddRec exists, consider it's register free and leave it alone. 00850 if (isExistingPhi(AR, SE)) 00851 return; 00852 00853 // Otherwise, do not consider this formula at all. 00854 Loose(); 00855 return; 00856 } 00857 AddRecCost += 1; /// TODO: This should be a function of the stride. 00858 00859 // Add the step value register, if it needs one. 00860 // TODO: The non-affine case isn't precisely modeled here. 00861 if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) { 00862 if (!Regs.count(AR->getOperand(1))) { 00863 RateRegister(AR->getOperand(1), Regs, L, SE, DT); 00864 if (isLoser()) 00865 return; 00866 } 00867 } 00868 } 00869 ++NumRegs; 00870 00871 // Rough heuristic; favor registers which don't require extra setup 00872 // instructions in the preheader. 00873 if (!isa<SCEVUnknown>(Reg) && 00874 !isa<SCEVConstant>(Reg) && 00875 !(isa<SCEVAddRecExpr>(Reg) && 00876 (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) || 00877 isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart())))) 00878 ++SetupCost; 00879 00880 NumIVMuls += isa<SCEVMulExpr>(Reg) && 00881 SE.hasComputableLoopEvolution(Reg, L); 00882 } 00883 00884 /// RatePrimaryRegister - Record this register in the set. If we haven't seen it 00885 /// before, rate it. Optional LoserRegs provides a way to declare any formula 00886 /// that refers to one of those regs an instant loser. 00887 void Cost::RatePrimaryRegister(const SCEV *Reg, 00888 SmallPtrSet<const SCEV *, 16> &Regs, 00889 const Loop *L, 00890 ScalarEvolution &SE, DominatorTree &DT, 00891 SmallPtrSet<const SCEV *, 16> *LoserRegs) { 00892 if (LoserRegs && LoserRegs->count(Reg)) { 00893 Loose(); 00894 return; 00895 } 00896 if (Regs.insert(Reg)) { 00897 RateRegister(Reg, Regs, L, SE, DT); 00898 if (LoserRegs && isLoser()) 00899 LoserRegs->insert(Reg); 00900 } 00901 } 00902 00903 void Cost::RateFormula(const Formula &F, 00904 SmallPtrSet<const SCEV *, 16> &Regs, 00905 const DenseSet<const SCEV *> &VisitedRegs, 00906 const Loop *L, 00907 const SmallVectorImpl<int64_t> &Offsets, 00908 ScalarEvolution &SE, DominatorTree &DT, 00909 SmallPtrSet<const SCEV *, 16> *LoserRegs) { 00910 // Tally up the registers. 00911 if (const SCEV *ScaledReg = F.ScaledReg) { 00912 if (VisitedRegs.count(ScaledReg)) { 00913 Loose(); 00914 return; 00915 } 00916 RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs); 00917 if (isLoser()) 00918 return; 00919 } 00920 for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(), 00921 E = F.BaseRegs.end(); I != E; ++I) { 00922 const SCEV *BaseReg = *I; 00923 if (VisitedRegs.count(BaseReg)) { 00924 Loose(); 00925 return; 00926 } 00927 RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs); 00928 if (isLoser()) 00929 return; 00930 } 00931 00932 // Determine how many (unfolded) adds we'll need inside the loop. 00933 size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0); 00934 if (NumBaseParts > 1) 00935 NumBaseAdds += NumBaseParts - 1; 00936 00937 // Tally up the non-zero immediates. 00938 for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(), 00939 E = Offsets.end(); I != E; ++I) { 00940 int64_t Offset = (uint64_t)*I + F.BaseOffset; 00941 if (F.BaseGV) 00942 ImmCost += 64; // Handle symbolic values conservatively. 00943 // TODO: This should probably be the pointer size. 00944 else if (Offset != 0) 00945 ImmCost += APInt(64, Offset, true).getMinSignedBits(); 00946 } 00947 assert(isValid() && "invalid cost"); 00948 } 00949 00950 /// Loose - Set this cost to a losing value. 00951 void Cost::Loose() { 00952 NumRegs = ~0u; 00953 AddRecCost = ~0u; 00954 NumIVMuls = ~0u; 00955 NumBaseAdds = ~0u; 00956 ImmCost = ~0u; 00957 SetupCost = ~0u; 00958 } 00959 00960 /// operator< - Choose the lower cost. 00961 bool Cost::operator<(const Cost &Other) const { 00962 if (NumRegs != Other.NumRegs) 00963 return NumRegs < Other.NumRegs; 00964 if (AddRecCost != Other.AddRecCost) 00965 return AddRecCost < Other.AddRecCost; 00966 if (NumIVMuls != Other.NumIVMuls) 00967 return NumIVMuls < Other.NumIVMuls; 00968 if (NumBaseAdds != Other.NumBaseAdds) 00969 return NumBaseAdds < Other.NumBaseAdds; 00970 if (ImmCost != Other.ImmCost) 00971 return ImmCost < Other.ImmCost; 00972 if (SetupCost != Other.SetupCost) 00973 return SetupCost < Other.SetupCost; 00974 return false; 00975 } 00976 00977 void Cost::print(raw_ostream &OS) const { 00978 OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s"); 00979 if (AddRecCost != 0) 00980 OS << ", with addrec cost " << AddRecCost; 00981 if (NumIVMuls != 0) 00982 OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s"); 00983 if (NumBaseAdds != 0) 00984 OS << ", plus " << NumBaseAdds << " base add" 00985 << (NumBaseAdds == 1 ? "" : "s"); 00986 if (ImmCost != 0) 00987 OS << ", plus " << ImmCost << " imm cost"; 00988 if (SetupCost != 0) 00989 OS << ", plus " << SetupCost << " setup cost"; 00990 } 00991 00992 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 00993 void Cost::dump() const { 00994 print(errs()); errs() << '\n'; 00995 } 00996 #endif 00997 00998 namespace { 00999 01000 /// LSRFixup - An operand value in an instruction which is to be replaced 01001 /// with some equivalent, possibly strength-reduced, replacement. 01002 struct LSRFixup { 01003 /// UserInst - The instruction which will be updated. 01004 Instruction *UserInst; 01005 01006 /// OperandValToReplace - The operand of the instruction which will 01007 /// be replaced. The operand may be used more than once; every instance 01008 /// will be replaced. 01009 Value *OperandValToReplace; 01010 01011 /// PostIncLoops - If this user is to use the post-incremented value of an 01012 /// induction variable, this variable is non-null and holds the loop 01013 /// associated with the induction variable. 01014 PostIncLoopSet PostIncLoops; 01015 01016 /// LUIdx - The index of the LSRUse describing the expression which 01017 /// this fixup needs, minus an offset (below). 01018 size_t LUIdx; 01019 01020 /// Offset - A constant offset to be added to the LSRUse expression. 01021 /// This allows multiple fixups to share the same LSRUse with different 01022 /// offsets, for example in an unrolled loop. 01023 int64_t Offset; 01024 01025 bool isUseFullyOutsideLoop(const Loop *L) const; 01026 01027 LSRFixup(); 01028 01029 void print(raw_ostream &OS) const; 01030 void dump() const; 01031 }; 01032 01033 } 01034 01035 LSRFixup::LSRFixup() 01036 : UserInst(0), OperandValToReplace(0), LUIdx(~size_t(0)), Offset(0) {} 01037 01038 /// isUseFullyOutsideLoop - Test whether this fixup always uses its 01039 /// value outside of the given loop. 01040 bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const { 01041 // PHI nodes use their value in their incoming blocks. 01042 if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) { 01043 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) 01044 if (PN->getIncomingValue(i) == OperandValToReplace && 01045 L->contains(PN->getIncomingBlock(i))) 01046 return false; 01047 return true; 01048 } 01049 01050 return !L->contains(UserInst); 01051 } 01052 01053 void LSRFixup::print(raw_ostream &OS) const { 01054 OS << "UserInst="; 01055 // Store is common and interesting enough to be worth special-casing. 01056 if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) { 01057 OS << "store "; 01058 WriteAsOperand(OS, Store->getOperand(0), /*PrintType=*/false); 01059 } else if (UserInst->getType()->isVoidTy()) 01060 OS << UserInst->getOpcodeName(); 01061 else 01062 WriteAsOperand(OS, UserInst, /*PrintType=*/false); 01063 01064 OS << ", OperandValToReplace="; 01065 WriteAsOperand(OS, OperandValToReplace, /*PrintType=*/false); 01066 01067 for (PostIncLoopSet::const_iterator I = PostIncLoops.begin(), 01068 E = PostIncLoops.end(); I != E; ++I) { 01069 OS << ", PostIncLoop="; 01070 WriteAsOperand(OS, (*I)->getHeader(), /*PrintType=*/false); 01071 } 01072 01073 if (LUIdx != ~size_t(0)) 01074 OS << ", LUIdx=" << LUIdx; 01075 01076 if (Offset != 0) 01077 OS << ", Offset=" << Offset; 01078 } 01079 01080 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 01081 void LSRFixup::dump() const { 01082 print(errs()); errs() << '\n'; 01083 } 01084 #endif 01085 01086 namespace { 01087 01088 /// UniquifierDenseMapInfo - A DenseMapInfo implementation for holding 01089 /// DenseMaps and DenseSets of sorted SmallVectors of const SCEV*. 01090 struct UniquifierDenseMapInfo { 01091 static SmallVector<const SCEV *, 4> getEmptyKey() { 01092 SmallVector<const SCEV *, 4> V; 01093 V.push_back(reinterpret_cast<const SCEV *>(-1)); 01094 return V; 01095 } 01096 01097 static SmallVector<const SCEV *, 4> getTombstoneKey() { 01098 SmallVector<const SCEV *, 4> V; 01099 V.push_back(reinterpret_cast<const SCEV *>(-2)); 01100 return V; 01101 } 01102 01103 static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) { 01104 unsigned Result = 0; 01105 for (SmallVectorImpl<const SCEV *>::const_iterator I = V.begin(), 01106 E = V.end(); I != E; ++I) 01107 Result ^= DenseMapInfo<const SCEV *>::getHashValue(*I); 01108 return Result; 01109 } 01110 01111 static bool isEqual(const SmallVector<const SCEV *, 4> &LHS, 01112 const SmallVector<const SCEV *, 4> &RHS) { 01113 return LHS == RHS; 01114 } 01115 }; 01116 01117 /// LSRUse - This class holds the state that LSR keeps for each use in 01118 /// IVUsers, as well as uses invented by LSR itself. It includes information 01119 /// about what kinds of things can be folded into the user, information about 01120 /// the user itself, and information about how the use may be satisfied. 01121 /// TODO: Represent multiple users of the same expression in common? 01122 class LSRUse { 01123 DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier; 01124 01125 public: 01126 /// KindType - An enum for a kind of use, indicating what types of 01127 /// scaled and immediate operands it might support. 01128 enum KindType { 01129 Basic, ///< A normal use, with no folding. 01130 Special, ///< A special case of basic, allowing -1 scales. 01131 Address, ///< An address use; folding according to TargetLowering 01132 ICmpZero ///< An equality icmp with both operands folded into one. 01133 // TODO: Add a generic icmp too? 01134 }; 01135 01136 KindType Kind; 01137 Type *AccessTy; 01138 01139 SmallVector<int64_t, 8> Offsets; 01140 int64_t MinOffset; 01141 int64_t MaxOffset; 01142 01143 /// AllFixupsOutsideLoop - This records whether all of the fixups using this 01144 /// LSRUse are outside of the loop, in which case some special-case heuristics 01145 /// may be used. 01146 bool AllFixupsOutsideLoop; 01147 01148 /// WidestFixupType - This records the widest use type for any fixup using 01149 /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different 01150 /// max fixup widths to be equivalent, because the narrower one may be relying 01151 /// on the implicit truncation to truncate away bogus bits. 01152 Type *WidestFixupType; 01153 01154 /// Formulae - A list of ways to build a value that can satisfy this user. 01155 /// After the list is populated, one of these is selected heuristically and 01156 /// used to formulate a replacement for OperandValToReplace in UserInst. 01157 SmallVector<Formula, 12> Formulae; 01158 01159 /// Regs - The set of register candidates used by all formulae in this LSRUse. 01160 SmallPtrSet<const SCEV *, 4> Regs; 01161 01162 LSRUse(KindType K, Type *T) : Kind(K), AccessTy(T), 01163 MinOffset(INT64_MAX), 01164 MaxOffset(INT64_MIN), 01165 AllFixupsOutsideLoop(true), 01166 WidestFixupType(0) {} 01167 01168 bool HasFormulaWithSameRegs(const Formula &F) const; 01169 bool InsertFormula(const Formula &F); 01170 void DeleteFormula(Formula &F); 01171 void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses); 01172 01173 void print(raw_ostream &OS) const; 01174 void dump() const; 01175 }; 01176 01177 } 01178 01179 /// HasFormula - Test whether this use as a formula which has the same 01180 /// registers as the given formula. 01181 bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const { 01182 SmallVector<const SCEV *, 4> Key = F.BaseRegs; 01183 if (F.ScaledReg) Key.push_back(F.ScaledReg); 01184 // Unstable sort by host order ok, because this is only used for uniquifying. 01185 std::sort(Key.begin(), Key.end()); 01186 return Uniquifier.count(Key); 01187 } 01188 01189 /// InsertFormula - If the given formula has not yet been inserted, add it to 01190 /// the list, and return true. Return false otherwise. 01191 bool LSRUse::InsertFormula(const Formula &F) { 01192 SmallVector<const SCEV *, 4> Key = F.BaseRegs; 01193 if (F.ScaledReg) Key.push_back(F.ScaledReg); 01194 // Unstable sort by host order ok, because this is only used for uniquifying. 01195 std::sort(Key.begin(), Key.end()); 01196 01197 if (!Uniquifier.insert(Key).second) 01198 return false; 01199 01200 // Using a register to hold the value of 0 is not profitable. 01201 assert((!F.ScaledReg || !F.ScaledReg->isZero()) && 01202 "Zero allocated in a scaled register!"); 01203 #ifndef NDEBUG 01204 for (SmallVectorImpl<const SCEV *>::const_iterator I = 01205 F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) 01206 assert(!(*I)->isZero() && "Zero allocated in a base register!"); 01207 #endif 01208 01209 // Add the formula to the list. 01210 Formulae.push_back(F); 01211 01212 // Record registers now being used by this use. 01213 Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end()); 01214 01215 return true; 01216 } 01217 01218 /// DeleteFormula - Remove the given formula from this use's list. 01219 void LSRUse::DeleteFormula(Formula &F) { 01220 if (&F != &Formulae.back()) 01221 std::swap(F, Formulae.back()); 01222 Formulae.pop_back(); 01223 } 01224 01225 /// RecomputeRegs - Recompute the Regs field, and update RegUses. 01226 void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) { 01227 // Now that we've filtered out some formulae, recompute the Regs set. 01228 SmallPtrSet<const SCEV *, 4> OldRegs = Regs; 01229 Regs.clear(); 01230 for (SmallVectorImpl<Formula>::const_iterator I = Formulae.begin(), 01231 E = Formulae.end(); I != E; ++I) { 01232 const Formula &F = *I; 01233 if (F.ScaledReg) Regs.insert(F.ScaledReg); 01234 Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end()); 01235 } 01236 01237 // Update the RegTracker. 01238 for (SmallPtrSet<const SCEV *, 4>::iterator I = OldRegs.begin(), 01239 E = OldRegs.end(); I != E; ++I) 01240 if (!Regs.count(*I)) 01241 RegUses.DropRegister(*I, LUIdx); 01242 } 01243 01244 void LSRUse::print(raw_ostream &OS) const { 01245 OS << "LSR Use: Kind="; 01246 switch (Kind) { 01247 case Basic: OS << "Basic"; break; 01248 case Special: OS << "Special"; break; 01249 case ICmpZero: OS << "ICmpZero"; break; 01250 case Address: 01251 OS << "Address of "; 01252 if (AccessTy->isPointerTy()) 01253 OS << "pointer"; // the full pointer type could be really verbose 01254 else 01255 OS << *AccessTy; 01256 } 01257 01258 OS << ", Offsets={"; 01259 for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(), 01260 E = Offsets.end(); I != E; ++I) { 01261 OS << *I; 01262 if (llvm::next(I) != E) 01263 OS << ','; 01264 } 01265 OS << '}'; 01266 01267 if (AllFixupsOutsideLoop) 01268 OS << ", all-fixups-outside-loop"; 01269 01270 if (WidestFixupType) 01271 OS << ", widest fixup type: " << *WidestFixupType; 01272 } 01273 01274 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 01275 void LSRUse::dump() const { 01276 print(errs()); errs() << '\n'; 01277 } 01278 #endif 01279 01280 /// isLegalUse - Test whether the use described by AM is "legal", meaning it can 01281 /// be completely folded into the user instruction at isel time. This includes 01282 /// address-mode folding and special icmp tricks. 01283 static bool isLegalUse(const TargetTransformInfo &TTI, LSRUse::KindType Kind, 01284 Type *AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, 01285 bool HasBaseReg, int64_t Scale) { 01286 switch (Kind) { 01287 case LSRUse::Address: 01288 return TTI.isLegalAddressingMode(AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale); 01289 01290 // Otherwise, just guess that reg+reg addressing is legal. 01291 //return ; 01292 01293 case LSRUse::ICmpZero: 01294 // There's not even a target hook for querying whether it would be legal to 01295 // fold a GV into an ICmp. 01296 if (BaseGV) 01297 return false; 01298 01299 // ICmp only has two operands; don't allow more than two non-trivial parts. 01300 if (Scale != 0 && HasBaseReg && BaseOffset != 0) 01301 return false; 01302 01303 // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by 01304 // putting the scaled register in the other operand of the icmp. 01305 if (Scale != 0 && Scale != -1) 01306 return false; 01307 01308 // If we have low-level target information, ask the target if it can fold an 01309 // integer immediate on an icmp. 01310 if (BaseOffset != 0) { 01311 // We have one of: 01312 // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset 01313 // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset 01314 // Offs is the ICmp immediate. 01315 if (Scale == 0) 01316 // The cast does the right thing with INT64_MIN. 01317 BaseOffset = -(uint64_t)BaseOffset; 01318 return TTI.isLegalICmpImmediate(BaseOffset); 01319 } 01320 01321 // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg 01322 return true; 01323 01324 case LSRUse::Basic: 01325 // Only handle single-register values. 01326 return !BaseGV && Scale == 0 && BaseOffset == 0; 01327 01328 case LSRUse::Special: 01329 // Special case Basic to handle -1 scales. 01330 return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset == 0; 01331 } 01332 01333 llvm_unreachable("Invalid LSRUse Kind!"); 01334 } 01335 01336 static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, 01337 int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy, 01338 GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, 01339 int64_t Scale) { 01340 // Check for overflow. 01341 if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) != 01342 (MinOffset > 0)) 01343 return false; 01344 MinOffset = (uint64_t)BaseOffset + MinOffset; 01345 if (((int64_t)((uint64_t)BaseOffset + MaxOffset) > BaseOffset) != 01346 (MaxOffset > 0)) 01347 return false; 01348 MaxOffset = (uint64_t)BaseOffset + MaxOffset; 01349 01350 return isLegalUse(TTI, Kind, AccessTy, BaseGV, MinOffset, HasBaseReg, 01351 Scale) && 01352 isLegalUse(TTI, Kind, AccessTy, BaseGV, MaxOffset, HasBaseReg, Scale); 01353 } 01354 01355 static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, 01356 int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy, 01357 const Formula &F) { 01358 return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV, 01359 F.BaseOffset, F.HasBaseReg, F.Scale); 01360 } 01361 01362 static bool isAlwaysFoldable(const TargetTransformInfo &TTI, 01363 LSRUse::KindType Kind, Type *AccessTy, 01364 GlobalValue *BaseGV, int64_t BaseOffset, 01365 bool HasBaseReg) { 01366 // Fast-path: zero is always foldable. 01367 if (BaseOffset == 0 && !BaseGV) return true; 01368 01369 // Conservatively, create an address with an immediate and a 01370 // base and a scale. 01371 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1; 01372 01373 // Canonicalize a scale of 1 to a base register if the formula doesn't 01374 // already have a base register. 01375 if (!HasBaseReg && Scale == 1) { 01376 Scale = 0; 01377 HasBaseReg = true; 01378 } 01379 01380 return isLegalUse(TTI, Kind, AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale); 01381 } 01382 01383 static bool isAlwaysFoldable(const TargetTransformInfo &TTI, 01384 ScalarEvolution &SE, int64_t MinOffset, 01385 int64_t MaxOffset, LSRUse::KindType Kind, 01386 Type *AccessTy, const SCEV *S, bool HasBaseReg) { 01387 // Fast-path: zero is always foldable. 01388 if (S->isZero()) return true; 01389 01390 // Conservatively, create an address with an immediate and a 01391 // base and a scale. 01392 int64_t BaseOffset = ExtractImmediate(S, SE); 01393 GlobalValue *BaseGV = ExtractSymbol(S, SE); 01394 01395 // If there's anything else involved, it's not foldable. 01396 if (!S->isZero()) return false; 01397 01398 // Fast-path: zero is always foldable. 01399 if (BaseOffset == 0 && !BaseGV) return true; 01400 01401 // Conservatively, create an address with an immediate and a 01402 // base and a scale. 01403 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1; 01404 01405 return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV, 01406 BaseOffset, HasBaseReg, Scale); 01407 } 01408 01409 namespace { 01410 01411 /// UseMapDenseMapInfo - A DenseMapInfo implementation for holding 01412 /// DenseMaps and DenseSets of pairs of const SCEV* and LSRUse::Kind. 01413 struct UseMapDenseMapInfo { 01414 static std::pair<const SCEV *, LSRUse::KindType> getEmptyKey() { 01415 return std::make_pair(reinterpret_cast<const SCEV *>(-1), LSRUse::Basic); 01416 } 01417 01418 static std::pair<const SCEV *, LSRUse::KindType> getTombstoneKey() { 01419 return std::make_pair(reinterpret_cast<const SCEV *>(-2), LSRUse::Basic); 01420 } 01421 01422 static unsigned 01423 getHashValue(const std::pair<const SCEV *, LSRUse::KindType> &V) { 01424 unsigned Result = DenseMapInfo<const SCEV *>::getHashValue(V.first); 01425 Result ^= DenseMapInfo<unsigned>::getHashValue(unsigned(V.second)); 01426 return Result; 01427 } 01428 01429 static bool isEqual(const std::pair<const SCEV *, LSRUse::KindType> &LHS, 01430 const std::pair<const SCEV *, LSRUse::KindType> &RHS) { 01431 return LHS == RHS; 01432 } 01433 }; 01434 01435 /// IVInc - An individual increment in a Chain of IV increments. 01436 /// Relate an IV user to an expression that computes the IV it uses from the IV 01437 /// used by the previous link in the Chain. 01438 /// 01439 /// For the head of a chain, IncExpr holds the absolute SCEV expression for the 01440 /// original IVOperand. The head of the chain's IVOperand is only valid during 01441 /// chain collection, before LSR replaces IV users. During chain generation, 01442 /// IncExpr can be used to find the new IVOperand that computes the same 01443 /// expression. 01444 struct IVInc { 01445 Instruction *UserInst; 01446 Value* IVOperand; 01447 const SCEV *IncExpr; 01448 01449 IVInc(Instruction *U, Value *O, const SCEV *E): 01450 UserInst(U), IVOperand(O), IncExpr(E) {} 01451 }; 01452 01453 // IVChain - The list of IV increments in program order. 01454 // We typically add the head of a chain without finding subsequent links. 01455 struct IVChain { 01456 SmallVector<IVInc,1> Incs; 01457 const SCEV *ExprBase; 01458 01459 IVChain() : ExprBase(0) {} 01460 01461 IVChain(const IVInc &Head, const SCEV *Base) 01462 : Incs(1, Head), ExprBase(Base) {} 01463 01464 typedef SmallVectorImpl<IVInc>::const_iterator const_iterator; 01465 01466 // begin - return the first increment in the chain. 01467 const_iterator begin() const { 01468 assert(!Incs.empty()); 01469 return llvm::next(Incs.begin()); 01470 } 01471 const_iterator end() const { 01472 return Incs.end(); 01473 } 01474 01475 // hasIncs - Returns true if this chain contains any increments. 01476 bool hasIncs() const { return Incs.size() >= 2; } 01477 01478 // add - Add an IVInc to the end of this chain. 01479 void add(const IVInc &X) { Incs.push_back(X); } 01480 01481 // tailUserInst - Returns the last UserInst in the chain. 01482 Instruction *tailUserInst() const { return Incs.back().UserInst; } 01483 01484 // isProfitableIncrement - Returns true if IncExpr can be profitably added to 01485 // this chain. 01486 bool isProfitableIncrement(const SCEV *OperExpr, 01487 const SCEV *IncExpr, 01488 ScalarEvolution&); 01489 }; 01490 01491 /// ChainUsers - Helper for CollectChains to track multiple IV increment uses. 01492 /// Distinguish between FarUsers that definitely cross IV increments and 01493 /// NearUsers that may be used between IV increments. 01494 struct ChainUsers { 01495 SmallPtrSet<Instruction*, 4> FarUsers; 01496 SmallPtrSet<Instruction*, 4> NearUsers; 01497 }; 01498 01499 /// LSRInstance - This class holds state for the main loop strength reduction 01500 /// logic. 01501 class LSRInstance { 01502 IVUsers &IU; 01503 ScalarEvolution &SE; 01504 DominatorTree &DT; 01505 LoopInfo &LI; 01506 const TargetTransformInfo &TTI; 01507 Loop *const L; 01508 bool Changed; 01509 01510 /// IVIncInsertPos - This is the insert position that the current loop's 01511 /// induction variable increment should be placed. In simple loops, this is 01512 /// the latch block's terminator. But in more complicated cases, this is a 01513 /// position which will dominate all the in-loop post-increment users. 01514 Instruction *IVIncInsertPos; 01515 01516 /// Factors - Interesting factors between use strides. 01517 SmallSetVector<int64_t, 8> Factors; 01518 01519 /// Types - Interesting use types, to facilitate truncation reuse. 01520 SmallSetVector<Type *, 4> Types; 01521 01522 /// Fixups - The list of operands which are to be replaced. 01523 SmallVector<LSRFixup, 16> Fixups; 01524 01525 /// Uses - The list of interesting uses. 01526 SmallVector<LSRUse, 16> Uses; 01527 01528 /// RegUses - Track which uses use which register candidates. 01529 RegUseTracker RegUses; 01530 01531 // Limit the number of chains to avoid quadratic behavior. We don't expect to 01532 // have more than a few IV increment chains in a loop. Missing a Chain falls 01533 // back to normal LSR behavior for those uses. 01534 static const unsigned MaxChains = 8; 01535 01536 /// IVChainVec - IV users can form a chain of IV increments. 01537 SmallVector<IVChain, MaxChains> IVChainVec; 01538 01539 /// IVIncSet - IV users that belong to profitable IVChains. 01540 SmallPtrSet<Use*, MaxChains> IVIncSet; 01541 01542 void OptimizeShadowIV(); 01543 bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse); 01544 ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse); 01545 void OptimizeLoopTermCond(); 01546 01547 void ChainInstruction(Instruction *UserInst, Instruction *IVOper, 01548 SmallVectorImpl<ChainUsers> &ChainUsersVec); 01549 void FinalizeChain(IVChain &Chain); 01550 void CollectChains(); 01551 void GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, 01552 SmallVectorImpl<WeakVH> &DeadInsts); 01553 01554 void CollectInterestingTypesAndFactors(); 01555 void CollectFixupsAndInitialFormulae(); 01556 01557 LSRFixup &getNewFixup() { 01558 Fixups.push_back(LSRFixup()); 01559 return Fixups.back(); 01560 } 01561 01562 // Support for sharing of LSRUses between LSRFixups. 01563 typedef DenseMap<std::pair<const SCEV *, LSRUse::KindType>, 01564 size_t, 01565 UseMapDenseMapInfo> UseMapTy; 01566 UseMapTy UseMap; 01567 01568 bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, 01569 LSRUse::KindType Kind, Type *AccessTy); 01570 01571 std::pair<size_t, int64_t> getUse(const SCEV *&Expr, 01572 LSRUse::KindType Kind, 01573 Type *AccessTy); 01574 01575 void DeleteUse(LSRUse &LU, size_t LUIdx); 01576 01577 LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU); 01578 01579 void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx); 01580 void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx); 01581 void CountRegisters(const Formula &F, size_t LUIdx); 01582 bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F); 01583 01584 void CollectLoopInvariantFixupsAndFormulae(); 01585 01586 void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base, 01587 unsigned Depth = 0); 01588 void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base); 01589 void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base); 01590 void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base); 01591 void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base); 01592 void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base); 01593 void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base); 01594 void GenerateCrossUseConstantOffsets(); 01595 void GenerateAllReuseFormulae(); 01596 01597 void FilterOutUndesirableDedicatedRegisters(); 01598 01599 size_t EstimateSearchSpaceComplexity() const; 01600 void NarrowSearchSpaceByDetectingSupersets(); 01601 void NarrowSearchSpaceByCollapsingUnrolledCode(); 01602 void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(); 01603 void NarrowSearchSpaceByPickingWinnerRegs(); 01604 void NarrowSearchSpaceUsingHeuristics(); 01605 01606 void SolveRecurse(SmallVectorImpl<const Formula *> &Solution, 01607 Cost &SolutionCost, 01608 SmallVectorImpl<const Formula *> &Workspace, 01609 const Cost &CurCost, 01610 const SmallPtrSet<const SCEV *, 16> &CurRegs, 01611 DenseSet<const SCEV *> &VisitedRegs) const; 01612 void Solve(SmallVectorImpl<const Formula *> &Solution) const; 01613 01614 BasicBlock::iterator 01615 HoistInsertPosition(BasicBlock::iterator IP, 01616 const SmallVectorImpl<Instruction *> &Inputs) const; 01617 BasicBlock::iterator 01618 AdjustInsertPositionForExpand(BasicBlock::iterator IP, 01619 const LSRFixup &LF, 01620 const LSRUse &LU, 01621 SCEVExpander &Rewriter) const; 01622 01623 Value *Expand(const LSRFixup &LF, 01624 const Formula &F, 01625 BasicBlock::iterator IP, 01626 SCEVExpander &Rewriter, 01627 SmallVectorImpl<WeakVH> &DeadInsts) const; 01628 void RewriteForPHI(PHINode *PN, const LSRFixup &LF, 01629 const Formula &F, 01630 SCEVExpander &Rewriter, 01631 SmallVectorImpl<WeakVH> &DeadInsts, 01632 Pass *P) const; 01633 void Rewrite(const LSRFixup &LF, 01634 const Formula &F, 01635 SCEVExpander &Rewriter, 01636 SmallVectorImpl<WeakVH> &DeadInsts, 01637 Pass *P) const; 01638 void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, 01639 Pass *P); 01640 01641 public: 01642 LSRInstance(Loop *L, Pass *P); 01643 01644 bool getChanged() const { return Changed; } 01645 01646 void print_factors_and_types(raw_ostream &OS) const; 01647 void print_fixups(raw_ostream &OS) const; 01648 void print_uses(raw_ostream &OS) const; 01649 void print(raw_ostream &OS) const; 01650 void dump() const; 01651 }; 01652 01653 } 01654 01655 /// OptimizeShadowIV - If IV is used in a int-to-float cast 01656 /// inside the loop then try to eliminate the cast operation. 01657 void LSRInstance::OptimizeShadowIV() { 01658 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); 01659 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) 01660 return; 01661 01662 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); 01663 UI != E; /* empty */) { 01664 IVUsers::const_iterator CandidateUI = UI; 01665 ++UI; 01666 Instruction *ShadowUse = CandidateUI->getUser(); 01667 Type *DestTy = NULL; 01668 bool IsSigned = false; 01669 01670 /* If shadow use is a int->float cast then insert a second IV 01671 to eliminate this cast. 01672 01673 for (unsigned i = 0; i < n; ++i) 01674 foo((double)i); 01675 01676 is transformed into 01677 01678 double d = 0.0; 01679 for (unsigned i = 0; i < n; ++i, ++d) 01680 foo(d); 01681 */ 01682 if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) { 01683 IsSigned = false; 01684 DestTy = UCast->getDestTy(); 01685 } 01686 else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) { 01687 IsSigned = true; 01688 DestTy = SCast->getDestTy(); 01689 } 01690 if (!DestTy) continue; 01691 01692 // If target does not support DestTy natively then do not apply 01693 // this transformation. 01694 if (!TTI.isTypeLegal(DestTy)) continue; 01695 01696 PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0)); 01697 if (!PH) continue; 01698 if (PH->getNumIncomingValues() != 2) continue; 01699 01700 Type *SrcTy = PH->getType(); 01701 int Mantissa = DestTy->getFPMantissaWidth(); 01702 if (Mantissa == -1) continue; 01703 if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa) 01704 continue; 01705 01706 unsigned Entry, Latch; 01707 if (PH->getIncomingBlock(0) == L->getLoopPreheader()) { 01708 Entry = 0; 01709 Latch = 1; 01710 } else { 01711 Entry = 1; 01712 Latch = 0; 01713 } 01714 01715 ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry)); 01716 if (!Init) continue; 01717 Constant *NewInit = ConstantFP::get(DestTy, IsSigned ? 01718 (double)Init->getSExtValue() : 01719 (double)Init->getZExtValue()); 01720 01721 BinaryOperator *Incr = 01722 dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch)); 01723 if (!Incr) continue; 01724 if (Incr->getOpcode() != Instruction::Add 01725 && Incr->getOpcode() != Instruction::Sub) 01726 continue; 01727 01728 /* Initialize new IV, double d = 0.0 in above example. */ 01729 ConstantInt *C = NULL; 01730 if (Incr->getOperand(0) == PH) 01731 C = dyn_cast<ConstantInt>(Incr->getOperand(1)); 01732 else if (Incr->getOperand(1) == PH) 01733 C = dyn_cast<ConstantInt>(Incr->getOperand(0)); 01734 else 01735 continue; 01736 01737 if (!C) continue; 01738 01739 // Ignore negative constants, as the code below doesn't handle them 01740 // correctly. TODO: Remove this restriction. 01741 if (!C->getValue().isStrictlyPositive()) continue; 01742 01743 /* Add new PHINode. */ 01744 PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH); 01745 01746 /* create new increment. '++d' in above example. */ 01747 Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue()); 01748 BinaryOperator *NewIncr = 01749 BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ? 01750 Instruction::FAdd : Instruction::FSub, 01751 NewPH, CFP, "IV.S.next.", Incr); 01752 01753 NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry)); 01754 NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch)); 01755 01756 /* Remove cast operation */ 01757 ShadowUse->replaceAllUsesWith(NewPH); 01758 ShadowUse->eraseFromParent(); 01759 Changed = true; 01760 break; 01761 } 01762 } 01763 01764 /// FindIVUserForCond - If Cond has an operand that is an expression of an IV, 01765 /// set the IV user and stride information and return true, otherwise return 01766 /// false. 01767 bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) { 01768 for (IVUsers::iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) 01769 if (UI->getUser() == Cond) { 01770 // NOTE: we could handle setcc instructions with multiple uses here, but 01771 // InstCombine does it as well for simple uses, it's not clear that it 01772 // occurs enough in real life to handle. 01773 CondUse = UI; 01774 return true; 01775 } 01776 return false; 01777 } 01778 01779 /// OptimizeMax - Rewrite the loop's terminating condition if it uses 01780 /// a max computation. 01781 /// 01782 /// This is a narrow solution to a specific, but acute, problem. For loops 01783 /// like this: 01784 /// 01785 /// i = 0; 01786 /// do { 01787 /// p[i] = 0.0; 01788 /// } while (++i < n); 01789 /// 01790 /// the trip count isn't just 'n', because 'n' might not be positive. And 01791 /// unfortunately this can come up even for loops where the user didn't use 01792 /// a C do-while loop. For example, seemingly well-behaved top-test loops 01793 /// will commonly be lowered like this: 01794 // 01795 /// if (n > 0) { 01796 /// i = 0; 01797 /// do { 01798 /// p[i] = 0.0; 01799 /// } while (++i < n); 01800 /// } 01801 /// 01802 /// and then it's possible for subsequent optimization to obscure the if 01803 /// test in such a way that indvars can't find it. 01804 /// 01805 /// When indvars can't find the if test in loops like this, it creates a 01806 /// max expression, which allows it to give the loop a canonical 01807 /// induction variable: 01808 /// 01809 /// i = 0; 01810 /// max = n < 1 ? 1 : n; 01811 /// do { 01812 /// p[i] = 0.0; 01813 /// } while (++i != max); 01814 /// 01815 /// Canonical induction variables are necessary because the loop passes 01816 /// are designed around them. The most obvious example of this is the 01817 /// LoopInfo analysis, which doesn't remember trip count values. It 01818 /// expects to be able to rediscover the trip count each time it is 01819 /// needed, and it does this using a simple analysis that only succeeds if 01820 /// the loop has a canonical induction variable. 01821 /// 01822 /// However, when it comes time to generate code, the maximum operation 01823 /// can be quite costly, especially if it's inside of an outer loop. 01824 /// 01825 /// This function solves this problem by detecting this type of loop and 01826 /// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting 01827 /// the instructions for the maximum computation. 01828 /// 01829 ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) { 01830 // Check that the loop matches the pattern we're looking for. 01831 if (Cond->getPredicate() != CmpInst::ICMP_EQ && 01832 Cond->getPredicate() != CmpInst::ICMP_NE) 01833 return Cond; 01834 01835 SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1)); 01836 if (!Sel || !Sel->hasOneUse()) return Cond; 01837 01838 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); 01839 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) 01840 return Cond; 01841 const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1); 01842 01843 // Add one to the backedge-taken count to get the trip count. 01844 const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount); 01845 if (IterationCount != SE.getSCEV(Sel)) return Cond; 01846 01847 // Check for a max calculation that matches the pattern. There's no check 01848 // for ICMP_ULE here because the comparison would be with zero, which 01849 // isn't interesting. 01850 CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; 01851 const SCEVNAryExpr *Max = 0; 01852 if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) { 01853 Pred = ICmpInst::ICMP_SLE; 01854 Max = S; 01855 } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) { 01856 Pred = ICmpInst::ICMP_SLT; 01857 Max = S; 01858 } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) { 01859 Pred = ICmpInst::ICMP_ULT; 01860 Max = U; 01861 } else { 01862 // No match; bail. 01863 return Cond; 01864 } 01865 01866 // To handle a max with more than two operands, this optimization would 01867 // require additional checking and setup. 01868 if (Max->getNumOperands() != 2) 01869 return Cond; 01870 01871 const SCEV *MaxLHS = Max->getOperand(0); 01872 const SCEV *MaxRHS = Max->getOperand(1); 01873 01874 // ScalarEvolution canonicalizes constants to the left. For < and >, look 01875 // for a comparison with 1. For <= and >=, a comparison with zero. 01876 if (!MaxLHS || 01877 (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One))) 01878 return Cond; 01879 01880 // Check the relevant induction variable for conformance to 01881 // the pattern. 01882 const SCEV *IV = SE.getSCEV(Cond->getOperand(0)); 01883 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV); 01884 if (!AR || !AR->isAffine() || 01885 AR->getStart() != One || 01886 AR->getStepRecurrence(SE) != One) 01887 return Cond; 01888 01889 assert(AR->getLoop() == L && 01890 "Loop condition operand is an addrec in a different loop!"); 01891 01892 // Check the right operand of the select, and remember it, as it will 01893 // be used in the new comparison instruction. 01894 Value *NewRHS = 0; 01895 if (ICmpInst::isTrueWhenEqual(Pred)) { 01896 // Look for n+1, and grab n. 01897 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1))) 01898 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1))) 01899 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS) 01900 NewRHS = BO->getOperand(0); 01901 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2))) 01902 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1))) 01903 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS) 01904 NewRHS = BO->getOperand(0); 01905 if (!NewRHS) 01906 return Cond; 01907 } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS) 01908 NewRHS = Sel->getOperand(1); 01909 else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS) 01910 NewRHS = Sel->getOperand(2); 01911 else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS)) 01912 NewRHS = SU->getValue(); 01913 else 01914 // Max doesn't match expected pattern. 01915 return Cond; 01916 01917 // Determine the new comparison opcode. It may be signed or unsigned, 01918 // and the original comparison may be either equality or inequality. 01919 if (Cond->getPredicate() == CmpInst::ICMP_EQ) 01920 Pred = CmpInst::getInversePredicate(Pred); 01921 01922 // Ok, everything looks ok to change the condition into an SLT or SGE and 01923 // delete the max calculation. 01924 ICmpInst *NewCond = 01925 new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp"); 01926 01927 // Delete the max calculation instructions. 01928 Cond->replaceAllUsesWith(NewCond); 01929 CondUse->setUser(NewCond); 01930 Instruction *Cmp = cast<Instruction>(Sel->getOperand(0)); 01931 Cond->eraseFromParent(); 01932 Sel->eraseFromParent(); 01933 if (Cmp->use_empty()) 01934 Cmp->eraseFromParent(); 01935 return NewCond; 01936 } 01937 01938 /// OptimizeLoopTermCond - Change loop terminating condition to use the 01939 /// postinc iv when possible. 01940 void 01941 LSRInstance::OptimizeLoopTermCond() { 01942 SmallPtrSet<Instruction *, 4> PostIncs; 01943 01944 BasicBlock *LatchBlock = L->getLoopLatch(); 01945 SmallVector<BasicBlock*, 8> ExitingBlocks; 01946 L->getExitingBlocks(ExitingBlocks); 01947 01948 for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) { 01949 BasicBlock *ExitingBlock = ExitingBlocks[i]; 01950 01951 // Get the terminating condition for the loop if possible. If we 01952 // can, we want to change it to use a post-incremented version of its 01953 // induction variable, to allow coalescing the live ranges for the IV into 01954 // one register value. 01955 01956 BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator()); 01957 if (!TermBr) 01958 continue; 01959 // FIXME: Overly conservative, termination condition could be an 'or' etc.. 01960 if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition())) 01961 continue; 01962 01963 // Search IVUsesByStride to find Cond's IVUse if there is one. 01964 IVStrideUse *CondUse = 0; 01965 ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition()); 01966 if (!FindIVUserForCond(Cond, CondUse)) 01967 continue; 01968 01969 // If the trip count is computed in terms of a max (due to ScalarEvolution 01970 // being unable to find a sufficient guard, for example), change the loop 01971 // comparison to use SLT or ULT instead of NE. 01972 // One consequence of doing this now is that it disrupts the count-down 01973 // optimization. That's not always a bad thing though, because in such 01974 // cases it may still be worthwhile to avoid a max. 01975 Cond = OptimizeMax(Cond, CondUse); 01976 01977 // If this exiting block dominates the latch block, it may also use 01978 // the post-inc value if it won't be shared with other uses. 01979 // Check for dominance. 01980 if (!DT.dominates(ExitingBlock, LatchBlock)) 01981 continue; 01982 01983 // Conservatively avoid trying to use the post-inc value in non-latch 01984 // exits if there may be pre-inc users in intervening blocks. 01985 if (LatchBlock != ExitingBlock) 01986 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) 01987 // Test if the use is reachable from the exiting block. This dominator 01988 // query is a conservative approximation of reachability. 01989 if (&*UI != CondUse && 01990 !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) { 01991 // Conservatively assume there may be reuse if the quotient of their 01992 // strides could be a legal scale. 01993 const SCEV *A = IU.getStride(*CondUse, L); 01994 const SCEV *B = IU.getStride(*UI, L); 01995 if (!A || !B) continue; 01996 if (SE.getTypeSizeInBits(A->getType()) != 01997 SE.getTypeSizeInBits(B->getType())) { 01998 if (SE.getTypeSizeInBits(A->getType()) > 01999 SE.getTypeSizeInBits(B->getType())) 02000 B = SE.getSignExtendExpr(B, A->getType()); 02001 else 02002 A = SE.getSignExtendExpr(A, B->getType()); 02003 } 02004 if (const SCEVConstant *D = 02005 dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) { 02006 const ConstantInt *C = D->getValue(); 02007 // Stride of one or negative one can have reuse with non-addresses. 02008 if (C->isOne() || C->isAllOnesValue()) 02009 goto decline_post_inc; 02010 // Avoid weird situations. 02011 if (C->getValue().getMinSignedBits() >= 64 || 02012 C->getValue().isMinSignedValue()) 02013 goto decline_post_inc; 02014 // Check for possible scaled-address reuse. 02015 Type *AccessTy = getAccessType(UI->getUser()); 02016 int64_t Scale = C->getSExtValue(); 02017 if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ 0, 02018 /*BaseOffset=*/ 0, 02019 /*HasBaseReg=*/ false, Scale)) 02020 goto decline_post_inc; 02021 Scale = -Scale; 02022 if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ 0, 02023 /*BaseOffset=*/ 0, 02024 /*HasBaseReg=*/ false, Scale)) 02025 goto decline_post_inc; 02026 } 02027 } 02028 02029 DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: " 02030 << *Cond << '\n'); 02031 02032 // It's possible for the setcc instruction to be anywhere in the loop, and 02033 // possible for it to have multiple users. If it is not immediately before 02034 // the exiting block branch, move it. 02035 if (&*++BasicBlock::iterator(Cond) != TermBr) { 02036 if (Cond->hasOneUse()) { 02037 Cond->moveBefore(TermBr); 02038 } else { 02039 // Clone the terminating condition and insert into the loopend. 02040 ICmpInst *OldCond = Cond; 02041 Cond = cast<ICmpInst>(Cond->clone()); 02042 Cond->setName(L->getHeader()->getName() + ".termcond"); 02043 ExitingBlock->getInstList().insert(TermBr, Cond); 02044 02045 // Clone the IVUse, as the old use still exists! 02046 CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace()); 02047 TermBr->replaceUsesOfWith(OldCond, Cond); 02048 } 02049 } 02050 02051 // If we get to here, we know that we can transform the setcc instruction to 02052 // use the post-incremented version of the IV, allowing us to coalesce the 02053 // live ranges for the IV correctly. 02054 CondUse->transformToPostInc(L); 02055 Changed = true; 02056 02057 PostIncs.insert(Cond); 02058 decline_post_inc:; 02059 } 02060 02061 // Determine an insertion point for the loop induction variable increment. It 02062 // must dominate all the post-inc comparisons we just set up, and it must 02063 // dominate the loop latch edge. 02064 IVIncInsertPos = L->getLoopLatch()->getTerminator(); 02065 for (SmallPtrSet<Instruction *, 4>::const_iterator I = PostIncs.begin(), 02066 E = PostIncs.end(); I != E; ++I) { 02067 BasicBlock *BB = 02068 DT.findNearestCommonDominator(IVIncInsertPos->getParent(), 02069 (*I)->getParent()); 02070 if (BB == (*I)->getParent()) 02071 IVIncInsertPos = *I; 02072 else if (BB != IVIncInsertPos->getParent()) 02073 IVIncInsertPos = BB->getTerminator(); 02074 } 02075 } 02076 02077 /// reconcileNewOffset - Determine if the given use can accommodate a fixup 02078 /// at the given offset and other details. If so, update the use and 02079 /// return true. 02080 bool 02081 LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, 02082 LSRUse::KindType Kind, Type *AccessTy) { 02083 int64_t NewMinOffset = LU.MinOffset; 02084 int64_t NewMaxOffset = LU.MaxOffset; 02085 Type *NewAccessTy = AccessTy; 02086 02087 // Check for a mismatched kind. It's tempting to collapse mismatched kinds to 02088 // something conservative, however this can pessimize in the case that one of 02089 // the uses will have all its uses outside the loop, for example. 02090 if (LU.Kind != Kind) 02091 return false; 02092 // Conservatively assume HasBaseReg is true for now. 02093 if (NewOffset < LU.MinOffset) { 02094 if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ 0, 02095 LU.MaxOffset - NewOffset, HasBaseReg)) 02096 return false; 02097 NewMinOffset = NewOffset; 02098 } else if (NewOffset > LU.MaxOffset) { 02099 if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ 0, 02100 NewOffset - LU.MinOffset, HasBaseReg)) 02101 return false; 02102 NewMaxOffset = NewOffset; 02103 } 02104 // Check for a mismatched access type, and fall back conservatively as needed. 02105 // TODO: Be less conservative when the type is similar and can use the same 02106 // addressing modes. 02107 if (Kind == LSRUse::Address && AccessTy != LU.AccessTy) 02108 NewAccessTy = Type::getVoidTy(AccessTy->getContext()); 02109 02110 // Update the use. 02111 LU.MinOffset = NewMinOffset; 02112 LU.MaxOffset = NewMaxOffset; 02113 LU.AccessTy = NewAccessTy; 02114 if (NewOffset != LU.Offsets.back()) 02115 LU.Offsets.push_back(NewOffset); 02116 return true; 02117 } 02118 02119 /// getUse - Return an LSRUse index and an offset value for a fixup which 02120 /// needs the given expression, with the given kind and optional access type. 02121 /// Either reuse an existing use or create a new one, as needed. 02122 std::pair<size_t, int64_t> 02123 LSRInstance::getUse(const SCEV *&Expr, 02124 LSRUse::KindType Kind, Type *AccessTy) { 02125 const SCEV *Copy = Expr; 02126 int64_t Offset = ExtractImmediate(Expr, SE); 02127 02128 // Basic uses can't accept any offset, for example. 02129 if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ 0, 02130 Offset, /*HasBaseReg=*/ true)) { 02131 Expr = Copy; 02132 Offset = 0; 02133 } 02134 02135 std::pair<UseMapTy::iterator, bool> P = 02136 UseMap.insert(std::make_pair(std::make_pair(Expr, Kind), 0)); 02137 if (!P.second) { 02138 // A use already existed with this base. 02139 size_t LUIdx = P.first->second; 02140 LSRUse &LU = Uses[LUIdx]; 02141 if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy)) 02142 // Reuse this use. 02143 return std::make_pair(LUIdx, Offset); 02144 } 02145 02146 // Create a new use. 02147 size_t LUIdx = Uses.size(); 02148 P.first->second = LUIdx; 02149 Uses.push_back(LSRUse(Kind, AccessTy)); 02150 LSRUse &LU = Uses[LUIdx]; 02151 02152 // We don't need to track redundant offsets, but we don't need to go out 02153 // of our way here to avoid them. 02154 if (LU.Offsets.empty() || Offset != LU.Offsets.back()) 02155 LU.Offsets.push_back(Offset); 02156 02157 LU.MinOffset = Offset; 02158 LU.MaxOffset = Offset; 02159 return std::make_pair(LUIdx, Offset); 02160 } 02161 02162 /// DeleteUse - Delete the given use from the Uses list. 02163 void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) { 02164 if (&LU != &Uses.back()) 02165 std::swap(LU, Uses.back()); 02166 Uses.pop_back(); 02167 02168 // Update RegUses. 02169 RegUses.SwapAndDropUse(LUIdx, Uses.size()); 02170 } 02171 02172 /// FindUseWithFormula - Look for a use distinct from OrigLU which is has 02173 /// a formula that has the same registers as the given formula. 02174 LSRUse * 02175 LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF, 02176 const LSRUse &OrigLU) { 02177 // Search all uses for the formula. This could be more clever. 02178 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 02179 LSRUse &LU = Uses[LUIdx]; 02180 // Check whether this use is close enough to OrigLU, to see whether it's 02181 // worthwhile looking through its formulae. 02182 // Ignore ICmpZero uses because they may contain formulae generated by 02183 // GenerateICmpZeroScales, in which case adding fixup offsets may 02184 // be invalid. 02185 if (&LU != &OrigLU && 02186 LU.Kind != LSRUse::ICmpZero && 02187 LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy && 02188 LU.WidestFixupType == OrigLU.WidestFixupType && 02189 LU.HasFormulaWithSameRegs(OrigF)) { 02190 // Scan through this use's formulae. 02191 for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(), 02192 E = LU.Formulae.end(); I != E; ++I) { 02193 const Formula &F = *I; 02194 // Check to see if this formula has the same registers and symbols 02195 // as OrigF. 02196 if (F.BaseRegs == OrigF.BaseRegs && 02197 F.ScaledReg == OrigF.ScaledReg && 02198 F.BaseGV == OrigF.BaseGV && 02199 F.Scale == OrigF.Scale && 02200 F.UnfoldedOffset == OrigF.UnfoldedOffset) { 02201 if (F.BaseOffset == 0) 02202 return &LU; 02203 // This is the formula where all the registers and symbols matched; 02204 // there aren't going to be any others. Since we declined it, we 02205 // can skip the rest of the formulae and proceed to the next LSRUse. 02206 break; 02207 } 02208 } 02209 } 02210 } 02211 02212 // Nothing looked good. 02213 return 0; 02214 } 02215 02216 void LSRInstance::CollectInterestingTypesAndFactors() { 02217 SmallSetVector<const SCEV *, 4> Strides; 02218 02219 // Collect interesting types and strides. 02220 SmallVector<const SCEV *, 4> Worklist; 02221 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) { 02222 const SCEV *Expr = IU.getExpr(*UI); 02223 02224 // Collect interesting types. 02225 Types.insert(SE.getEffectiveSCEVType(Expr->getType())); 02226 02227 // Add strides for mentioned loops. 02228 Worklist.push_back(Expr); 02229 do { 02230 const SCEV *S = Worklist.pop_back_val(); 02231 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { 02232 if (AR->getLoop() == L) 02233 Strides.insert(AR->getStepRecurrence(SE)); 02234 Worklist.push_back(AR->getStart()); 02235 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { 02236 Worklist.append(Add->op_begin(), Add->op_end()); 02237 } 02238 } while (!Worklist.empty()); 02239 } 02240 02241 // Compute interesting factors from the set of interesting strides. 02242 for (SmallSetVector<const SCEV *, 4>::const_iterator 02243 I = Strides.begin(), E = Strides.end(); I != E; ++I) 02244 for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter = 02245 llvm::next(I); NewStrideIter != E; ++NewStrideIter) { 02246 const SCEV *OldStride = *I; 02247 const SCEV *NewStride = *NewStrideIter; 02248 02249 if (SE.getTypeSizeInBits(OldStride->getType()) != 02250 SE.getTypeSizeInBits(NewStride->getType())) { 02251 if (SE.getTypeSizeInBits(OldStride->getType()) > 02252 SE.getTypeSizeInBits(NewStride->getType())) 02253 NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType()); 02254 else 02255 OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType()); 02256 } 02257 if (const SCEVConstant *Factor = 02258 dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride, 02259 SE, true))) { 02260 if (Factor->getValue()->getValue().getMinSignedBits() <= 64) 02261 Factors.insert(Factor->getValue()->getValue().getSExtValue()); 02262 } else if (const SCEVConstant *Factor = 02263 dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride, 02264 NewStride, 02265 SE, true))) { 02266 if (Factor->getValue()->getValue().getMinSignedBits() <= 64) 02267 Factors.insert(Factor->getValue()->getValue().getSExtValue()); 02268 } 02269 } 02270 02271 // If all uses use the same type, don't bother looking for truncation-based 02272 // reuse. 02273 if (Types.size() == 1) 02274 Types.clear(); 02275 02276 DEBUG(print_factors_and_types(dbgs())); 02277 } 02278 02279 /// findIVOperand - Helper for CollectChains that finds an IV operand (computed 02280 /// by an AddRec in this loop) within [OI,OE) or returns OE. If IVUsers mapped 02281 /// Instructions to IVStrideUses, we could partially skip this. 02282 static User::op_iterator 02283 findIVOperand(User::op_iterator OI, User::op_iterator OE, 02284 Loop *L, ScalarEvolution &SE) { 02285 for(; OI != OE; ++OI) { 02286 if (Instruction *Oper = dyn_cast<Instruction>(*OI)) { 02287 if (!SE.isSCEVable(Oper->getType())) 02288 continue; 02289 02290 if (const SCEVAddRecExpr *AR = 02291 dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) { 02292 if (AR->getLoop() == L) 02293 break; 02294 } 02295 } 02296 } 02297 return OI; 02298 } 02299 02300 /// getWideOperand - IVChain logic must consistenctly peek base TruncInst 02301 /// operands, so wrap it in a convenient helper. 02302 static Value *getWideOperand(Value *Oper) { 02303 if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper)) 02304 return Trunc->getOperand(0); 02305 return Oper; 02306 } 02307 02308 /// isCompatibleIVType - Return true if we allow an IV chain to include both 02309 /// types. 02310 static bool isCompatibleIVType(Value *LVal, Value *RVal) { 02311 Type *LType = LVal->getType(); 02312 Type *RType = RVal->getType(); 02313 return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy()); 02314 } 02315 02316 /// getExprBase - Return an approximation of this SCEV expression's "base", or 02317 /// NULL for any constant. Returning the expression itself is 02318 /// conservative. Returning a deeper subexpression is more precise and valid as 02319 /// long as it isn't less complex than another subexpression. For expressions 02320 /// involving multiple unscaled values, we need to return the pointer-type 02321 /// SCEVUnknown. This avoids forming chains across objects, such as: 02322 /// PrevOper==a[i], IVOper==b[i], IVInc==b-a. 02323 /// 02324 /// Since SCEVUnknown is the rightmost type, and pointers are the rightmost 02325 /// SCEVUnknown, we simply return the rightmost SCEV operand. 02326 static const SCEV *getExprBase(const SCEV *S) { 02327 switch (S->getSCEVType()) { 02328 default: // uncluding scUnknown. 02329 return S; 02330 case scConstant: 02331 return 0; 02332 case scTruncate: 02333 return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand()); 02334 case scZeroExtend: 02335 return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand()); 02336 case scSignExtend: 02337 return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand()); 02338 case scAddExpr: { 02339 // Skip over scaled operands (scMulExpr) to follow add operands as long as 02340 // there's nothing more complex. 02341 // FIXME: not sure if we want to recognize negation. 02342 const SCEVAddExpr *Add = cast<SCEVAddExpr>(S); 02343 for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(Add->op_end()), 02344 E(Add->op_begin()); I != E; ++I) { 02345 const SCEV *SubExpr = *I; 02346 if (SubExpr->getSCEVType() == scAddExpr) 02347 return getExprBase(SubExpr); 02348 02349 if (SubExpr->getSCEVType() != scMulExpr) 02350 return SubExpr; 02351 } 02352 return S; // all operands are scaled, be conservative. 02353 } 02354 case scAddRecExpr: 02355 return getExprBase(cast<SCEVAddRecExpr>(S)->getStart()); 02356 } 02357 } 02358 02359 /// Return true if the chain increment is profitable to expand into a loop 02360 /// invariant value, which may require its own register. A profitable chain 02361 /// increment will be an offset relative to the same base. We allow such offsets 02362 /// to potentially be used as chain increment as long as it's not obviously 02363 /// expensive to expand using real instructions. 02364 bool IVChain::isProfitableIncrement(const SCEV *OperExpr, 02365 const SCEV *IncExpr, 02366 ScalarEvolution &SE) { 02367 // Aggressively form chains when -stress-ivchain. 02368 if (StressIVChain) 02369 return true; 02370 02371 // Do not replace a constant offset from IV head with a nonconstant IV 02372 // increment. 02373 if (!isa<SCEVConstant>(IncExpr)) { 02374 const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand)); 02375 if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr))) 02376 return 0; 02377 } 02378 02379 SmallPtrSet<const SCEV*, 8> Processed; 02380 return !isHighCostExpansion(IncExpr, Processed, SE); 02381 } 02382 02383 /// Return true if the number of registers needed for the chain is estimated to 02384 /// be less than the number required for the individual IV users. First prohibit 02385 /// any IV users that keep the IV live across increments (the Users set should 02386 /// be empty). Next count the number and type of increments in the chain. 02387 /// 02388 /// Chaining IVs can lead to considerable code bloat if ISEL doesn't 02389 /// effectively use postinc addressing modes. Only consider it profitable it the 02390 /// increments can be computed in fewer registers when chained. 02391 /// 02392 /// TODO: Consider IVInc free if it's already used in another chains. 02393 static bool 02394 isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users, 02395 ScalarEvolution &SE, const TargetTransformInfo &TTI) { 02396 if (StressIVChain) 02397 return true; 02398 02399 if (!Chain.hasIncs()) 02400 return false; 02401 02402 if (!Users.empty()) { 02403 DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n"; 02404 for (SmallPtrSet<Instruction*, 4>::const_iterator I = Users.begin(), 02405 E = Users.end(); I != E; ++I) { 02406 dbgs() << " " << **I << "\n"; 02407 }); 02408 return false; 02409 } 02410 assert(!Chain.Incs.empty() && "empty IV chains are not allowed"); 02411 02412 // The chain itself may require a register, so intialize cost to 1. 02413 int cost = 1; 02414 02415 // A complete chain likely eliminates the need for keeping the original IV in 02416 // a register. LSR does not currently know how to form a complete chain unless 02417 // the header phi already exists. 02418 if (isa<PHINode>(Chain.tailUserInst()) 02419 && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) { 02420 --cost; 02421 } 02422 const SCEV *LastIncExpr = 0; 02423 unsigned NumConstIncrements = 0; 02424 unsigned NumVarIncrements = 0; 02425 unsigned NumReusedIncrements = 0; 02426 for (IVChain::const_iterator I = Chain.begin(), E = Chain.end(); 02427 I != E; ++I) { 02428 02429 if (I->IncExpr->isZero()) 02430 continue; 02431 02432 // Incrementing by zero or some constant is neutral. We assume constants can 02433 // be folded into an addressing mode or an add's immediate operand. 02434 if (isa<SCEVConstant>(I->IncExpr)) { 02435 ++NumConstIncrements; 02436 continue; 02437 } 02438 02439 if (I->IncExpr == LastIncExpr) 02440 ++NumReusedIncrements; 02441 else 02442 ++NumVarIncrements; 02443 02444 LastIncExpr = I->IncExpr; 02445 } 02446 // An IV chain with a single increment is handled by LSR's postinc 02447 // uses. However, a chain with multiple increments requires keeping the IV's 02448 // value live longer than it needs to be if chained. 02449 if (NumConstIncrements > 1) 02450 --cost; 02451 02452 // Materializing increment expressions in the preheader that didn't exist in 02453 // the original code may cost a register. For example, sign-extended array 02454 // indices can produce ridiculous increments like this: 02455 // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64))) 02456 cost += NumVarIncrements; 02457 02458 // Reusing variable increments likely saves a register to hold the multiple of 02459 // the stride. 02460 cost -= NumReusedIncrements; 02461 02462 DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost 02463 << "\n"); 02464 02465 return cost < 0; 02466 } 02467 02468 /// ChainInstruction - Add this IV user to an existing chain or make it the head 02469 /// of a new chain. 02470 void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper, 02471 SmallVectorImpl<ChainUsers> &ChainUsersVec) { 02472 // When IVs are used as types of varying widths, they are generally converted 02473 // to a wider type with some uses remaining narrow under a (free) trunc. 02474 Value *const NextIV = getWideOperand(IVOper); 02475 const SCEV *const OperExpr = SE.getSCEV(NextIV); 02476 const SCEV *const OperExprBase = getExprBase(OperExpr); 02477 02478 // Visit all existing chains. Check if its IVOper can be computed as a 02479 // profitable loop invariant increment from the last link in the Chain. 02480 unsigned ChainIdx = 0, NChains = IVChainVec.size(); 02481 const SCEV *LastIncExpr = 0; 02482 for (; ChainIdx < NChains; ++ChainIdx) { 02483 IVChain &Chain = IVChainVec[ChainIdx]; 02484 02485 // Prune the solution space aggressively by checking that both IV operands 02486 // are expressions that operate on the same unscaled SCEVUnknown. This 02487 // "base" will be canceled by the subsequent getMinusSCEV call. Checking 02488 // first avoids creating extra SCEV expressions. 02489 if (!StressIVChain && Chain.ExprBase != OperExprBase) 02490 continue; 02491 02492 Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand); 02493 if (!isCompatibleIVType(PrevIV, NextIV)) 02494 continue; 02495 02496 // A phi node terminates a chain. 02497 if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst())) 02498 continue; 02499 02500 // The increment must be loop-invariant so it can be kept in a register. 02501 const SCEV *PrevExpr = SE.getSCEV(PrevIV); 02502 const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr); 02503 if (!SE.isLoopInvariant(IncExpr, L)) 02504 continue; 02505 02506 if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) { 02507 LastIncExpr = IncExpr; 02508 break; 02509 } 02510 } 02511 // If we haven't found a chain, create a new one, unless we hit the max. Don't 02512 // bother for phi nodes, because they must be last in the chain. 02513 if (ChainIdx == NChains) { 02514 if (isa<PHINode>(UserInst)) 02515 return; 02516 if (NChains >= MaxChains && !StressIVChain) { 02517 DEBUG(dbgs() << "IV Chain Limit\n"); 02518 return; 02519 } 02520 LastIncExpr = OperExpr; 02521 // IVUsers may have skipped over sign/zero extensions. We don't currently 02522 // attempt to form chains involving extensions unless they can be hoisted 02523 // into this loop's AddRec. 02524 if (!isa<SCEVAddRecExpr>(LastIncExpr)) 02525 return; 02526 ++NChains; 02527 IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr), 02528 OperExprBase)); 02529 ChainUsersVec.resize(NChains); 02530 DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst 02531 << ") IV=" << *LastIncExpr << "\n"); 02532 } else { 02533 DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst 02534 << ") IV+" << *LastIncExpr << "\n"); 02535 // Add this IV user to the end of the chain. 02536 IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr)); 02537 } 02538 IVChain &Chain = IVChainVec[ChainIdx]; 02539 02540 SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers; 02541 // This chain's NearUsers become FarUsers. 02542 if (!LastIncExpr->isZero()) { 02543 ChainUsersVec[ChainIdx].FarUsers.insert(NearUsers.begin(), 02544 NearUsers.end()); 02545 NearUsers.clear(); 02546 } 02547 02548 // All other uses of IVOperand become near uses of the chain. 02549 // We currently ignore intermediate values within SCEV expressions, assuming 02550 // they will eventually be used be the current chain, or can be computed 02551 // from one of the chain increments. To be more precise we could 02552 // transitively follow its user and only add leaf IV users to the set. 02553 for (Value::use_iterator UseIter = IVOper->use_begin(), 02554 UseEnd = IVOper->use_end(); UseIter != UseEnd; ++UseIter) { 02555 Instruction *OtherUse = dyn_cast<Instruction>(*UseIter); 02556 if (!OtherUse) 02557 continue; 02558 // Uses in the chain will no longer be uses if the chain is formed. 02559 // Include the head of the chain in this iteration (not Chain.begin()). 02560 IVChain::const_iterator IncIter = Chain.Incs.begin(); 02561 IVChain::const_iterator IncEnd = Chain.Incs.end(); 02562 for( ; IncIter != IncEnd; ++IncIter) { 02563 if (IncIter->UserInst == OtherUse) 02564 break; 02565 } 02566 if (IncIter != IncEnd) 02567 continue; 02568 02569 if (SE.isSCEVable(OtherUse->getType()) 02570 && !isa<SCEVUnknown>(SE.getSCEV(OtherUse)) 02571 && IU.isIVUserOrOperand(OtherUse)) { 02572 continue; 02573 } 02574 NearUsers.insert(OtherUse); 02575 } 02576 02577 // Since this user is part of the chain, it's no longer considered a use 02578 // of the chain. 02579 ChainUsersVec[ChainIdx].FarUsers.erase(UserInst); 02580 } 02581 02582 /// CollectChains - Populate the vector of Chains. 02583 /// 02584 /// This decreases ILP at the architecture level. Targets with ample registers, 02585 /// multiple memory ports, and no register renaming probably don't want 02586 /// this. However, such targets should probably disable LSR altogether. 02587 /// 02588 /// The job of LSR is to make a reasonable choice of induction variables across 02589 /// the loop. Subsequent passes can easily "unchain" computation exposing more 02590 /// ILP *within the loop* if the target wants it. 02591 /// 02592 /// Finding the best IV chain is potentially a scheduling problem. Since LSR 02593 /// will not reorder memory operations, it will recognize this as a chain, but 02594 /// will generate redundant IV increments. Ideally this would be corrected later 02595 /// by a smart scheduler: 02596 /// = A[i] 02597 /// = A[i+x] 02598 /// A[i] = 02599 /// A[i+x] = 02600 /// 02601 /// TODO: Walk the entire domtree within this loop, not just the path to the 02602 /// loop latch. This will discover chains on side paths, but requires 02603 /// maintaining multiple copies of the Chains state. 02604 void LSRInstance::CollectChains() { 02605 DEBUG(dbgs() << "Collecting IV Chains.\n"); 02606 SmallVector<ChainUsers, 8> ChainUsersVec; 02607 02608 SmallVector<BasicBlock *,8> LatchPath; 02609 BasicBlock *LoopHeader = L->getHeader(); 02610 for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch()); 02611 Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) { 02612 LatchPath.push_back(Rung->getBlock()); 02613 } 02614 LatchPath.push_back(LoopHeader); 02615 02616 // Walk the instruction stream from the loop header to the loop latch. 02617 for (SmallVectorImpl<BasicBlock *>::reverse_iterator 02618 BBIter = LatchPath.rbegin(), BBEnd = LatchPath.rend(); 02619 BBIter != BBEnd; ++BBIter) { 02620 for (BasicBlock::iterator I = (*BBIter)->begin(), E = (*BBIter)->end(); 02621 I != E; ++I) { 02622 // Skip instructions that weren't seen by IVUsers analysis. 02623 if (isa<PHINode>(I) || !IU.isIVUserOrOperand(I)) 02624 continue; 02625 02626 // Ignore users that are part of a SCEV expression. This way we only 02627 // consider leaf IV Users. This effectively rediscovers a portion of 02628 // IVUsers analysis but in program order this time. 02629 if (SE.isSCEVable(I->getType()) && !isa<SCEVUnknown>(SE.getSCEV(I))) 02630 continue; 02631 02632 // Remove this instruction from any NearUsers set it may be in. 02633 for (unsigned ChainIdx = 0, NChains = IVChainVec.size(); 02634 ChainIdx < NChains; ++ChainIdx) { 02635 ChainUsersVec[ChainIdx].NearUsers.erase(I); 02636 } 02637 // Search for operands that can be chained. 02638 SmallPtrSet<Instruction*, 4> UniqueOperands; 02639 User::op_iterator IVOpEnd = I->op_end(); 02640 User::op_iterator IVOpIter = findIVOperand(I->op_begin(), IVOpEnd, L, SE); 02641 while (IVOpIter != IVOpEnd) { 02642 Instruction *IVOpInst = cast<Instruction>(*IVOpIter); 02643 if (UniqueOperands.insert(IVOpInst)) 02644 ChainInstruction(I, IVOpInst, ChainUsersVec); 02645 IVOpIter = findIVOperand(llvm::next(IVOpIter), IVOpEnd, L, SE); 02646 } 02647 } // Continue walking down the instructions. 02648 } // Continue walking down the domtree. 02649 // Visit phi backedges to determine if the chain can generate the IV postinc. 02650 for (BasicBlock::iterator I = L->getHeader()->begin(); 02651 PHINode *PN = dyn_cast<PHINode>(I); ++I) { 02652 if (!SE.isSCEVable(PN->getType())) 02653 continue; 02654 02655 Instruction *IncV = 02656 dyn_cast<Instruction>(PN->getIncomingValueForBlock(L->getLoopLatch())); 02657 if (IncV) 02658 ChainInstruction(PN, IncV, ChainUsersVec); 02659 } 02660 // Remove any unprofitable chains. 02661 unsigned ChainIdx = 0; 02662 for (unsigned UsersIdx = 0, NChains = IVChainVec.size(); 02663 UsersIdx < NChains; ++UsersIdx) { 02664 if (!isProfitableChain(IVChainVec[UsersIdx], 02665 ChainUsersVec[UsersIdx].FarUsers, SE, TTI)) 02666 continue; 02667 // Preserve the chain at UsesIdx. 02668 if (ChainIdx != UsersIdx) 02669 IVChainVec[ChainIdx] = IVChainVec[UsersIdx]; 02670 FinalizeChain(IVChainVec[ChainIdx]); 02671 ++ChainIdx; 02672 } 02673 IVChainVec.resize(ChainIdx); 02674 } 02675 02676 void LSRInstance::FinalizeChain(IVChain &Chain) { 02677 assert(!Chain.Incs.empty() && "empty IV chains are not allowed"); 02678 DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n"); 02679 02680 for (IVChain::const_iterator I = Chain.begin(), E = Chain.end(); 02681 I != E; ++I) { 02682 DEBUG(dbgs() << " Inc: " << *I->UserInst << "\n"); 02683 User::op_iterator UseI = 02684 std::find(I->UserInst->op_begin(), I->UserInst->op_end(), I->IVOperand); 02685 assert(UseI != I->UserInst->op_end() && "cannot find IV operand"); 02686 IVIncSet.insert(UseI); 02687 } 02688 } 02689 02690 /// Return true if the IVInc can be folded into an addressing mode. 02691 static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, 02692 Value *Operand, const TargetTransformInfo &TTI) { 02693 const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr); 02694 if (!IncConst || !isAddressUse(UserInst, Operand)) 02695 return false; 02696 02697 if (IncConst->getValue()->getValue().getMinSignedBits() > 64) 02698 return false; 02699 02700 int64_t IncOffset = IncConst->getValue()->getSExtValue(); 02701 if (!isAlwaysFoldable(TTI, LSRUse::Address, 02702 getAccessType(UserInst), /*BaseGV=*/ 0, 02703 IncOffset, /*HaseBaseReg=*/ false)) 02704 return false; 02705 02706 return true; 02707 } 02708 02709 /// GenerateIVChains - Generate an add or subtract for each IVInc in a chain to 02710 /// materialize the IV user's operand from the previous IV user's operand. 02711 void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, 02712 SmallVectorImpl<WeakVH> &DeadInsts) { 02713 // Find the new IVOperand for the head of the chain. It may have been replaced 02714 // by LSR. 02715 const IVInc &Head = Chain.Incs[0]; 02716 User::op_iterator IVOpEnd = Head.UserInst->op_end(); 02717 // findIVOperand returns IVOpEnd if it can no longer find a valid IV user. 02718 User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(), 02719 IVOpEnd, L, SE); 02720 Value *IVSrc = 0; 02721 while (IVOpIter != IVOpEnd) { 02722 IVSrc = getWideOperand(*IVOpIter); 02723 02724 // If this operand computes the expression that the chain needs, we may use 02725 // it. (Check this after setting IVSrc which is used below.) 02726 // 02727 // Note that if Head.IncExpr is wider than IVSrc, then this phi is too 02728 // narrow for the chain, so we can no longer use it. We do allow using a 02729 // wider phi, assuming the LSR checked for free truncation. In that case we 02730 // should already have a truncate on this operand such that 02731 // getSCEV(IVSrc) == IncExpr. 02732 if (SE.getSCEV(*IVOpIter) == Head.IncExpr 02733 || SE.getSCEV(IVSrc) == Head.IncExpr) { 02734 break; 02735 } 02736 IVOpIter = findIVOperand(llvm::next(IVOpIter), IVOpEnd, L, SE); 02737 } 02738 if (IVOpIter == IVOpEnd) { 02739 // Gracefully give up on this chain. 02740 DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n"); 02741 return; 02742 } 02743 02744 DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n"); 02745 Type *IVTy = IVSrc->getType(); 02746 Type *IntTy = SE.getEffectiveSCEVType(IVTy); 02747 const SCEV *LeftOverExpr = 0; 02748 for (IVChain::const_iterator IncI = Chain.begin(), 02749 IncE = Chain.end(); IncI != IncE; ++IncI) { 02750 02751 Instruction *InsertPt = IncI->UserInst; 02752 if (isa<PHINode>(InsertPt)) 02753 InsertPt = L->getLoopLatch()->getTerminator(); 02754 02755 // IVOper will replace the current IV User's operand. IVSrc is the IV 02756 // value currently held in a register. 02757 Value *IVOper = IVSrc; 02758 if (!IncI->IncExpr->isZero()) { 02759 // IncExpr was the result of subtraction of two narrow values, so must 02760 // be signed. 02761 const SCEV *IncExpr = SE.getNoopOrSignExtend(IncI->IncExpr, IntTy); 02762 LeftOverExpr = LeftOverExpr ? 02763 SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr; 02764 } 02765 if (LeftOverExpr && !LeftOverExpr->isZero()) { 02766 // Expand the IV increment. 02767 Rewriter.clearPostInc(); 02768 Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt); 02769 const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc), 02770 SE.getUnknown(IncV)); 02771 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt); 02772 02773 // If an IV increment can't be folded, use it as the next IV value. 02774 if (!canFoldIVIncExpr(LeftOverExpr, IncI->UserInst, IncI->IVOperand, 02775 TTI)) { 02776 assert(IVTy == IVOper->getType() && "inconsistent IV increment type"); 02777 IVSrc = IVOper; 02778 LeftOverExpr = 0; 02779 } 02780 } 02781 Type *OperTy = IncI->IVOperand->getType(); 02782 if (IVTy != OperTy) { 02783 assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) && 02784 "cannot extend a chained IV"); 02785 IRBuilder<> Builder(InsertPt); 02786 IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain"); 02787 } 02788 IncI->UserInst->replaceUsesOfWith(IncI->IVOperand, IVOper); 02789 DeadInsts.push_back(IncI->IVOperand); 02790 } 02791 // If LSR created a new, wider phi, we may also replace its postinc. We only 02792 // do this if we also found a wide value for the head of the chain. 02793 if (isa<PHINode>(Chain.tailUserInst())) { 02794 for (BasicBlock::iterator I = L->getHeader()->begin(); 02795 PHINode *Phi = dyn_cast<PHINode>(I); ++I) { 02796 if (!isCompatibleIVType(Phi, IVSrc)) 02797 continue; 02798 Instruction *PostIncV = dyn_cast<Instruction>( 02799 Phi->getIncomingValueForBlock(L->getLoopLatch())); 02800 if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc))) 02801 continue; 02802 Value *IVOper = IVSrc; 02803 Type *PostIncTy = PostIncV->getType(); 02804 if (IVTy != PostIncTy) { 02805 assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types"); 02806 IRBuilder<> Builder(L->getLoopLatch()->getTerminator()); 02807 Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc()); 02808 IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain"); 02809 } 02810 Phi->replaceUsesOfWith(PostIncV, IVOper); 02811 DeadInsts.push_back(PostIncV); 02812 } 02813 } 02814 } 02815 02816 void LSRInstance::CollectFixupsAndInitialFormulae() { 02817 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) { 02818 Instruction *UserInst = UI->getUser(); 02819 // Skip IV users that are part of profitable IV Chains. 02820 User::op_iterator UseI = std::find(UserInst->op_begin(), UserInst->op_end(), 02821 UI->getOperandValToReplace()); 02822 assert(UseI != UserInst->op_end() && "cannot find IV operand"); 02823 if (IVIncSet.count(UseI)) 02824 continue; 02825 02826 // Record the uses. 02827 LSRFixup &LF = getNewFixup(); 02828 LF.UserInst = UserInst; 02829 LF.OperandValToReplace = UI->getOperandValToReplace(); 02830 LF.PostIncLoops = UI->getPostIncLoops(); 02831 02832 LSRUse::KindType Kind = LSRUse::Basic; 02833 Type *AccessTy = 0; 02834 if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) { 02835 Kind = LSRUse::Address; 02836 AccessTy = getAccessType(LF.UserInst); 02837 } 02838 02839 const SCEV *S = IU.getExpr(*UI); 02840 02841 // Equality (== and !=) ICmps are special. We can rewrite (i == N) as 02842 // (N - i == 0), and this allows (N - i) to be the expression that we work 02843 // with rather than just N or i, so we can consider the register 02844 // requirements for both N and i at the same time. Limiting this code to 02845 // equality icmps is not a problem because all interesting loops use 02846 // equality icmps, thanks to IndVarSimplify. 02847 if (ICmpInst *CI = dyn_cast<ICmpInst>(LF.UserInst)) 02848 if (CI->isEquality()) { 02849 // Swap the operands if needed to put the OperandValToReplace on the 02850 // left, for consistency. 02851 Value *NV = CI->getOperand(1); 02852 if (NV == LF.OperandValToReplace) { 02853 CI->setOperand(1, CI->getOperand(0)); 02854 CI->setOperand(0, NV); 02855 NV = CI->getOperand(1); 02856 Changed = true; 02857 } 02858 02859 // x == y --> x - y == 0 02860 const SCEV *N = SE.getSCEV(NV); 02861 if (SE.isLoopInvariant(N, L) && isSafeToExpand(N)) { 02862 // S is normalized, so normalize N before folding it into S 02863 // to keep the result normalized. 02864 N = TransformForPostIncUse(Normalize, N, CI, 0, 02865 LF.PostIncLoops, SE, DT); 02866 Kind = LSRUse::ICmpZero; 02867 S = SE.getMinusSCEV(N, S); 02868 } 02869 02870 // -1 and the negations of all interesting strides (except the negation 02871 // of -1) are now also interesting. 02872 for (size_t i = 0, e = Factors.size(); i != e; ++i) 02873 if (Factors[i] != -1) 02874 Factors.insert(-(uint64_t)Factors[i]); 02875 Factors.insert(-1); 02876 } 02877 02878 // Set up the initial formula for this use. 02879 std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy); 02880 LF.LUIdx = P.first; 02881 LF.Offset = P.second; 02882 LSRUse &LU = Uses[LF.LUIdx]; 02883 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); 02884 if (!LU.WidestFixupType || 02885 SE.getTypeSizeInBits(LU.WidestFixupType) < 02886 SE.getTypeSizeInBits(LF.OperandValToReplace->getType())) 02887 LU.WidestFixupType = LF.OperandValToReplace->getType(); 02888 02889 // If this is the first use of this LSRUse, give it a formula. 02890 if (LU.Formulae.empty()) { 02891 InsertInitialFormula(S, LU, LF.LUIdx); 02892 CountRegisters(LU.Formulae.back(), LF.LUIdx); 02893 } 02894 } 02895 02896 DEBUG(print_fixups(dbgs())); 02897 } 02898 02899 /// InsertInitialFormula - Insert a formula for the given expression into 02900 /// the given use, separating out loop-variant portions from loop-invariant 02901 /// and loop-computable portions. 02902 void 02903 LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) { 02904 Formula F; 02905 F.InitialMatch(S, L, SE); 02906 bool Inserted = InsertFormula(LU, LUIdx, F); 02907 assert(Inserted && "Initial formula already exists!"); (void)Inserted; 02908 } 02909 02910 /// InsertSupplementalFormula - Insert a simple single-register formula for 02911 /// the given expression into the given use. 02912 void 02913 LSRInstance::InsertSupplementalFormula(const SCEV *S, 02914 LSRUse &LU, size_t LUIdx) { 02915 Formula F; 02916 F.BaseRegs.push_back(S); 02917 F.HasBaseReg = true; 02918 bool Inserted = InsertFormula(LU, LUIdx, F); 02919 assert(Inserted && "Supplemental formula already exists!"); (void)Inserted; 02920 } 02921 02922 /// CountRegisters - Note which registers are used by the given formula, 02923 /// updating RegUses. 02924 void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) { 02925 if (F.ScaledReg) 02926 RegUses.CountRegister(F.ScaledReg, LUIdx); 02927 for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(), 02928 E = F.BaseRegs.end(); I != E; ++I) 02929 RegUses.CountRegister(*I, LUIdx); 02930 } 02931 02932 /// InsertFormula - If the given formula has not yet been inserted, add it to 02933 /// the list, and return true. Return false otherwise. 02934 bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) { 02935 if (!LU.InsertFormula(F)) 02936 return false; 02937 02938 CountRegisters(F, LUIdx); 02939 return true; 02940 } 02941 02942 /// CollectLoopInvariantFixupsAndFormulae - Check for other uses of 02943 /// loop-invariant values which we're tracking. These other uses will pin these 02944 /// values in registers, making them less profitable for elimination. 02945 /// TODO: This currently misses non-constant addrec step registers. 02946 /// TODO: Should this give more weight to users inside the loop? 02947 void 02948 LSRInstance::CollectLoopInvariantFixupsAndFormulae() { 02949 SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end()); 02950 SmallPtrSet<const SCEV *, 8> Inserted; 02951 02952 while (!Worklist.empty()) { 02953 const SCEV *S = Worklist.pop_back_val(); 02954 02955 if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S)) 02956 Worklist.append(N->op_begin(), N->op_end()); 02957 else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S)) 02958 Worklist.push_back(C->getOperand()); 02959 else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) { 02960 Worklist.push_back(D->getLHS()); 02961 Worklist.push_back(D->getRHS()); 02962 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) { 02963 if (!Inserted.insert(U)) continue; 02964 const Value *V = U->getValue(); 02965 if (const Instruction *Inst = dyn_cast<Instruction>(V)) { 02966 // Look for instructions defined outside the loop. 02967 if (L->contains(Inst)) continue; 02968 } else if (isa<UndefValue>(V)) 02969 // Undef doesn't have a live range, so it doesn't matter. 02970 continue; 02971 for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end(); 02972 UI != UE; ++UI) { 02973 const Instruction *UserInst = dyn_cast<Instruction>(*UI); 02974 // Ignore non-instructions. 02975 if (!UserInst) 02976 continue; 02977 // Ignore instructions in other functions (as can happen with 02978 // Constants). 02979 if (UserInst->getParent()->getParent() != L->getHeader()->getParent()) 02980 continue; 02981 // Ignore instructions not dominated by the loop. 02982 const BasicBlock *UseBB = !isa<PHINode>(UserInst) ? 02983 UserInst->getParent() : 02984 cast<PHINode>(UserInst)->getIncomingBlock( 02985 PHINode::getIncomingValueNumForOperand(UI.getOperandNo())); 02986 if (!DT.dominates(L->getHeader(), UseBB)) 02987 continue; 02988 // Ignore uses which are part of other SCEV expressions, to avoid 02989 // analyzing them multiple times. 02990 if (SE.isSCEVable(UserInst->getType())) { 02991 const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst)); 02992 // If the user is a no-op, look through to its uses. 02993 if (!isa<SCEVUnknown>(UserS)) 02994 continue; 02995 if (UserS == U) { 02996 Worklist.push_back( 02997 SE.getUnknown(const_cast<Instruction *>(UserInst))); 02998 continue; 02999 } 03000 } 03001 // Ignore icmp instructions which are already being analyzed. 03002 if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) { 03003 unsigned OtherIdx = !UI.getOperandNo(); 03004 Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx)); 03005 if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L)) 03006 continue; 03007 } 03008 03009 LSRFixup &LF = getNewFixup(); 03010 LF.UserInst = const_cast<Instruction *>(UserInst); 03011 LF.OperandValToReplace = UI.getUse(); 03012 std::pair<size_t, int64_t> P = getUse(S, LSRUse::Basic, 0); 03013 LF.LUIdx = P.first; 03014 LF.Offset = P.second; 03015 LSRUse &LU = Uses[LF.LUIdx]; 03016 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); 03017 if (!LU.WidestFixupType || 03018 SE.getTypeSizeInBits(LU.WidestFixupType) < 03019 SE.getTypeSizeInBits(LF.OperandValToReplace->getType())) 03020 LU.WidestFixupType = LF.OperandValToReplace->getType(); 03021 InsertSupplementalFormula(U, LU, LF.LUIdx); 03022 CountRegisters(LU.Formulae.back(), Uses.size() - 1); 03023 break; 03024 } 03025 } 03026 } 03027 } 03028 03029 /// CollectSubexprs - Split S into subexpressions which can be pulled out into 03030 /// separate registers. If C is non-null, multiply each subexpression by C. 03031 /// 03032 /// Return remainder expression after factoring the subexpressions captured by 03033 /// Ops. If Ops is complete, return NULL. 03034 static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C, 03035 SmallVectorImpl<const SCEV *> &Ops, 03036 const Loop *L, 03037 ScalarEvolution &SE, 03038 unsigned Depth = 0) { 03039 // Arbitrarily cap recursion to protect compile time. 03040 if (Depth >= 3) 03041 return S; 03042 03043 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { 03044 // Break out add operands. 03045 for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); 03046 I != E; ++I) { 03047 const SCEV *Remainder = CollectSubexprs(*I, C, Ops, L, SE, Depth+1); 03048 if (Remainder) 03049 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder); 03050 } 03051 return NULL; 03052 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { 03053 // Split a non-zero base out of an addrec. 03054 if (AR->getStart()->isZero()) 03055 return S; 03056 03057 const SCEV *Remainder = CollectSubexprs(AR->getStart(), 03058 C, Ops, L, SE, Depth+1); 03059 // Split the non-zero AddRec unless it is part of a nested recurrence that 03060 // does not pertain to this loop. 03061 if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) { 03062 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder); 03063 Remainder = NULL; 03064 } 03065 if (Remainder != AR->getStart()) { 03066 if (!Remainder) 03067 Remainder = SE.getConstant(AR->getType(), 0); 03068 return SE.getAddRecExpr(Remainder, 03069 AR->getStepRecurrence(SE), 03070 AR->getLoop(), 03071 //FIXME: AR->getNoWrapFlags(SCEV::FlagNW) 03072 SCEV::FlagAnyWrap); 03073 } 03074 } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) { 03075 // Break (C * (a + b + c)) into C*a + C*b + C*c. 03076 if (Mul->getNumOperands() != 2) 03077 return S; 03078 if (const SCEVConstant *Op0 = 03079 dyn_cast<SCEVConstant>(Mul->getOperand(0))) { 03080 C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0; 03081 const SCEV *Remainder = 03082 CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1); 03083 if (Remainder) 03084 Ops.push_back(SE.getMulExpr(C, Remainder)); 03085 return NULL; 03086 } 03087 } 03088 return S; 03089 } 03090 03091 /// GenerateReassociations - Split out subexpressions from adds and the bases of 03092 /// addrecs. 03093 void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, 03094 Formula Base, 03095 unsigned Depth) { 03096 // Arbitrarily cap recursion to protect compile time. 03097 if (Depth >= 3) return; 03098 03099 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) { 03100 const SCEV *BaseReg = Base.BaseRegs[i]; 03101 03102 SmallVector<const SCEV *, 8> AddOps; 03103 const SCEV *Remainder = CollectSubexprs(BaseReg, 0, AddOps, L, SE); 03104 if (Remainder) 03105 AddOps.push_back(Remainder); 03106 03107 if (AddOps.size() == 1) continue; 03108 03109 for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(), 03110 JE = AddOps.end(); J != JE; ++J) { 03111 03112 // Loop-variant "unknown" values are uninteresting; we won't be able to 03113 // do anything meaningful with them. 03114 if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L)) 03115 continue; 03116 03117 // Don't pull a constant into a register if the constant could be folded 03118 // into an immediate field. 03119 if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind, 03120 LU.AccessTy, *J, Base.getNumRegs() > 1)) 03121 continue; 03122 03123 // Collect all operands except *J. 03124 SmallVector<const SCEV *, 8> InnerAddOps 03125 (((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J); 03126 InnerAddOps.append 03127 (llvm::next(J), ((const SmallVector<const SCEV *, 8> &)AddOps).end()); 03128 03129 // Don't leave just a constant behind in a register if the constant could 03130 // be folded into an immediate field. 03131 if (InnerAddOps.size() == 1 && 03132 isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind, 03133 LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1)) 03134 continue; 03135 03136 const SCEV *InnerSum = SE.getAddExpr(InnerAddOps); 03137 if (InnerSum->isZero()) 03138 continue; 03139 Formula F = Base; 03140 03141 // Add the remaining pieces of the add back into the new formula. 03142 const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum); 03143 if (InnerSumSC && 03144 SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 && 03145 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset + 03146 InnerSumSC->getValue()->getZExtValue())) { 03147 F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset + 03148 InnerSumSC->getValue()->getZExtValue(); 03149 F.BaseRegs.erase(F.BaseRegs.begin() + i); 03150 } else 03151 F.BaseRegs[i] = InnerSum; 03152 03153 // Add J as its own register, or an unfolded immediate. 03154 const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J); 03155 if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 && 03156 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset + 03157 SC->getValue()->getZExtValue())) 03158 F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset + 03159 SC->getValue()->getZExtValue(); 03160 else 03161 F.BaseRegs.push_back(*J); 03162 03163 if (InsertFormula(LU, LUIdx, F)) 03164 // If that formula hadn't been seen before, recurse to find more like 03165 // it. 03166 GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth+1); 03167 } 03168 } 03169 } 03170 03171 /// GenerateCombinations - Generate a formula consisting of all of the 03172 /// loop-dominating registers added into a single register. 03173 void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, 03174 Formula Base) { 03175 // This method is only interesting on a plurality of registers. 03176 if (Base.BaseRegs.size() <= 1) return; 03177 03178 Formula F = Base; 03179 F.BaseRegs.clear(); 03180 SmallVector<const SCEV *, 4> Ops; 03181 for (SmallVectorImpl<const SCEV *>::const_iterator 03182 I = Base.BaseRegs.begin(), E = Base.BaseRegs.end(); I != E; ++I) { 03183 const SCEV *BaseReg = *I; 03184 if (SE.properlyDominates(BaseReg, L->getHeader()) && 03185 !SE.hasComputableLoopEvolution(BaseReg, L)) 03186 Ops.push_back(BaseReg); 03187 else 03188 F.BaseRegs.push_back(BaseReg); 03189 } 03190 if (Ops.size() > 1) { 03191 const SCEV *Sum = SE.getAddExpr(Ops); 03192 // TODO: If Sum is zero, it probably means ScalarEvolution missed an 03193 // opportunity to fold something. For now, just ignore such cases 03194 // rather than proceed with zero in a register. 03195 if (!Sum->isZero()) { 03196 F.BaseRegs.push_back(Sum); 03197 (void)InsertFormula(LU, LUIdx, F); 03198 } 03199 } 03200 } 03201 03202 /// GenerateSymbolicOffsets - Generate reuse formulae using symbolic offsets. 03203 void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, 03204 Formula Base) { 03205 // We can't add a symbolic offset if the address already contains one. 03206 if (Base.BaseGV) return; 03207 03208 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) { 03209 const SCEV *G = Base.BaseRegs[i]; 03210 GlobalValue *GV = ExtractSymbol(G, SE); 03211 if (G->isZero() || !GV) 03212 continue; 03213 Formula F = Base; 03214 F.BaseGV = GV; 03215 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) 03216 continue; 03217 F.BaseRegs[i] = G; 03218 (void)InsertFormula(LU, LUIdx, F); 03219 } 03220 } 03221 03222 /// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets. 03223 void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, 03224 Formula Base) { 03225 // TODO: For now, just add the min and max offset, because it usually isn't 03226 // worthwhile looking at everything inbetween. 03227 SmallVector<int64_t, 2> Worklist; 03228 Worklist.push_back(LU.MinOffset); 03229 if (LU.MaxOffset != LU.MinOffset) 03230 Worklist.push_back(LU.MaxOffset); 03231 03232 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) { 03233 const SCEV *G = Base.BaseRegs[i]; 03234 03235 for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(), 03236 E = Worklist.end(); I != E; ++I) { 03237 Formula F = Base; 03238 F.BaseOffset = (uint64_t)Base.BaseOffset - *I; 03239 if (isLegalUse(TTI, LU.MinOffset - *I, LU.MaxOffset - *I, LU.Kind, 03240 LU.AccessTy, F)) { 03241 // Add the offset to the base register. 03242 const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G); 03243 // If it cancelled out, drop the base register, otherwise update it. 03244 if (NewG->isZero()) { 03245 std::swap(F.BaseRegs[i], F.BaseRegs.back()); 03246 F.BaseRegs.pop_back(); 03247 } else 03248 F.BaseRegs[i] = NewG; 03249 03250 (void)InsertFormula(LU, LUIdx, F); 03251 } 03252 } 03253 03254 int64_t Imm = ExtractImmediate(G, SE); 03255 if (G->isZero() || Imm == 0) 03256 continue; 03257 Formula F = Base; 03258 F.BaseOffset = (uint64_t)F.BaseOffset + Imm; 03259 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) 03260 continue; 03261 F.BaseRegs[i] = G; 03262 (void)InsertFormula(LU, LUIdx, F); 03263 } 03264 } 03265 03266 /// GenerateICmpZeroScales - For ICmpZero, check to see if we can scale up 03267 /// the comparison. For example, x == y -> x*c == y*c. 03268 void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, 03269 Formula Base) { 03270 if (LU.Kind != LSRUse::ICmpZero) return; 03271 03272 // Determine the integer type for the base formula. 03273 Type *IntTy = Base.getType(); 03274 if (!IntTy) return; 03275 if (SE.getTypeSizeInBits(IntTy) > 64) return; 03276 03277 // Don't do this if there is more than one offset. 03278 if (LU.MinOffset != LU.MaxOffset) return; 03279 03280 assert(!Base.BaseGV && "ICmpZero use is not legal!"); 03281 03282 // Check each interesting stride. 03283 for (SmallSetVector<int64_t, 8>::const_iterator 03284 I = Factors.begin(), E = Factors.end(); I != E; ++I) { 03285 int64_t Factor = *I; 03286 03287 // Check that the multiplication doesn't overflow. 03288 if (Base.BaseOffset == INT64_MIN && Factor == -1) 03289 continue; 03290 int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor; 03291 if (NewBaseOffset / Factor != Base.BaseOffset) 03292 continue; 03293 03294 // Check that multiplying with the use offset doesn't overflow. 03295 int64_t Offset = LU.MinOffset; 03296 if (Offset == INT64_MIN && Factor == -1) 03297 continue; 03298 Offset = (uint64_t)Offset * Factor; 03299 if (Offset / Factor != LU.MinOffset) 03300 continue; 03301 03302 Formula F = Base; 03303 F.BaseOffset = NewBaseOffset; 03304 03305 // Check that this scale is legal. 03306 if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F)) 03307 continue; 03308 03309 // Compensate for the use having MinOffset built into it. 03310 F.BaseOffset = (uint64_t)F.BaseOffset + Offset - LU.MinOffset; 03311 03312 const SCEV *FactorS = SE.getConstant(IntTy, Factor); 03313 03314 // Check that multiplying with each base register doesn't overflow. 03315 for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) { 03316 F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS); 03317 if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i]) 03318 goto next; 03319 } 03320 03321 // Check that multiplying with the scaled register doesn't overflow. 03322 if (F.ScaledReg) { 03323 F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS); 03324 if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg) 03325 continue; 03326 } 03327 03328 // Check that multiplying with the unfolded offset doesn't overflow. 03329 if (F.UnfoldedOffset != 0) { 03330 if (F.UnfoldedOffset == INT64_MIN && Factor == -1) 03331 continue; 03332 F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor; 03333 if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset) 03334 continue; 03335 } 03336 03337 // If we make it here and it's legal, add it. 03338 (void)InsertFormula(LU, LUIdx, F); 03339 next:; 03340 } 03341 } 03342 03343 /// GenerateScales - Generate stride factor reuse formulae by making use of 03344 /// scaled-offset address modes, for example. 03345 void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { 03346 // Determine the integer type for the base formula. 03347 Type *IntTy = Base.getType(); 03348 if (!IntTy) return; 03349 03350 // If this Formula already has a scaled register, we can't add another one. 03351 if (Base.Scale != 0) return; 03352 03353 // Check each interesting stride. 03354 for (SmallSetVector<int64_t, 8>::const_iterator 03355 I = Factors.begin(), E = Factors.end(); I != E; ++I) { 03356 int64_t Factor = *I; 03357 03358 Base.Scale = Factor; 03359 Base.HasBaseReg = Base.BaseRegs.size() > 1; 03360 // Check whether this scale is going to be legal. 03361 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, 03362 Base)) { 03363 // As a special-case, handle special out-of-loop Basic users specially. 03364 // TODO: Reconsider this special case. 03365 if (LU.Kind == LSRUse::Basic && 03366 isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special, 03367 LU.AccessTy, Base) && 03368 LU.AllFixupsOutsideLoop) 03369 LU.Kind = LSRUse::Special; 03370 else 03371 continue; 03372 } 03373 // For an ICmpZero, negating a solitary base register won't lead to 03374 // new solutions. 03375 if (LU.Kind == LSRUse::ICmpZero && 03376 !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV) 03377 continue; 03378 // For each addrec base reg, apply the scale, if possible. 03379 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) 03380 if (const SCEVAddRecExpr *AR = 03381 dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) { 03382 const SCEV *FactorS = SE.getConstant(IntTy, Factor); 03383 if (FactorS->isZero()) 03384 continue; 03385 // Divide out the factor, ignoring high bits, since we'll be 03386 // scaling the value back up in the end. 03387 if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) { 03388 // TODO: This could be optimized to avoid all the copying. 03389 Formula F = Base; 03390 F.ScaledReg = Quotient; 03391 F.DeleteBaseReg(F.BaseRegs[i]); 03392 (void)InsertFormula(LU, LUIdx, F); 03393 } 03394 } 03395 } 03396 } 03397 03398 /// GenerateTruncates - Generate reuse formulae from different IV types. 03399 void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) { 03400 // Don't bother truncating symbolic values. 03401 if (Base.BaseGV) return; 03402 03403 // Determine the integer type for the base formula. 03404 Type *DstTy = Base.getType(); 03405 if (!DstTy) return; 03406 DstTy = SE.getEffectiveSCEVType(DstTy); 03407 03408 for (SmallSetVector<Type *, 4>::const_iterator 03409 I = Types.begin(), E = Types.end(); I != E; ++I) { 03410 Type *SrcTy = *I; 03411 if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) { 03412 Formula F = Base; 03413 03414 if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, *I); 03415 for (SmallVectorImpl<const SCEV *>::iterator J = F.BaseRegs.begin(), 03416 JE = F.BaseRegs.end(); J != JE; ++J) 03417 *J = SE.getAnyExtendExpr(*J, SrcTy); 03418 03419 // TODO: This assumes we've done basic processing on all uses and 03420 // have an idea what the register usage is. 03421 if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses)) 03422 continue; 03423 03424 (void)InsertFormula(LU, LUIdx, F); 03425 } 03426 } 03427 } 03428 03429 namespace { 03430 03431 /// WorkItem - Helper class for GenerateCrossUseConstantOffsets. It's used to 03432 /// defer modifications so that the search phase doesn't have to worry about 03433 /// the data structures moving underneath it. 03434 struct WorkItem { 03435 size_t LUIdx; 03436 int64_t Imm; 03437 const SCEV *OrigReg; 03438 03439 WorkItem(size_t LI, int64_t I, const SCEV *R) 03440 : LUIdx(LI), Imm(I), OrigReg(R) {} 03441 03442 void print(raw_ostream &OS) const; 03443 void dump() const; 03444 }; 03445 03446 } 03447 03448 void WorkItem::print(raw_ostream &OS) const { 03449 OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx 03450 << " , add offset " << Imm; 03451 } 03452 03453 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 03454 void WorkItem::dump() const { 03455 print(errs()); errs() << '\n'; 03456 } 03457 #endif 03458 03459 /// GenerateCrossUseConstantOffsets - Look for registers which are a constant 03460 /// distance apart and try to form reuse opportunities between them. 03461 void LSRInstance::GenerateCrossUseConstantOffsets() { 03462 // Group the registers by their value without any added constant offset. 03463 typedef std::map<int64_t, const SCEV *> ImmMapTy; 03464 typedef DenseMap<const SCEV *, ImmMapTy> RegMapTy; 03465 RegMapTy Map; 03466 DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap; 03467 SmallVector<const SCEV *, 8> Sequence; 03468 for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end(); 03469 I != E; ++I) { 03470 const SCEV *Reg = *I; 03471 int64_t Imm = ExtractImmediate(Reg, SE); 03472 std::pair<RegMapTy::iterator, bool> Pair = 03473 Map.insert(std::make_pair(Reg, ImmMapTy())); 03474 if (Pair.second) 03475 Sequence.push_back(Reg); 03476 Pair.first->second.insert(std::make_pair(Imm, *I)); 03477 UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(*I); 03478 } 03479 03480 // Now examine each set of registers with the same base value. Build up 03481 // a list of work to do and do the work in a separate step so that we're 03482 // not adding formulae and register counts while we're searching. 03483 SmallVector<WorkItem, 32> WorkItems; 03484 SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems; 03485 for (SmallVectorImpl<const SCEV *>::const_iterator I = Sequence.begin(), 03486 E = Sequence.end(); I != E; ++I) { 03487 const SCEV *Reg = *I; 03488 const ImmMapTy &Imms = Map.find(Reg)->second; 03489 03490 // It's not worthwhile looking for reuse if there's only one offset. 03491 if (Imms.size() == 1) 03492 continue; 03493 03494 DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':'; 03495 for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end(); 03496 J != JE; ++J) 03497 dbgs() << ' ' << J->first; 03498 dbgs() << '\n'); 03499 03500 // Examine each offset. 03501 for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end(); 03502 J != JE; ++J) { 03503 const SCEV *OrigReg = J->second; 03504 03505 int64_t JImm = J->first; 03506 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg); 03507 03508 if (!isa<SCEVConstant>(OrigReg) && 03509 UsedByIndicesMap[Reg].count() == 1) { 03510 DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg << '\n'); 03511 continue; 03512 } 03513 03514 // Conservatively examine offsets between this orig reg a few selected 03515 // other orig regs. 03516 ImmMapTy::const_iterator OtherImms[] = { 03517 Imms.begin(), prior(Imms.end()), 03518 Imms.lower_bound((Imms.begin()->first + prior(Imms.end())->first) / 2) 03519 }; 03520 for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) { 03521 ImmMapTy::const_iterator M = OtherImms[i]; 03522 if (M == J || M == JE) continue; 03523 03524 // Compute the difference between the two. 03525 int64_t Imm = (uint64_t)JImm - M->first; 03526 for (int LUIdx = UsedByIndices.find_first(); LUIdx != -1; 03527 LUIdx = UsedByIndices.find_next(LUIdx)) 03528 // Make a memo of this use, offset, and register tuple. 03529 if (UniqueItems.insert(std::make_pair(LUIdx, Imm))) 03530 WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg)); 03531 } 03532 } 03533 } 03534 03535 Map.clear(); 03536 Sequence.clear(); 03537 UsedByIndicesMap.clear(); 03538 UniqueItems.clear(); 03539 03540 // Now iterate through the worklist and add new formulae. 03541 for (SmallVectorImpl<WorkItem>::const_iterator I = WorkItems.begin(), 03542 E = WorkItems.end(); I != E; ++I) { 03543 const WorkItem &WI = *I; 03544 size_t LUIdx = WI.LUIdx; 03545 LSRUse &LU = Uses[LUIdx]; 03546 int64_t Imm = WI.Imm; 03547 const SCEV *OrigReg = WI.OrigReg; 03548 03549 Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType()); 03550 const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm)); 03551 unsigned BitWidth = SE.getTypeSizeInBits(IntTy); 03552 03553 // TODO: Use a more targeted data structure. 03554 for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) { 03555 const Formula &F = LU.Formulae[L]; 03556 // Use the immediate in the scaled register. 03557 if (F.ScaledReg == OrigReg) { 03558 int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale; 03559 // Don't create 50 + reg(-50). 03560 if (F.referencesReg(SE.getSCEV( 03561 ConstantInt::get(IntTy, -(uint64_t)Offset)))) 03562 continue; 03563 Formula NewF = F; 03564 NewF.BaseOffset = Offset; 03565 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, 03566 NewF)) 03567 continue; 03568 NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg); 03569 03570 // If the new scale is a constant in a register, and adding the constant 03571 // value to the immediate would produce a value closer to zero than the 03572 // immediate itself, then the formula isn't worthwhile. 03573 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) 03574 if (C->getValue()->isNegative() != 03575 (NewF.BaseOffset < 0) && 03576 (C->getValue()->getValue().abs() * APInt(BitWidth, F.Scale)) 03577 .ule(abs64(NewF.BaseOffset))) 03578 continue; 03579 03580 // OK, looks good. 03581 (void)InsertFormula(LU, LUIdx, NewF); 03582 } else { 03583 // Use the immediate in a base register. 03584 for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) { 03585 const SCEV *BaseReg = F.BaseRegs[N]; 03586 if (BaseReg != OrigReg) 03587 continue; 03588 Formula NewF = F; 03589 NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm; 03590 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, 03591 LU.Kind, LU.AccessTy, NewF)) { 03592 if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm)) 03593 continue; 03594 NewF = F; 03595 NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm; 03596 } 03597 NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg); 03598 03599 // If the new formula has a constant in a register, and adding the 03600 // constant value to the immediate would produce a value closer to 03601 // zero than the immediate itself, then the formula isn't worthwhile. 03602 for (SmallVectorImpl<const SCEV *>::const_iterator 03603 J = NewF.BaseRegs.begin(), JE = NewF.BaseRegs.end(); 03604 J != JE; ++J) 03605 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*J)) 03606 if ((C->getValue()->getValue() + NewF.BaseOffset).abs().slt( 03607 abs64(NewF.BaseOffset)) && 03608 (C->getValue()->getValue() + 03609 NewF.BaseOffset).countTrailingZeros() >= 03610 CountTrailingZeros_64(NewF.BaseOffset)) 03611 goto skip_formula; 03612 03613 // Ok, looks good. 03614 (void)InsertFormula(LU, LUIdx, NewF); 03615 break; 03616 skip_formula:; 03617 } 03618 } 03619 } 03620 } 03621 } 03622 03623 /// GenerateAllReuseFormulae - Generate formulae for each use. 03624 void 03625 LSRInstance::GenerateAllReuseFormulae() { 03626 // This is split into multiple loops so that hasRegsUsedByUsesOtherThan 03627 // queries are more precise. 03628 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 03629 LSRUse &LU = Uses[LUIdx]; 03630 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 03631 GenerateReassociations(LU, LUIdx, LU.Formulae[i]); 03632 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 03633 GenerateCombinations(LU, LUIdx, LU.Formulae[i]); 03634 } 03635 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 03636 LSRUse &LU = Uses[LUIdx]; 03637 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 03638 GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]); 03639 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 03640 GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]); 03641 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 03642 GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]); 03643 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 03644 GenerateScales(LU, LUIdx, LU.Formulae[i]); 03645 } 03646 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 03647 LSRUse &LU = Uses[LUIdx]; 03648 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 03649 GenerateTruncates(LU, LUIdx, LU.Formulae[i]); 03650 } 03651 03652 GenerateCrossUseConstantOffsets(); 03653 03654 DEBUG(dbgs() << "\n" 03655 "After generating reuse formulae:\n"; 03656 print_uses(dbgs())); 03657 } 03658 03659 /// If there are multiple formulae with the same set of registers used 03660 /// by other uses, pick the best one and delete the others. 03661 void LSRInstance::FilterOutUndesirableDedicatedRegisters() { 03662 DenseSet<const SCEV *> VisitedRegs; 03663 SmallPtrSet<const SCEV *, 16> Regs; 03664 SmallPtrSet<const SCEV *, 16> LoserRegs; 03665 #ifndef NDEBUG 03666 bool ChangedFormulae = false; 03667 #endif 03668 03669 // Collect the best formula for each unique set of shared registers. This 03670 // is reset for each use. 03671 typedef DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo> 03672 BestFormulaeTy; 03673 BestFormulaeTy BestFormulae; 03674 03675 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 03676 LSRUse &LU = Uses[LUIdx]; 03677 DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n'); 03678 03679 bool Any = false; 03680 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); 03681 FIdx != NumForms; ++FIdx) { 03682 Formula &F = LU.Formulae[FIdx]; 03683 03684 // Some formulas are instant losers. For example, they may depend on 03685 // nonexistent AddRecs from other loops. These need to be filtered 03686 // immediately, otherwise heuristics could choose them over others leading 03687 // to an unsatisfactory solution. Passing LoserRegs into RateFormula here 03688 // avoids the need to recompute this information across formulae using the 03689 // same bad AddRec. Passing LoserRegs is also essential unless we remove 03690 // the corresponding bad register from the Regs set. 03691 Cost CostF; 03692 Regs.clear(); 03693 CostF.RateFormula(F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, 03694 &LoserRegs); 03695 if (CostF.isLoser()) { 03696 // During initial formula generation, undesirable formulae are generated 03697 // by uses within other loops that have some non-trivial address mode or 03698 // use the postinc form of the IV. LSR needs to provide these formulae 03699 // as the basis of rediscovering the desired formula that uses an AddRec 03700 // corresponding to the existing phi. Once all formulae have been 03701 // generated, these initial losers may be pruned. 03702 DEBUG(dbgs() << " Filtering loser "; F.print(dbgs()); 03703 dbgs() << "\n"); 03704 } 03705 else { 03706 SmallVector<const SCEV *, 4> Key; 03707 for (SmallVectorImpl<const SCEV *>::const_iterator J = F.BaseRegs.begin(), 03708 JE = F.BaseRegs.end(); J != JE; ++J) { 03709 const SCEV *Reg = *J; 03710 if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx)) 03711 Key.push_back(Reg); 03712 } 03713 if (F.ScaledReg && 03714 RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx)) 03715 Key.push_back(F.ScaledReg); 03716 // Unstable sort by host order ok, because this is only used for 03717 // uniquifying. 03718 std::sort(Key.begin(), Key.end()); 03719 03720 std::pair<BestFormulaeTy::const_iterator, bool> P = 03721 BestFormulae.insert(std::make_pair(Key, FIdx)); 03722 if (P.second) 03723 continue; 03724 03725 Formula &Best = LU.Formulae[P.first->second]; 03726 03727 Cost CostBest; 03728 Regs.clear(); 03729 CostBest.RateFormula(Best, Regs, VisitedRegs, L, LU.Offsets, SE, DT); 03730 if (CostF < CostBest) 03731 std::swap(F, Best); 03732 DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs()); 03733 dbgs() << "\n" 03734 " in favor of formula "; Best.print(dbgs()); 03735 dbgs() << '\n'); 03736 } 03737 #ifndef NDEBUG 03738 ChangedFormulae = true; 03739 #endif 03740 LU.DeleteFormula(F); 03741 --FIdx; 03742 --NumForms; 03743 Any = true; 03744 } 03745 03746 // Now that we've filtered out some formulae, recompute the Regs set. 03747 if (Any) 03748 LU.RecomputeRegs(LUIdx, RegUses); 03749 03750 // Reset this to prepare for the next use. 03751 BestFormulae.clear(); 03752 } 03753 03754 DEBUG(if (ChangedFormulae) { 03755 dbgs() << "\n" 03756 "After filtering out undesirable candidates:\n"; 03757 print_uses(dbgs()); 03758 }); 03759 } 03760 03761 // This is a rough guess that seems to work fairly well. 03762 static const size_t ComplexityLimit = UINT16_MAX; 03763 03764 /// EstimateSearchSpaceComplexity - Estimate the worst-case number of 03765 /// solutions the solver might have to consider. It almost never considers 03766 /// this many solutions because it prune the search space, but the pruning 03767 /// isn't always sufficient. 03768 size_t LSRInstance::EstimateSearchSpaceComplexity() const { 03769 size_t Power = 1; 03770 for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(), 03771 E = Uses.end(); I != E; ++I) { 03772 size_t FSize = I->Formulae.size(); 03773 if (FSize >= ComplexityLimit) { 03774 Power = ComplexityLimit; 03775 break; 03776 } 03777 Power *= FSize; 03778 if (Power >= ComplexityLimit) 03779 break; 03780 } 03781 return Power; 03782 } 03783 03784 /// NarrowSearchSpaceByDetectingSupersets - When one formula uses a superset 03785 /// of the registers of another formula, it won't help reduce register 03786 /// pressure (though it may not necessarily hurt register pressure); remove 03787 /// it to simplify the system. 03788 void LSRInstance::NarrowSearchSpaceByDetectingSupersets() { 03789 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) { 03790 DEBUG(dbgs() << "The search space is too complex.\n"); 03791 03792 DEBUG(dbgs() << "Narrowing the search space by eliminating formulae " 03793 "which use a superset of registers used by other " 03794 "formulae.\n"); 03795 03796 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 03797 LSRUse &LU = Uses[LUIdx]; 03798 bool Any = false; 03799 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) { 03800 Formula &F = LU.Formulae[i]; 03801 // Look for a formula with a constant or GV in a register. If the use 03802 // also has a formula with that same value in an immediate field, 03803 // delete the one that uses a register. 03804 for (SmallVectorImpl<const SCEV *>::const_iterator 03805 I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) { 03806 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) { 03807 Formula NewF = F; 03808 NewF.BaseOffset += C->getValue()->getSExtValue(); 03809 NewF.BaseRegs.erase(NewF.BaseRegs.begin() + 03810 (I - F.BaseRegs.begin())); 03811 if (LU.HasFormulaWithSameRegs(NewF)) { 03812 DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n'); 03813 LU.DeleteFormula(F); 03814 --i; 03815 --e; 03816 Any = true; 03817 break; 03818 } 03819 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) { 03820 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) 03821 if (!F.BaseGV) { 03822 Formula NewF = F; 03823 NewF.BaseGV = GV; 03824 NewF.BaseRegs.erase(NewF.BaseRegs.begin() + 03825 (I - F.BaseRegs.begin())); 03826 if (LU.HasFormulaWithSameRegs(NewF)) { 03827 DEBUG(dbgs() << " Deleting "; F.print(dbgs()); 03828 dbgs() << '\n'); 03829 LU.DeleteFormula(F); 03830 --i; 03831 --e; 03832 Any = true; 03833 break; 03834 } 03835 } 03836 } 03837 } 03838 } 03839 if (Any) 03840 LU.RecomputeRegs(LUIdx, RegUses); 03841 } 03842 03843 DEBUG(dbgs() << "After pre-selection:\n"; 03844 print_uses(dbgs())); 03845 } 03846 } 03847 03848 /// NarrowSearchSpaceByCollapsingUnrolledCode - When there are many registers 03849 /// for expressions like A, A+1, A+2, etc., allocate a single register for 03850 /// them. 03851 void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { 03852 if (EstimateSearchSpaceComplexity() < ComplexityLimit) 03853 return; 03854 03855 DEBUG(dbgs() << "The search space is too complex.\n" 03856 "Narrowing the search space by assuming that uses separated " 03857 "by a constant offset will use the same registers.\n"); 03858 03859 // This is especially useful for unrolled loops. 03860 03861 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 03862 LSRUse &LU = Uses[LUIdx]; 03863 for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(), 03864 E = LU.Formulae.end(); I != E; ++I) { 03865 const Formula &F = *I; 03866 if (F.BaseOffset == 0 || F.Scale != 0) 03867 continue; 03868 03869 LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU); 03870 if (!LUThatHas) 03871 continue; 03872 03873 if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false, 03874 LU.Kind, LU.AccessTy)) 03875 continue; 03876 03877 DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n'); 03878 03879 LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop; 03880 03881 // Update the relocs to reference the new use. 03882 for (SmallVectorImpl<LSRFixup>::iterator I = Fixups.begin(), 03883 E = Fixups.end(); I != E; ++I) { 03884 LSRFixup &Fixup = *I; 03885 if (Fixup.LUIdx == LUIdx) { 03886 Fixup.LUIdx = LUThatHas - &Uses.front(); 03887 Fixup.Offset += F.BaseOffset; 03888 // Add the new offset to LUThatHas' offset list. 03889 if (LUThatHas->Offsets.back() != Fixup.Offset) { 03890 LUThatHas->Offsets.push_back(Fixup.Offset); 03891 if (Fixup.Offset > LUThatHas->MaxOffset) 03892 LUThatHas->MaxOffset = Fixup.Offset; 03893 if (Fixup.Offset < LUThatHas->MinOffset) 03894 LUThatHas->MinOffset = Fixup.Offset; 03895 } 03896 DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n'); 03897 } 03898 if (Fixup.LUIdx == NumUses-1) 03899 Fixup.LUIdx = LUIdx; 03900 } 03901 03902 // Delete formulae from the new use which are no longer legal. 03903 bool Any = false; 03904 for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) { 03905 Formula &F = LUThatHas->Formulae[i]; 03906 if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset, 03907 LUThatHas->Kind, LUThatHas->AccessTy, F)) { 03908 DEBUG(dbgs() << " Deleting "; F.print(dbgs()); 03909 dbgs() << '\n'); 03910 LUThatHas->DeleteFormula(F); 03911 --i; 03912 --e; 03913 Any = true; 03914 } 03915 } 03916 03917 if (Any) 03918 LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses); 03919 03920 // Delete the old use. 03921 DeleteUse(LU, LUIdx); 03922 --LUIdx; 03923 --NumUses; 03924 break; 03925 } 03926 } 03927 03928 DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs())); 03929 } 03930 03931 /// NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters - Call 03932 /// FilterOutUndesirableDedicatedRegisters again, if necessary, now that 03933 /// we've done more filtering, as it may be able to find more formulae to 03934 /// eliminate. 03935 void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){ 03936 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) { 03937 DEBUG(dbgs() << "The search space is too complex.\n"); 03938 03939 DEBUG(dbgs() << "Narrowing the search space by re-filtering out " 03940 "undesirable dedicated registers.\n"); 03941 03942 FilterOutUndesirableDedicatedRegisters(); 03943 03944 DEBUG(dbgs() << "After pre-selection:\n"; 03945 print_uses(dbgs())); 03946 } 03947 } 03948 03949 /// NarrowSearchSpaceByPickingWinnerRegs - Pick a register which seems likely 03950 /// to be profitable, and then in any use which has any reference to that 03951 /// register, delete all formulae which do not reference that register. 03952 void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() { 03953 // With all other options exhausted, loop until the system is simple 03954 // enough to handle. 03955 SmallPtrSet<const SCEV *, 4> Taken; 03956 while (EstimateSearchSpaceComplexity() >= ComplexityLimit) { 03957 // Ok, we have too many of formulae on our hands to conveniently handle. 03958 // Use a rough heuristic to thin out the list. 03959 DEBUG(dbgs() << "The search space is too complex.\n"); 03960 03961 // Pick the register which is used by the most LSRUses, which is likely 03962 // to be a good reuse register candidate. 03963 const SCEV *Best = 0; 03964 unsigned BestNum = 0; 03965 for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end(); 03966 I != E; ++I) { 03967 const SCEV *Reg = *I; 03968 if (Taken.count(Reg)) 03969 continue; 03970 if (!Best) 03971 Best = Reg; 03972 else { 03973 unsigned Count = RegUses.getUsedByIndices(Reg).count(); 03974 if (Count > BestNum) { 03975 Best = Reg; 03976 BestNum = Count; 03977 } 03978 } 03979 } 03980 03981 DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best 03982 << " will yield profitable reuse.\n"); 03983 Taken.insert(Best); 03984 03985 // In any use with formulae which references this register, delete formulae 03986 // which don't reference it. 03987 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 03988 LSRUse &LU = Uses[LUIdx]; 03989 if (!LU.Regs.count(Best)) continue; 03990 03991 bool Any = false; 03992 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) { 03993 Formula &F = LU.Formulae[i]; 03994 if (!F.referencesReg(Best)) { 03995 DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n'); 03996 LU.DeleteFormula(F); 03997 --e; 03998 --i; 03999 Any = true; 04000 assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?"); 04001 continue; 04002 } 04003 } 04004 04005 if (Any) 04006 LU.RecomputeRegs(LUIdx, RegUses); 04007 } 04008 04009 DEBUG(dbgs() << "After pre-selection:\n"; 04010 print_uses(dbgs())); 04011 } 04012 } 04013 04014 /// NarrowSearchSpaceUsingHeuristics - If there are an extraordinary number of 04015 /// formulae to choose from, use some rough heuristics to prune down the number 04016 /// of formulae. This keeps the main solver from taking an extraordinary amount 04017 /// of time in some worst-case scenarios. 04018 void LSRInstance::NarrowSearchSpaceUsingHeuristics() { 04019 NarrowSearchSpaceByDetectingSupersets(); 04020 NarrowSearchSpaceByCollapsingUnrolledCode(); 04021 NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(); 04022 NarrowSearchSpaceByPickingWinnerRegs(); 04023 } 04024 04025 /// SolveRecurse - This is the recursive solver. 04026 void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, 04027 Cost &SolutionCost, 04028 SmallVectorImpl<const Formula *> &Workspace, 04029 const Cost &CurCost, 04030 const SmallPtrSet<const SCEV *, 16> &CurRegs, 04031 DenseSet<const SCEV *> &VisitedRegs) const { 04032 // Some ideas: 04033 // - prune more: 04034 // - use more aggressive filtering 04035 // - sort the formula so that the most profitable solutions are found first 04036 // - sort the uses too 04037 // - search faster: 04038 // - don't compute a cost, and then compare. compare while computing a cost 04039 // and bail early. 04040 // - track register sets with SmallBitVector 04041 04042 const LSRUse &LU = Uses[Workspace.size()]; 04043 04044 // If this use references any register that's already a part of the 04045 // in-progress solution, consider it a requirement that a formula must 04046 // reference that register in order to be considered. This prunes out 04047 // unprofitable searching. 04048 SmallSetVector<const SCEV *, 4> ReqRegs; 04049 for (SmallPtrSet<const SCEV *, 16>::const_iterator I = CurRegs.begin(), 04050 E = CurRegs.end(); I != E; ++I) 04051 if (LU.Regs.count(*I)) 04052 ReqRegs.insert(*I); 04053 04054 SmallPtrSet<const SCEV *, 16> NewRegs; 04055 Cost NewCost; 04056 for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(), 04057 E = LU.Formulae.end(); I != E; ++I) { 04058 const Formula &F = *I; 04059 04060 // Ignore formulae which do not use any of the required registers. 04061 bool SatisfiedReqReg = true; 04062 for (SmallSetVector<const SCEV *, 4>::const_iterator J = ReqRegs.begin(), 04063 JE = ReqRegs.end(); J != JE; ++J) { 04064 const SCEV *Reg = *J; 04065 if ((!F.ScaledReg || F.ScaledReg != Reg) && 04066 std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) == 04067 F.BaseRegs.end()) { 04068 SatisfiedReqReg = false; 04069 break; 04070 } 04071 } 04072 if (!SatisfiedReqReg) { 04073 // If none of the formulae satisfied the required registers, then we could 04074 // clear ReqRegs and try again. Currently, we simply give up in this case. 04075 continue; 04076 } 04077 04078 // Evaluate the cost of the current formula. If it's already worse than 04079 // the current best, prune the search at that point. 04080 NewCost = CurCost; 04081 NewRegs = CurRegs; 04082 NewCost.RateFormula(F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT); 04083 if (NewCost < SolutionCost) { 04084 Workspace.push_back(&F); 04085 if (Workspace.size() != Uses.size()) { 04086 SolveRecurse(Solution, SolutionCost, Workspace, NewCost, 04087 NewRegs, VisitedRegs); 04088 if (F.getNumRegs() == 1 && Workspace.size() == 1) 04089 VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]); 04090 } else { 04091 DEBUG(dbgs() << "New best at "; NewCost.print(dbgs()); 04092 dbgs() << ".\n Regs:"; 04093 for (SmallPtrSet<const SCEV *, 16>::const_iterator 04094 I = NewRegs.begin(), E = NewRegs.end(); I != E; ++I) 04095 dbgs() << ' ' << **I; 04096 dbgs() << '\n'); 04097 04098 SolutionCost = NewCost; 04099 Solution = Workspace; 04100 } 04101 Workspace.pop_back(); 04102 } 04103 } 04104 } 04105 04106 /// Solve - Choose one formula from each use. Return the results in the given 04107 /// Solution vector. 04108 void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const { 04109 SmallVector<const Formula *, 8> Workspace; 04110 Cost SolutionCost; 04111 SolutionCost.Loose(); 04112 Cost CurCost; 04113 SmallPtrSet<const SCEV *, 16> CurRegs; 04114 DenseSet<const SCEV *> VisitedRegs; 04115 Workspace.reserve(Uses.size()); 04116 04117 // SolveRecurse does all the work. 04118 SolveRecurse(Solution, SolutionCost, Workspace, CurCost, 04119 CurRegs, VisitedRegs); 04120 if (Solution.empty()) { 04121 DEBUG(dbgs() << "\nNo Satisfactory Solution\n"); 04122 return; 04123 } 04124 04125 // Ok, we've now made all our decisions. 04126 DEBUG(dbgs() << "\n" 04127 "The chosen solution requires "; SolutionCost.print(dbgs()); 04128 dbgs() << ":\n"; 04129 for (size_t i = 0, e = Uses.size(); i != e; ++i) { 04130 dbgs() << " "; 04131 Uses[i].print(dbgs()); 04132 dbgs() << "\n" 04133 " "; 04134 Solution[i]->print(dbgs()); 04135 dbgs() << '\n'; 04136 }); 04137 04138 assert(Solution.size() == Uses.size() && "Malformed solution!"); 04139 } 04140 04141 /// HoistInsertPosition - Helper for AdjustInsertPositionForExpand. Climb up 04142 /// the dominator tree far as we can go while still being dominated by the 04143 /// input positions. This helps canonicalize the insert position, which 04144 /// encourages sharing. 04145 BasicBlock::iterator 04146 LSRInstance::HoistInsertPosition(BasicBlock::iterator IP, 04147 const SmallVectorImpl<Instruction *> &Inputs) 04148 const { 04149 for (;;) { 04150 const Loop *IPLoop = LI.getLoopFor(IP->getParent()); 04151 unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0; 04152 04153 BasicBlock *IDom; 04154 for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) { 04155 if (!Rung) return IP; 04156 Rung = Rung->getIDom(); 04157 if (!Rung) return IP; 04158 IDom = Rung->getBlock(); 04159 04160 // Don't climb into a loop though. 04161 const Loop *IDomLoop = LI.getLoopFor(IDom); 04162 unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0; 04163 if (IDomDepth <= IPLoopDepth && 04164 (IDomDepth != IPLoopDepth || IDomLoop == IPLoop)) 04165 break; 04166 } 04167 04168 bool AllDominate = true; 04169 Instruction *BetterPos = 0; 04170 Instruction *Tentative = IDom->getTerminator(); 04171 for (SmallVectorImpl<Instruction *>::const_iterator I = Inputs.begin(), 04172 E = Inputs.end(); I != E; ++I) { 04173 Instruction *Inst = *I; 04174 if (Inst == Tentative || !DT.dominates(Inst, Tentative)) { 04175 AllDominate = false; 04176 break; 04177 } 04178 // Attempt to find an insert position in the middle of the block, 04179 // instead of at the end, so that it can be used for other expansions. 04180 if (IDom == Inst->getParent() && 04181 (!BetterPos || !DT.dominates(Inst, BetterPos))) 04182 BetterPos = llvm::next(BasicBlock::iterator(Inst)); 04183 } 04184 if (!AllDominate) 04185 break; 04186 if (BetterPos) 04187 IP = BetterPos; 04188 else 04189 IP = Tentative; 04190 } 04191 04192 return IP; 04193 } 04194 04195 /// AdjustInsertPositionForExpand - Determine an input position which will be 04196 /// dominated by the operands and which will dominate the result. 04197 BasicBlock::iterator 04198 LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP, 04199 const LSRFixup &LF, 04200 const LSRUse &LU, 04201 SCEVExpander &Rewriter) const { 04202 // Collect some instructions which must be dominated by the 04203 // expanding replacement. These must be dominated by any operands that 04204 // will be required in the expansion. 04205 SmallVector<Instruction *, 4> Inputs; 04206 if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace)) 04207 Inputs.push_back(I); 04208 if (LU.Kind == LSRUse::ICmpZero) 04209 if (Instruction *I = 04210 dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1))) 04211 Inputs.push_back(I); 04212 if (LF.PostIncLoops.count(L)) { 04213 if (LF.isUseFullyOutsideLoop(L)) 04214 Inputs.push_back(L->getLoopLatch()->getTerminator()); 04215 else 04216 Inputs.push_back(IVIncInsertPos); 04217 } 04218 // The expansion must also be dominated by the increment positions of any 04219 // loops it for which it is using post-inc mode. 04220 for (PostIncLoopSet::const_iterator I = LF.PostIncLoops.begin(), 04221 E = LF.PostIncLoops.end(); I != E; ++I) { 04222 const Loop *PIL = *I; 04223 if (PIL == L) continue; 04224 04225 // Be dominated by the loop exit. 04226 SmallVector<BasicBlock *, 4> ExitingBlocks; 04227 PIL->getExitingBlocks(ExitingBlocks); 04228 if (!ExitingBlocks.empty()) { 04229 BasicBlock *BB = ExitingBlocks[0]; 04230 for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i) 04231 BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]); 04232 Inputs.push_back(BB->getTerminator()); 04233 } 04234 } 04235 04236 assert(!isa<PHINode>(LowestIP) && !isa<LandingPadInst>(LowestIP) 04237 && !isa<DbgInfoIntrinsic>(LowestIP) && 04238 "Insertion point must be a normal instruction"); 04239 04240 // Then, climb up the immediate dominator tree as far as we can go while 04241 // still being dominated by the input positions. 04242 BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs); 04243 04244 // Don't insert instructions before PHI nodes. 04245 while (isa<PHINode>(IP)) ++IP; 04246 04247 // Ignore landingpad instructions. 04248 while (isa<LandingPadInst>(IP)) ++IP; 04249 04250 // Ignore debug intrinsics. 04251 while (isa<DbgInfoIntrinsic>(IP)) ++IP; 04252 04253 // Set IP below instructions recently inserted by SCEVExpander. This keeps the 04254 // IP consistent across expansions and allows the previously inserted 04255 // instructions to be reused by subsequent expansion. 04256 while (Rewriter.isInsertedInstruction(IP) && IP != LowestIP) ++IP; 04257 04258 return IP; 04259 } 04260 04261 /// Expand - Emit instructions for the leading candidate expression for this 04262 /// LSRUse (this is called "expanding"). 04263 Value *LSRInstance::Expand(const LSRFixup &LF, 04264 const Formula &F, 04265 BasicBlock::iterator IP, 04266 SCEVExpander &Rewriter, 04267 SmallVectorImpl<WeakVH> &DeadInsts) const { 04268 const LSRUse &LU = Uses[LF.LUIdx]; 04269 04270 // Determine an input position which will be dominated by the operands and 04271 // which will dominate the result. 04272 IP = AdjustInsertPositionForExpand(IP, LF, LU, Rewriter); 04273 04274 // Inform the Rewriter if we have a post-increment use, so that it can 04275 // perform an advantageous expansion. 04276 Rewriter.setPostInc(LF.PostIncLoops); 04277 04278 // This is the type that the user actually needs. 04279 Type *OpTy = LF.OperandValToReplace->getType(); 04280 // This will be the type that we'll initially expand to. 04281 Type *Ty = F.getType(); 04282 if (!Ty) 04283 // No type known; just expand directly to the ultimate type. 04284 Ty = OpTy; 04285 else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy)) 04286 // Expand directly to the ultimate type if it's the right size. 04287 Ty = OpTy; 04288 // This is the type to do integer arithmetic in. 04289 Type *IntTy = SE.getEffectiveSCEVType(Ty); 04290 04291 // Build up a list of operands to add together to form the full base. 04292 SmallVector<const SCEV *, 8> Ops; 04293 04294 // Expand the BaseRegs portion. 04295 for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(), 04296 E = F.BaseRegs.end(); I != E; ++I) { 04297 const SCEV *Reg = *I; 04298 assert(!Reg->isZero() && "Zero allocated in a base register!"); 04299 04300 // If we're expanding for a post-inc user, make the post-inc adjustment. 04301 PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops); 04302 Reg = TransformForPostIncUse(Denormalize, Reg, 04303 LF.UserInst, LF.OperandValToReplace, 04304 Loops, SE, DT); 04305 04306 Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP))); 04307 } 04308 04309 // Expand the ScaledReg portion. 04310 Value *ICmpScaledV = 0; 04311 if (F.Scale != 0) { 04312 const SCEV *ScaledS = F.ScaledReg; 04313 04314 // If we're expanding for a post-inc user, make the post-inc adjustment. 04315 PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops); 04316 ScaledS = TransformForPostIncUse(Denormalize, ScaledS, 04317 LF.UserInst, LF.OperandValToReplace, 04318 Loops, SE, DT); 04319 04320 if (LU.Kind == LSRUse::ICmpZero) { 04321 // An interesting way of "folding" with an icmp is to use a negated 04322 // scale, which we'll implement by inserting it into the other operand 04323 // of the icmp. 04324 assert(F.Scale == -1 && 04325 "The only scale supported by ICmpZero uses is -1!"); 04326 ICmpScaledV = Rewriter.expandCodeFor(ScaledS, 0, IP); 04327 } else { 04328 // Otherwise just expand the scaled register and an explicit scale, 04329 // which is expected to be matched as part of the address. 04330 04331 // Flush the operand list to suppress SCEVExpander hoisting address modes. 04332 if (!Ops.empty() && LU.Kind == LSRUse::Address) { 04333 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); 04334 Ops.clear(); 04335 Ops.push_back(SE.getUnknown(FullV)); 04336 } 04337 ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP)); 04338 ScaledS = SE.getMulExpr(ScaledS, 04339 SE.getConstant(ScaledS->getType(), F.Scale)); 04340 Ops.push_back(ScaledS); 04341 } 04342 } 04343 04344 // Expand the GV portion. 04345 if (F.BaseGV) { 04346 // Flush the operand list to suppress SCEVExpander hoisting. 04347 if (!Ops.empty()) { 04348 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); 04349 Ops.clear(); 04350 Ops.push_back(SE.getUnknown(FullV)); 04351 } 04352 Ops.push_back(SE.getUnknown(F.BaseGV)); 04353 } 04354 04355 // Flush the operand list to suppress SCEVExpander hoisting of both folded and 04356 // unfolded offsets. LSR assumes they both live next to their uses. 04357 if (!Ops.empty()) { 04358 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); 04359 Ops.clear(); 04360 Ops.push_back(SE.getUnknown(FullV)); 04361 } 04362 04363 // Expand the immediate portion. 04364 int64_t Offset = (uint64_t)F.BaseOffset + LF.Offset; 04365 if (Offset != 0) { 04366 if (LU.Kind == LSRUse::ICmpZero) { 04367 // The other interesting way of "folding" with an ICmpZero is to use a 04368 // negated immediate. 04369 if (!ICmpScaledV) 04370 ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset); 04371 else { 04372 Ops.push_back(SE.getUnknown(ICmpScaledV)); 04373 ICmpScaledV = ConstantInt::get(IntTy, Offset); 04374 } 04375 } else { 04376 // Just add the immediate values. These again are expected to be matched 04377 // as part of the address. 04378 Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset))); 04379 } 04380 } 04381 04382 // Expand the unfolded offset portion. 04383 int64_t UnfoldedOffset = F.UnfoldedOffset; 04384 if (UnfoldedOffset != 0) { 04385 // Just add the immediate values. 04386 Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, 04387 UnfoldedOffset))); 04388 } 04389 04390 // Emit instructions summing all the operands. 04391 const SCEV *FullS = Ops.empty() ? 04392 SE.getConstant(IntTy, 0) : 04393 SE.getAddExpr(Ops); 04394 Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP); 04395 04396 // We're done expanding now, so reset the rewriter. 04397 Rewriter.clearPostInc(); 04398 04399 // An ICmpZero Formula represents an ICmp which we're handling as a 04400 // comparison against zero. Now that we've expanded an expression for that 04401 // form, update the ICmp's other operand. 04402 if (LU.Kind == LSRUse::ICmpZero) { 04403 ICmpInst *CI = cast<ICmpInst>(LF.UserInst); 04404 DeadInsts.push_back(CI->getOperand(1)); 04405 assert(!F.BaseGV && "ICmp does not support folding a global value and " 04406 "a scale at the same time!"); 04407 if (F.Scale == -1) { 04408 if (ICmpScaledV->getType() != OpTy) { 04409 Instruction *Cast = 04410 CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false, 04411 OpTy, false), 04412 ICmpScaledV, OpTy, "tmp", CI); 04413 ICmpScaledV = Cast; 04414 } 04415 CI->setOperand(1, ICmpScaledV); 04416 } else { 04417 assert(F.Scale == 0 && 04418 "ICmp does not support folding a global value and " 04419 "a scale at the same time!"); 04420 Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy), 04421 -(uint64_t)Offset); 04422 if (C->getType() != OpTy) 04423 C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false, 04424 OpTy, false), 04425 C, OpTy); 04426 04427 CI->setOperand(1, C); 04428 } 04429 } 04430 04431 return FullV; 04432 } 04433 04434 /// RewriteForPHI - Helper for Rewrite. PHI nodes are special because the use 04435 /// of their operands effectively happens in their predecessor blocks, so the 04436 /// expression may need to be expanded in multiple places. 04437 void LSRInstance::RewriteForPHI(PHINode *PN, 04438 const LSRFixup &LF, 04439 const Formula &F, 04440 SCEVExpander &Rewriter, 04441 SmallVectorImpl<WeakVH> &DeadInsts, 04442 Pass *P) const { 04443 DenseMap<BasicBlock *, Value *> Inserted; 04444 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) 04445 if (PN->getIncomingValue(i) == LF.OperandValToReplace) { 04446 BasicBlock *BB = PN->getIncomingBlock(i); 04447 04448 // If this is a critical edge, split the edge so that we do not insert 04449 // the code on all predecessor/successor paths. We do this unless this 04450 // is the canonical backedge for this loop, which complicates post-inc 04451 // users. 04452 if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 && 04453 !isa<IndirectBrInst>(BB->getTerminator())) { 04454 BasicBlock *Parent = PN->getParent(); 04455 Loop *PNLoop = LI.getLoopFor(Parent); 04456 if (!PNLoop || Parent != PNLoop->getHeader()) { 04457 // Split the critical edge. 04458 BasicBlock *NewBB = 0; 04459 if (!Parent->isLandingPad()) { 04460 NewBB = SplitCriticalEdge(BB, Parent, P, 04461 /*MergeIdenticalEdges=*/true, 04462 /*DontDeleteUselessPhis=*/true); 04463 } else { 04464 SmallVector<BasicBlock*, 2> NewBBs; 04465 SplitLandingPadPredecessors(Parent, BB, "", "", P, NewBBs); 04466 NewBB = NewBBs[0]; 04467 } 04468 // If NewBB==NULL, then SplitCriticalEdge refused to split because all 04469 // phi predecessors are identical. The simple thing to do is skip 04470 // splitting in this case rather than complicate the API. 04471 if (NewBB) { 04472 // If PN is outside of the loop and BB is in the loop, we want to 04473 // move the block to be immediately before the PHI block, not 04474 // immediately after BB. 04475 if (L->contains(BB) && !L->contains(PN)) 04476 NewBB->moveBefore(PN->getParent()); 04477 04478 // Splitting the edge can reduce the number of PHI entries we have. 04479 e = PN->getNumIncomingValues(); 04480 BB = NewBB; 04481 i = PN->getBasicBlockIndex(BB); 04482 } 04483 } 04484 } 04485 04486 std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair = 04487 Inserted.insert(std::make_pair(BB, static_cast<Value *>(0))); 04488 if (!Pair.second) 04489 PN->setIncomingValue(i, Pair.first->second); 04490 else { 04491 Value *FullV = Expand(LF, F, BB->getTerminator(), Rewriter, DeadInsts); 04492 04493 // If this is reuse-by-noop-cast, insert the noop cast. 04494 Type *OpTy = LF.OperandValToReplace->getType(); 04495 if (FullV->getType() != OpTy) 04496 FullV = 04497 CastInst::Create(CastInst::getCastOpcode(FullV, false, 04498 OpTy, false), 04499 FullV, LF.OperandValToReplace->getType(), 04500 "tmp", BB->getTerminator()); 04501 04502 PN->setIncomingValue(i, FullV); 04503 Pair.first->second = FullV; 04504 } 04505 } 04506 } 04507 04508 /// Rewrite - Emit instructions for the leading candidate expression for this 04509 /// LSRUse (this is called "expanding"), and update the UserInst to reference 04510 /// the newly expanded value. 04511 void LSRInstance::Rewrite(const LSRFixup &LF, 04512 const Formula &F, 04513 SCEVExpander &Rewriter, 04514 SmallVectorImpl<WeakVH> &DeadInsts, 04515 Pass *P) const { 04516 // First, find an insertion point that dominates UserInst. For PHI nodes, 04517 // find the nearest block which dominates all the relevant uses. 04518 if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) { 04519 RewriteForPHI(PN, LF, F, Rewriter, DeadInsts, P); 04520 } else { 04521 Value *FullV = Expand(LF, F, LF.UserInst, Rewriter, DeadInsts); 04522 04523 // If this is reuse-by-noop-cast, insert the noop cast. 04524 Type *OpTy = LF.OperandValToReplace->getType(); 04525 if (FullV->getType() != OpTy) { 04526 Instruction *Cast = 04527 CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false), 04528 FullV, OpTy, "tmp", LF.UserInst); 04529 FullV = Cast; 04530 } 04531 04532 // Update the user. ICmpZero is handled specially here (for now) because 04533 // Expand may have updated one of the operands of the icmp already, and 04534 // its new value may happen to be equal to LF.OperandValToReplace, in 04535 // which case doing replaceUsesOfWith leads to replacing both operands 04536 // with the same value. TODO: Reorganize this. 04537 if (Uses[LF.LUIdx].Kind == LSRUse::ICmpZero) 04538 LF.UserInst->setOperand(0, FullV); 04539 else 04540 LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV); 04541 } 04542 04543 DeadInsts.push_back(LF.OperandValToReplace); 04544 } 04545 04546 /// ImplementSolution - Rewrite all the fixup locations with new values, 04547 /// following the chosen solution. 04548 void 04549 LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, 04550 Pass *P) { 04551 // Keep track of instructions we may have made dead, so that 04552 // we can remove them after we are done working. 04553 SmallVector<WeakVH, 16> DeadInsts; 04554 04555 SCEVExpander Rewriter(SE, "lsr"); 04556 #ifndef NDEBUG 04557 Rewriter.setDebugType(DEBUG_TYPE); 04558 #endif 04559 Rewriter.disableCanonicalMode(); 04560 Rewriter.enableLSRMode(); 04561 Rewriter.setIVIncInsertPos(L, IVIncInsertPos); 04562 04563 // Mark phi nodes that terminate chains so the expander tries to reuse them. 04564 for (SmallVectorImpl<IVChain>::const_iterator ChainI = IVChainVec.begin(), 04565 ChainE = IVChainVec.end(); ChainI != ChainE; ++ChainI) { 04566 if (PHINode *PN = dyn_cast<PHINode>(ChainI->tailUserInst())) 04567 Rewriter.setChainedPhi(PN); 04568 } 04569 04570 // Expand the new value definitions and update the users. 04571 for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(), 04572 E = Fixups.end(); I != E; ++I) { 04573 const LSRFixup &Fixup = *I; 04574 04575 Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts, P); 04576 04577 Changed = true; 04578 } 04579 04580 for (SmallVectorImpl<IVChain>::const_iterator ChainI = IVChainVec.begin(), 04581 ChainE = IVChainVec.end(); ChainI != ChainE; ++ChainI) { 04582 GenerateIVChain(*ChainI, Rewriter, DeadInsts); 04583 Changed = true; 04584 } 04585 // Clean up after ourselves. This must be done before deleting any 04586 // instructions. 04587 Rewriter.clear(); 04588 04589 Changed |= DeleteTriviallyDeadInstructions(DeadInsts); 04590 } 04591 04592 LSRInstance::LSRInstance(Loop *L, Pass *P) 04593 : IU(P->getAnalysis<IVUsers>()), SE(P->getAnalysis<ScalarEvolution>()), 04594 DT(P->getAnalysis<DominatorTree>()), LI(P->getAnalysis<LoopInfo>()), 04595 TTI(P->getAnalysis<TargetTransformInfo>()), L(L), Changed(false), 04596 IVIncInsertPos(0) { 04597 // If LoopSimplify form is not available, stay out of trouble. 04598 if (!L->isLoopSimplifyForm()) 04599 return; 04600 04601 // If there's no interesting work to be done, bail early. 04602 if (IU.empty()) return; 04603 04604 // If there's too much analysis to be done, bail early. We won't be able to 04605 // model the problem anyway. 04606 unsigned NumUsers = 0; 04607 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) { 04608 if (++NumUsers > MaxIVUsers) { 04609 DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << *L 04610 << "\n"); 04611 return; 04612 } 04613 } 04614 04615 #ifndef NDEBUG 04616 // All dominating loops must have preheaders, or SCEVExpander may not be able 04617 // to materialize an AddRecExpr whose Start is an outer AddRecExpr. 04618 // 04619 // IVUsers analysis should only create users that are dominated by simple loop 04620 // headers. Since this loop should dominate all of its users, its user list 04621 // should be empty if this loop itself is not within a simple loop nest. 04622 for (DomTreeNode *Rung = DT.getNode(L->getLoopPreheader()); 04623 Rung; Rung = Rung->getIDom()) { 04624 BasicBlock *BB = Rung->getBlock(); 04625 const Loop *DomLoop = LI.getLoopFor(BB); 04626 if (DomLoop && DomLoop->getHeader() == BB) { 04627 assert(DomLoop->getLoopPreheader() && "LSR needs a simplified loop nest"); 04628 } 04629 } 04630 #endif // DEBUG 04631 04632 DEBUG(dbgs() << "\nLSR on loop "; 04633 WriteAsOperand(dbgs(), L->getHeader(), /*PrintType=*/false); 04634 dbgs() << ":\n"); 04635 04636 // First, perform some low-level loop optimizations. 04637 OptimizeShadowIV(); 04638 OptimizeLoopTermCond(); 04639 04640 // If loop preparation eliminates all interesting IV users, bail. 04641 if (IU.empty()) return; 04642 04643 // Skip nested loops until we can model them better with formulae. 04644 if (!L->empty()) { 04645 DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n"); 04646 return; 04647 } 04648 04649 // Start collecting data and preparing for the solver. 04650 CollectChains(); 04651 CollectInterestingTypesAndFactors(); 04652 CollectFixupsAndInitialFormulae(); 04653 CollectLoopInvariantFixupsAndFormulae(); 04654 04655 assert(!Uses.empty() && "IVUsers reported at least one use"); 04656 DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n"; 04657 print_uses(dbgs())); 04658 04659 // Now use the reuse data to generate a bunch of interesting ways 04660 // to formulate the values needed for the uses. 04661 GenerateAllReuseFormulae(); 04662 04663 FilterOutUndesirableDedicatedRegisters(); 04664 NarrowSearchSpaceUsingHeuristics(); 04665 04666 SmallVector<const Formula *, 8> Solution; 04667 Solve(Solution); 04668 04669 // Release memory that is no longer needed. 04670 Factors.clear(); 04671 Types.clear(); 04672 RegUses.clear(); 04673 04674 if (Solution.empty()) 04675 return; 04676 04677 #ifndef NDEBUG 04678 // Formulae should be legal. 04679 for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(), E = Uses.end(); 04680 I != E; ++I) { 04681 const LSRUse &LU = *I; 04682 for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(), 04683 JE = LU.Formulae.end(); 04684 J != JE; ++J) 04685 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, 04686 *J) && "Illegal formula generated!"); 04687 }; 04688 #endif 04689 04690 // Now that we've decided what we want, make it so. 04691 ImplementSolution(Solution, P); 04692 } 04693 04694 void LSRInstance::print_factors_and_types(raw_ostream &OS) const { 04695 if (Factors.empty() && Types.empty()) return; 04696 04697 OS << "LSR has identified the following interesting factors and types: "; 04698 bool First = true; 04699 04700 for (SmallSetVector<int64_t, 8>::const_iterator 04701 I = Factors.begin(), E = Factors.end(); I != E; ++I) { 04702 if (!First) OS << ", "; 04703 First = false; 04704 OS << '*' << *I; 04705 } 04706 04707 for (SmallSetVector<Type *, 4>::const_iterator 04708 I = Types.begin(), E = Types.end(); I != E; ++I) { 04709 if (!First) OS << ", "; 04710 First = false; 04711 OS << '(' << **I << ')'; 04712 } 04713 OS << '\n'; 04714 } 04715 04716 void LSRInstance::print_fixups(raw_ostream &OS) const { 04717 OS << "LSR is examining the following fixup sites:\n"; 04718 for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(), 04719 E = Fixups.end(); I != E; ++I) { 04720 dbgs() << " "; 04721 I->print(OS); 04722 OS << '\n'; 04723 } 04724 } 04725 04726 void LSRInstance::print_uses(raw_ostream &OS) const { 04727 OS << "LSR is examining the following uses:\n"; 04728 for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(), 04729 E = Uses.end(); I != E; ++I) { 04730 const LSRUse &LU = *I; 04731 dbgs() << " "; 04732 LU.print(OS); 04733 OS << '\n'; 04734 for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(), 04735 JE = LU.Formulae.end(); J != JE; ++J) { 04736 OS << " "; 04737 J->print(OS); 04738 OS << '\n'; 04739 } 04740 } 04741 } 04742 04743 void LSRInstance::print(raw_ostream &OS) const { 04744 print_factors_and_types(OS); 04745 print_fixups(OS); 04746 print_uses(OS); 04747 } 04748 04749 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 04750 void LSRInstance::dump() const { 04751 print(errs()); errs() << '\n'; 04752 } 04753 #endif 04754 04755 namespace { 04756 04757 class LoopStrengthReduce : public LoopPass { 04758 public: 04759 static char ID; // Pass ID, replacement for typeid 04760 LoopStrengthReduce(); 04761 04762 private: 04763 bool runOnLoop(Loop *L, LPPassManager &LPM); 04764 void getAnalysisUsage(AnalysisUsage &AU) const; 04765 }; 04766 04767 } 04768 04769 char LoopStrengthReduce::ID = 0; 04770 INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce", 04771 "Loop Strength Reduction", false, false) 04772 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) 04773 INITIALIZE_PASS_DEPENDENCY(DominatorTree) 04774 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 04775 INITIALIZE_PASS_DEPENDENCY(IVUsers) 04776 INITIALIZE_PASS_DEPENDENCY(LoopInfo) 04777 INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 04778 INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce", 04779 "Loop Strength Reduction", false, false) 04780 04781 04782 Pass *llvm::createLoopStrengthReducePass() { 04783 return new LoopStrengthReduce(); 04784 } 04785 04786 LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) { 04787 initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry()); 04788 } 04789 04790 void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { 04791 // We split critical edges, so we change the CFG. However, we do update 04792 // many analyses if they are around. 04793 AU.addPreservedID(LoopSimplifyID); 04794 04795 AU.addRequired<LoopInfo>(); 04796 AU.addPreserved<LoopInfo>(); 04797 AU.addRequiredID(LoopSimplifyID); 04798 AU.addRequired<DominatorTree>(); 04799 AU.addPreserved<DominatorTree>(); 04800 AU.addRequired<ScalarEvolution>(); 04801 AU.addPreserved<ScalarEvolution>(); 04802 // Requiring LoopSimplify a second time here prevents IVUsers from running 04803 // twice, since LoopSimplify was invalidated by running ScalarEvolution. 04804 AU.addRequiredID(LoopSimplifyID); 04805 AU.addRequired<IVUsers>(); 04806 AU.addPreserved<IVUsers>(); 04807 AU.addRequired<TargetTransformInfo>(); 04808 } 04809 04810 bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { 04811 bool Changed = false; 04812 04813 // Run the main LSR transformation. 04814 Changed |= LSRInstance(L, this).getChanged(); 04815 04816 // Remove any extra phis created by processing inner loops. 04817 Changed |= DeleteDeadPHIs(L->getHeader()); 04818 if (EnablePhiElim && L->isLoopSimplifyForm()) { 04819 SmallVector<WeakVH, 16> DeadInsts; 04820 SCEVExpander Rewriter(getAnalysis<ScalarEvolution>(), "lsr"); 04821 #ifndef NDEBUG 04822 Rewriter.setDebugType(DEBUG_TYPE); 04823 #endif 04824 unsigned numFolded = 04825 Rewriter.replaceCongruentIVs(L, &getAnalysis<DominatorTree>(), 04826 DeadInsts, 04827 &getAnalysis<TargetTransformInfo>()); 04828 if (numFolded) { 04829 Changed = true; 04830 DeleteTriviallyDeadInstructions(DeadInsts); 04831 DeleteDeadPHIs(L->getHeader()); 04832 } 04833 } 04834 return Changed; 04835 }