LLVM  4.0.0
LoopStrengthReduce.cpp
Go to the documentation of this file.
1 //===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This transformation analyzes and transforms the induction variables (and
11 // computations derived from them) into forms suitable for efficient execution
12 // on the target.
13 //
14 // This pass performs a strength reduction on array references inside loops that
15 // have as one or more of their components the loop induction variable, it
16 // rewrites expressions to take advantage of scaled-index addressing modes
17 // available on the target, and it performs a variety of other optimizations
18 // related to loop induction variables.
19 //
20 // Terminology note: this code has a lot of handling for "post-increment" or
21 // "post-inc" users. This is not talking about post-increment addressing modes;
22 // it is instead talking about code like this:
23 //
24 // %i = phi [ 0, %entry ], [ %i.next, %latch ]
25 // ...
26 // %i.next = add %i, 1
27 // %c = icmp eq %i.next, %n
28 //
29 // The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
30 // it's useful to think about these as the same register, with some uses using
31 // the value of the register before the add and some using it after. In this
32 // example, the icmp is a post-increment user, since it uses %i.next, which is
33 // the value of the induction variable after the increment. The other common
34 // case of post-increment users is users outside the loop.
35 //
36 // TODO: More sophistication in the way Formulae are generated and filtered.
37 //
38 // TODO: Handle multiple loops at a time.
39 //
40 // TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
41 // of a GlobalValue?
42 //
43 // TODO: When truncation is free, truncate ICmp users' operands to make it a
44 // smaller encoding (on x86 at least).
45 //
46 // TODO: When a negated register is used by an add (such as in a list of
47 // multiple base registers, or as the increment expression in an addrec),
48 // we may not actually need both reg and (-1 * reg) in registers; the
49 // negation can be implemented by using a sub instead of an add. The
50 // lack of support for taking this into consideration when making
51 // register pressure decisions is partly worked around by the "Special"
52 // use kind.
53 //
54 //===----------------------------------------------------------------------===//
55 
57 #include "llvm/ADT/APInt.h"
58 #include "llvm/ADT/DenseMap.h"
59 #include "llvm/ADT/DenseSet.h"
60 #include "llvm/ADT/Hashing.h"
62 #include "llvm/ADT/STLExtras.h"
63 #include "llvm/ADT/SetVector.h"
65 #include "llvm/ADT/SmallPtrSet.h"
66 #include "llvm/ADT/SmallSet.h"
67 #include "llvm/ADT/SmallVector.h"
68 #include "llvm/Analysis/IVUsers.h"
69 #include "llvm/Analysis/LoopInfo.h"
70 #include "llvm/Analysis/LoopPass.h"
76 #include "llvm/IR/BasicBlock.h"
77 #include "llvm/IR/Constant.h"
78 #include "llvm/IR/Constants.h"
79 #include "llvm/IR/DerivedTypes.h"
80 #include "llvm/IR/Dominators.h"
81 #include "llvm/IR/GlobalValue.h"
82 #include "llvm/IR/IRBuilder.h"
83 #include "llvm/IR/Instruction.h"
84 #include "llvm/IR/Instructions.h"
85 #include "llvm/IR/IntrinsicInst.h"
86 #include "llvm/IR/Module.h"
87 #include "llvm/IR/OperandTraits.h"
88 #include "llvm/IR/Operator.h"
89 #include "llvm/IR/Type.h"
90 #include "llvm/IR/Value.h"
91 #include "llvm/IR/ValueHandle.h"
92 #include "llvm/Pass.h"
93 #include "llvm/Support/Casting.h"
95 #include "llvm/Support/Compiler.h"
96 #include "llvm/Support/Debug.h"
100 #include "llvm/Transforms/Scalar.h"
104 #include <algorithm>
105 #include <cassert>
106 #include <cstddef>
107 #include <cstdint>
108 #include <cstdlib>
109 #include <iterator>
110 #include <map>
111 #include <tuple>
112 #include <utility>
113 
114 using namespace llvm;
115 
116 #define DEBUG_TYPE "loop-reduce"
117 
118 /// MaxIVUsers is an arbitrary threshold that provides an early opportunitiy for
119 /// bail out. This threshold is far beyond the number of users that LSR can
120 /// conceivably solve, so it should not affect generated code, but catches the
121 /// worst cases before LSR burns too much compile time and stack space.
122 static const unsigned MaxIVUsers = 200;
123 
124 // Temporary flag to cleanup congruent phis after LSR phi expansion.
125 // It's currently disabled until we can determine whether it's truly useful or
126 // not. The flag should be removed after the v3.0 release.
127 // This is now needed for ivchains.
129  "enable-lsr-phielim", cl::Hidden, cl::init(true),
130  cl::desc("Enable LSR phi elimination"));
131 
132 #ifndef NDEBUG
133 // Stress test IV chain generation.
135  "stress-ivchain", cl::Hidden, cl::init(false),
136  cl::desc("Stress test LSR IV chains"));
137 #else
138 static bool StressIVChain = false;
139 #endif
140 
141 namespace {
142 
143 struct MemAccessTy {
144  /// Used in situations where the accessed memory type is unknown.
145  static const unsigned UnknownAddressSpace = ~0u;
146 
147  Type *MemTy;
148  unsigned AddrSpace;
149 
150  MemAccessTy() : MemTy(nullptr), AddrSpace(UnknownAddressSpace) {}
151 
152  MemAccessTy(Type *Ty, unsigned AS) :
153  MemTy(Ty), AddrSpace(AS) {}
154 
155  bool operator==(MemAccessTy Other) const {
156  return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
157  }
158 
159  bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
160 
161  static MemAccessTy getUnknown(LLVMContext &Ctx,
162  unsigned AS = UnknownAddressSpace) {
163  return MemAccessTy(Type::getVoidTy(Ctx), AS);
164  }
165 };
166 
167 /// This class holds data which is used to order reuse candidates.
168 class RegSortData {
169 public:
170  /// This represents the set of LSRUse indices which reference
171  /// a particular register.
172  SmallBitVector UsedByIndices;
173 
174  void print(raw_ostream &OS) const;
175  void dump() const;
176 };
177 
178 } // end anonymous namespace
179 
180 void RegSortData::print(raw_ostream &OS) const {
181  OS << "[NumUses=" << UsedByIndices.count() << ']';
182 }
183 
185 void RegSortData::dump() const {
186  print(errs()); errs() << '\n';
187 }
188 
189 namespace {
190 
191 /// Map register candidates to information about how they are used.
192 class RegUseTracker {
193  typedef DenseMap<const SCEV *, RegSortData> RegUsesTy;
194 
195  RegUsesTy RegUsesMap;
197 
198 public:
199  void countRegister(const SCEV *Reg, size_t LUIdx);
200  void dropRegister(const SCEV *Reg, size_t LUIdx);
201  void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
202 
203  bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
204 
205  const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
206 
207  void clear();
208 
210  typedef SmallVectorImpl<const SCEV *>::const_iterator const_iterator;
211  iterator begin() { return RegSequence.begin(); }
212  iterator end() { return RegSequence.end(); }
213  const_iterator begin() const { return RegSequence.begin(); }
214  const_iterator end() const { return RegSequence.end(); }
215 };
216 
217 } // end anonymous namespace
218 
219 void
220 RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
221  std::pair<RegUsesTy::iterator, bool> Pair =
222  RegUsesMap.insert(std::make_pair(Reg, RegSortData()));
223  RegSortData &RSD = Pair.first->second;
224  if (Pair.second)
225  RegSequence.push_back(Reg);
226  RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
227  RSD.UsedByIndices.set(LUIdx);
228 }
229 
230 void
231 RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
232  RegUsesTy::iterator It = RegUsesMap.find(Reg);
233  assert(It != RegUsesMap.end());
234  RegSortData &RSD = It->second;
235  assert(RSD.UsedByIndices.size() > LUIdx);
236  RSD.UsedByIndices.reset(LUIdx);
237 }
238 
239 void
240 RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
241  assert(LUIdx <= LastLUIdx);
242 
243  // Update RegUses. The data structure is not optimized for this purpose;
244  // we must iterate through it and update each of the bit vectors.
245  for (auto &Pair : RegUsesMap) {
246  SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
247  if (LUIdx < UsedByIndices.size())
248  UsedByIndices[LUIdx] =
249  LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : false;
250  UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
251  }
252 }
253 
254 bool
255 RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
256  RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
257  if (I == RegUsesMap.end())
258  return false;
259  const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
260  int i = UsedByIndices.find_first();
261  if (i == -1) return false;
262  if ((size_t)i != LUIdx) return true;
263  return UsedByIndices.find_next(i) != -1;
264 }
265 
266 const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
267  RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
268  assert(I != RegUsesMap.end() && "Unknown register!");
269  return I->second.UsedByIndices;
270 }
271 
272 void RegUseTracker::clear() {
273  RegUsesMap.clear();
274  RegSequence.clear();
275 }
276 
277 namespace {
278 
279 /// This class holds information that describes a formula for computing
280 /// satisfying a use. It may include broken-out immediates and scaled registers.
281 struct Formula {
282  /// Global base address used for complex addressing.
283  GlobalValue *BaseGV;
284 
285  /// Base offset for complex addressing.
286  int64_t BaseOffset;
287 
288  /// Whether any complex addressing has a base register.
289  bool HasBaseReg;
290 
291  /// The scale of any complex addressing.
292  int64_t Scale;
293 
294  /// The list of "base" registers for this use. When this is non-empty. The
295  /// canonical representation of a formula is
296  /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
297  /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
298  /// #1 enforces that the scaled register is always used when at least two
299  /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
300  /// #2 enforces that 1 * reg is reg.
301  /// This invariant can be temporarly broken while building a formula.
302  /// However, every formula inserted into the LSRInstance must be in canonical
303  /// form.
305 
306  /// The 'scaled' register for this use. This should be non-null when Scale is
307  /// not zero.
308  const SCEV *ScaledReg;
309 
310  /// An additional constant offset which added near the use. This requires a
311  /// temporary register, but the offset itself can live in an add immediate
312  /// field rather than a register.
313  int64_t UnfoldedOffset;
314 
315  Formula()
316  : BaseGV(nullptr), BaseOffset(0), HasBaseReg(false), Scale(0),
317  ScaledReg(nullptr), UnfoldedOffset(0) {}
318 
319  void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
320 
321  bool isCanonical() const;
322 
323  void canonicalize();
324 
325  bool unscale();
326 
327  size_t getNumRegs() const;
328  Type *getType() const;
329 
330  void deleteBaseReg(const SCEV *&S);
331 
332  bool referencesReg(const SCEV *S) const;
333  bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
334  const RegUseTracker &RegUses) const;
335 
336  void print(raw_ostream &OS) const;
337  void dump() const;
338 };
339 
340 } // end anonymous namespace
341 
342 /// Recursion helper for initialMatch.
343 static void DoInitialMatch(const SCEV *S, Loop *L,
346  ScalarEvolution &SE) {
347  // Collect expressions which properly dominate the loop header.
348  if (SE.properlyDominates(S, L->getHeader())) {
349  Good.push_back(S);
350  return;
351  }
352 
353  // Look at add operands.
354  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
355  for (const SCEV *S : Add->operands())
356  DoInitialMatch(S, L, Good, Bad, SE);
357  return;
358  }
359 
360  // Look at addrec operands.
361  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
362  if (!AR->getStart()->isZero() && AR->isAffine()) {
363  DoInitialMatch(AR->getStart(), L, Good, Bad, SE);
364  DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
365  AR->getStepRecurrence(SE),
366  // FIXME: AR->getNoWrapFlags()
367  AR->getLoop(), SCEV::FlagAnyWrap),
368  L, Good, Bad, SE);
369  return;
370  }
371 
372  // Handle a multiplication by -1 (negation) if it didn't fold.
373  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
374  if (Mul->getOperand(0)->isAllOnesValue()) {
375  SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end());
376  const SCEV *NewMul = SE.getMulExpr(Ops);
377 
380  DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
381  const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
382  SE.getEffectiveSCEVType(NewMul->getType())));
383  for (const SCEV *S : MyGood)
384  Good.push_back(SE.getMulExpr(NegOne, S));
385  for (const SCEV *S : MyBad)
386  Bad.push_back(SE.getMulExpr(NegOne, S));
387  return;
388  }
389 
390  // Ok, we can't do anything interesting. Just stuff the whole thing into a
391  // register and hope for the best.
392  Bad.push_back(S);
393 }
394 
395 /// Incorporate loop-variant parts of S into this Formula, attempting to keep
396 /// all loop-invariant and loop-computable values in a single base register.
397 void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
400  DoInitialMatch(S, L, Good, Bad, SE);
401  if (!Good.empty()) {
402  const SCEV *Sum = SE.getAddExpr(Good);
403  if (!Sum->isZero())
404  BaseRegs.push_back(Sum);
405  HasBaseReg = true;
406  }
407  if (!Bad.empty()) {
408  const SCEV *Sum = SE.getAddExpr(Bad);
409  if (!Sum->isZero())
410  BaseRegs.push_back(Sum);
411  HasBaseReg = true;
412  }
413  canonicalize();
414 }
415 
416 /// \brief Check whether or not this formula statisfies the canonical
417 /// representation.
418 /// \see Formula::BaseRegs.
419 bool Formula::isCanonical() const {
420  if (ScaledReg)
421  return Scale != 1 || !BaseRegs.empty();
422  return BaseRegs.size() <= 1;
423 }
424 
425 /// \brief Helper method to morph a formula into its canonical representation.
426 /// \see Formula::BaseRegs.
427 /// Every formula having more than one base register, must use the ScaledReg
428 /// field. Otherwise, we would have to do special cases everywhere in LSR
429 /// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
430 /// On the other hand, 1*reg should be canonicalized into reg.
431 void Formula::canonicalize() {
432  if (isCanonical())
433  return;
434  // So far we did not need this case. This is easy to implement but it is
435  // useless to maintain dead code. Beside it could hurt compile time.
436  assert(!BaseRegs.empty() && "1*reg => reg, should not be needed.");
437  // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
438  ScaledReg = BaseRegs.back();
439  BaseRegs.pop_back();
440  Scale = 1;
441  size_t BaseRegsSize = BaseRegs.size();
442  size_t Try = 0;
443  // If ScaledReg is an invariant, try to find a variant expression.
444  while (Try < BaseRegsSize && !isa<SCEVAddRecExpr>(ScaledReg))
445  std::swap(ScaledReg, BaseRegs[Try++]);
446 }
447 
448 /// \brief Get rid of the scale in the formula.
449 /// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
450 /// \return true if it was possible to get rid of the scale, false otherwise.
451 /// \note After this operation the formula may not be in the canonical form.
452 bool Formula::unscale() {
453  if (Scale != 1)
454  return false;
455  Scale = 0;
456  BaseRegs.push_back(ScaledReg);
457  ScaledReg = nullptr;
458  return true;
459 }
460 
461 /// Return the total number of register operands used by this formula. This does
462 /// not include register uses implied by non-constant addrec strides.
463 size_t Formula::getNumRegs() const {
464  return !!ScaledReg + BaseRegs.size();
465 }
466 
467 /// Return the type of this formula, if it has one, or null otherwise. This type
468 /// is meaningless except for the bit size.
469 Type *Formula::getType() const {
470  return !BaseRegs.empty() ? BaseRegs.front()->getType() :
471  ScaledReg ? ScaledReg->getType() :
472  BaseGV ? BaseGV->getType() :
473  nullptr;
474 }
475 
476 /// Delete the given base reg from the BaseRegs list.
477 void Formula::deleteBaseReg(const SCEV *&S) {
478  if (&S != &BaseRegs.back())
479  std::swap(S, BaseRegs.back());
480  BaseRegs.pop_back();
481 }
482 
483 /// Test if this formula references the given register.
484 bool Formula::referencesReg(const SCEV *S) const {
485  return S == ScaledReg || is_contained(BaseRegs, S);
486 }
487 
488 /// Test whether this formula uses registers which are used by uses other than
489 /// the use with the given index.
490 bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
491  const RegUseTracker &RegUses) const {
492  if (ScaledReg)
493  if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
494  return true;
495  for (const SCEV *BaseReg : BaseRegs)
496  if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
497  return true;
498  return false;
499 }
500 
501 void Formula::print(raw_ostream &OS) const {
502  bool First = true;
503  if (BaseGV) {
504  if (!First) OS << " + "; else First = false;
505  BaseGV->printAsOperand(OS, /*PrintType=*/false);
506  }
507  if (BaseOffset != 0) {
508  if (!First) OS << " + "; else First = false;
509  OS << BaseOffset;
510  }
511  for (const SCEV *BaseReg : BaseRegs) {
512  if (!First) OS << " + "; else First = false;
513  OS << "reg(" << *BaseReg << ')';
514  }
515  if (HasBaseReg && BaseRegs.empty()) {
516  if (!First) OS << " + "; else First = false;
517  OS << "**error: HasBaseReg**";
518  } else if (!HasBaseReg && !BaseRegs.empty()) {
519  if (!First) OS << " + "; else First = false;
520  OS << "**error: !HasBaseReg**";
521  }
522  if (Scale != 0) {
523  if (!First) OS << " + "; else First = false;
524  OS << Scale << "*reg(";
525  if (ScaledReg)
526  OS << *ScaledReg;
527  else
528  OS << "<unknown>";
529  OS << ')';
530  }
531  if (UnfoldedOffset != 0) {
532  if (!First) OS << " + ";
533  OS << "imm(" << UnfoldedOffset << ')';
534  }
535 }
536 
538 void Formula::dump() const {
539  print(errs()); errs() << '\n';
540 }
541 
542 /// Return true if the given addrec can be sign-extended without changing its
543 /// value.
544 static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
545  Type *WideTy =
547  return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
548 }
549 
550 /// Return true if the given add can be sign-extended without changing its
551 /// value.
552 static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
553  Type *WideTy =
555  return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
556 }
557 
558 /// Return true if the given mul can be sign-extended without changing its
559 /// value.
560 static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
561  Type *WideTy =
563  SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
564  return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
565 }
566 
567 /// Return an expression for LHS /s RHS, if it can be determined and if the
568 /// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
569 /// is true, expressions like (X * Y) /s Y are simplified to Y, ignoring that
570 /// the multiplication may overflow, which is useful when the result will be
571 /// used in a context where the most significant bits are ignored.
572 static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
573  ScalarEvolution &SE,
574  bool IgnoreSignificantBits = false) {
575  // Handle the trivial case, which works for any SCEV type.
576  if (LHS == RHS)
577  return SE.getConstant(LHS->getType(), 1);
578 
579  // Handle a few RHS special cases.
580  const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
581  if (RC) {
582  const APInt &RA = RC->getAPInt();
583  // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
584  // some folding.
585  if (RA.isAllOnesValue())
586  return SE.getMulExpr(LHS, RC);
587  // Handle x /s 1 as x.
588  if (RA == 1)
589  return LHS;
590  }
591 
592  // Check for a division of a constant by a constant.
593  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
594  if (!RC)
595  return nullptr;
596  const APInt &LA = C->getAPInt();
597  const APInt &RA = RC->getAPInt();
598  if (LA.srem(RA) != 0)
599  return nullptr;
600  return SE.getConstant(LA.sdiv(RA));
601  }
602 
603  // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
604  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
605  if ((IgnoreSignificantBits || isAddRecSExtable(AR, SE)) && AR->isAffine()) {
606  const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
607  IgnoreSignificantBits);
608  if (!Step) return nullptr;
609  const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
610  IgnoreSignificantBits);
611  if (!Start) return nullptr;
612  // FlagNW is independent of the start value, step direction, and is
613  // preserved with smaller magnitude steps.
614  // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
615  return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
616  }
617  return nullptr;
618  }
619 
620  // Distribute the sdiv over add operands, if the add doesn't overflow.
621  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
622  if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
624  for (const SCEV *S : Add->operands()) {
625  const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
626  if (!Op) return nullptr;
627  Ops.push_back(Op);
628  }
629  return SE.getAddExpr(Ops);
630  }
631  return nullptr;
632  }
633 
634  // Check for a multiply operand that we can pull RHS out of.
635  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
636  if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
638  bool Found = false;
639  for (const SCEV *S : Mul->operands()) {
640  if (!Found)
641  if (const SCEV *Q = getExactSDiv(S, RHS, SE,
642  IgnoreSignificantBits)) {
643  S = Q;
644  Found = true;
645  }
646  Ops.push_back(S);
647  }
648  return Found ? SE.getMulExpr(Ops) : nullptr;
649  }
650  return nullptr;
651  }
652 
653  // Otherwise we don't know.
654  return nullptr;
655 }
656 
657 /// If S involves the addition of a constant integer value, return that integer
658 /// value, and mutate S to point to a new SCEV with that value excluded.
659 static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
660  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
661  if (C->getAPInt().getMinSignedBits() <= 64) {
662  S = SE.getConstant(C->getType(), 0);
663  return C->getValue()->getSExtValue();
664  }
665  } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
666  SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
667  int64_t Result = ExtractImmediate(NewOps.front(), SE);
668  if (Result != 0)
669  S = SE.getAddExpr(NewOps);
670  return Result;
671  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
672  SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
673  int64_t Result = ExtractImmediate(NewOps.front(), SE);
674  if (Result != 0)
675  S = SE.getAddRecExpr(NewOps, AR->getLoop(),
676  // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
678  return Result;
679  }
680  return 0;
681 }
682 
683 /// If S involves the addition of a GlobalValue address, return that symbol, and
684 /// mutate S to point to a new SCEV with that value excluded.
686  if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
687  if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
688  S = SE.getConstant(GV->getType(), 0);
689  return GV;
690  }
691  } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
692  SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
693  GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
694  if (Result)
695  S = SE.getAddExpr(NewOps);
696  return Result;
697  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
698  SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
699  GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
700  if (Result)
701  S = SE.getAddRecExpr(NewOps, AR->getLoop(),
702  // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
704  return Result;
705  }
706  return nullptr;
707 }
708 
709 /// Returns true if the specified instruction is using the specified value as an
710 /// address.
711 static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
712  bool isAddress = isa<LoadInst>(Inst);
713  if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
714  if (SI->getOperand(1) == OperandVal)
715  isAddress = true;
716  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
717  // Addressing modes can also be folded into prefetches and a variety
718  // of intrinsics.
719  switch (II->getIntrinsicID()) {
720  default: break;
721  case Intrinsic::prefetch:
722  if (II->getArgOperand(0) == OperandVal)
723  isAddress = true;
724  break;
725  }
726  }
727  return isAddress;
728 }
729 
730 /// Return the type of the memory being accessed.
731 static MemAccessTy getAccessType(const Instruction *Inst) {
732  MemAccessTy AccessTy(Inst->getType(), MemAccessTy::UnknownAddressSpace);
733  if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
734  AccessTy.MemTy = SI->getOperand(0)->getType();
735  AccessTy.AddrSpace = SI->getPointerAddressSpace();
736  } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
737  AccessTy.AddrSpace = LI->getPointerAddressSpace();
738  }
739 
740  // All pointers have the same requirements, so canonicalize them to an
741  // arbitrary pointer type to minimize variation.
742  if (PointerType *PTy = dyn_cast<PointerType>(AccessTy.MemTy))
743  AccessTy.MemTy = PointerType::get(IntegerType::get(PTy->getContext(), 1),
744  PTy->getAddressSpace());
745 
746  return AccessTy;
747 }
748 
749 /// Return true if this AddRec is already a phi in its loop.
750 static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
751  for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();
752  PHINode *PN = dyn_cast<PHINode>(I); ++I) {
753  if (SE.isSCEVable(PN->getType()) &&
754  (SE.getEffectiveSCEVType(PN->getType()) ==
755  SE.getEffectiveSCEVType(AR->getType())) &&
756  SE.getSCEV(PN) == AR)
757  return true;
758  }
759  return false;
760 }
761 
762 /// Check if expanding this expression is likely to incur significant cost. This
763 /// is tricky because SCEV doesn't track which expressions are actually computed
764 /// by the current IR.
765 ///
766 /// We currently allow expansion of IV increments that involve adds,
767 /// multiplication by constants, and AddRecs from existing phis.
768 ///
769 /// TODO: Allow UDivExpr if we can find an existing IV increment that is an
770 /// obvious multiple of the UDivExpr.
771 static bool isHighCostExpansion(const SCEV *S,
772  SmallPtrSetImpl<const SCEV*> &Processed,
773  ScalarEvolution &SE) {
774  // Zero/One operand expressions
775  switch (S->getSCEVType()) {
776  case scUnknown:
777  case scConstant:
778  return false;
779  case scTruncate:
780  return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
781  Processed, SE);
782  case scZeroExtend:
783  return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
784  Processed, SE);
785  case scSignExtend:
786  return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
787  Processed, SE);
788  }
789 
790  if (!Processed.insert(S).second)
791  return false;
792 
793  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
794  for (const SCEV *S : Add->operands()) {
795  if (isHighCostExpansion(S, Processed, SE))
796  return true;
797  }
798  return false;
799  }
800 
801  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
802  if (Mul->getNumOperands() == 2) {
803  // Multiplication by a constant is ok
804  if (isa<SCEVConstant>(Mul->getOperand(0)))
805  return isHighCostExpansion(Mul->getOperand(1), Processed, SE);
806 
807  // If we have the value of one operand, check if an existing
808  // multiplication already generates this expression.
809  if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) {
810  Value *UVal = U->getValue();
811  for (User *UR : UVal->users()) {
812  // If U is a constant, it may be used by a ConstantExpr.
813  Instruction *UI = dyn_cast<Instruction>(UR);
814  if (UI && UI->getOpcode() == Instruction::Mul &&
815  SE.isSCEVable(UI->getType())) {
816  return SE.getSCEV(UI) == Mul;
817  }
818  }
819  }
820  }
821  }
822 
823  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
824  if (isExistingPhi(AR, SE))
825  return false;
826  }
827 
828  // Fow now, consider any other type of expression (div/mul/min/max) high cost.
829  return true;
830 }
831 
832 /// If any of the instructions is the specified set are trivially dead, delete
833 /// them and see if this makes any of their operands subsequently dead.
834 static bool
836  bool Changed = false;
837 
838  while (!DeadInsts.empty()) {
839  Value *V = DeadInsts.pop_back_val();
840  Instruction *I = dyn_cast_or_null<Instruction>(V);
841 
842  if (!I || !isInstructionTriviallyDead(I))
843  continue;
844 
845  for (Use &O : I->operands())
846  if (Instruction *U = dyn_cast<Instruction>(O)) {
847  O = nullptr;
848  if (U->use_empty())
849  DeadInsts.emplace_back(U);
850  }
851 
852  I->eraseFromParent();
853  Changed = true;
854  }
855 
856  return Changed;
857 }
858 
859 namespace {
860 
861 class LSRUse;
862 
863 } // end anonymous namespace
864 
865 /// \brief Check if the addressing mode defined by \p F is completely
866 /// folded in \p LU at isel time.
867 /// This includes address-mode folding and special icmp tricks.
868 /// This function returns true if \p LU can accommodate what \p F
869 /// defines and up to 1 base + 1 scaled + offset.
870 /// In other words, if \p F has several base registers, this function may
871 /// still return true. Therefore, users still need to account for
872 /// additional base registers and/or unfolded offsets to derive an
873 /// accurate cost model.
874 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
875  const LSRUse &LU, const Formula &F);
876 // Get the cost of the scaling factor used in F for LU.
877 static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
878  const LSRUse &LU, const Formula &F);
879 
880 namespace {
881 
882 /// This class is used to measure and compare candidate formulae.
883 class Cost {
884  /// TODO: Some of these could be merged. Also, a lexical ordering
885  /// isn't always optimal.
886  unsigned NumRegs;
887  unsigned AddRecCost;
888  unsigned NumIVMuls;
889  unsigned NumBaseAdds;
890  unsigned ImmCost;
891  unsigned SetupCost;
892  unsigned ScaleCost;
893 
894 public:
895  Cost()
896  : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0),
897  SetupCost(0), ScaleCost(0) {}
898 
899  bool operator<(const Cost &Other) const;
900 
901  void Lose();
902 
903 #ifndef NDEBUG
904  // Once any of the metrics loses, they must all remain losers.
905  bool isValid() {
906  return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
907  | ImmCost | SetupCost | ScaleCost) != ~0u)
908  || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
909  & ImmCost & SetupCost & ScaleCost) == ~0u);
910  }
911 #endif
912 
913  bool isLoser() {
914  assert(isValid() && "invalid cost");
915  return NumRegs == ~0u;
916  }
917 
918  void RateFormula(const TargetTransformInfo &TTI,
919  const Formula &F,
921  const DenseSet<const SCEV *> &VisitedRegs,
922  const Loop *L,
924  const LSRUse &LU,
925  SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
926 
927  void print(raw_ostream &OS) const;
928  void dump() const;
929 
930 private:
931  void RateRegister(const SCEV *Reg,
933  const Loop *L,
934  ScalarEvolution &SE, DominatorTree &DT);
935  void RatePrimaryRegister(const SCEV *Reg,
937  const Loop *L,
939  SmallPtrSetImpl<const SCEV *> *LoserRegs);
940 };
941 
942 /// An operand value in an instruction which is to be replaced with some
943 /// equivalent, possibly strength-reduced, replacement.
944 struct LSRFixup {
945  /// The instruction which will be updated.
946  Instruction *UserInst;
947 
948  /// The operand of the instruction which will be replaced. The operand may be
949  /// used more than once; every instance will be replaced.
950  Value *OperandValToReplace;
951 
952  /// If this user is to use the post-incremented value of an induction
953  /// variable, this variable is non-null and holds the loop associated with the
954  /// induction variable.
955  PostIncLoopSet PostIncLoops;
956 
957  /// A constant offset to be added to the LSRUse expression. This allows
958  /// multiple fixups to share the same LSRUse with different offsets, for
959  /// example in an unrolled loop.
960  int64_t Offset;
961 
962  bool isUseFullyOutsideLoop(const Loop *L) const;
963 
964  LSRFixup();
965 
966  void print(raw_ostream &OS) const;
967  void dump() const;
968 };
969 
970 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted
971 /// SmallVectors of const SCEV*.
972 struct UniquifierDenseMapInfo {
973  static SmallVector<const SCEV *, 4> getEmptyKey() {
975  V.push_back(reinterpret_cast<const SCEV *>(-1));
976  return V;
977  }
978 
979  static SmallVector<const SCEV *, 4> getTombstoneKey() {
981  V.push_back(reinterpret_cast<const SCEV *>(-2));
982  return V;
983  }
984 
985  static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) {
986  return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
987  }
988 
989  static bool isEqual(const SmallVector<const SCEV *, 4> &LHS,
990  const SmallVector<const SCEV *, 4> &RHS) {
991  return LHS == RHS;
992  }
993 };
994 
995 /// This class holds the state that LSR keeps for each use in IVUsers, as well
996 /// as uses invented by LSR itself. It includes information about what kinds of
997 /// things can be folded into the user, information about the user itself, and
998 /// information about how the use may be satisfied. TODO: Represent multiple
999 /// users of the same expression in common?
1000 class LSRUse {
1001  DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;
1002 
1003 public:
1004  /// An enum for a kind of use, indicating what types of scaled and immediate
1005  /// operands it might support.
1006  enum KindType {
1007  Basic, ///< A normal use, with no folding.
1008  Special, ///< A special case of basic, allowing -1 scales.
1009  Address, ///< An address use; folding according to TargetLowering
1010  ICmpZero ///< An equality icmp with both operands folded into one.
1011  // TODO: Add a generic icmp too?
1012  };
1013 
1014  typedef PointerIntPair<const SCEV *, 2, KindType> SCEVUseKindPair;
1015 
1016  KindType Kind;
1017  MemAccessTy AccessTy;
1018 
1019  /// The list of operands which are to be replaced.
1021 
1022  /// Keep track of the min and max offsets of the fixups.
1023  int64_t MinOffset;
1024  int64_t MaxOffset;
1025 
1026  /// This records whether all of the fixups using this LSRUse are outside of
1027  /// the loop, in which case some special-case heuristics may be used.
1028  bool AllFixupsOutsideLoop;
1029 
1030  /// RigidFormula is set to true to guarantee that this use will be associated
1031  /// with a single formula--the one that initially matched. Some SCEV
1032  /// expressions cannot be expanded. This allows LSR to consider the registers
1033  /// used by those expressions without the need to expand them later after
1034  /// changing the formula.
1035  bool RigidFormula;
1036 
1037  /// This records the widest use type for any fixup using this
1038  /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
1039  /// fixup widths to be equivalent, because the narrower one may be relying on
1040  /// the implicit truncation to truncate away bogus bits.
1041  Type *WidestFixupType;
1042 
1043  /// A list of ways to build a value that can satisfy this user. After the
1044  /// list is populated, one of these is selected heuristically and used to
1045  /// formulate a replacement for OperandValToReplace in UserInst.
1046  SmallVector<Formula, 12> Formulae;
1047 
1048  /// The set of register candidates used by all formulae in this LSRUse.
1050 
1051  LSRUse(KindType K, MemAccessTy AT)
1052  : Kind(K), AccessTy(AT), MinOffset(INT64_MAX), MaxOffset(INT64_MIN),
1053  AllFixupsOutsideLoop(true), RigidFormula(false),
1054  WidestFixupType(nullptr) {}
1055 
1056  LSRFixup &getNewFixup() {
1057  Fixups.push_back(LSRFixup());
1058  return Fixups.back();
1059  }
1060 
1061  void pushFixup(LSRFixup &f) {
1062  Fixups.push_back(f);
1063  if (f.Offset > MaxOffset)
1064  MaxOffset = f.Offset;
1065  if (f.Offset < MinOffset)
1066  MinOffset = f.Offset;
1067  }
1068 
1069  bool HasFormulaWithSameRegs(const Formula &F) const;
1070  bool InsertFormula(const Formula &F);
1071  void DeleteFormula(Formula &F);
1072  void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
1073 
1074  void print(raw_ostream &OS) const;
1075  void dump() const;
1076 };
1077 
1078 } // end anonymous namespace
1079 
1080 /// Tally up interesting quantities from the given register.
1081 void Cost::RateRegister(const SCEV *Reg,
1083  const Loop *L,
1084  ScalarEvolution &SE, DominatorTree &DT) {
1085  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
1086  // If this is an addrec for another loop, don't second-guess its addrec phi
1087  // nodes. LSR isn't currently smart enough to reason about more than one
1088  // loop at a time. LSR has already run on inner loops, will not run on outer
1089  // loops, and cannot be expected to change sibling loops.
1090  if (AR->getLoop() != L) {
1091  // If the AddRec exists, consider it's register free and leave it alone.
1092  if (isExistingPhi(AR, SE))
1093  return;
1094 
1095  // Otherwise, do not consider this formula at all.
1096  Lose();
1097  return;
1098  }
1099  AddRecCost += 1; /// TODO: This should be a function of the stride.
1100 
1101  // Add the step value register, if it needs one.
1102  // TODO: The non-affine case isn't precisely modeled here.
1103  if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
1104  if (!Regs.count(AR->getOperand(1))) {
1105  RateRegister(AR->getOperand(1), Regs, L, SE, DT);
1106  if (isLoser())
1107  return;
1108  }
1109  }
1110  }
1111  ++NumRegs;
1112 
1113  // Rough heuristic; favor registers which don't require extra setup
1114  // instructions in the preheader.
1115  if (!isa<SCEVUnknown>(Reg) &&
1116  !isa<SCEVConstant>(Reg) &&
1117  !(isa<SCEVAddRecExpr>(Reg) &&
1118  (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) ||
1119  isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart()))))
1120  ++SetupCost;
1121 
1122  NumIVMuls += isa<SCEVMulExpr>(Reg) &&
1123  SE.hasComputableLoopEvolution(Reg, L);
1124 }
1125 
1126 /// Record this register in the set. If we haven't seen it before, rate
1127 /// it. Optional LoserRegs provides a way to declare any formula that refers to
1128 /// one of those regs an instant loser.
1129 void Cost::RatePrimaryRegister(const SCEV *Reg,
1131  const Loop *L,
1132  ScalarEvolution &SE, DominatorTree &DT,
1133  SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1134  if (LoserRegs && LoserRegs->count(Reg)) {
1135  Lose();
1136  return;
1137  }
1138  if (Regs.insert(Reg).second) {
1139  RateRegister(Reg, Regs, L, SE, DT);
1140  if (LoserRegs && isLoser())
1141  LoserRegs->insert(Reg);
1142  }
1143 }
1144 
1145 void Cost::RateFormula(const TargetTransformInfo &TTI,
1146  const Formula &F,
1148  const DenseSet<const SCEV *> &VisitedRegs,
1149  const Loop *L,
1150  ScalarEvolution &SE, DominatorTree &DT,
1151  const LSRUse &LU,
1152  SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1153  assert(F.isCanonical() && "Cost is accurate only for canonical formula");
1154  // Tally up the registers.
1155  if (const SCEV *ScaledReg = F.ScaledReg) {
1156  if (VisitedRegs.count(ScaledReg)) {
1157  Lose();
1158  return;
1159  }
1160  RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs);
1161  if (isLoser())
1162  return;
1163  }
1164  for (const SCEV *BaseReg : F.BaseRegs) {
1165  if (VisitedRegs.count(BaseReg)) {
1166  Lose();
1167  return;
1168  }
1169  RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs);
1170  if (isLoser())
1171  return;
1172  }
1173 
1174  // Determine how many (unfolded) adds we'll need inside the loop.
1175  size_t NumBaseParts = F.getNumRegs();
1176  if (NumBaseParts > 1)
1177  // Do not count the base and a possible second register if the target
1178  // allows to fold 2 registers.
1179  NumBaseAdds +=
1180  NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F)));
1181  NumBaseAdds += (F.UnfoldedOffset != 0);
1182 
1183  // Accumulate non-free scaling amounts.
1184  ScaleCost += getScalingFactorCost(TTI, LU, F);
1185 
1186  // Tally up the non-zero immediates.
1187  for (const LSRFixup &Fixup : LU.Fixups) {
1188  int64_t O = Fixup.Offset;
1189  int64_t Offset = (uint64_t)O + F.BaseOffset;
1190  if (F.BaseGV)
1191  ImmCost += 64; // Handle symbolic values conservatively.
1192  // TODO: This should probably be the pointer size.
1193  else if (Offset != 0)
1194  ImmCost += APInt(64, Offset, true).getMinSignedBits();
1195 
1196  // Check with target if this offset with this instruction is
1197  // specifically not supported.
1198  if ((isa<LoadInst>(Fixup.UserInst) || isa<StoreInst>(Fixup.UserInst)) &&
1199  !TTI.isFoldableMemAccessOffset(Fixup.UserInst, Offset))
1200  NumBaseAdds++;
1201  }
1202  assert(isValid() && "invalid cost");
1203 }
1204 
1205 /// Set this cost to a losing value.
1206 void Cost::Lose() {
1207  NumRegs = ~0u;
1208  AddRecCost = ~0u;
1209  NumIVMuls = ~0u;
1210  NumBaseAdds = ~0u;
1211  ImmCost = ~0u;
1212  SetupCost = ~0u;
1213  ScaleCost = ~0u;
1214 }
1215 
1216 /// Choose the lower cost.
1217 bool Cost::operator<(const Cost &Other) const {
1218  return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost,
1219  ImmCost, SetupCost) <
1220  std::tie(Other.NumRegs, Other.AddRecCost, Other.NumIVMuls,
1221  Other.NumBaseAdds, Other.ScaleCost, Other.ImmCost,
1222  Other.SetupCost);
1223 }
1224 
1225 void Cost::print(raw_ostream &OS) const {
1226  OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
1227  if (AddRecCost != 0)
1228  OS << ", with addrec cost " << AddRecCost;
1229  if (NumIVMuls != 0)
1230  OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s");
1231  if (NumBaseAdds != 0)
1232  OS << ", plus " << NumBaseAdds << " base add"
1233  << (NumBaseAdds == 1 ? "" : "s");
1234  if (ScaleCost != 0)
1235  OS << ", plus " << ScaleCost << " scale cost";
1236  if (ImmCost != 0)
1237  OS << ", plus " << ImmCost << " imm cost";
1238  if (SetupCost != 0)
1239  OS << ", plus " << SetupCost << " setup cost";
1240 }
1241 
1243 void Cost::dump() const {
1244  print(errs()); errs() << '\n';
1245 }
1246 
1247 LSRFixup::LSRFixup()
1248  : UserInst(nullptr), OperandValToReplace(nullptr),
1249  Offset(0) {}
1250 
1251 /// Test whether this fixup always uses its value outside of the given loop.
1252 bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
1253  // PHI nodes use their value in their incoming blocks.
1254  if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
1255  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
1256  if (PN->getIncomingValue(i) == OperandValToReplace &&
1257  L->contains(PN->getIncomingBlock(i)))
1258  return false;
1259  return true;
1260  }
1261 
1262  return !L->contains(UserInst);
1263 }
1264 
1265 void LSRFixup::print(raw_ostream &OS) const {
1266  OS << "UserInst=";
1267  // Store is common and interesting enough to be worth special-casing.
1268  if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
1269  OS << "store ";
1270  Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
1271  } else if (UserInst->getType()->isVoidTy())
1272  OS << UserInst->getOpcodeName();
1273  else
1274  UserInst->printAsOperand(OS, /*PrintType=*/false);
1275 
1276  OS << ", OperandValToReplace=";
1277  OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
1278 
1279  for (const Loop *PIL : PostIncLoops) {
1280  OS << ", PostIncLoop=";
1281  PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
1282  }
1283 
1284  if (Offset != 0)
1285  OS << ", Offset=" << Offset;
1286 }
1287 
1289 void LSRFixup::dump() const {
1290  print(errs()); errs() << '\n';
1291 }
1292 
1293 /// Test whether this use as a formula which has the same registers as the given
1294 /// formula.
1295 bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
1296  SmallVector<const SCEV *, 4> Key = F.BaseRegs;
1297  if (F.ScaledReg) Key.push_back(F.ScaledReg);
1298  // Unstable sort by host order ok, because this is only used for uniquifying.
1299  std::sort(Key.begin(), Key.end());
1300  return Uniquifier.count(Key);
1301 }
1302 
1303 /// If the given formula has not yet been inserted, add it to the list, and
1304 /// return true. Return false otherwise. The formula must be in canonical form.
1305 bool LSRUse::InsertFormula(const Formula &F) {
1306  assert(F.isCanonical() && "Invalid canonical representation");
1307 
1308  if (!Formulae.empty() && RigidFormula)
1309  return false;
1310 
1311  SmallVector<const SCEV *, 4> Key = F.BaseRegs;
1312  if (F.ScaledReg) Key.push_back(F.ScaledReg);
1313  // Unstable sort by host order ok, because this is only used for uniquifying.
1314  std::sort(Key.begin(), Key.end());
1315 
1316  if (!Uniquifier.insert(Key).second)
1317  return false;
1318 
1319  // Using a register to hold the value of 0 is not profitable.
1320  assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
1321  "Zero allocated in a scaled register!");
1322 #ifndef NDEBUG
1323  for (const SCEV *BaseReg : F.BaseRegs)
1324  assert(!BaseReg->isZero() && "Zero allocated in a base register!");
1325 #endif
1326 
1327  // Add the formula to the list.
1328  Formulae.push_back(F);
1329 
1330  // Record registers now being used by this use.
1331  Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1332  if (F.ScaledReg)
1333  Regs.insert(F.ScaledReg);
1334 
1335  return true;
1336 }
1337 
1338 /// Remove the given formula from this use's list.
1339 void LSRUse::DeleteFormula(Formula &F) {
1340  if (&F != &Formulae.back())
1341  std::swap(F, Formulae.back());
1342  Formulae.pop_back();
1343 }
1344 
1345 /// Recompute the Regs field, and update RegUses.
1346 void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
1347  // Now that we've filtered out some formulae, recompute the Regs set.
1348  SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
1349  Regs.clear();
1350  for (const Formula &F : Formulae) {
1351  if (F.ScaledReg) Regs.insert(F.ScaledReg);
1352  Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1353  }
1354 
1355  // Update the RegTracker.
1356  for (const SCEV *S : OldRegs)
1357  if (!Regs.count(S))
1358  RegUses.dropRegister(S, LUIdx);
1359 }
1360 
1361 void LSRUse::print(raw_ostream &OS) const {
1362  OS << "LSR Use: Kind=";
1363  switch (Kind) {
1364  case Basic: OS << "Basic"; break;
1365  case Special: OS << "Special"; break;
1366  case ICmpZero: OS << "ICmpZero"; break;
1367  case Address:
1368  OS << "Address of ";
1369  if (AccessTy.MemTy->isPointerTy())
1370  OS << "pointer"; // the full pointer type could be really verbose
1371  else {
1372  OS << *AccessTy.MemTy;
1373  }
1374 
1375  OS << " in addrspace(" << AccessTy.AddrSpace << ')';
1376  }
1377 
1378  OS << ", Offsets={";
1379  bool NeedComma = false;
1380  for (const LSRFixup &Fixup : Fixups) {
1381  if (NeedComma) OS << ',';
1382  OS << Fixup.Offset;
1383  NeedComma = true;
1384  }
1385  OS << '}';
1386 
1387  if (AllFixupsOutsideLoop)
1388  OS << ", all-fixups-outside-loop";
1389 
1390  if (WidestFixupType)
1391  OS << ", widest fixup type: " << *WidestFixupType;
1392 }
1393 
1395 void LSRUse::dump() const {
1396  print(errs()); errs() << '\n';
1397 }
1398 
1400  LSRUse::KindType Kind, MemAccessTy AccessTy,
1401  GlobalValue *BaseGV, int64_t BaseOffset,
1402  bool HasBaseReg, int64_t Scale) {
1403  switch (Kind) {
1404  case LSRUse::Address:
1405  return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, BaseOffset,
1406  HasBaseReg, Scale, AccessTy.AddrSpace);
1407 
1408  case LSRUse::ICmpZero:
1409  // There's not even a target hook for querying whether it would be legal to
1410  // fold a GV into an ICmp.
1411  if (BaseGV)
1412  return false;
1413 
1414  // ICmp only has two operands; don't allow more than two non-trivial parts.
1415  if (Scale != 0 && HasBaseReg && BaseOffset != 0)
1416  return false;
1417 
1418  // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
1419  // putting the scaled register in the other operand of the icmp.
1420  if (Scale != 0 && Scale != -1)
1421  return false;
1422 
1423  // If we have low-level target information, ask the target if it can fold an
1424  // integer immediate on an icmp.
1425  if (BaseOffset != 0) {
1426  // We have one of:
1427  // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
1428  // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
1429  // Offs is the ICmp immediate.
1430  if (Scale == 0)
1431  // The cast does the right thing with INT64_MIN.
1432  BaseOffset = -(uint64_t)BaseOffset;
1433  return TTI.isLegalICmpImmediate(BaseOffset);
1434  }
1435 
1436  // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
1437  return true;
1438 
1439  case LSRUse::Basic:
1440  // Only handle single-register values.
1441  return !BaseGV && Scale == 0 && BaseOffset == 0;
1442 
1443  case LSRUse::Special:
1444  // Special case Basic to handle -1 scales.
1445  return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset == 0;
1446  }
1447 
1448  llvm_unreachable("Invalid LSRUse Kind!");
1449 }
1450 
1452  int64_t MinOffset, int64_t MaxOffset,
1453  LSRUse::KindType Kind, MemAccessTy AccessTy,
1454  GlobalValue *BaseGV, int64_t BaseOffset,
1455  bool HasBaseReg, int64_t Scale) {
1456  // Check for overflow.
1457  if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) !=
1458  (MinOffset > 0))
1459  return false;
1460  MinOffset = (uint64_t)BaseOffset + MinOffset;
1461  if (((int64_t)((uint64_t)BaseOffset + MaxOffset) > BaseOffset) !=
1462  (MaxOffset > 0))
1463  return false;
1464  MaxOffset = (uint64_t)BaseOffset + MaxOffset;
1465 
1466  return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
1467  HasBaseReg, Scale) &&
1468  isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
1469  HasBaseReg, Scale);
1470 }
1471 
1473  int64_t MinOffset, int64_t MaxOffset,
1474  LSRUse::KindType Kind, MemAccessTy AccessTy,
1475  const Formula &F) {
1476  // For the purpose of isAMCompletelyFolded either having a canonical formula
1477  // or a scale not equal to zero is correct.
1478  // Problems may arise from non canonical formulae having a scale == 0.
1479  // Strictly speaking it would best to just rely on canonical formulae.
1480  // However, when we generate the scaled formulae, we first check that the
1481  // scaling factor is profitable before computing the actual ScaledReg for
1482  // compile time sake.
1483  assert((F.isCanonical() || F.Scale != 0));
1484  return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1485  F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
1486 }
1487 
1488 /// Test whether we know how to expand the current formula.
1489 static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
1490  int64_t MaxOffset, LSRUse::KindType Kind,
1491  MemAccessTy AccessTy, GlobalValue *BaseGV,
1492  int64_t BaseOffset, bool HasBaseReg, int64_t Scale) {
1493  // We know how to expand completely foldable formulae.
1494  return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1495  BaseOffset, HasBaseReg, Scale) ||
1496  // Or formulae that use a base register produced by a sum of base
1497  // registers.
1498  (Scale == 1 &&
1499  isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1500  BaseGV, BaseOffset, true, 0));
1501 }
1502 
1503 static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
1504  int64_t MaxOffset, LSRUse::KindType Kind,
1505  MemAccessTy AccessTy, const Formula &F) {
1506  return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
1507  F.BaseOffset, F.HasBaseReg, F.Scale);
1508 }
1509 
1511  const LSRUse &LU, const Formula &F) {
1512  return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1513  LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
1514  F.Scale);
1515 }
1516 
1517 static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
1518  const LSRUse &LU, const Formula &F) {
1519  if (!F.Scale)
1520  return 0;
1521 
1522  // If the use is not completely folded in that instruction, we will have to
1523  // pay an extra cost only for scale != 1.
1524  if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1525  LU.AccessTy, F))
1526  return F.Scale != 1;
1527 
1528  switch (LU.Kind) {
1529  case LSRUse::Address: {
1530  // Check the scaling factor cost with both the min and max offsets.
1531  int ScaleCostMinOffset = TTI.getScalingFactorCost(
1532  LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MinOffset, F.HasBaseReg,
1533  F.Scale, LU.AccessTy.AddrSpace);
1534  int ScaleCostMaxOffset = TTI.getScalingFactorCost(
1535  LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MaxOffset, F.HasBaseReg,
1536  F.Scale, LU.AccessTy.AddrSpace);
1537 
1538  assert(ScaleCostMinOffset >= 0 && ScaleCostMaxOffset >= 0 &&
1539  "Legal addressing mode has an illegal cost!");
1540  return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
1541  }
1542  case LSRUse::ICmpZero:
1543  case LSRUse::Basic:
1544  case LSRUse::Special:
1545  // The use is completely folded, i.e., everything is folded into the
1546  // instruction.
1547  return 0;
1548  }
1549 
1550  llvm_unreachable("Invalid LSRUse Kind!");
1551 }
1552 
1553 static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
1554  LSRUse::KindType Kind, MemAccessTy AccessTy,
1555  GlobalValue *BaseGV, int64_t BaseOffset,
1556  bool HasBaseReg) {
1557  // Fast-path: zero is always foldable.
1558  if (BaseOffset == 0 && !BaseGV) return true;
1559 
1560  // Conservatively, create an address with an immediate and a
1561  // base and a scale.
1562  int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
1563 
1564  // Canonicalize a scale of 1 to a base register if the formula doesn't
1565  // already have a base register.
1566  if (!HasBaseReg && Scale == 1) {
1567  Scale = 0;
1568  HasBaseReg = true;
1569  }
1570 
1571  return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
1572  HasBaseReg, Scale);
1573 }
1574 
1575 static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
1576  ScalarEvolution &SE, int64_t MinOffset,
1577  int64_t MaxOffset, LSRUse::KindType Kind,
1578  MemAccessTy AccessTy, const SCEV *S,
1579  bool HasBaseReg) {
1580  // Fast-path: zero is always foldable.
1581  if (S->isZero()) return true;
1582 
1583  // Conservatively, create an address with an immediate and a
1584  // base and a scale.
1585  int64_t BaseOffset = ExtractImmediate(S, SE);
1586  GlobalValue *BaseGV = ExtractSymbol(S, SE);
1587 
1588  // If there's anything else involved, it's not foldable.
1589  if (!S->isZero()) return false;
1590 
1591  // Fast-path: zero is always foldable.
1592  if (BaseOffset == 0 && !BaseGV) return true;
1593 
1594  // Conservatively, create an address with an immediate and a
1595  // base and a scale.
1596  int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
1597 
1598  return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1599  BaseOffset, HasBaseReg, Scale);
1600 }
1601 
1602 namespace {
1603 
1604 /// An individual increment in a Chain of IV increments. Relate an IV user to
1605 /// an expression that computes the IV it uses from the IV used by the previous
1606 /// link in the Chain.
1607 ///
1608 /// For the head of a chain, IncExpr holds the absolute SCEV expression for the
1609 /// original IVOperand. The head of the chain's IVOperand is only valid during
1610 /// chain collection, before LSR replaces IV users. During chain generation,
1611 /// IncExpr can be used to find the new IVOperand that computes the same
1612 /// expression.
1613 struct IVInc {
1614  Instruction *UserInst;
1615  Value* IVOperand;
1616  const SCEV *IncExpr;
1617 
1618  IVInc(Instruction *U, Value *O, const SCEV *E):
1619  UserInst(U), IVOperand(O), IncExpr(E) {}
1620 };
1621 
1622 // The list of IV increments in program order. We typically add the head of a
1623 // chain without finding subsequent links.
1624 struct IVChain {
1625  SmallVector<IVInc,1> Incs;
1626  const SCEV *ExprBase;
1627 
1628  IVChain() : ExprBase(nullptr) {}
1629 
1630  IVChain(const IVInc &Head, const SCEV *Base)
1631  : Incs(1, Head), ExprBase(Base) {}
1632 
1633  typedef SmallVectorImpl<IVInc>::const_iterator const_iterator;
1634 
1635  // Return the first increment in the chain.
1636  const_iterator begin() const {
1637  assert(!Incs.empty());
1638  return std::next(Incs.begin());
1639  }
1640  const_iterator end() const {
1641  return Incs.end();
1642  }
1643 
1644  // Returns true if this chain contains any increments.
1645  bool hasIncs() const { return Incs.size() >= 2; }
1646 
1647  // Add an IVInc to the end of this chain.
1648  void add(const IVInc &X) { Incs.push_back(X); }
1649 
1650  // Returns the last UserInst in the chain.
1651  Instruction *tailUserInst() const { return Incs.back().UserInst; }
1652 
1653  // Returns true if IncExpr can be profitably added to this chain.
1654  bool isProfitableIncrement(const SCEV *OperExpr,
1655  const SCEV *IncExpr,
1656  ScalarEvolution&);
1657 };
1658 
1659 /// Helper for CollectChains to track multiple IV increment uses. Distinguish
1660 /// between FarUsers that definitely cross IV increments and NearUsers that may
1661 /// be used between IV increments.
1662 struct ChainUsers {
1664  SmallPtrSet<Instruction*, 4> NearUsers;
1665 };
1666 
1667 /// This class holds state for the main loop strength reduction logic.
1668 class LSRInstance {
1669  IVUsers &IU;
1670  ScalarEvolution &SE;
1671  DominatorTree &DT;
1672  LoopInfo &LI;
1673  const TargetTransformInfo &TTI;
1674  Loop *const L;
1675  bool Changed;
1676 
1677  /// This is the insert position that the current loop's induction variable
1678  /// increment should be placed. In simple loops, this is the latch block's
1679  /// terminator. But in more complicated cases, this is a position which will
1680  /// dominate all the in-loop post-increment users.
1681  Instruction *IVIncInsertPos;
1682 
1683  /// Interesting factors between use strides.
1684  ///
1685  /// We explicitly use a SetVector which contains a SmallSet, instead of the
1686  /// default, a SmallDenseSet, because we need to use the full range of
1687  /// int64_ts, and there's currently no good way of doing that with
1688  /// SmallDenseSet.
1690 
1691  /// Interesting use types, to facilitate truncation reuse.
1693 
1694  /// The list of interesting uses.
1696 
1697  /// Track which uses use which register candidates.
1698  RegUseTracker RegUses;
1699 
1700  // Limit the number of chains to avoid quadratic behavior. We don't expect to
1701  // have more than a few IV increment chains in a loop. Missing a Chain falls
1702  // back to normal LSR behavior for those uses.
1703  static const unsigned MaxChains = 8;
1704 
1705  /// IV users can form a chain of IV increments.
1707 
1708  /// IV users that belong to profitable IVChains.
1710 
1711  void OptimizeShadowIV();
1712  bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
1713  ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
1714  void OptimizeLoopTermCond();
1715 
1716  void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
1717  SmallVectorImpl<ChainUsers> &ChainUsersVec);
1718  void FinalizeChain(IVChain &Chain);
1719  void CollectChains();
1720  void GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
1721  SmallVectorImpl<WeakVH> &DeadInsts);
1722 
1723  void CollectInterestingTypesAndFactors();
1724  void CollectFixupsAndInitialFormulae();
1725 
1726  // Support for sharing of LSRUses between LSRFixups.
1728  UseMapTy UseMap;
1729 
1730  bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
1731  LSRUse::KindType Kind, MemAccessTy AccessTy);
1732 
1733  std::pair<size_t, int64_t> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
1734  MemAccessTy AccessTy);
1735 
1736  void DeleteUse(LSRUse &LU, size_t LUIdx);
1737 
1738  LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
1739 
1740  void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
1741  void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
1742  void CountRegisters(const Formula &F, size_t LUIdx);
1743  bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
1744 
1745  void CollectLoopInvariantFixupsAndFormulae();
1746 
1747  void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
1748  unsigned Depth = 0);
1749 
1750  void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
1751  const Formula &Base, unsigned Depth,
1752  size_t Idx, bool IsScaledReg = false);
1753  void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
1754  void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
1755  const Formula &Base, size_t Idx,
1756  bool IsScaledReg = false);
1757  void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
1758  void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
1759  const Formula &Base,
1760  const SmallVectorImpl<int64_t> &Worklist,
1761  size_t Idx, bool IsScaledReg = false);
1762  void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
1763  void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
1764  void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
1765  void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
1766  void GenerateCrossUseConstantOffsets();
1767  void GenerateAllReuseFormulae();
1768 
1769  void FilterOutUndesirableDedicatedRegisters();
1770 
1771  size_t EstimateSearchSpaceComplexity() const;
1772  void NarrowSearchSpaceByDetectingSupersets();
1773  void NarrowSearchSpaceByCollapsingUnrolledCode();
1774  void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
1775  void NarrowSearchSpaceByPickingWinnerRegs();
1776  void NarrowSearchSpaceUsingHeuristics();
1777 
1778  void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
1779  Cost &SolutionCost,
1781  const Cost &CurCost,
1782  const SmallPtrSet<const SCEV *, 16> &CurRegs,
1783  DenseSet<const SCEV *> &VisitedRegs) const;
1784  void Solve(SmallVectorImpl<const Formula *> &Solution) const;
1785 
1787  HoistInsertPosition(BasicBlock::iterator IP,
1788  const SmallVectorImpl<Instruction *> &Inputs) const;
1790  AdjustInsertPositionForExpand(BasicBlock::iterator IP,
1791  const LSRFixup &LF,
1792  const LSRUse &LU,
1793  SCEVExpander &Rewriter) const;
1794 
1795  Value *Expand(const LSRUse &LU, const LSRFixup &LF,
1796  const Formula &F,
1799  SmallVectorImpl<WeakVH> &DeadInsts) const;
1800  void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
1801  const Formula &F,
1803  SmallVectorImpl<WeakVH> &DeadInsts) const;
1804  void Rewrite(const LSRUse &LU, const LSRFixup &LF,
1805  const Formula &F,
1807  SmallVectorImpl<WeakVH> &DeadInsts) const;
1808  void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
1809 
1810 public:
1811  LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
1812  LoopInfo &LI, const TargetTransformInfo &TTI);
1813 
1814  bool getChanged() const { return Changed; }
1815 
1816  void print_factors_and_types(raw_ostream &OS) const;
1817  void print_fixups(raw_ostream &OS) const;
1818  void print_uses(raw_ostream &OS) const;
1819  void print(raw_ostream &OS) const;
1820  void dump() const;
1821 };
1822 
1823 } // end anonymous namespace
1824 
1825 /// If IV is used in a int-to-float cast inside the loop then try to eliminate
1826 /// the cast operation.
1827 void LSRInstance::OptimizeShadowIV() {
1828  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
1829  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
1830  return;
1831 
1832  for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
1833  UI != E; /* empty */) {
1834  IVUsers::const_iterator CandidateUI = UI;
1835  ++UI;
1836  Instruction *ShadowUse = CandidateUI->getUser();
1837  Type *DestTy = nullptr;
1838  bool IsSigned = false;
1839 
1840  /* If shadow use is a int->float cast then insert a second IV
1841  to eliminate this cast.
1842 
1843  for (unsigned i = 0; i < n; ++i)
1844  foo((double)i);
1845 
1846  is transformed into
1847 
1848  double d = 0.0;
1849  for (unsigned i = 0; i < n; ++i, ++d)
1850  foo(d);
1851  */
1852  if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
1853  IsSigned = false;
1854  DestTy = UCast->getDestTy();
1855  }
1856  else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
1857  IsSigned = true;
1858  DestTy = SCast->getDestTy();
1859  }
1860  if (!DestTy) continue;
1861 
1862  // If target does not support DestTy natively then do not apply
1863  // this transformation.
1864  if (!TTI.isTypeLegal(DestTy)) continue;
1865 
1866  PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
1867  if (!PH) continue;
1868  if (PH->getNumIncomingValues() != 2) continue;
1869 
1870  Type *SrcTy = PH->getType();
1871  int Mantissa = DestTy->getFPMantissaWidth();
1872  if (Mantissa == -1) continue;
1873  if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
1874  continue;
1875 
1876  unsigned Entry, Latch;
1877  if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
1878  Entry = 0;
1879  Latch = 1;
1880  } else {
1881  Entry = 1;
1882  Latch = 0;
1883  }
1884 
1886  if (!Init) continue;
1887  Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
1888  (double)Init->getSExtValue() :
1889  (double)Init->getZExtValue());
1890 
1891  BinaryOperator *Incr =
1893  if (!Incr) continue;
1894  if (Incr->getOpcode() != Instruction::Add
1895  && Incr->getOpcode() != Instruction::Sub)
1896  continue;
1897 
1898  /* Initialize new IV, double d = 0.0 in above example. */
1899  ConstantInt *C = nullptr;
1900  if (Incr->getOperand(0) == PH)
1901  C = dyn_cast<ConstantInt>(Incr->getOperand(1));
1902  else if (Incr->getOperand(1) == PH)
1903  C = dyn_cast<ConstantInt>(Incr->getOperand(0));
1904  else
1905  continue;
1906 
1907  if (!C) continue;
1908 
1909  // Ignore negative constants, as the code below doesn't handle them
1910  // correctly. TODO: Remove this restriction.
1911  if (!C->getValue().isStrictlyPositive()) continue;
1912 
1913  /* Add new PHINode. */
1914  PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH);
1915 
1916  /* create new increment. '++d' in above example. */
1917  Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
1918  BinaryOperator *NewIncr =
1919  BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
1920  Instruction::FAdd : Instruction::FSub,
1921  NewPH, CFP, "IV.S.next.", Incr);
1922 
1923  NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
1924  NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
1925 
1926  /* Remove cast operation */
1927  ShadowUse->replaceAllUsesWith(NewPH);
1928  ShadowUse->eraseFromParent();
1929  Changed = true;
1930  break;
1931  }
1932 }
1933 
1934 /// If Cond has an operand that is an expression of an IV, set the IV user and
1935 /// stride information and return true, otherwise return false.
1936 bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
1937  for (IVStrideUse &U : IU)
1938  if (U.getUser() == Cond) {
1939  // NOTE: we could handle setcc instructions with multiple uses here, but
1940  // InstCombine does it as well for simple uses, it's not clear that it
1941  // occurs enough in real life to handle.
1942  CondUse = &U;
1943  return true;
1944  }
1945  return false;
1946 }
1947 
1948 /// Rewrite the loop's terminating condition if it uses a max computation.
1949 ///
1950 /// This is a narrow solution to a specific, but acute, problem. For loops
1951 /// like this:
1952 ///
1953 /// i = 0;
1954 /// do {
1955 /// p[i] = 0.0;
1956 /// } while (++i < n);
1957 ///
1958 /// the trip count isn't just 'n', because 'n' might not be positive. And
1959 /// unfortunately this can come up even for loops where the user didn't use
1960 /// a C do-while loop. For example, seemingly well-behaved top-test loops
1961 /// will commonly be lowered like this:
1962 //
1963 /// if (n > 0) {
1964 /// i = 0;
1965 /// do {
1966 /// p[i] = 0.0;
1967 /// } while (++i < n);
1968 /// }
1969 ///
1970 /// and then it's possible for subsequent optimization to obscure the if
1971 /// test in such a way that indvars can't find it.
1972 ///
1973 /// When indvars can't find the if test in loops like this, it creates a
1974 /// max expression, which allows it to give the loop a canonical
1975 /// induction variable:
1976 ///
1977 /// i = 0;
1978 /// max = n < 1 ? 1 : n;
1979 /// do {
1980 /// p[i] = 0.0;
1981 /// } while (++i != max);
1982 ///
1983 /// Canonical induction variables are necessary because the loop passes
1984 /// are designed around them. The most obvious example of this is the
1985 /// LoopInfo analysis, which doesn't remember trip count values. It
1986 /// expects to be able to rediscover the trip count each time it is
1987 /// needed, and it does this using a simple analysis that only succeeds if
1988 /// the loop has a canonical induction variable.
1989 ///
1990 /// However, when it comes time to generate code, the maximum operation
1991 /// can be quite costly, especially if it's inside of an outer loop.
1992 ///
1993 /// This function solves this problem by detecting this type of loop and
1994 /// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
1995 /// the instructions for the maximum computation.
1996 ///
1997 ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
1998  // Check that the loop matches the pattern we're looking for.
1999  if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
2000  Cond->getPredicate() != CmpInst::ICMP_NE)
2001  return Cond;
2002 
2003  SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
2004  if (!Sel || !Sel->hasOneUse()) return Cond;
2005 
2006  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2007  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2008  return Cond;
2009  const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
2010 
2011  // Add one to the backedge-taken count to get the trip count.
2012  const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
2013  if (IterationCount != SE.getSCEV(Sel)) return Cond;
2014 
2015  // Check for a max calculation that matches the pattern. There's no check
2016  // for ICMP_ULE here because the comparison would be with zero, which
2017  // isn't interesting.
2018  CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
2019  const SCEVNAryExpr *Max = nullptr;
2020  if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
2021  Pred = ICmpInst::ICMP_SLE;
2022  Max = S;
2023  } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
2024  Pred = ICmpInst::ICMP_SLT;
2025  Max = S;
2026  } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
2027  Pred = ICmpInst::ICMP_ULT;
2028  Max = U;
2029  } else {
2030  // No match; bail.
2031  return Cond;
2032  }
2033 
2034  // To handle a max with more than two operands, this optimization would
2035  // require additional checking and setup.
2036  if (Max->getNumOperands() != 2)
2037  return Cond;
2038 
2039  const SCEV *MaxLHS = Max->getOperand(0);
2040  const SCEV *MaxRHS = Max->getOperand(1);
2041 
2042  // ScalarEvolution canonicalizes constants to the left. For < and >, look
2043  // for a comparison with 1. For <= and >=, a comparison with zero.
2044  if (!MaxLHS ||
2045  (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
2046  return Cond;
2047 
2048  // Check the relevant induction variable for conformance to
2049  // the pattern.
2050  const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
2051  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
2052  if (!AR || !AR->isAffine() ||
2053  AR->getStart() != One ||
2054  AR->getStepRecurrence(SE) != One)
2055  return Cond;
2056 
2057  assert(AR->getLoop() == L &&
2058  "Loop condition operand is an addrec in a different loop!");
2059 
2060  // Check the right operand of the select, and remember it, as it will
2061  // be used in the new comparison instruction.
2062  Value *NewRHS = nullptr;
2063  if (ICmpInst::isTrueWhenEqual(Pred)) {
2064  // Look for n+1, and grab n.
2065  if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
2066  if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2067  if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2068  NewRHS = BO->getOperand(0);
2069  if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
2070  if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2071  if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2072  NewRHS = BO->getOperand(0);
2073  if (!NewRHS)
2074  return Cond;
2075  } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
2076  NewRHS = Sel->getOperand(1);
2077  else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
2078  NewRHS = Sel->getOperand(2);
2079  else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
2080  NewRHS = SU->getValue();
2081  else
2082  // Max doesn't match expected pattern.
2083  return Cond;
2084 
2085  // Determine the new comparison opcode. It may be signed or unsigned,
2086  // and the original comparison may be either equality or inequality.
2087  if (Cond->getPredicate() == CmpInst::ICMP_EQ)
2088  Pred = CmpInst::getInversePredicate(Pred);
2089 
2090  // Ok, everything looks ok to change the condition into an SLT or SGE and
2091  // delete the max calculation.
2092  ICmpInst *NewCond =
2093  new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp");
2094 
2095  // Delete the max calculation instructions.
2096  Cond->replaceAllUsesWith(NewCond);
2097  CondUse->setUser(NewCond);
2098  Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
2099  Cond->eraseFromParent();
2100  Sel->eraseFromParent();
2101  if (Cmp->use_empty())
2102  Cmp->eraseFromParent();
2103  return NewCond;
2104 }
2105 
2106 /// Change loop terminating condition to use the postinc iv when possible.
2107 void
2108 LSRInstance::OptimizeLoopTermCond() {
2110 
2111  // We need a different set of heuristics for rotated and non-rotated loops.
2112  // If a loop is rotated then the latch is also the backedge, so inserting
2113  // post-inc expressions just before the latch is ideal. To reduce live ranges
2114  // it also makes sense to rewrite terminating conditions to use post-inc
2115  // expressions.
2116  //
2117  // If the loop is not rotated then the latch is not a backedge; the latch
2118  // check is done in the loop head. Adding post-inc expressions before the
2119  // latch will cause overlapping live-ranges of pre-inc and post-inc expressions
2120  // in the loop body. In this case we do *not* want to use post-inc expressions
2121  // in the latch check, and we want to insert post-inc expressions before
2122  // the backedge.
2123  BasicBlock *LatchBlock = L->getLoopLatch();
2124  SmallVector<BasicBlock*, 8> ExitingBlocks;
2125  L->getExitingBlocks(ExitingBlocks);
2126  if (llvm::all_of(ExitingBlocks, [&LatchBlock](const BasicBlock *BB) {
2127  return LatchBlock != BB;
2128  })) {
2129  // The backedge doesn't exit the loop; treat this as a head-tested loop.
2130  IVIncInsertPos = LatchBlock->getTerminator();
2131  return;
2132  }
2133 
2134  // Otherwise treat this as a rotated loop.
2135  for (BasicBlock *ExitingBlock : ExitingBlocks) {
2136 
2137  // Get the terminating condition for the loop if possible. If we
2138  // can, we want to change it to use a post-incremented version of its
2139  // induction variable, to allow coalescing the live ranges for the IV into
2140  // one register value.
2141 
2142  BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
2143  if (!TermBr)
2144  continue;
2145  // FIXME: Overly conservative, termination condition could be an 'or' etc..
2146  if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
2147  continue;
2148 
2149  // Search IVUsesByStride to find Cond's IVUse if there is one.
2150  IVStrideUse *CondUse = nullptr;
2151  ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
2152  if (!FindIVUserForCond(Cond, CondUse))
2153  continue;
2154 
2155  // If the trip count is computed in terms of a max (due to ScalarEvolution
2156  // being unable to find a sufficient guard, for example), change the loop
2157  // comparison to use SLT or ULT instead of NE.
2158  // One consequence of doing this now is that it disrupts the count-down
2159  // optimization. That's not always a bad thing though, because in such
2160  // cases it may still be worthwhile to avoid a max.
2161  Cond = OptimizeMax(Cond, CondUse);
2162 
2163  // If this exiting block dominates the latch block, it may also use
2164  // the post-inc value if it won't be shared with other uses.
2165  // Check for dominance.
2166  if (!DT.dominates(ExitingBlock, LatchBlock))
2167  continue;
2168 
2169  // Conservatively avoid trying to use the post-inc value in non-latch
2170  // exits if there may be pre-inc users in intervening blocks.
2171  if (LatchBlock != ExitingBlock)
2172  for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
2173  // Test if the use is reachable from the exiting block. This dominator
2174  // query is a conservative approximation of reachability.
2175  if (&*UI != CondUse &&
2176  !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {
2177  // Conservatively assume there may be reuse if the quotient of their
2178  // strides could be a legal scale.
2179  const SCEV *A = IU.getStride(*CondUse, L);
2180  const SCEV *B = IU.getStride(*UI, L);
2181  if (!A || !B) continue;
2182  if (SE.getTypeSizeInBits(A->getType()) !=
2183  SE.getTypeSizeInBits(B->getType())) {
2184  if (SE.getTypeSizeInBits(A->getType()) >
2185  SE.getTypeSizeInBits(B->getType()))
2186  B = SE.getSignExtendExpr(B, A->getType());
2187  else
2188  A = SE.getSignExtendExpr(A, B->getType());
2189  }
2190  if (const SCEVConstant *D =
2191  dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
2192  const ConstantInt *C = D->getValue();
2193  // Stride of one or negative one can have reuse with non-addresses.
2194  if (C->isOne() || C->isAllOnesValue())
2195  goto decline_post_inc;
2196  // Avoid weird situations.
2197  if (C->getValue().getMinSignedBits() >= 64 ||
2198  C->getValue().isMinSignedValue())
2199  goto decline_post_inc;
2200  // Check for possible scaled-address reuse.
2201  MemAccessTy AccessTy = getAccessType(UI->getUser());
2202  int64_t Scale = C->getSExtValue();
2203  if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2204  /*BaseOffset=*/0,
2205  /*HasBaseReg=*/false, Scale,
2206  AccessTy.AddrSpace))
2207  goto decline_post_inc;
2208  Scale = -Scale;
2209  if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2210  /*BaseOffset=*/0,
2211  /*HasBaseReg=*/false, Scale,
2212  AccessTy.AddrSpace))
2213  goto decline_post_inc;
2214  }
2215  }
2216 
2217  DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: "
2218  << *Cond << '\n');
2219 
2220  // It's possible for the setcc instruction to be anywhere in the loop, and
2221  // possible for it to have multiple users. If it is not immediately before
2222  // the exiting block branch, move it.
2223  if (&*++BasicBlock::iterator(Cond) != TermBr) {
2224  if (Cond->hasOneUse()) {
2225  Cond->moveBefore(TermBr);
2226  } else {
2227  // Clone the terminating condition and insert into the loopend.
2228  ICmpInst *OldCond = Cond;
2229  Cond = cast<ICmpInst>(Cond->clone());
2230  Cond->setName(L->getHeader()->getName() + ".termcond");
2231  ExitingBlock->getInstList().insert(TermBr->getIterator(), Cond);
2232 
2233  // Clone the IVUse, as the old use still exists!
2234  CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
2235  TermBr->replaceUsesOfWith(OldCond, Cond);
2236  }
2237  }
2238 
2239  // If we get to here, we know that we can transform the setcc instruction to
2240  // use the post-incremented version of the IV, allowing us to coalesce the
2241  // live ranges for the IV correctly.
2242  CondUse->transformToPostInc(L);
2243  Changed = true;
2244 
2245  PostIncs.insert(Cond);
2246  decline_post_inc:;
2247  }
2248 
2249  // Determine an insertion point for the loop induction variable increment. It
2250  // must dominate all the post-inc comparisons we just set up, and it must
2251  // dominate the loop latch edge.
2252  IVIncInsertPos = L->getLoopLatch()->getTerminator();
2253  for (Instruction *Inst : PostIncs) {
2254  BasicBlock *BB =
2255  DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
2256  Inst->getParent());
2257  if (BB == Inst->getParent())
2258  IVIncInsertPos = Inst;
2259  else if (BB != IVIncInsertPos->getParent())
2260  IVIncInsertPos = BB->getTerminator();
2261  }
2262 }
2263 
2264 /// Determine if the given use can accommodate a fixup at the given offset and
2265 /// other details. If so, update the use and return true.
2266 bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
2267  bool HasBaseReg, LSRUse::KindType Kind,
2268  MemAccessTy AccessTy) {
2269  int64_t NewMinOffset = LU.MinOffset;
2270  int64_t NewMaxOffset = LU.MaxOffset;
2271  MemAccessTy NewAccessTy = AccessTy;
2272 
2273  // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
2274  // something conservative, however this can pessimize in the case that one of
2275  // the uses will have all its uses outside the loop, for example.
2276  if (LU.Kind != Kind)
2277  return false;
2278 
2279  // Check for a mismatched access type, and fall back conservatively as needed.
2280  // TODO: Be less conservative when the type is similar and can use the same
2281  // addressing modes.
2282  if (Kind == LSRUse::Address) {
2283  if (AccessTy.MemTy != LU.AccessTy.MemTy) {
2284  NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext(),
2285  AccessTy.AddrSpace);
2286  }
2287  }
2288 
2289  // Conservatively assume HasBaseReg is true for now.
2290  if (NewOffset < LU.MinOffset) {
2291  if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2292  LU.MaxOffset - NewOffset, HasBaseReg))
2293  return false;
2294  NewMinOffset = NewOffset;
2295  } else if (NewOffset > LU.MaxOffset) {
2296  if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2297  NewOffset - LU.MinOffset, HasBaseReg))
2298  return false;
2299  NewMaxOffset = NewOffset;
2300  }
2301 
2302  // Update the use.
2303  LU.MinOffset = NewMinOffset;
2304  LU.MaxOffset = NewMaxOffset;
2305  LU.AccessTy = NewAccessTy;
2306  return true;
2307 }
2308 
2309 /// Return an LSRUse index and an offset value for a fixup which needs the given
2310 /// expression, with the given kind and optional access type. Either reuse an
2311 /// existing use or create a new one, as needed.
2312 std::pair<size_t, int64_t> LSRInstance::getUse(const SCEV *&Expr,
2313  LSRUse::KindType Kind,
2314  MemAccessTy AccessTy) {
2315  const SCEV *Copy = Expr;
2316  int64_t Offset = ExtractImmediate(Expr, SE);
2317 
2318  // Basic uses can't accept any offset, for example.
2319  if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
2320  Offset, /*HasBaseReg=*/ true)) {
2321  Expr = Copy;
2322  Offset = 0;
2323  }
2324 
2325  std::pair<UseMapTy::iterator, bool> P =
2326  UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0));
2327  if (!P.second) {
2328  // A use already existed with this base.
2329  size_t LUIdx = P.first->second;
2330  LSRUse &LU = Uses[LUIdx];
2331  if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
2332  // Reuse this use.
2333  return std::make_pair(LUIdx, Offset);
2334  }
2335 
2336  // Create a new use.
2337  size_t LUIdx = Uses.size();
2338  P.first->second = LUIdx;
2339  Uses.push_back(LSRUse(Kind, AccessTy));
2340  LSRUse &LU = Uses[LUIdx];
2341 
2342  LU.MinOffset = Offset;
2343  LU.MaxOffset = Offset;
2344  return std::make_pair(LUIdx, Offset);
2345 }
2346 
2347 /// Delete the given use from the Uses list.
2348 void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
2349  if (&LU != &Uses.back())
2350  std::swap(LU, Uses.back());
2351  Uses.pop_back();
2352 
2353  // Update RegUses.
2354  RegUses.swapAndDropUse(LUIdx, Uses.size());
2355 }
2356 
2357 /// Look for a use distinct from OrigLU which is has a formula that has the same
2358 /// registers as the given formula.
2359 LSRUse *
2360 LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
2361  const LSRUse &OrigLU) {
2362  // Search all uses for the formula. This could be more clever.
2363  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
2364  LSRUse &LU = Uses[LUIdx];
2365  // Check whether this use is close enough to OrigLU, to see whether it's
2366  // worthwhile looking through its formulae.
2367  // Ignore ICmpZero uses because they may contain formulae generated by
2368  // GenerateICmpZeroScales, in which case adding fixup offsets may
2369  // be invalid.
2370  if (&LU != &OrigLU &&
2371  LU.Kind != LSRUse::ICmpZero &&
2372  LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
2373  LU.WidestFixupType == OrigLU.WidestFixupType &&
2374  LU.HasFormulaWithSameRegs(OrigF)) {
2375  // Scan through this use's formulae.
2376  for (const Formula &F : LU.Formulae) {
2377  // Check to see if this formula has the same registers and symbols
2378  // as OrigF.
2379  if (F.BaseRegs == OrigF.BaseRegs &&
2380  F.ScaledReg == OrigF.ScaledReg &&
2381  F.BaseGV == OrigF.BaseGV &&
2382  F.Scale == OrigF.Scale &&
2383  F.UnfoldedOffset == OrigF.UnfoldedOffset) {
2384  if (F.BaseOffset == 0)
2385  return &LU;
2386  // This is the formula where all the registers and symbols matched;
2387  // there aren't going to be any others. Since we declined it, we
2388  // can skip the rest of the formulae and proceed to the next LSRUse.
2389  break;
2390  }
2391  }
2392  }
2393  }
2394 
2395  // Nothing looked good.
2396  return nullptr;
2397 }
2398 
2399 void LSRInstance::CollectInterestingTypesAndFactors() {
2401 
2402  // Collect interesting types and strides.
2404  for (const IVStrideUse &U : IU) {
2405  const SCEV *Expr = IU.getExpr(U);
2406 
2407  // Collect interesting types.
2408  Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
2409 
2410  // Add strides for mentioned loops.
2411  Worklist.push_back(Expr);
2412  do {
2413  const SCEV *S = Worklist.pop_back_val();
2414  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
2415  if (AR->getLoop() == L)
2416  Strides.insert(AR->getStepRecurrence(SE));
2417  Worklist.push_back(AR->getStart());
2418  } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
2419  Worklist.append(Add->op_begin(), Add->op_end());
2420  }
2421  } while (!Worklist.empty());
2422  }
2423 
2424  // Compute interesting factors from the set of interesting strides.
2426  I = Strides.begin(), E = Strides.end(); I != E; ++I)
2428  std::next(I); NewStrideIter != E; ++NewStrideIter) {
2429  const SCEV *OldStride = *I;
2430  const SCEV *NewStride = *NewStrideIter;
2431 
2432  if (SE.getTypeSizeInBits(OldStride->getType()) !=
2433  SE.getTypeSizeInBits(NewStride->getType())) {
2434  if (SE.getTypeSizeInBits(OldStride->getType()) >
2435  SE.getTypeSizeInBits(NewStride->getType()))
2436  NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
2437  else
2438  OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
2439  }
2440  if (const SCEVConstant *Factor =
2441  dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
2442  SE, true))) {
2443  if (Factor->getAPInt().getMinSignedBits() <= 64)
2444  Factors.insert(Factor->getAPInt().getSExtValue());
2445  } else if (const SCEVConstant *Factor =
2446  dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
2447  NewStride,
2448  SE, true))) {
2449  if (Factor->getAPInt().getMinSignedBits() <= 64)
2450  Factors.insert(Factor->getAPInt().getSExtValue());
2451  }
2452  }
2453 
2454  // If all uses use the same type, don't bother looking for truncation-based
2455  // reuse.
2456  if (Types.size() == 1)
2457  Types.clear();
2458 
2459  DEBUG(print_factors_and_types(dbgs()));
2460 }
2461 
2462 /// Helper for CollectChains that finds an IV operand (computed by an AddRec in
2463 /// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
2464 /// IVStrideUses, we could partially skip this.
2465 static User::op_iterator
2467  Loop *L, ScalarEvolution &SE) {
2468  for(; OI != OE; ++OI) {
2469  if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
2470  if (!SE.isSCEVable(Oper->getType()))
2471  continue;
2472 
2473  if (const SCEVAddRecExpr *AR =
2474  dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) {
2475  if (AR->getLoop() == L)
2476  break;
2477  }
2478  }
2479  }
2480  return OI;
2481 }
2482 
2483 /// IVChain logic must consistenctly peek base TruncInst operands, so wrap it in
2484 /// a convenient helper.
2485 static Value *getWideOperand(Value *Oper) {
2486  if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
2487  return Trunc->getOperand(0);
2488  return Oper;
2489 }
2490 
2491 /// Return true if we allow an IV chain to include both types.
2492 static bool isCompatibleIVType(Value *LVal, Value *RVal) {
2493  Type *LType = LVal->getType();
2494  Type *RType = RVal->getType();
2495  return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy());
2496 }
2497 
2498 /// Return an approximation of this SCEV expression's "base", or NULL for any
2499 /// constant. Returning the expression itself is conservative. Returning a
2500 /// deeper subexpression is more precise and valid as long as it isn't less
2501 /// complex than another subexpression. For expressions involving multiple
2502 /// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
2503 /// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
2504 /// IVInc==b-a.
2505 ///
2506 /// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
2507 /// SCEVUnknown, we simply return the rightmost SCEV operand.
2508 static const SCEV *getExprBase(const SCEV *S) {
2509  switch (S->getSCEVType()) {
2510  default: // uncluding scUnknown.
2511  return S;
2512  case scConstant:
2513  return nullptr;
2514  case scTruncate:
2515  return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
2516  case scZeroExtend:
2517  return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
2518  case scSignExtend:
2519  return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
2520  case scAddExpr: {
2521  // Skip over scaled operands (scMulExpr) to follow add operands as long as
2522  // there's nothing more complex.
2523  // FIXME: not sure if we want to recognize negation.
2524  const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
2525  for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(Add->op_end()),
2526  E(Add->op_begin()); I != E; ++I) {
2527  const SCEV *SubExpr = *I;
2528  if (SubExpr->getSCEVType() == scAddExpr)
2529  return getExprBase(SubExpr);
2530 
2531  if (SubExpr->getSCEVType() != scMulExpr)
2532  return SubExpr;
2533  }
2534  return S; // all operands are scaled, be conservative.
2535  }
2536  case scAddRecExpr:
2537  return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
2538  }
2539 }
2540 
2541 /// Return true if the chain increment is profitable to expand into a loop
2542 /// invariant value, which may require its own register. A profitable chain
2543 /// increment will be an offset relative to the same base. We allow such offsets
2544 /// to potentially be used as chain increment as long as it's not obviously
2545 /// expensive to expand using real instructions.
2546 bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
2547  const SCEV *IncExpr,
2548  ScalarEvolution &SE) {
2549  // Aggressively form chains when -stress-ivchain.
2550  if (StressIVChain)
2551  return true;
2552 
2553  // Do not replace a constant offset from IV head with a nonconstant IV
2554  // increment.
2555  if (!isa<SCEVConstant>(IncExpr)) {
2556  const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
2557  if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
2558  return false;
2559  }
2560 
2561  SmallPtrSet<const SCEV*, 8> Processed;
2562  return !isHighCostExpansion(IncExpr, Processed, SE);
2563 }
2564 
2565 /// Return true if the number of registers needed for the chain is estimated to
2566 /// be less than the number required for the individual IV users. First prohibit
2567 /// any IV users that keep the IV live across increments (the Users set should
2568 /// be empty). Next count the number and type of increments in the chain.
2569 ///
2570 /// Chaining IVs can lead to considerable code bloat if ISEL doesn't
2571 /// effectively use postinc addressing modes. Only consider it profitable it the
2572 /// increments can be computed in fewer registers when chained.
2573 ///
2574 /// TODO: Consider IVInc free if it's already used in another chains.
2575 static bool
2577  ScalarEvolution &SE, const TargetTransformInfo &TTI) {
2578  if (StressIVChain)
2579  return true;
2580 
2581  if (!Chain.hasIncs())
2582  return false;
2583 
2584  if (!Users.empty()) {
2585  DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
2586  for (Instruction *Inst : Users) {
2587  dbgs() << " " << *Inst << "\n";
2588  });
2589  return false;
2590  }
2591  assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
2592 
2593  // The chain itself may require a register, so intialize cost to 1.
2594  int cost = 1;
2595 
2596  // A complete chain likely eliminates the need for keeping the original IV in
2597  // a register. LSR does not currently know how to form a complete chain unless
2598  // the header phi already exists.
2599  if (isa<PHINode>(Chain.tailUserInst())
2600  && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
2601  --cost;
2602  }
2603  const SCEV *LastIncExpr = nullptr;
2604  unsigned NumConstIncrements = 0;
2605  unsigned NumVarIncrements = 0;
2606  unsigned NumReusedIncrements = 0;
2607  for (const IVInc &Inc : Chain) {
2608  if (Inc.IncExpr->isZero())
2609  continue;
2610 
2611  // Incrementing by zero or some constant is neutral. We assume constants can
2612  // be folded into an addressing mode or an add's immediate operand.
2613  if (isa<SCEVConstant>(Inc.IncExpr)) {
2614  ++NumConstIncrements;
2615  continue;
2616  }
2617 
2618  if (Inc.IncExpr == LastIncExpr)
2619  ++NumReusedIncrements;
2620  else
2621  ++NumVarIncrements;
2622 
2623  LastIncExpr = Inc.IncExpr;
2624  }
2625  // An IV chain with a single increment is handled by LSR's postinc
2626  // uses. However, a chain with multiple increments requires keeping the IV's
2627  // value live longer than it needs to be if chained.
2628  if (NumConstIncrements > 1)
2629  --cost;
2630 
2631  // Materializing increment expressions in the preheader that didn't exist in
2632  // the original code may cost a register. For example, sign-extended array
2633  // indices can produce ridiculous increments like this:
2634  // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
2635  cost += NumVarIncrements;
2636 
2637  // Reusing variable increments likely saves a register to hold the multiple of
2638  // the stride.
2639  cost -= NumReusedIncrements;
2640 
2641  DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
2642  << "\n");
2643 
2644  return cost < 0;
2645 }
2646 
2647 /// Add this IV user to an existing chain or make it the head of a new chain.
2648 void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
2649  SmallVectorImpl<ChainUsers> &ChainUsersVec) {
2650  // When IVs are used as types of varying widths, they are generally converted
2651  // to a wider type with some uses remaining narrow under a (free) trunc.
2652  Value *const NextIV = getWideOperand(IVOper);
2653  const SCEV *const OperExpr = SE.getSCEV(NextIV);
2654  const SCEV *const OperExprBase = getExprBase(OperExpr);
2655 
2656  // Visit all existing chains. Check if its IVOper can be computed as a
2657  // profitable loop invariant increment from the last link in the Chain.
2658  unsigned ChainIdx = 0, NChains = IVChainVec.size();
2659  const SCEV *LastIncExpr = nullptr;
2660  for (; ChainIdx < NChains; ++ChainIdx) {
2661  IVChain &Chain = IVChainVec[ChainIdx];
2662 
2663  // Prune the solution space aggressively by checking that both IV operands
2664  // are expressions that operate on the same unscaled SCEVUnknown. This
2665  // "base" will be canceled by the subsequent getMinusSCEV call. Checking
2666  // first avoids creating extra SCEV expressions.
2667  if (!StressIVChain && Chain.ExprBase != OperExprBase)
2668  continue;
2669 
2670  Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
2671  if (!isCompatibleIVType(PrevIV, NextIV))
2672  continue;
2673 
2674  // A phi node terminates a chain.
2675  if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
2676  continue;
2677 
2678  // The increment must be loop-invariant so it can be kept in a register.
2679  const SCEV *PrevExpr = SE.getSCEV(PrevIV);
2680  const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
2681  if (!SE.isLoopInvariant(IncExpr, L))
2682  continue;
2683 
2684  if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
2685  LastIncExpr = IncExpr;
2686  break;
2687  }
2688  }
2689  // If we haven't found a chain, create a new one, unless we hit the max. Don't
2690  // bother for phi nodes, because they must be last in the chain.
2691  if (ChainIdx == NChains) {
2692  if (isa<PHINode>(UserInst))
2693  return;
2694  if (NChains >= MaxChains && !StressIVChain) {
2695  DEBUG(dbgs() << "IV Chain Limit\n");
2696  return;
2697  }
2698  LastIncExpr = OperExpr;
2699  // IVUsers may have skipped over sign/zero extensions. We don't currently
2700  // attempt to form chains involving extensions unless they can be hoisted
2701  // into this loop's AddRec.
2702  if (!isa<SCEVAddRecExpr>(LastIncExpr))
2703  return;
2704  ++NChains;
2705  IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
2706  OperExprBase));
2707  ChainUsersVec.resize(NChains);
2708  DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
2709  << ") IV=" << *LastIncExpr << "\n");
2710  } else {
2711  DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst
2712  << ") IV+" << *LastIncExpr << "\n");
2713  // Add this IV user to the end of the chain.
2714  IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
2715  }
2716  IVChain &Chain = IVChainVec[ChainIdx];
2717 
2718  SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
2719  // This chain's NearUsers become FarUsers.
2720  if (!LastIncExpr->isZero()) {
2721  ChainUsersVec[ChainIdx].FarUsers.insert(NearUsers.begin(),
2722  NearUsers.end());
2723  NearUsers.clear();
2724  }
2725 
2726  // All other uses of IVOperand become near uses of the chain.
2727  // We currently ignore intermediate values within SCEV expressions, assuming
2728  // they will eventually be used be the current chain, or can be computed
2729  // from one of the chain increments. To be more precise we could
2730  // transitively follow its user and only add leaf IV users to the set.
2731  for (User *U : IVOper->users()) {
2732  Instruction *OtherUse = dyn_cast<Instruction>(U);
2733  if (!OtherUse)
2734  continue;
2735  // Uses in the chain will no longer be uses if the chain is formed.
2736  // Include the head of the chain in this iteration (not Chain.begin()).
2737  IVChain::const_iterator IncIter = Chain.Incs.begin();
2738  IVChain::const_iterator IncEnd = Chain.Incs.end();
2739  for( ; IncIter != IncEnd; ++IncIter) {
2740  if (IncIter->UserInst == OtherUse)
2741  break;
2742  }
2743  if (IncIter != IncEnd)
2744  continue;
2745 
2746  if (SE.isSCEVable(OtherUse->getType())
2747  && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
2748  && IU.isIVUserOrOperand(OtherUse)) {
2749  continue;
2750  }
2751  NearUsers.insert(OtherUse);
2752  }
2753 
2754  // Since this user is part of the chain, it's no longer considered a use
2755  // of the chain.
2756  ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
2757 }
2758 
2759 /// Populate the vector of Chains.
2760 ///
2761 /// This decreases ILP at the architecture level. Targets with ample registers,
2762 /// multiple memory ports, and no register renaming probably don't want
2763 /// this. However, such targets should probably disable LSR altogether.
2764 ///
2765 /// The job of LSR is to make a reasonable choice of induction variables across
2766 /// the loop. Subsequent passes can easily "unchain" computation exposing more
2767 /// ILP *within the loop* if the target wants it.
2768 ///
2769 /// Finding the best IV chain is potentially a scheduling problem. Since LSR
2770 /// will not reorder memory operations, it will recognize this as a chain, but
2771 /// will generate redundant IV increments. Ideally this would be corrected later
2772 /// by a smart scheduler:
2773 /// = A[i]
2774 /// = A[i+x]
2775 /// A[i] =
2776 /// A[i+x] =
2777 ///
2778 /// TODO: Walk the entire domtree within this loop, not just the path to the
2779 /// loop latch. This will discover chains on side paths, but requires
2780 /// maintaining multiple copies of the Chains state.
2781 void LSRInstance::CollectChains() {
2782  DEBUG(dbgs() << "Collecting IV Chains.\n");
2783  SmallVector<ChainUsers, 8> ChainUsersVec;
2784 
2785  SmallVector<BasicBlock *,8> LatchPath;
2786  BasicBlock *LoopHeader = L->getHeader();
2787  for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
2788  Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
2789  LatchPath.push_back(Rung->getBlock());
2790  }
2791  LatchPath.push_back(LoopHeader);
2792 
2793  // Walk the instruction stream from the loop header to the loop latch.
2794  for (BasicBlock *BB : reverse(LatchPath)) {
2795  for (Instruction &I : *BB) {
2796  // Skip instructions that weren't seen by IVUsers analysis.
2797  if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&I))
2798  continue;
2799 
2800  // Ignore users that are part of a SCEV expression. This way we only
2801  // consider leaf IV Users. This effectively rediscovers a portion of
2802  // IVUsers analysis but in program order this time.
2803  if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I)))
2804  continue;
2805 
2806  // Remove this instruction from any NearUsers set it may be in.
2807  for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
2808  ChainIdx < NChains; ++ChainIdx) {
2809  ChainUsersVec[ChainIdx].NearUsers.erase(&I);
2810  }
2811  // Search for operands that can be chained.
2812  SmallPtrSet<Instruction*, 4> UniqueOperands;
2813  User::op_iterator IVOpEnd = I.op_end();
2814  User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE);
2815  while (IVOpIter != IVOpEnd) {
2816  Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
2817  if (UniqueOperands.insert(IVOpInst).second)
2818  ChainInstruction(&I, IVOpInst, ChainUsersVec);
2819  IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
2820  }
2821  } // Continue walking down the instructions.
2822  } // Continue walking down the domtree.
2823  // Visit phi backedges to determine if the chain can generate the IV postinc.
2824  for (BasicBlock::iterator I = L->getHeader()->begin();
2825  PHINode *PN = dyn_cast<PHINode>(I); ++I) {
2826  if (!SE.isSCEVable(PN->getType()))
2827  continue;
2828 
2829  Instruction *IncV =
2830  dyn_cast<Instruction>(PN->getIncomingValueForBlock(L->getLoopLatch()));
2831  if (IncV)
2832  ChainInstruction(PN, IncV, ChainUsersVec);
2833  }
2834  // Remove any unprofitable chains.
2835  unsigned ChainIdx = 0;
2836  for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
2837  UsersIdx < NChains; ++UsersIdx) {
2838  if (!isProfitableChain(IVChainVec[UsersIdx],
2839  ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
2840  continue;
2841  // Preserve the chain at UsesIdx.
2842  if (ChainIdx != UsersIdx)
2843  IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
2844  FinalizeChain(IVChainVec[ChainIdx]);
2845  ++ChainIdx;
2846  }
2847  IVChainVec.resize(ChainIdx);
2848 }
2849 
2850 void LSRInstance::FinalizeChain(IVChain &Chain) {
2851  assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
2852  DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
2853 
2854  for (const IVInc &Inc : Chain) {
2855  DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");
2856  auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
2857  assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
2858  IVIncSet.insert(UseI);
2859  }
2860 }
2861 
2862 /// Return true if the IVInc can be folded into an addressing mode.
2863 static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
2864  Value *Operand, const TargetTransformInfo &TTI) {
2865  const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
2866  if (!IncConst || !isAddressUse(UserInst, Operand))
2867  return false;
2868 
2869  if (IncConst->getAPInt().getMinSignedBits() > 64)
2870  return false;
2871 
2872  MemAccessTy AccessTy = getAccessType(UserInst);
2873  int64_t IncOffset = IncConst->getValue()->getSExtValue();
2874  if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
2875  IncOffset, /*HaseBaseReg=*/false))
2876  return false;
2877 
2878  return true;
2879 }
2880 
2881 /// Generate an add or subtract for each IVInc in a chain to materialize the IV
2882 /// user's operand from the previous IV user's operand.
2883 void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
2884  SmallVectorImpl<WeakVH> &DeadInsts) {
2885  // Find the new IVOperand for the head of the chain. It may have been replaced
2886  // by LSR.
2887  const IVInc &Head = Chain.Incs[0];
2888  User::op_iterator IVOpEnd = Head.UserInst->op_end();
2889  // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
2890  User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
2891  IVOpEnd, L, SE);
2892  Value *IVSrc = nullptr;
2893  while (IVOpIter != IVOpEnd) {
2894  IVSrc = getWideOperand(*IVOpIter);
2895 
2896  // If this operand computes the expression that the chain needs, we may use
2897  // it. (Check this after setting IVSrc which is used below.)
2898  //
2899  // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
2900  // narrow for the chain, so we can no longer use it. We do allow using a
2901  // wider phi, assuming the LSR checked for free truncation. In that case we
2902  // should already have a truncate on this operand such that
2903  // getSCEV(IVSrc) == IncExpr.
2904  if (SE.getSCEV(*IVOpIter) == Head.IncExpr
2905  || SE.getSCEV(IVSrc) == Head.IncExpr) {
2906  break;
2907  }
2908  IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
2909  }
2910  if (IVOpIter == IVOpEnd) {
2911  // Gracefully give up on this chain.
2912  DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
2913  return;
2914  }
2915 
2916  DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
2917  Type *IVTy = IVSrc->getType();
2918  Type *IntTy = SE.getEffectiveSCEVType(IVTy);
2919  const SCEV *LeftOverExpr = nullptr;
2920  for (const IVInc &Inc : Chain) {
2921  Instruction *InsertPt = Inc.UserInst;
2922  if (isa<PHINode>(InsertPt))
2923  InsertPt = L->getLoopLatch()->getTerminator();
2924 
2925  // IVOper will replace the current IV User's operand. IVSrc is the IV
2926  // value currently held in a register.
2927  Value *IVOper = IVSrc;
2928  if (!Inc.IncExpr->isZero()) {
2929  // IncExpr was the result of subtraction of two narrow values, so must
2930  // be signed.
2931  const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
2932  LeftOverExpr = LeftOverExpr ?
2933  SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
2934  }
2935  if (LeftOverExpr && !LeftOverExpr->isZero()) {
2936  // Expand the IV increment.
2937  Rewriter.clearPostInc();
2938  Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
2939  const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
2940  SE.getUnknown(IncV));
2941  IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
2942 
2943  // If an IV increment can't be folded, use it as the next IV value.
2944  if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
2945  assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
2946  IVSrc = IVOper;
2947  LeftOverExpr = nullptr;
2948  }
2949  }
2950  Type *OperTy = Inc.IVOperand->getType();
2951  if (IVTy != OperTy) {
2952  assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
2953  "cannot extend a chained IV");
2954  IRBuilder<> Builder(InsertPt);
2955  IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
2956  }
2957  Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
2958  DeadInsts.emplace_back(Inc.IVOperand);
2959  }
2960  // If LSR created a new, wider phi, we may also replace its postinc. We only
2961  // do this if we also found a wide value for the head of the chain.
2962  if (isa<PHINode>(Chain.tailUserInst())) {
2963  for (BasicBlock::iterator I = L->getHeader()->begin();
2964  PHINode *Phi = dyn_cast<PHINode>(I); ++I) {
2965  if (!isCompatibleIVType(Phi, IVSrc))
2966  continue;
2967  Instruction *PostIncV = dyn_cast<Instruction>(
2968  Phi->getIncomingValueForBlock(L->getLoopLatch()));
2969  if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
2970  continue;
2971  Value *IVOper = IVSrc;
2972  Type *PostIncTy = PostIncV->getType();
2973  if (IVTy != PostIncTy) {
2974  assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
2975  IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
2976  Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
2977  IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
2978  }
2979  Phi->replaceUsesOfWith(PostIncV, IVOper);
2980  DeadInsts.emplace_back(PostIncV);
2981  }
2982  }
2983 }
2984 
2985 void LSRInstance::CollectFixupsAndInitialFormulae() {
2986  for (const IVStrideUse &U : IU) {
2987  Instruction *UserInst = U.getUser();
2988  // Skip IV users that are part of profitable IV Chains.
2989  User::op_iterator UseI =
2990  find(UserInst->operands(), U.getOperandValToReplace());
2991  assert(UseI != UserInst->op_end() && "cannot find IV operand");
2992  if (IVIncSet.count(UseI))
2993  continue;
2994 
2995  LSRUse::KindType Kind = LSRUse::Basic;
2996  MemAccessTy AccessTy;
2997  if (isAddressUse(UserInst, U.getOperandValToReplace())) {
2998  Kind = LSRUse::Address;
2999  AccessTy = getAccessType(UserInst);
3000  }
3001 
3002  const SCEV *S = IU.getExpr(U);
3003  PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
3004 
3005  // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
3006  // (N - i == 0), and this allows (N - i) to be the expression that we work
3007  // with rather than just N or i, so we can consider the register
3008  // requirements for both N and i at the same time. Limiting this code to
3009  // equality icmps is not a problem because all interesting loops use
3010  // equality icmps, thanks to IndVarSimplify.
3011  if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst))
3012  if (CI->isEquality()) {
3013  // Swap the operands if needed to put the OperandValToReplace on the
3014  // left, for consistency.
3015  Value *NV = CI->getOperand(1);
3016  if (NV == U.getOperandValToReplace()) {
3017  CI->setOperand(1, CI->getOperand(0));
3018  CI->setOperand(0, NV);
3019  NV = CI->getOperand(1);
3020  Changed = true;
3021  }
3022 
3023  // x == y --> x - y == 0
3024  const SCEV *N = SE.getSCEV(NV);
3025  if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) {
3026  // S is normalized, so normalize N before folding it into S
3027  // to keep the result normalized.
3028  N = TransformForPostIncUse(Normalize, N, CI, nullptr,
3029  TmpPostIncLoops, SE, DT);
3030  Kind = LSRUse::ICmpZero;
3031  S = SE.getMinusSCEV(N, S);
3032  }
3033 
3034  // -1 and the negations of all interesting strides (except the negation
3035  // of -1) are now also interesting.
3036  for (size_t i = 0, e = Factors.size(); i != e; ++i)
3037  if (Factors[i] != -1)
3038  Factors.insert(-(uint64_t)Factors[i]);
3039  Factors.insert(-1);
3040  }
3041 
3042  // Get or create an LSRUse.
3043  std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
3044  size_t LUIdx = P.first;
3045  int64_t Offset = P.second;
3046  LSRUse &LU = Uses[LUIdx];
3047 
3048  // Record the fixup.
3049  LSRFixup &LF = LU.getNewFixup();
3050  LF.UserInst = UserInst;
3051  LF.OperandValToReplace = U.getOperandValToReplace();
3052  LF.PostIncLoops = TmpPostIncLoops;
3053  LF.Offset = Offset;
3054  LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3055 
3056  if (!LU.WidestFixupType ||
3057  SE.getTypeSizeInBits(LU.WidestFixupType) <
3058  SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3059  LU.WidestFixupType = LF.OperandValToReplace->getType();
3060 
3061  // If this is the first use of this LSRUse, give it a formula.
3062  if (LU.Formulae.empty()) {
3063  InsertInitialFormula(S, LU, LUIdx);
3064  CountRegisters(LU.Formulae.back(), LUIdx);
3065  }
3066  }
3067 
3068  DEBUG(print_fixups(dbgs()));
3069 }
3070 
3071 /// Insert a formula for the given expression into the given use, separating out
3072 /// loop-variant portions from loop-invariant and loop-computable portions.
3073 void
3074 LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
3075  // Mark uses whose expressions cannot be expanded.
3076  if (!isSafeToExpand(S, SE))
3077  LU.RigidFormula = true;
3078 
3079  Formula F;
3080  F.initialMatch(S, L, SE);
3081  bool Inserted = InsertFormula(LU, LUIdx, F);
3082  assert(Inserted && "Initial formula already exists!"); (void)Inserted;
3083 }
3084 
3085 /// Insert a simple single-register formula for the given expression into the
3086 /// given use.
3087 void
3088 LSRInstance::InsertSupplementalFormula(const SCEV *S,
3089  LSRUse &LU, size_t LUIdx) {
3090  Formula F;
3091  F.BaseRegs.push_back(S);
3092  F.HasBaseReg = true;
3093  bool Inserted = InsertFormula(LU, LUIdx, F);
3094  assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
3095 }
3096 
3097 /// Note which registers are used by the given formula, updating RegUses.
3098 void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
3099  if (F.ScaledReg)
3100  RegUses.countRegister(F.ScaledReg, LUIdx);
3101  for (const SCEV *BaseReg : F.BaseRegs)
3102  RegUses.countRegister(BaseReg, LUIdx);
3103 }
3104 
3105 /// If the given formula has not yet been inserted, add it to the list, and
3106 /// return true. Return false otherwise.
3107 bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
3108  // Do not insert formula that we will not be able to expand.
3109  assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
3110  "Formula is illegal");
3111  if (!LU.InsertFormula(F))
3112  return false;
3113 
3114  CountRegisters(F, LUIdx);
3115  return true;
3116 }
3117 
3118 /// Check for other uses of loop-invariant values which we're tracking. These
3119 /// other uses will pin these values in registers, making them less profitable
3120 /// for elimination.
3121 /// TODO: This currently misses non-constant addrec step registers.
3122 /// TODO: Should this give more weight to users inside the loop?
3123 void
3124 LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
3125  SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
3127 
3128  while (!Worklist.empty()) {
3129  const SCEV *S = Worklist.pop_back_val();
3130 
3131  // Don't process the same SCEV twice
3132  if (!Visited.insert(S).second)
3133  continue;
3134 
3135  if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
3136  Worklist.append(N->op_begin(), N->op_end());
3137  else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
3138  Worklist.push_back(C->getOperand());
3139  else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
3140  Worklist.push_back(D->getLHS());
3141  Worklist.push_back(D->getRHS());
3142  } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
3143  const Value *V = US->getValue();
3144  if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
3145  // Look for instructions defined outside the loop.
3146  if (L->contains(Inst)) continue;
3147  } else if (isa<UndefValue>(V))
3148  // Undef doesn't have a live range, so it doesn't matter.
3149  continue;
3150  for (const Use &U : V->uses()) {
3151  const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
3152  // Ignore non-instructions.
3153  if (!UserInst)
3154  continue;
3155  // Ignore instructions in other functions (as can happen with
3156  // Constants).
3157  if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
3158  continue;
3159  // Ignore instructions not dominated by the loop.
3160  const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
3161  UserInst->getParent() :
3162  cast<PHINode>(UserInst)->getIncomingBlock(
3163  PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3164  if (!DT.dominates(L->getHeader(), UseBB))
3165  continue;
3166  // Don't bother if the instruction is in a BB which ends in an EHPad.
3167  if (UseBB->getTerminator()->isEHPad())
3168  continue;
3169  // Don't bother rewriting PHIs in catchswitch blocks.
3170  if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
3171  continue;
3172  // Ignore uses which are part of other SCEV expressions, to avoid
3173  // analyzing them multiple times.
3174  if (SE.isSCEVable(UserInst->getType())) {
3175  const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
3176  // If the user is a no-op, look through to its uses.
3177  if (!isa<SCEVUnknown>(UserS))
3178  continue;
3179  if (UserS == US) {
3180  Worklist.push_back(
3181  SE.getUnknown(const_cast<Instruction *>(UserInst)));
3182  continue;
3183  }
3184  }
3185  // Ignore icmp instructions which are already being analyzed.
3186  if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
3187  unsigned OtherIdx = !U.getOperandNo();
3188  Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
3189  if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
3190  continue;
3191  }
3192 
3193  std::pair<size_t, int64_t> P = getUse(
3194  S, LSRUse::Basic, MemAccessTy());
3195  size_t LUIdx = P.first;
3196  int64_t Offset = P.second;
3197  LSRUse &LU = Uses[LUIdx];
3198  LSRFixup &LF = LU.getNewFixup();
3199  LF.UserInst = const_cast<Instruction *>(UserInst);
3200  LF.OperandValToReplace = U;
3201  LF.Offset = Offset;
3202  LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3203  if (!LU.WidestFixupType ||
3204  SE.getTypeSizeInBits(LU.WidestFixupType) <
3205  SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3206  LU.WidestFixupType = LF.OperandValToReplace->getType();
3207  InsertSupplementalFormula(US, LU, LUIdx);
3208  CountRegisters(LU.Formulae.back(), Uses.size() - 1);
3209  break;
3210  }
3211  }
3212  }
3213 }
3214 
3215 /// Split S into subexpressions which can be pulled out into separate
3216 /// registers. If C is non-null, multiply each subexpression by C.
3217 ///
3218 /// Return remainder expression after factoring the subexpressions captured by
3219 /// Ops. If Ops is complete, return NULL.
3220 static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
3222  const Loop *L,
3223  ScalarEvolution &SE,
3224  unsigned Depth = 0) {
3225  // Arbitrarily cap recursion to protect compile time.
3226  if (Depth >= 3)
3227  return S;
3228 
3229  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
3230  // Break out add operands.
3231  for (const SCEV *S : Add->operands()) {
3232  const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
3233  if (Remainder)
3234  Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3235  }
3236  return nullptr;
3237  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
3238  // Split a non-zero base out of an addrec.
3239  if (AR->getStart()->isZero() || !AR->isAffine())
3240  return S;
3241 
3242  const SCEV *Remainder = CollectSubexprs(AR->getStart(),
3243  C, Ops, L, SE, Depth+1);
3244  // Split the non-zero AddRec unless it is part of a nested recurrence that
3245  // does not pertain to this loop.
3246  if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) {
3247  Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3248  Remainder = nullptr;
3249  }
3250  if (Remainder != AR->getStart()) {
3251  if (!Remainder)
3252  Remainder = SE.getConstant(AR->getType(), 0);
3253  return SE.getAddRecExpr(Remainder,
3254  AR->getStepRecurrence(SE),
3255  AR->getLoop(),
3256  //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
3257  SCEV::FlagAnyWrap);
3258  }
3259  } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
3260  // Break (C * (a + b + c)) into C*a + C*b + C*c.
3261  if (Mul->getNumOperands() != 2)
3262  return S;
3263  if (const SCEVConstant *Op0 =
3264  dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
3265  C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
3266  const SCEV *Remainder =
3267  CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1);
3268  if (Remainder)
3269  Ops.push_back(SE.getMulExpr(C, Remainder));
3270  return nullptr;
3271  }
3272  }
3273  return S;
3274 }
3275 
3276 /// \brief Helper function for LSRInstance::GenerateReassociations.
3277 void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
3278  const Formula &Base,
3279  unsigned Depth, size_t Idx,
3280  bool IsScaledReg) {
3281  const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3283  const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
3284  if (Remainder)
3285  AddOps.push_back(Remainder);
3286 
3287  if (AddOps.size() == 1)
3288  return;
3289 
3291  JE = AddOps.end();
3292  J != JE; ++J) {
3293 
3294  // Loop-variant "unknown" values are uninteresting; we won't be able to
3295  // do anything meaningful with them.
3296  if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
3297  continue;
3298 
3299  // Don't pull a constant into a register if the constant could be folded
3300  // into an immediate field.
3301  if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3302  LU.AccessTy, *J, Base.getNumRegs() > 1))
3303  continue;
3304 
3305  // Collect all operands except *J.
3306  SmallVector<const SCEV *, 8> InnerAddOps(
3307  ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
3308  InnerAddOps.append(std::next(J),
3309  ((const SmallVector<const SCEV *, 8> &)AddOps).end());
3310 
3311  // Don't leave just a constant behind in a register if the constant could
3312  // be folded into an immediate field.
3313  if (InnerAddOps.size() == 1 &&
3314  isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3315  LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
3316  continue;
3317 
3318  const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
3319  if (InnerSum->isZero())
3320  continue;
3321  Formula F = Base;
3322 
3323  // Add the remaining pieces of the add back into the new formula.
3324  const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
3325  if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
3326  TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
3327  InnerSumSC->getValue()->getZExtValue())) {
3328  F.UnfoldedOffset =
3329  (uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue();
3330  if (IsScaledReg)
3331  F.ScaledReg = nullptr;
3332  else
3333  F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
3334  } else if (IsScaledReg)
3335  F.ScaledReg = InnerSum;
3336  else
3337  F.BaseRegs[Idx] = InnerSum;
3338 
3339  // Add J as its own register, or an unfolded immediate.
3340  const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
3341  if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
3342  TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
3343  SC->getValue()->getZExtValue()))
3344  F.UnfoldedOffset =
3345  (uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue();
3346  else
3347  F.BaseRegs.push_back(*J);
3348  // We may have changed the number of register in base regs, adjust the
3349  // formula accordingly.
3350  F.canonicalize();
3351 
3352  if (InsertFormula(LU, LUIdx, F))
3353  // If that formula hadn't been seen before, recurse to find more like
3354  // it.
3355  GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth + 1);
3356  }
3357 }
3358 
3359 /// Split out subexpressions from adds and the bases of addrecs.
3360 void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
3361  Formula Base, unsigned Depth) {
3362  assert(Base.isCanonical() && "Input must be in the canonical form");
3363  // Arbitrarily cap recursion to protect compile time.
3364  if (Depth >= 3)
3365  return;
3366 
3367  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
3368  GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
3369 
3370  if (Base.Scale == 1)
3371  GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
3372  /* Idx */ -1, /* IsScaledReg */ true);
3373 }
3374 
3375 /// Generate a formula consisting of all of the loop-dominating registers added
3376 /// into a single register.
3377 void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
3378  Formula Base) {
3379  // This method is only interesting on a plurality of registers.
3380  if (Base.BaseRegs.size() + (Base.Scale == 1) <= 1)
3381  return;
3382 
3383  // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
3384  // processing the formula.
3385  Base.unscale();
3386  Formula F = Base;
3387  F.BaseRegs.clear();
3389  for (const SCEV *BaseReg : Base.BaseRegs) {
3390  if (SE.properlyDominates(BaseReg, L->getHeader()) &&
3391  !SE.hasComputableLoopEvolution(BaseReg, L))
3392  Ops.push_back(BaseReg);
3393  else
3394  F.BaseRegs.push_back(BaseReg);
3395  }
3396  if (Ops.size() > 1) {
3397  const SCEV *Sum = SE.getAddExpr(Ops);
3398  // TODO: If Sum is zero, it probably means ScalarEvolution missed an
3399  // opportunity to fold something. For now, just ignore such cases
3400  // rather than proceed with zero in a register.
3401  if (!Sum->isZero()) {
3402  F.BaseRegs.push_back(Sum);
3403  F.canonicalize();
3404  (void)InsertFormula(LU, LUIdx, F);
3405  }
3406  }
3407 }
3408 
3409 /// \brief Helper function for LSRInstance::GenerateSymbolicOffsets.
3410 void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
3411  const Formula &Base, size_t Idx,
3412  bool IsScaledReg) {
3413  const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3414  GlobalValue *GV = ExtractSymbol(G, SE);
3415  if (G->isZero() || !GV)
3416  return;
3417  Formula F = Base;
3418  F.BaseGV = GV;
3419  if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
3420  return;
3421  if (IsScaledReg)
3422  F.ScaledReg = G;
3423  else
3424  F.BaseRegs[Idx] = G;
3425  (void)InsertFormula(LU, LUIdx, F);
3426 }
3427 
3428 /// Generate reuse formulae using symbolic offsets.
3429 void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
3430  Formula Base) {
3431  // We can't add a symbolic offset if the address already contains one.
3432  if (Base.BaseGV) return;
3433 
3434  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
3435  GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
3436  if (Base.Scale == 1)
3437  GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
3438  /* IsScaledReg */ true);
3439 }
3440 
3441 /// \brief Helper function for LSRInstance::GenerateConstantOffsets.
3442 void LSRInstance::GenerateConstantOffsetsImpl(
3443  LSRUse &LU, unsigned LUIdx, const Formula &Base,
3444  const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) {
3445  const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3446  for (int64_t Offset : Worklist) {
3447  Formula F = Base;
3448  F.BaseOffset = (uint64_t)Base.BaseOffset - Offset;
3449  if (isLegalUse(TTI, LU.MinOffset - Offset, LU.MaxOffset - Offset, LU.Kind,
3450  LU.AccessTy, F)) {
3451  // Add the offset to the base register.
3452  const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), Offset), G);
3453  // If it cancelled out, drop the base register, otherwise update it.
3454  if (NewG->isZero()) {
3455  if (IsScaledReg) {
3456  F.Scale = 0;
3457  F.ScaledReg = nullptr;
3458  } else
3459  F.deleteBaseReg(F.BaseRegs[Idx]);
3460  F.canonicalize();
3461  } else if (IsScaledReg)
3462  F.ScaledReg = NewG;
3463  else
3464  F.BaseRegs[Idx] = NewG;
3465 
3466  (void)InsertFormula(LU, LUIdx, F);
3467  }
3468  }
3469 
3470  int64_t Imm = ExtractImmediate(G, SE);
3471  if (G->isZero() || Imm == 0)
3472  return;
3473  Formula F = Base;
3474  F.BaseOffset = (uint64_t)F.BaseOffset + Imm;
3475  if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
3476  return;
3477  if (IsScaledReg)
3478  F.ScaledReg = G;
3479  else
3480  F.BaseRegs[Idx] = G;
3481  (void)InsertFormula(LU, LUIdx, F);
3482 }
3483 
3484 /// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
3485 void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
3486  Formula Base) {
3487  // TODO: For now, just add the min and max offset, because it usually isn't
3488  // worthwhile looking at everything inbetween.
3489  SmallVector<int64_t, 2> Worklist;
3490  Worklist.push_back(LU.MinOffset);
3491  if (LU.MaxOffset != LU.MinOffset)
3492  Worklist.push_back(LU.MaxOffset);
3493 
3494  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
3495  GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
3496  if (Base.Scale == 1)
3497  GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
3498  /* IsScaledReg */ true);
3499 }
3500 
3501 /// For ICmpZero, check to see if we can scale up the comparison. For example, x
3502 /// == y -> x*c == y*c.
3503 void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
3504  Formula Base) {
3505  if (LU.Kind != LSRUse::ICmpZero) return;
3506 
3507  // Determine the integer type for the base formula.
3508  Type *IntTy = Base.getType();
3509  if (!IntTy) return;
3510  if (SE.getTypeSizeInBits(IntTy) > 64) return;
3511 
3512  // Don't do this if there is more than one offset.
3513  if (LU.MinOffset != LU.MaxOffset) return;
3514 
3515  assert(!Base.BaseGV && "ICmpZero use is not legal!");
3516 
3517  // Check each interesting stride.
3518  for (int64_t Factor : Factors) {
3519  // Check that the multiplication doesn't overflow.
3520  if (Base.BaseOffset == INT64_MIN && Factor == -1)
3521  continue;
3522  int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor;
3523  if (NewBaseOffset / Factor != Base.BaseOffset)
3524  continue;
3525  // If the offset will be truncated at this use, check that it is in bounds.
3526  if (!IntTy->isPointerTy() &&
3527  !ConstantInt::isValueValidForType(IntTy, NewBaseOffset))
3528  continue;
3529 
3530  // Check that multiplying with the use offset doesn't overflow.
3531  int64_t Offset = LU.MinOffset;
3532  if (Offset == INT64_MIN && Factor == -1)
3533  continue;
3534  Offset = (uint64_t)Offset * Factor;
3535  if (Offset / Factor != LU.MinOffset)
3536  continue;
3537  // If the offset will be truncated at this use, check that it is in bounds.
3538  if (!IntTy->isPointerTy() &&
3539  !ConstantInt::isValueValidForType(IntTy, Offset))
3540  continue;
3541 
3542  Formula F = Base;
3543  F.BaseOffset = NewBaseOffset;
3544 
3545  // Check that this scale is legal.
3546  if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
3547  continue;
3548 
3549  // Compensate for the use having MinOffset built into it.
3550  F.BaseOffset = (uint64_t)F.BaseOffset + Offset - LU.MinOffset;
3551 
3552  const SCEV *FactorS = SE.getConstant(IntTy, Factor);
3553 
3554  // Check that multiplying with each base register doesn't overflow.
3555  for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
3556  F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
3557  if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
3558  goto next;
3559  }
3560 
3561  // Check that multiplying with the scaled register doesn't overflow.
3562  if (F.ScaledReg) {
3563  F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
3564  if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
3565  continue;
3566  }
3567 
3568  // Check that multiplying with the unfolded offset doesn't overflow.
3569  if (F.UnfoldedOffset != 0) {
3570  if (F.UnfoldedOffset == INT64_MIN && Factor == -1)
3571  continue;
3572  F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor;
3573  if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset)
3574  continue;
3575  // If the offset will be truncated, check that it is in bounds.
3576  if (!IntTy->isPointerTy() &&
3577  !ConstantInt::isValueValidForType(IntTy, F.UnfoldedOffset))
3578  continue;
3579  }
3580 
3581  // If we make it here and it's legal, add it.
3582  (void)InsertFormula(LU, LUIdx, F);
3583  next:;
3584  }
3585 }
3586 
3587 /// Generate stride factor reuse formulae by making use of scaled-offset address
3588 /// modes, for example.
3589 void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
3590  // Determine the integer type for the base formula.
3591  Type *IntTy = Base.getType();
3592  if (!IntTy) return;
3593 
3594  // If this Formula already has a scaled register, we can't add another one.
3595  // Try to unscale the formula to generate a better scale.
3596  if (Base.Scale != 0 && !Base.unscale())
3597  return;
3598 
3599  assert(Base.Scale == 0 && "unscale did not did its job!");
3600 
3601  // Check each interesting stride.
3602  for (int64_t Factor : Factors) {
3603  Base.Scale = Factor;
3604  Base.HasBaseReg = Base.BaseRegs.size() > 1;
3605  // Check whether this scale is going to be legal.
3606  if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
3607  Base)) {
3608  // As a special-case, handle special out-of-loop Basic users specially.
3609  // TODO: Reconsider this special case.
3610  if (LU.Kind == LSRUse::Basic &&
3611  isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
3612  LU.AccessTy, Base) &&
3613  LU.AllFixupsOutsideLoop)
3614  LU.Kind = LSRUse::Special;
3615  else
3616  continue;
3617  }
3618  // For an ICmpZero, negating a solitary base register won't lead to
3619  // new solutions.
3620  if (LU.Kind == LSRUse::ICmpZero &&
3621  !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV)
3622  continue;
3623  // For each addrec base reg, apply the scale, if possible.
3624  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
3625  if (const SCEVAddRecExpr *AR =
3626  dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) {
3627  const SCEV *FactorS = SE.getConstant(IntTy, Factor);
3628  if (FactorS->isZero())
3629  continue;
3630  // Divide out the factor, ignoring high bits, since we'll be
3631  // scaling the value back up in the end.
3632  if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) {
3633  // TODO: This could be optimized to avoid all the copying.
3634  Formula F = Base;
3635  F.ScaledReg = Quotient;
3636  F.deleteBaseReg(F.BaseRegs[i]);
3637  // The canonical representation of 1*reg is reg, which is already in
3638  // Base. In that case, do not try to insert the formula, it will be
3639  // rejected anyway.
3640  if (F.Scale == 1 && F.BaseRegs.empty())
3641  continue;
3642  (void)InsertFormula(LU, LUIdx, F);
3643  }
3644  }
3645  }
3646 }
3647 
3648 /// Generate reuse formulae from different IV types.
3649 void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
3650  // Don't bother truncating symbolic values.
3651  if (Base.BaseGV) return;
3652 
3653  // Determine the integer type for the base formula.
3654  Type *DstTy = Base.getType();
3655  if (!DstTy) return;
3656  DstTy = SE.getEffectiveSCEVType(DstTy);
3657 
3658  for (Type *SrcTy : Types) {
3659  if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
3660  Formula F = Base;
3661 
3662  if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, SrcTy);
3663  for (const SCEV *&BaseReg : F.BaseRegs)
3664  BaseReg = SE.getAnyExtendExpr(BaseReg, SrcTy);
3665 
3666  // TODO: This assumes we've done basic processing on all uses and
3667  // have an idea what the register usage is.
3668  if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
3669  continue;
3670 
3671  (void)InsertFormula(LU, LUIdx, F);
3672  }
3673  }
3674 }
3675 
3676 namespace {
3677 
3678 /// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
3679 /// modifications so that the search phase doesn't have to worry about the data
3680 /// structures moving underneath it.
3681 struct WorkItem {
3682  size_t LUIdx;
3683  int64_t Imm;
3684  const SCEV *OrigReg;
3685 
3686  WorkItem(size_t LI, int64_t I, const SCEV *R)
3687  : LUIdx(LI), Imm(I), OrigReg(R) {}
3688 
3689  void print(raw_ostream &OS) const;
3690  void dump() const;
3691 };
3692 
3693 } // end anonymous namespace
3694 
3695 void WorkItem::print(raw_ostream &OS) const {
3696  OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
3697  << " , add offset " << Imm;
3698 }
3699 
3701 void WorkItem::dump() const {
3702  print(errs()); errs() << '\n';
3703 }
3704 
3705 /// Look for registers which are a constant distance apart and try to form reuse
3706 /// opportunities between them.
3707 void LSRInstance::GenerateCrossUseConstantOffsets() {
3708  // Group the registers by their value without any added constant offset.
3709  typedef std::map<int64_t, const SCEV *> ImmMapTy;
3711  DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
3713  for (const SCEV *Use : RegUses) {
3714  const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify.
3715  int64_t Imm = ExtractImmediate(Reg, SE);
3716  auto Pair = Map.insert(std::make_pair(Reg, ImmMapTy()));
3717  if (Pair.second)
3718  Sequence.push_back(Reg);
3719  Pair.first->second.insert(std::make_pair(Imm, Use));
3720  UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
3721  }
3722 
3723  // Now examine each set of registers with the same base value. Build up
3724  // a list of work to do and do the work in a separate step so that we're
3725  // not adding formulae and register counts while we're searching.
3726  SmallVector<WorkItem, 32> WorkItems;
3727  SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems;
3728  for (const SCEV *Reg : Sequence) {
3729  const ImmMapTy &Imms = Map.find(Reg)->second;
3730 
3731  // It's not worthwhile looking for reuse if there's only one offset.
3732  if (Imms.size() == 1)
3733  continue;
3734 
3735  DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
3736  for (const auto &Entry : Imms)
3737  dbgs() << ' ' << Entry.first;
3738  dbgs() << '\n');
3739 
3740  // Examine each offset.
3741  for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
3742  J != JE; ++J) {
3743  const SCEV *OrigReg = J->second;
3744 
3745  int64_t JImm = J->first;
3746  const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
3747 
3748  if (!isa<SCEVConstant>(OrigReg) &&
3749  UsedByIndicesMap[Reg].count() == 1) {
3750  DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg << '\n');
3751  continue;
3752  }
3753 
3754  // Conservatively examine offsets between this orig reg a few selected
3755  // other orig regs.
3756  ImmMapTy::const_iterator OtherImms[] = {
3757  Imms.begin(), std::prev(Imms.end()),
3758  Imms.lower_bound((Imms.begin()->first + std::prev(Imms.end())->first) /
3759  2)
3760  };
3761  for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {
3762  ImmMapTy::const_iterator M = OtherImms[i];
3763  if (M == J || M == JE) continue;
3764 
3765  // Compute the difference between the two.
3766  int64_t Imm = (uint64_t)JImm - M->first;
3767  for (int LUIdx = UsedByIndices.find_first(); LUIdx != -1;
3768  LUIdx = UsedByIndices.find_next(LUIdx))
3769  // Make a memo of this use, offset, and register tuple.
3770  if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
3771  WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
3772  }
3773  }
3774  }
3775 
3776  Map.clear();
3777  Sequence.clear();
3778  UsedByIndicesMap.clear();
3779  UniqueItems.clear();
3780 
3781  // Now iterate through the worklist and add new formulae.
3782  for (const WorkItem &WI : WorkItems) {
3783  size_t LUIdx = WI.LUIdx;
3784  LSRUse &LU = Uses[LUIdx];
3785  int64_t Imm = WI.Imm;
3786  const SCEV *OrigReg = WI.OrigReg;
3787 
3788  Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
3789  const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
3790  unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
3791 
3792  // TODO: Use a more targeted data structure.
3793  for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
3794  Formula F = LU.Formulae[L];
3795  // FIXME: The code for the scaled and unscaled registers looks
3796  // very similar but slightly different. Investigate if they
3797  // could be merged. That way, we would not have to unscale the
3798  // Formula.
3799  F.unscale();
3800  // Use the immediate in the scaled register.
3801  if (F.ScaledReg == OrigReg) {
3802  int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale;
3803  // Don't create 50 + reg(-50).
3804  if (F.referencesReg(SE.getSCEV(
3805  ConstantInt::get(IntTy, -(uint64_t)Offset))))
3806  continue;
3807  Formula NewF = F;
3808  NewF.BaseOffset = Offset;
3809  if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
3810  NewF))
3811  continue;
3812  NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
3813 
3814  // If the new scale is a constant in a register, and adding the constant
3815  // value to the immediate would produce a value closer to zero than the
3816  // immediate itself, then the formula isn't worthwhile.
3817  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
3818  if (C->getValue()->isNegative() != (NewF.BaseOffset < 0) &&
3819  (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
3820  .ule(std::abs(NewF.BaseOffset)))
3821  continue;
3822 
3823  // OK, looks good.
3824  NewF.canonicalize();
3825  (void)InsertFormula(LU, LUIdx, NewF);
3826  } else {
3827  // Use the immediate in a base register.
3828  for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
3829  const SCEV *BaseReg = F.BaseRegs[N];
3830  if (BaseReg != OrigReg)
3831  continue;
3832  Formula NewF = F;
3833  NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm;
3834  if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
3835  LU.Kind, LU.AccessTy, NewF)) {
3836  if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
3837  continue;
3838  NewF = F;
3839  NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm;
3840  }
3841  NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
3842 
3843  // If the new formula has a constant in a register, and adding the
3844  // constant value to the immediate would produce a value closer to
3845  // zero than the immediate itself, then the formula isn't worthwhile.
3846  for (const SCEV *NewReg : NewF.BaseRegs)
3847  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg))
3848  if ((C->getAPInt() + NewF.BaseOffset)
3849  .abs()
3850  .slt(std::abs(NewF.BaseOffset)) &&
3851  (C->getAPInt() + NewF.BaseOffset).countTrailingZeros() >=
3852  countTrailingZeros<uint64_t>(NewF.BaseOffset))
3853  goto skip_formula;
3854 
3855  // Ok, looks good.
3856  NewF.canonicalize();
3857  (void)InsertFormula(LU, LUIdx, NewF);
3858  break;
3859  skip_formula:;
3860  }
3861  }
3862  }
3863  }
3864 }
3865 
3866 /// Generate formulae for each use.
3867 void
3868 LSRInstance::GenerateAllReuseFormulae() {
3869  // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
3870  // queries are more precise.
3871  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
3872  LSRUse &LU = Uses[LUIdx];
3873  for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
3874  GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
3875  for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
3876  GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
3877  }
3878  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
3879  LSRUse &LU = Uses[LUIdx];
3880  for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
3881  GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
3882  for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
3883  GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
3884  for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
3885  GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
3886  for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
3887  GenerateScales(LU, LUIdx, LU.Formulae[i]);
3888  }
3889  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
3890  LSRUse &LU = Uses[LUIdx];
3891  for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
3892  GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
3893  }
3894 
3895  GenerateCrossUseConstantOffsets();
3896 
3897  DEBUG(dbgs() << "\n"
3898  "After generating reuse formulae:\n";
3899  print_uses(dbgs()));
3900 }
3901 
3902 /// If there are multiple formulae with the same set of registers used
3903 /// by other uses, pick the best one and delete the others.
3904 void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
3905  DenseSet<const SCEV *> VisitedRegs;
3908 #ifndef NDEBUG
3909  bool ChangedFormulae = false;
3910 #endif
3911 
3912  // Collect the best formula for each unique set of shared registers. This
3913  // is reset for each use.
3914  typedef DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>
3915  BestFormulaeTy;
3916  BestFormulaeTy BestFormulae;
3917 
3918  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
3919  LSRUse &LU = Uses[LUIdx];
3920  DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n');
3921 
3922  bool Any = false;
3923  for (size_t FIdx = 0, NumForms = LU.Formulae.size();
3924  FIdx != NumForms; ++FIdx) {
3925  Formula &F = LU.Formulae[FIdx];
3926 
3927  // Some formulas are instant losers. For example, they may depend on
3928  // nonexistent AddRecs from other loops. These need to be filtered
3929  // immediately, otherwise heuristics could choose them over others leading
3930  // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
3931  // avoids the need to recompute this information across formulae using the
3932  // same bad AddRec. Passing LoserRegs is also essential unless we remove
3933  // the corresponding bad register from the Regs set.
3934  Cost CostF;
3935  Regs.clear();
3936  CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, SE, DT, LU, &LoserRegs);
3937  if (CostF.isLoser()) {
3938  // During initial formula generation, undesirable formulae are generated
3939  // by uses within other loops that have some non-trivial address mode or
3940  // use the postinc form of the IV. LSR needs to provide these formulae
3941  // as the basis of rediscovering the desired formula that uses an AddRec
3942  // corresponding to the existing phi. Once all formulae have been
3943  // generated, these initial losers may be pruned.
3944  DEBUG(dbgs() << " Filtering loser "; F.print(dbgs());
3945  dbgs() << "\n");
3946  }
3947  else {
3949  for (const SCEV *Reg : F.BaseRegs) {
3950  if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
3951  Key.push_back(Reg);
3952  }
3953  if (F.ScaledReg &&
3954  RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
3955  Key.push_back(F.ScaledReg);
3956  // Unstable sort by host order ok, because this is only used for
3957  // uniquifying.
3958  std::sort(Key.begin(), Key.end());
3959 
3960  std::pair<BestFormulaeTy::const_iterator, bool> P =
3961  BestFormulae.insert(std::make_pair(Key, FIdx));
3962  if (P.second)
3963  continue;
3964 
3965  Formula &Best = LU.Formulae[P.first->second];
3966 
3967  Cost CostBest;
3968  Regs.clear();
3969  CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, SE, DT, LU);
3970  if (CostF < CostBest)
3971  std::swap(F, Best);
3972  DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
3973  dbgs() << "\n"
3974  " in favor of formula "; Best.print(dbgs());
3975  dbgs() << '\n');
3976  }
3977 #ifndef NDEBUG
3978  ChangedFormulae = true;
3979 #endif
3980  LU.DeleteFormula(F);
3981  --FIdx;
3982  --NumForms;
3983  Any = true;
3984  }
3985 
3986  // Now that we've filtered out some formulae, recompute the Regs set.
3987  if (Any)
3988  LU.RecomputeRegs(LUIdx, RegUses);
3989 
3990  // Reset this to prepare for the next use.
3991  BestFormulae.clear();
3992  }
3993 
3994  DEBUG(if (ChangedFormulae) {
3995  dbgs() << "\n"
3996  "After filtering out undesirable candidates:\n";
3997  print_uses(dbgs());
3998  });
3999 }
4000 
4001 // This is a rough guess that seems to work fairly well.
4002 static const size_t ComplexityLimit = UINT16_MAX;
4003 
4004 /// Estimate the worst-case number of solutions the solver might have to
4005 /// consider. It almost never considers this many solutions because it prune the
4006 /// search space, but the pruning isn't always sufficient.
4007 size_t LSRInstance::EstimateSearchSpaceComplexity() const {
4008  size_t Power = 1;
4009  for (const LSRUse &LU : Uses) {
4010  size_t FSize = LU.Formulae.size();
4011  if (FSize >= ComplexityLimit) {
4012  Power = ComplexityLimit;
4013  break;
4014  }
4015  Power *= FSize;
4016  if (Power >= ComplexityLimit)
4017  break;
4018  }
4019  return Power;
4020 }
4021 
4022 /// When one formula uses a superset of the registers of another formula, it
4023 /// won't help reduce register pressure (though it may not necessarily hurt
4024 /// register pressure); remove it to simplify the system.
4025 void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
4026  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4027  DEBUG(dbgs() << "The search space is too complex.\n");
4028 
4029  DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
4030  "which use a superset of registers used by other "
4031  "formulae.\n");
4032 
4033  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4034  LSRUse &LU = Uses[LUIdx];
4035  bool Any = false;
4036  for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
4037  Formula &F = LU.Formulae[i];
4038  // Look for a formula with a constant or GV in a register. If the use
4039  // also has a formula with that same value in an immediate field,
4040  // delete the one that uses a register.
4042  I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
4043  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
4044  Formula NewF = F;
4045  NewF.BaseOffset += C->getValue()->getSExtValue();
4046  NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4047  (I - F.BaseRegs.begin()));
4048  if (LU.HasFormulaWithSameRegs(NewF)) {
4049  DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
4050  LU.DeleteFormula(F);
4051  --i;
4052  --e;
4053  Any = true;
4054  break;
4055  }
4056  } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
4057  if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
4058  if (!F.BaseGV) {
4059  Formula NewF = F;
4060  NewF.BaseGV = GV;
4061  NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4062  (I - F.BaseRegs.begin()));
4063  if (LU.HasFormulaWithSameRegs(NewF)) {
4064  DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4065  dbgs() << '\n');
4066  LU.DeleteFormula(F);
4067  --i;
4068  --e;
4069  Any = true;
4070  break;
4071  }
4072  }
4073  }
4074  }
4075  }
4076  if (Any)
4077  LU.RecomputeRegs(LUIdx, RegUses);
4078  }
4079 
4080  DEBUG(dbgs() << "After pre-selection:\n";
4081  print_uses(dbgs()));
4082  }
4083 }
4084 
4085 /// When there are many registers for expressions like A, A+1, A+2, etc.,
4086 /// allocate a single register for them.
4087 void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
4088  if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4089  return;
4090 
4091  DEBUG(dbgs() << "The search space is too complex.\n"
4092  "Narrowing the search space by assuming that uses separated "
4093  "by a constant offset will use the same registers.\n");
4094 
4095  // This is especially useful for unrolled loops.
4096 
4097  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4098  LSRUse &LU = Uses[LUIdx];
4099  for (const Formula &F : LU.Formulae) {
4100  if (F.BaseOffset == 0 || (F.Scale != 0 && F.Scale != 1))
4101  continue;
4102 
4103  LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
4104  if (!LUThatHas)
4105  continue;
4106 
4107  if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
4108  LU.Kind, LU.AccessTy))
4109  continue;
4110 
4111  DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');
4112 
4113  LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
4114 
4115  // Transfer the fixups of LU to LUThatHas.
4116  for (LSRFixup &Fixup : LU.Fixups) {
4117  Fixup.Offset += F.BaseOffset;
4118  LUThatHas->pushFixup(Fixup);
4119  DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
4120  }
4121 
4122  // Delete formulae from the new use which are no longer legal.
4123  bool Any = false;
4124  for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
4125  Formula &F = LUThatHas->Formulae[i];
4126  if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
4127  LUThatHas->Kind, LUThatHas->AccessTy, F)) {
4128  DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4129  dbgs() << '\n');
4130  LUThatHas->DeleteFormula(F);
4131  --i;
4132  --e;
4133  Any = true;
4134  }
4135  }
4136 
4137  if (Any)
4138  LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
4139 
4140  // Delete the old use.
4141  DeleteUse(LU, LUIdx);
4142  --LUIdx;
4143  --NumUses;
4144  break;
4145  }
4146  }
4147 
4148  DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4149 }
4150 
4151 /// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
4152 /// we've done more filtering, as it may be able to find more formulae to
4153 /// eliminate.
4154 void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
4155  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4156  DEBUG(dbgs() << "The search space is too complex.\n");
4157 
4158  DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
4159  "undesirable dedicated registers.\n");
4160 
4161  FilterOutUndesirableDedicatedRegisters();
4162 
4163  DEBUG(dbgs() << "After pre-selection:\n";
4164  print_uses(dbgs()));
4165  }
4166 }
4167 
4168 /// Pick a register which seems likely to be profitable, and then in any use
4169 /// which has any reference to that register, delete all formulae which do not
4170 /// reference that register.
4171 void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
4172  // With all other options exhausted, loop until the system is simple
4173  // enough to handle.
4175  while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4176  // Ok, we have too many of formulae on our hands to conveniently handle.
4177  // Use a rough heuristic to thin out the list.
4178  DEBUG(dbgs() << "The search space is too complex.\n");
4179 
4180  // Pick the register which is used by the most LSRUses, which is likely
4181  // to be a good reuse register candidate.
4182  const SCEV *Best = nullptr;
4183  unsigned BestNum = 0;
4184  for (const SCEV *Reg : RegUses) {
4185  if (Taken.count(Reg))
4186  continue;
4187  if (!Best) {
4188  Best = Reg;
4189  BestNum = RegUses.getUsedByIndices(Reg).count();
4190  } else {
4191  unsigned Count = RegUses.getUsedByIndices(Reg).count();
4192  if (Count > BestNum) {
4193  Best = Reg;
4194  BestNum = Count;
4195  }
4196  }
4197  }
4198 
4199  DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
4200  << " will yield profitable reuse.\n");
4201  Taken.insert(Best);
4202 
4203  // In any use with formulae which references this register, delete formulae
4204  // which don't reference it.
4205  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4206  LSRUse &LU = Uses[LUIdx];
4207  if (!LU.Regs.count(Best)) continue;
4208 
4209  bool Any = false;
4210  for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
4211  Formula &F = LU.Formulae[i];
4212  if (!F.referencesReg(Best)) {
4213  DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
4214  LU.DeleteFormula(F);
4215  --e;
4216  --i;
4217  Any = true;
4218  assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
4219  continue;
4220  }
4221  }
4222 
4223  if (Any)
4224  LU.RecomputeRegs(LUIdx, RegUses);
4225  }
4226 
4227  DEBUG(dbgs() << "After pre-selection:\n";
4228  print_uses(dbgs()));
4229  }
4230 }
4231 
4232 /// If there are an extraordinary number of formulae to choose from, use some
4233 /// rough heuristics to prune down the number of formulae. This keeps the main
4234 /// solver from taking an extraordinary amount of time in some worst-case
4235 /// scenarios.
4236 void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
4237  NarrowSearchSpaceByDetectingSupersets();
4238  NarrowSearchSpaceByCollapsingUnrolledCode();
4239  NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
4240  NarrowSearchSpaceByPickingWinnerRegs();
4241 }
4242 
4243 /// This is the recursive solver.
4244 void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
4245  Cost &SolutionCost,
4247  const Cost &CurCost,
4248  const SmallPtrSet<const SCEV *, 16> &CurRegs,
4249  DenseSet<const SCEV *> &VisitedRegs) const {
4250  // Some ideas:
4251  // - prune more:
4252  // - use more aggressive filtering
4253  // - sort the formula so that the most profitable solutions are found first
4254  // - sort the uses too
4255  // - search faster:
4256  // - don't compute a cost, and then compare. compare while computing a cost
4257  // and bail early.
4258  // - track register sets with SmallBitVector
4259 
4260  const LSRUse &LU = Uses[Workspace.size()];
4261 
4262  // If this use references any register that's already a part of the
4263  // in-progress solution, consider it a requirement that a formula must
4264  // reference that register in order to be considered. This prunes out
4265  // unprofitable searching.
4267  for (const SCEV *S : CurRegs)
4268  if (LU.Regs.count(S))
4269  ReqRegs.insert(S);
4270 
4272  Cost NewCost;
4273  for (const Formula &F : LU.Formulae) {
4274  // Ignore formulae which may not be ideal in terms of register reuse of
4275  // ReqRegs. The formula should use all required registers before
4276  // introducing new ones.
4277  int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
4278  for (const SCEV *Reg : ReqRegs) {
4279  if ((F.ScaledReg && F.ScaledReg == Reg) ||
4280  is_contained(F.BaseRegs, Reg)) {
4281  --NumReqRegsToFind;
4282  if (NumReqRegsToFind == 0)
4283  break;
4284  }
4285  }
4286  if (NumReqRegsToFind != 0) {
4287  // If none of the formulae satisfied the required registers, then we could
4288  // clear ReqRegs and try again. Currently, we simply give up in this case.
4289  continue;
4290  }
4291 
4292  // Evaluate the cost of the current formula. If it's already worse than
4293  // the current best, prune the search at that point.
4294  NewCost = CurCost;
4295  NewRegs = CurRegs;
4296  NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, SE, DT, LU);
4297  if (NewCost < SolutionCost) {
4298  Workspace.push_back(&F);
4299  if (Workspace.size() != Uses.size()) {
4300  SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
4301  NewRegs, VisitedRegs);
4302  if (F.getNumRegs() == 1 && Workspace.size() == 1)
4303  VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
4304  } else {
4305  DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
4306  dbgs() << ".\n Regs:";
4307  for (const SCEV *S : NewRegs)
4308  dbgs() << ' ' << *S;
4309  dbgs() << '\n');
4310 
4311  SolutionCost = NewCost;
4312  Solution = Workspace;
4313  }
4314  Workspace.pop_back();
4315  }
4316  }
4317 }
4318 
4319 /// Choose one formula from each use. Return the results in the given Solution
4320 /// vector.
4321 void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
4323  Cost SolutionCost;
4324  SolutionCost.Lose();
4325  Cost CurCost;
4327  DenseSet<const SCEV *> VisitedRegs;
4328  Workspace.reserve(Uses.size());
4329 
4330  // SolveRecurse does all the work.
4331  SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
4332  CurRegs, VisitedRegs);
4333  if (Solution.empty()) {
4334  DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
4335  return;
4336  }
4337 
4338  // Ok, we've now made all our decisions.
4339  DEBUG(dbgs() << "\n"
4340  "The chosen solution requires "; SolutionCost.print(dbgs());
4341  dbgs() << ":\n";
4342  for (size_t i = 0, e = Uses.size(); i != e; ++i) {
4343  dbgs() << " ";
4344  Uses[i].print(dbgs());
4345  dbgs() << "\n"
4346  " ";
4347  Solution[i]->print(dbgs());
4348  dbgs() << '\n';
4349  });
4350 
4351  assert(Solution.size() == Uses.size() && "Malformed solution!");
4352 }
4353 
4354 /// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
4355 /// we can go while still being dominated by the input positions. This helps
4356 /// canonicalize the insert position, which encourages sharing.
4358 LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
4359  const SmallVectorImpl<Instruction *> &Inputs)
4360  const {
4361  Instruction *Tentative = &*IP;
4362  while (true) {
4363  bool AllDominate = true;
4364  Instruction *BetterPos = nullptr;
4365  // Don't bother attempting to insert before a catchswitch, their basic block
4366  // cannot have other non-PHI instructions.
4367  if (isa<CatchSwitchInst>(Tentative))
4368  return IP;
4369 
4370  for (Instruction *Inst : Inputs) {
4371  if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
4372  AllDominate = false;
4373  break;
4374  }
4375  // Attempt to find an insert position in the middle of the block,
4376  // instead of at the end, so that it can be used for other expansions.
4377  if (Tentative->getParent() == Inst->getParent() &&
4378  (!BetterPos || !DT.dominates(Inst, BetterPos)))
4379  BetterPos = &*std::next(BasicBlock::iterator(Inst));
4380  }
4381  if (!AllDominate)
4382  break;
4383  if (BetterPos)
4384  IP = BetterPos->getIterator();
4385  else
4386  IP = Tentative->getIterator();
4387 
4388  const Loop *IPLoop = LI.getLoopFor(IP->getParent());
4389  unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
4390 
4391  BasicBlock *IDom;
4392  for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
4393  if (!Rung) return IP;
4394  Rung = Rung->getIDom();
4395  if (!Rung) return IP;
4396  IDom = Rung->getBlock();
4397 
4398  // Don't climb into a loop though.
4399  const Loop *IDomLoop = LI.getLoopFor(IDom);
4400  unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
4401  if (IDomDepth <= IPLoopDepth &&
4402  (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
4403  break;
4404  }
4405 
4406  Tentative = IDom->getTerminator();
4407  }
4408 
4409  return IP;
4410 }
4411 
4412 /// Determine an input position which will be dominated by the operands and
4413 /// which will dominate the result.
4415 LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,
4416  const LSRFixup &LF,
4417  const LSRUse &LU,
4418  SCEVExpander &Rewriter) const {
4419  // Collect some instructions which must be dominated by the
4420  // expanding replacement. These must be dominated by any operands that
4421  // will be required in the expansion.
4423  if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
4424  Inputs.push_back(I);
4425  if (LU.Kind == LSRUse::ICmpZero)
4426  if (Instruction *I =
4427  dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
4428  Inputs.push_back(I);
4429  if (LF.PostIncLoops.count(L)) {
4430  if (LF.isUseFullyOutsideLoop(L))
4431  Inputs.push_back(L->getLoopLatch()->getTerminator());
4432  else
4433  Inputs.push_back(IVIncInsertPos);
4434  }
4435  // The expansion must also be dominated by the increment positions of any
4436  // loops it for which it is using post-inc mode.
4437  for (const Loop *PIL : LF.PostIncLoops) {
4438  if (PIL == L) continue;
4439 
4440  // Be dominated by the loop exit.
4441  SmallVector<BasicBlock *, 4> ExitingBlocks;
4442  PIL->getExitingBlocks(ExitingBlocks);
4443  if (!ExitingBlocks.empty()) {
4444  BasicBlock *BB = ExitingBlocks[0];
4445  for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
4446  BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
4447  Inputs.push_back(BB->getTerminator());
4448  }
4449  }
4450 
4451  assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad()
4452  && !isa<DbgInfoIntrinsic>(LowestIP) &&
4453  "Insertion point must be a normal instruction");
4454 
4455  // Then, climb up the immediate dominator tree as far as we can go while
4456  // still being dominated by the input positions.
4457  BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
4458 
4459  // Don't insert instructions before PHI nodes.
4460  while (isa<PHINode>(IP)) ++IP;
4461 
4462  // Ignore landingpad instructions.
4463  while (IP->isEHPad()) ++IP;
4464 
4465  // Ignore debug intrinsics.
4466  while (isa<DbgInfoIntrinsic>(IP)) ++IP;
4467 
4468  // Set IP below instructions recently inserted by SCEVExpander. This keeps the
4469  // IP consistent across expansions and allows the previously inserted
4470  // instructions to be reused by subsequent expansion.
4471  while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP)
4472  ++IP;
4473 
4474  return IP;
4475 }
4476 
4477 /// Emit instructions for the leading candidate expression for this LSRUse (this
4478 /// is called "expanding").
4479 Value *LSRInstance::Expand(const LSRUse &LU,
4480  const LSRFixup &LF,
4481  const Formula &F,
4483  SCEVExpander &Rewriter,
4484  SmallVectorImpl<WeakVH> &DeadInsts) const {
4485  if (LU.RigidFormula)
4486  return LF.OperandValToReplace;
4487 
4488  // Determine an input position which will be dominated by the operands and
4489  // which will dominate the result.
4490  IP = AdjustInsertPositionForExpand(IP, LF, LU, Rewriter);
4491  Rewriter.setInsertPoint(&*IP);
4492 
4493  // Inform the Rewriter if we have a post-increment use, so that it can
4494  // perform an advantageous expansion.
4495  Rewriter.setPostInc(LF.PostIncLoops);
4496 
4497  // This is the type that the user actually needs.
4498  Type *OpTy = LF.OperandValToReplace->getType();
4499  // This will be the type that we'll initially expand to.
4500  Type *Ty = F.getType();
4501  if (!Ty)
4502  // No type known; just expand directly to the ultimate type.
4503  Ty = OpTy;
4504  else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
4505  // Expand directly to the ultimate type if it's the right size.
4506  Ty = OpTy;
4507  // This is the type to do integer arithmetic in.
4508  Type *IntTy = SE.getEffectiveSCEVType(Ty);
4509 
4510  // Build up a list of operands to add together to form the full base.
4512 
4513  // Expand the BaseRegs portion.
4514  for (const SCEV *Reg : F.BaseRegs) {
4515  assert(!Reg->isZero() && "Zero allocated in a base register!");
4516 
4517  // If we're expanding for a post-inc user, make the post-inc adjustment.
4518  PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
4520  LF.UserInst, LF.OperandValToReplace,
4521  Loops, SE, DT);
4522 
4523  Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
4524  }
4525 
4526  // Expand the ScaledReg portion.
4527  Value *ICmpScaledV = nullptr;
4528  if (F.Scale != 0) {
4529  const SCEV *ScaledS = F.ScaledReg;
4530 
4531  // If we're expanding for a post-inc user, make the post-inc adjustment.
4532  PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
4533  ScaledS = TransformForPostIncUse(Denormalize, ScaledS,
4534  LF.UserInst, LF.OperandValToReplace,
4535  Loops, SE, DT);
4536 
4537  if (LU.Kind == LSRUse::ICmpZero) {
4538  // Expand ScaleReg as if it was part of the base regs.
4539  if (F.Scale == 1)
4540  Ops.push_back(
4541  SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)));
4542  else {
4543  // An interesting way of "folding" with an icmp is to use a negated
4544  // scale, which we'll implement by inserting it into the other operand
4545  // of the icmp.
4546  assert(F.Scale == -1 &&
4547  "The only scale supported by ICmpZero uses is -1!");
4548  ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr);
4549  }
4550  } else {
4551  // Otherwise just expand the scaled register and an explicit scale,
4552  // which is expected to be matched as part of the address.
4553 
4554  // Flush the operand list to suppress SCEVExpander hoisting address modes.
4555  // Unless the addressing mode will not be folded.
4556  if (!Ops.empty() && LU.Kind == LSRUse::Address &&
4557  isAMCompletelyFolded(TTI, LU, F)) {
4558  Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
4559  Ops.clear();
4560  Ops.push_back(SE.getUnknown(FullV));
4561  }
4562  ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr));
4563  if (F.Scale != 1)
4564  ScaledS =
4565  SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
4566  Ops.push_back(ScaledS);
4567  }
4568  }
4569 
4570  // Expand the GV portion.
4571  if (F.BaseGV) {
4572  // Flush the operand list to suppress SCEVExpander hoisting.
4573  if (!Ops.empty()) {
4574  Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
4575  Ops.clear();
4576  Ops.push_back(SE.getUnknown(FullV));
4577  }
4578  Ops.push_back(SE.getUnknown(F.BaseGV));
4579  }
4580 
4581  // Flush the operand list to suppress SCEVExpander hoisting of both folded and
4582  // unfolded offsets. LSR assumes they both live next to their uses.
4583  if (!Ops.empty()) {
4584  Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
4585  Ops.clear();
4586  Ops.push_back(SE.getUnknown(FullV));
4587  }
4588 
4589  // Expand the immediate portion.
4590  int64_t Offset = (uint64_t)F.BaseOffset + LF.Offset;
4591  if (Offset != 0) {
4592  if (LU.Kind == LSRUse::ICmpZero) {
4593  // The other interesting way of "folding" with an ICmpZero is to use a
4594  // negated immediate.
4595  if (!ICmpScaledV)
4596  ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset);
4597  else {
4598  Ops.push_back(SE.getUnknown(ICmpScaledV));
4599  ICmpScaledV = ConstantInt::get(IntTy, Offset);
4600  }
4601  } else {
4602  // Just add the immediate values. These again are expected to be matched
4603  // as part of the address.
4604  Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset)));
4605  }
4606  }
4607 
4608  // Expand the unfolded offset portion.
4609  int64_t UnfoldedOffset = F.UnfoldedOffset;
4610  if (UnfoldedOffset != 0) {
4611  // Just add the immediate values.
4612  Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy,
4613  UnfoldedOffset)));
4614  }
4615 
4616  // Emit instructions summing all the operands.
4617  const SCEV *FullS = Ops.empty() ?
4618  SE.getConstant(IntTy, 0) :
4619  SE.getAddExpr(Ops);
4620  Value *FullV = Rewriter.expandCodeFor(FullS, Ty);
4621 
4622  // We're done expanding now, so reset the rewriter.
4623  Rewriter.clearPostInc();
4624 
4625  // An ICmpZero Formula represents an ICmp which we're handling as a
4626  // comparison against zero. Now that we've expanded an expression for that
4627  // form, update the ICmp's other operand.
4628  if (LU.Kind == LSRUse::ICmpZero) {
4629  ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
4630  DeadInsts.emplace_back(CI->getOperand(1));
4631  assert(!F.BaseGV && "ICmp does not support folding a global value and "
4632  "a scale at the same time!");
4633  if (F.Scale == -1) {
4634  if (ICmpScaledV->getType() != OpTy) {
4635  Instruction *Cast =
4636  CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false,
4637  OpTy, false),
4638  ICmpScaledV, OpTy, "tmp", CI);
4639  ICmpScaledV = Cast;
4640  }
4641  CI->setOperand(1, ICmpScaledV);
4642  } else {
4643  // A scale of 1 means that the scale has been expanded as part of the
4644  // base regs.
4645  assert((F.Scale == 0 || F.Scale == 1) &&
4646  "ICmp does not support folding a global value and "
4647  "a scale at the same time!");
4648  Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
4649  -(uint64_t)Offset);
4650  if (C->getType() != OpTy)
4651  C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
4652  OpTy, false),
4653  C, OpTy);
4654 
4655  CI->setOperand(1, C);
4656  }
4657  }
4658 
4659  return FullV;
4660 }
4661 
4662 /// Helper for Rewrite. PHI nodes are special because the use of their operands
4663 /// effectively happens in their predecessor blocks, so the expression may need
4664 /// to be expanded in multiple places.
4665 void LSRInstance::RewriteForPHI(PHINode *PN,
4666  const LSRUse &LU,
4667  const LSRFixup &LF,
4668  const Formula &F,
4669  SCEVExpander &Rewriter,
4670  SmallVectorImpl<WeakVH> &DeadInsts) const {
4672  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
4673  if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
4674  BasicBlock *BB = PN->getIncomingBlock(i);
4675 
4676  // If this is a critical edge, split the edge so that we do not insert
4677  // the code on all predecessor/successor paths. We do this unless this
4678  // is the canonical backedge for this loop, which complicates post-inc
4679  // users.
4680  if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
4681  !isa<IndirectBrInst>(BB->getTerminator()) &&
4682  !isa<CatchSwitchInst>(BB->getTerminator())) {
4683  BasicBlock *Parent = PN->getParent();
4684  Loop *PNLoop = LI.getLoopFor(Parent);
4685  if (!PNLoop || Parent != PNLoop->getHeader()) {
4686  // Split the critical edge.
4687  BasicBlock *NewBB = nullptr;
4688  if (!Parent->isLandingPad()) {
4689  NewBB = SplitCriticalEdge(BB, Parent,
4691  .setMergeIdenticalEdges()
4692  .setDontDeleteUselessPHIs());
4693  } else {
4695  SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DT, &LI);
4696  NewBB = NewBBs[0];
4697  }
4698  // If NewBB==NULL, then SplitCriticalEdge refused to split because all
4699  // phi predecessors are identical. The simple thing to do is skip
4700  // splitting in this case rather than complicate the API.
4701  if (NewBB) {
4702  // If PN is outside of the loop and BB is in the loop, we want to
4703  // move the block to be immediately before the PHI block, not
4704  // immediately after BB.
4705  if (L->contains(BB) && !L->contains(PN))
4706  NewBB->moveBefore(PN->getParent());
4707 
4708  // Splitting the edge can reduce the number of PHI entries we have.
4709  e = PN->getNumIncomingValues();
4710  BB = NewBB;
4711  i = PN->getBasicBlockIndex(BB);
4712  }
4713  }
4714  }
4715 
4716  std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
4717  Inserted.insert(std::make_pair(BB, static_cast<Value *>(nullptr)));
4718  if (!Pair.second)
4719  PN->setIncomingValue(i, Pair.first->second);
4720  else {
4721  Value *FullV = Expand(LU, LF, F, BB->getTerminator()->getIterator(),
4722  Rewriter, DeadInsts);
4723 
4724  // If this is reuse-by-noop-cast, insert the noop cast.
4725  Type *OpTy = LF.OperandValToReplace->getType();
4726  if (FullV->getType() != OpTy)
4727  FullV =
4728  CastInst::Create(CastInst::getCastOpcode(FullV, false,
4729  OpTy, false),
4730  FullV, LF.OperandValToReplace->getType(),
4731  "tmp", BB->getTerminator());
4732 
4733  PN->setIncomingValue(i, FullV);
4734  Pair.first->second = FullV;
4735  }
4736  }
4737 }
4738 
4739 /// Emit instructions for the leading candidate expression for this LSRUse (this
4740 /// is called "expanding"), and update the UserInst to reference the newly
4741 /// expanded value.
4742 void LSRInstance::Rewrite(const LSRUse &LU,
4743  const LSRFixup &LF,
4744  const Formula &F,
4745  SCEVExpander &Rewriter,
4746  SmallVectorImpl<WeakVH> &DeadInsts) const {
4747  // First, find an insertion point that dominates UserInst. For PHI nodes,
4748  // find the nearest block which dominates all the relevant uses.
4749  if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
4750  RewriteForPHI(PN, LU, LF, F, Rewriter, DeadInsts);
4751  } else {
4752  Value *FullV =
4753  Expand(LU, LF, F, LF.UserInst->getIterator(), Rewriter, DeadInsts);
4754 
4755  // If this is reuse-by-noop-cast, insert the noop cast.
4756  Type *OpTy = LF.OperandValToReplace->getType();
4757  if (FullV->getType() != OpTy) {
4758  Instruction *Cast =
4759  CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
4760  FullV, OpTy, "tmp", LF.UserInst);
4761  FullV = Cast;
4762  }
4763 
4764  // Update the user. ICmpZero is handled specially here (for now) because
4765  // Expand may have updated one of the operands of the icmp already, and
4766  // its new value may happen to be equal to LF.OperandValToReplace, in
4767  // which case doing replaceUsesOfWith leads to replacing both operands
4768  // with the same value. TODO: Reorganize this.
4769  if (LU.Kind == LSRUse::ICmpZero)
4770  LF.UserInst->setOperand(0, FullV);
4771  else
4772  LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
4773  }
4774 
4775  DeadInsts.emplace_back(LF.OperandValToReplace);
4776 }
4777 
4778 /// Rewrite all the fixup locations with new values, following the chosen
4779 /// solution.
4780 void LSRInstance::ImplementSolution(
4781  const SmallVectorImpl<const Formula *> &Solution) {
4782  // Keep track of instructions we may have made dead, so that
4783  // we can remove them after we are done working.
4784  SmallVector<WeakVH, 16> DeadInsts;
4785 
4786  SCEVExpander Rewriter(SE, L->getHeader()->getModule()->getDataLayout(),
4787  "lsr");
4788 #ifndef NDEBUG
4789  Rewriter.setDebugType(DEBUG_TYPE);
4790 #endif
4791  Rewriter.disableCanonicalMode();
4792  Rewriter.enableLSRMode();
4793  Rewriter.setIVIncInsertPos(L, IVIncInsertPos);
4794 
4795  // Mark phi nodes that terminate chains so the expander tries to reuse them.
4796  for (const IVChain &Chain : IVChainVec) {
4797  if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst()))
4798  Rewriter.setChainedPhi(PN);
4799  }
4800 
4801  // Expand the new value definitions and update the users.
4802  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
4803  for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) {
4804  Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], Rewriter, DeadInsts);
4805  Changed = true;
4806  }
4807 
4808  for (const IVChain &Chain : IVChainVec) {
4809  GenerateIVChain(Chain, Rewriter, DeadInsts);
4810  Changed = true;
4811  }
4812  // Clean up after ourselves. This must be done before deleting any
4813  // instructions.
4814  Rewriter.clear();
4815 
4816  Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
4817 }
4818 
4819 LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
4820  DominatorTree &DT, LoopInfo &LI,
4821  const TargetTransformInfo &TTI)
4822  : IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L), Changed(false),
4823  IVIncInsertPos(nullptr) {
4824  // If LoopSimplify form is not available, stay out of trouble.
4825  if (!L->isLoopSimplifyForm())
4826  return;
4827 
4828  // If there's no interesting work to be done, bail early.
4829  if (IU.empty()) return;
4830 
4831  // If there's too much analysis to be done, bail early. We won't be able to
4832  // model the problem anyway.
4833  unsigned NumUsers = 0;
4834  for (const IVStrideUse &U : IU) {
4835  if (++NumUsers > MaxIVUsers) {
4836  (void)U;
4837  DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U << "\n");
4838  return;
4839  }
4840  // Bail out if we have a PHI on an EHPad that gets a value from a
4841  // CatchSwitchInst. Because the CatchSwitchInst cannot be split, there is
4842  // no good place to stick any instructions.
4843  if (auto *PN = dyn_cast<PHINode>(U.getUser())) {
4844  auto *FirstNonPHI = PN->getParent()->getFirstNonPHI();
4845  if (isa<FuncletPadInst>(FirstNonPHI) ||
4846  isa<CatchSwitchInst>(FirstNonPHI))
4847  for (BasicBlock *PredBB : PN->blocks())
4848  if (isa<CatchSwitchInst>(PredBB->getFirstNonPHI()))
4849  return;
4850  }
4851  }
4852 
4853 #ifndef NDEBUG
4854  // All dominating loops must have preheaders, or SCEVExpander may not be able
4855  // to materialize an AddRecExpr whose Start is an outer AddRecExpr.
4856  //
4857  // IVUsers analysis should only create users that are dominated by simple loop
4858  // headers. Since this loop should dominate all of its users, its user list
4859  // should be empty if this loop itself is not within a simple loop nest.
4860  for (DomTreeNode *Rung = DT.getNode(L->getLoopPreheader());
4861  Rung; Rung = Rung->getIDom()) {
4862  BasicBlock *BB = Rung->getBlock();
4863  const Loop *DomLoop = LI.getLoopFor(BB);
4864  if (DomLoop && DomLoop->getHeader() == BB) {
4865  assert(DomLoop->getLoopPreheader() && "LSR needs a simplified loop nest");
4866  }
4867  }
4868 #endif // DEBUG
4869 
4870  DEBUG(dbgs() << "\nLSR on loop ";
4871  L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
4872  dbgs() << ":\n");
4873 
4874  // First, perform some low-level loop optimizations.
4875  OptimizeShadowIV();
4876  OptimizeLoopTermCond();
4877 
4878  // If loop preparation eliminates all interesting IV users, bail.
4879  if (IU.empty()) return;
4880 
4881  // Skip nested loops until we can model them better with formulae.
4882  if (!L->empty()) {
4883  DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
4884  return;
4885  }
4886 
4887  // Start collecting data and preparing for the solver.
4888  CollectChains();
4889  CollectInterestingTypesAndFactors();
4890  CollectFixupsAndInitialFormulae();
4891  CollectLoopInvariantFixupsAndFormulae();
4892 
4893  assert(!Uses.empty() && "IVUsers reported at least one use");
4894  DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
4895  print_uses(dbgs()));
4896 
4897  // Now use the reuse data to generate a bunch of interesting ways
4898  // to formulate the values needed for the uses.
4899  GenerateAllReuseFormulae();
4900 
4901  FilterOutUndesirableDedicatedRegisters();
4902  NarrowSearchSpaceUsingHeuristics();
4903 
4905  Solve(Solution);
4906 
4907  // Release memory that is no longer needed.
4908  Factors.clear();
4909  Types.clear();
4910  RegUses.clear();
4911 
4912  if (Solution.empty())
4913  return;
4914 
4915 #ifndef NDEBUG
4916  // Formulae should be legal.
4917  for (const LSRUse &LU : Uses) {
4918  for (const Formula &F : LU.Formulae)
4919  assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4920  F) && "Illegal formula generated!");
4921  };
4922 #endif
4923 
4924  // Now that we've decided what we want, make it so.
4925  ImplementSolution(Solution);
4926 }
4927 
4928 void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
4929  if (Factors.empty() && Types.empty()) return;
4930 
4931  OS << "LSR has identified the following interesting factors and types: ";
4932  bool First = true;
4933 
4934  for (int64_t Factor : Factors) {
4935  if (!First) OS << ", ";
4936  First = false;
4937  OS << '*' << Factor;
4938  }
4939 
4940  for (Type *Ty : Types) {
4941  if (!First) OS << ", ";
4942  First = false;
4943  OS << '(' << *Ty << ')';
4944  }
4945  OS << '\n';
4946 }
4947 
4948 void LSRInstance::print_fixups(raw_ostream &OS) const {
4949  OS << "LSR is examining the following fixup sites:\n";
4950  for (const LSRUse &LU : Uses)
4951  for (const LSRFixup &LF : LU.Fixups) {
4952  dbgs() << " ";
4953  LF.print(OS);
4954  OS << '\n';
4955  }
4956 }
4957 
4958 void LSRInstance::print_uses(raw_ostream &OS) const {
4959  OS << "LSR is examining the following uses:\n";
4960  for (const LSRUse &LU : Uses) {
4961  dbgs() << " ";
4962  LU.print(OS);
4963  OS << '\n';
4964  for (const Formula &F : LU.Formulae) {
4965  OS << " ";
4966  F.print(OS);
4967  OS << '\n';
4968  }
4969  }
4970 }
4971 
4972 void LSRInstance::print(raw_ostream &OS) const {
4973  print_factors_and_types(OS);
4974  print_fixups(OS);
4975  print_uses(OS);
4976 }
4977 
4979 void LSRInstance::dump() const {
4980  print(errs()); errs() << '\n';
4981 }
4982 
4983 namespace {
4984 
4985 class LoopStrengthReduce : public LoopPass {
4986 public:
4987  static char ID; // Pass ID, replacement for typeid
4988 
4989  LoopStrengthReduce();
4990 
4991 private:
4992  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
4993  void getAnalysisUsage(AnalysisUsage &AU) const override;
4994 };
4995 
4996 } // end anonymous namespace
4997 
4998 LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
4999  initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry());
5000 }
5001 
5002 void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
5003  // We split critical edges, so we change the CFG. However, we do update
5004  // many analyses if they are around.
5006 
5014  // Requiring LoopSimplify a second time here prevents IVUsers from running
5015  // twice, since LoopSimplify was invalidated by running ScalarEvolution.
5020 }
5021 
5023  DominatorTree &DT, LoopInfo &LI,
5024  const TargetTransformInfo &TTI) {
5025  bool Changed = false;
5026 
5027  // Run the main LSR transformation.
5028  Changed |= LSRInstance(L, IU, SE, DT, LI, TTI).getChanged();
5029 
5030  // Remove any extra phis created by processing inner loops.
5031  Changed |= DeleteDeadPHIs(L->getHeader());
5032  if (EnablePhiElim && L->isLoopSimplifyForm()) {
5033  SmallVector<WeakVH, 16> DeadInsts;
5034  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
5035  SCEVExpander Rewriter(SE, DL, "lsr");
5036 #ifndef NDEBUG
5037  Rewriter.setDebugType(DEBUG_TYPE);
5038 #endif
5039  unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI);
5040  if (numFolded) {
5041  Changed = true;
5043  DeleteDeadPHIs(L->getHeader());
5044  }
5045  }
5046  return Changed;
5047 }
5048 
5049 bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
5050  if (skipLoop(L))
5051  return false;
5052 
5053  auto &IU = getAnalysis<IVUsersWrapperPass>().getIU();
5054  auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
5055  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
5056  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
5057  const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
5058  *L->getHeader()->getParent());
5059  return ReduceLoopStrength(L, IU, SE, DT, LI, TTI);
5060 }
5061 
5062 PreservedAnalyses LoopStrengthReducePass::run(Loop &L, LoopAnalysisManager &AM,
5064  LPMUpdater &) {
5065  if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,
5066  AR.DT, AR.LI, AR.TTI))
5067  return PreservedAnalyses::all();
5068 
5070 }
5071 
5072 char LoopStrengthReduce::ID = 0;
5073 INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
5074  "Loop Strength Reduction", false, false)
5080 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
5081 INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
5082  "Loop Strength Reduction", false, false)
5083 
5084 Pass *llvm::createLoopStrengthReducePass() { return new LoopStrengthReduce(); }
MachineLoop * L
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:81
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:76
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:102
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:241
static bool isProfitableChain(IVChain &Chain, SmallPtrSetImpl< Instruction * > &Users, ScalarEvolution &SE, const TargetTransformInfo &TTI)
Return true if the number of registers needed for the chain is estimated to be less than the number r...
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
iterator_range< use_iterator > uses()
Definition: Value.h:326
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
raw_ostream & errs()
This returns a reference to a raw_ostream for standard error.
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
const SCEV * TransformForPostIncUse(TransformKind Kind, const SCEV *S, Instruction *User, Value *OperandValToReplace, PostIncLoopSet &Loops, ScalarEvolution &SE, DominatorTree &DT)
TransformForPostIncUse - Transform the given expression according to the given transformation kind...
DiagnosticInfoOptimizationBase::Argument NV
static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if this AddRec is already a phi in its loop.
Pass * createLoopStrengthReducePass()
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
const SCEV * getConstant(ConstantInt *V)
size_t i
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds...
Definition: Compiler.h:450
This header provides classes for managing a pipeline of passes over loops in LLVM IR...
LLVMContext & getContext() const
bool isZero() const
Return true if the expression is a constant zero.
static const size_t ComplexityLimit
Implements a dense probed hash-table based set.
Definition: DenseSet.h:202
The main scalar evolution driver.
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:84
Denormalize - Perform the inverse transform on the expression with the given loop set...
size_type count(PtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:380
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space...
Definition: Type.cpp:655
static const unsigned MaxIVUsers
MaxIVUsers is an arbitrary threshold that provides an early opportunitiy for bail out...
bool properlyDominates(const SCEV *S, const BasicBlock *BB)
Return true if elements that makes up the given SCEV properly dominate the specified basic block...
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
void initializeLoopStrengthReducePass(PassRegistry &)
void setDebugType(const char *s)
const_iterator begin(StringRef path)
Get begin iterator over path.
Definition: Path.cpp:233
int getFPMantissaWidth() const
Return the width of the mantissa of this type.
Definition: Type.cpp:127
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:736
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
static void dump(StringRef Title, SpillInfo const &Spills)
Definition: CoroFrame.cpp:283
static bool isCanonical(const MDString *S)
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:100
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
An instruction for reading from memory.
Definition: Instructions.h:164
FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys=None)
Return the function type for an intrinsic.
Definition: Function.cpp:905
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Definition: LoopInfo.h:575
iv Induction Variable Users
Definition: IVUsers.cpp:51
bool isLegalAddImmediate(int64_t Imm) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
void reserve(size_type N)
Definition: SmallVector.h:377
loop data prefetch
bool isTrueWhenEqual(CondCode Cond)
Return true if the specified condition returns true if the two operands to the condition are equal...
Definition: ISDOpcodes.h:888
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:93
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:78
BlockT * getHeader() const
Definition: LoopInfo.h:102
This is the base class for unary cast operator classes.
const SCEV * getStart() const
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
Definition: LoopInfoImpl.h:157
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:345
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:324
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:172
#define DEBUG_TYPE
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:53
Hexagon Hardware Loops
bool isUnconditional() const
This class represents the LLVM 'select' instruction.
Option class for critical edge splitting.
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:143
void clearPostInc()
Disable all post-inc expansion.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:56
static User::op_iterator findIVOperand(User::op_iterator OI, User::op_iterator OE, Loop *L, ScalarEvolution &SE)
Helper for CollectChains that finds an IV operand (computed by an AddRec in this loop) within [OI...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:32
static GCRegistry::Add< StatepointGC > D("statepoint-example","an example strategy for statepoint")
Instruction * getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:180
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:588
Reg
All possible values of the reg field in the ModR/M byte.
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
Definition: LoopInfoImpl.h:36
uint64_t getTypeSizeInBits(Type *Ty) const
Return the size in bits of the specified type, for which isSCEVable must return true.
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:257
static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F)
Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following: ...
op_iterator op_begin() const
This node represents multiplication of some number of SCEVs.
bool isLoopSimplifyForm() const
Return true if the Loop is in the form that the LoopSimplify form transforms loops to...
Definition: LoopInfo.cpp:190
This file implements a class to represent arbitrary precision integral constant values and operations...
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:60
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:154
auto reverse(ContainerTy &&C, typename std::enable_if< has_rbegin< ContainerTy >::value >::type *=nullptr) -> decltype(make_range(C.rbegin(), C.rend()))
Definition: STLExtras.h:241
loop Loop Strength false
#define F(x, y, z)
Definition: MD5.cpp:51
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:136
void clear()
Definition: SmallSet.h:118
static bool isEqual(const Function &Caller, const Function &Callee)
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
const RegList & Regs
static bool add(uint64_t *dest, const uint64_t *x, const uint64_t *y, unsigned len)
This function adds the integer array x to the integer array Y and places the result in dest...
Definition: APInt.cpp:239
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:83
bool DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI=nullptr)
Examine each PHI in the given block and delete it if it is dead.
Base class for the actual dominator tree node.
AnalysisUsage & addPreservedID(const void *ID)
static GCRegistry::Add< OcamlGC > B("ocaml","ocaml 3.10-compatible GC")
An instruction for storing to memory.
Definition: Instructions.h:300
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:151
auto count(R &&Range, const E &Element) -> typename std::iterator_traits< decltype(std::begin(Range))>::difference_type
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:791
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:401
Type * getEffectiveSCEVType(Type *Ty) const
Return a type with the same bitwidth as the given type and which represents how SCEV will treat the g...
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:96
unsigned getMinSignedBits() const
Get the minimum bit size for this signed APInt.
Definition: APInt.h:1298
This class represents a truncation of integer types.
const SCEV * getAddRecExpr(const SCEV *Start, const SCEV *Step, const Loop *L, SCEV::NoWrapFlags Flags)
Get an add recurrence expression for the specified loop.
Maximum length of the test input libFuzzer tries to guess a good value based on the corpus and reports it always prefer smaller inputs during the corpus shuffle When libFuzzer itself reports a bug this exit code will be used If indicates the maximal total time in seconds to run the fuzzer minimizes the provided crash input Use with etc Experimental Use value profile to guide fuzzing Number of simultaneous worker processes to run the jobs If min(jobs, NumberOfCpuCores()/2)\" is used.") FUZZER_FLAG_INT(reload
Class to represent pointers.
Definition: DerivedTypes.h:443
static GCRegistry::Add< CoreCLRGC > E("coreclr","CoreCLR-compatible GC")
unsigned getNumIncomingValues() const
Return the number of incoming edges.
void replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:24
void clear()
Erase the contents of the InsertedExpressions map so that users trying to expand the same expression ...
unsigned getNumSuccessors() const
Return the number of successors that this terminator has.
Definition: InstrTypes.h:74
BasicBlock * SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, const CriticalEdgeSplittingOptions &Options=CriticalEdgeSplittingOptions())
If this edge is a critical edge, insert a new node to split the critical edge.
static bool isCompatibleIVType(Value *LVal, Value *RVal)
Return true if we allow an IV chain to include both types.
#define P(N)
This means that we are dealing with an entirely unknown SCEV value, and only represent it as its LLVM...
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace=0) const
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:395
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values...
bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Wrapper pass for TargetTransformInfo.
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:107
void setUser(Instruction *NewUser)
setUser - Assign a new user instruction for this use.
Definition: IVUsers.h:51
* if(!EatIfPresent(lltok::kw_thread_local)) return false
ParseOptionalThreadLocal := /*empty.
std::size_t countTrailingZeros(T Val, ZeroBehavior ZB=ZB_Width)
Count number of 0's from the least significant bit to the most stopping at the first 1...
Definition: MathExtras.h:111
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:109
static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, int64_t MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale)
Test whether we know how to expand the current formula.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs...ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:653
static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE)
Return true if the given mul can be sign-extended without changing its value.
LLVM Basic Block Representation.
Definition: BasicBlock.h:51
PointerIntPair - This class implements a pair of a pointer and small integer.
This class represents a binary unsigned division operation.
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:45
static void DoInitialMatch(const SCEV *S, Loop *L, SmallVectorImpl< const SCEV * > &Good, SmallVectorImpl< const SCEV * > &Bad, ScalarEvolution &SE)
Recursion helper for initialMatch.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:48
void setInsertPoint(Instruction *IP)
Set the current insertion point.
Type * getType() const
Return the LLVM type of this SCEV expression.
void SplitLandingPadPredecessors(BasicBlock *OrigBB, ArrayRef< BasicBlock * > Preds, const char *Suffix, const char *Suffix2, SmallVectorImpl< BasicBlock * > &NewBBs, DominatorTree *DT=nullptr, LoopInfo *LI=nullptr, bool PreserveLCSSA=false)
This method transforms the landing pad, OrigBB, by introducing two new basic blocks into the function...
Conditional or Unconditional Branch instruction.
This is an important base class in LLVM.
Definition: Constant.h:42
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator begin()
Definition: SmallVector.h:115
const SCEV * getOperand(unsigned i) const
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1321
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:36
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Normalize - Normalize according to the given loops.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:368
static cl::opt< bool > EnablePhiElim("enable-lsr-phielim", cl::Hidden, cl::init(true), cl::desc("Enable LSR phi elimination"))
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:259
Represent the analysis usage information of a pass.
op_iterator op_end()
Definition: User.h:207
static Type * getVoidTy(LLVMContext &C)
Definition: Type.cpp:154
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
static bool isAlwaysFoldable(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg)
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:109
uint32_t Offset
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang","erlang-compatible garbage collector")
This instruction compares its operands according to the predicate given to the constructor.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return LHS-RHS. Minus is represented in SCEV as A+B*-1.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:880
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE,"Assign register bank of generic virtual registers", false, false) RegBankSelect
Value * expandCodeFor(const SCEV *SH, Type *Ty, Instruction *I)
Insert code to directly compute the specified SCEV expression into the program.
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
for(unsigned i=0, e=MI->getNumOperands();i!=e;++i)
static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE)
If S involves the addition of a constant integer value, return that integer value, and mutate S to point to a new SCEV with that value excluded.
APInt sdiv(const APInt &RHS) const
Signed division function for APInt.
Definition: APInt.cpp:1854
Value * getOperand(unsigned i) const
Definition: User.h:145
NodeT * findNearestCommonDominator(NodeT *A, NodeT *B)
findNearestCommonDominator - Find nearest common dominator basic block for basic block A and B...
op_range operands()
Definition: User.h:213
iterator begin() const
Definition: SmallPtrSet.h:398
self_iterator getIterator()
Definition: ilist_node.h:81
iterator_range< block_iterator > blocks()
std::pair< NoneType, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:80
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:960
static Constant * getAllOnesValue(Type *Ty)
Get the all ones value.
Definition: Constants.cpp:249
LLVM_NODISCARD bool empty() const
Definition: SmallPtrSet.h:98
const APInt & getAPInt() const
void append(in_iter in_start, in_iter in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:392
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:213
iterator erase(const_iterator CI)
Definition: SmallVector.h:431
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:113
bool isFoldableMemAccessOffset(Instruction *I, int64_t Offset) const
Return true if target supports the load / store instruction with the given Offset on the form reg + O...
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE)
Return true if the given add can be sign-extended without changing its value.
int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace=0) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target...
APInt srem(const APInt &RHS) const
Function for signed remainder operation.
Definition: APInt.cpp:1902
char & LoopSimplifyID
void setChainedPhi(PHINode *PN)
bool dominates(const Instruction *Def, const Use &U) const
Return true if Def dominates a use in User.
Definition: Dominators.cpp:218
static const SCEV * getExactSDiv(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool IgnoreSignificantBits=false)
Return an expression for LHS /s RHS, if it can be determined and if the remainder is known to be zero...
unsigned replaceCongruentIVs(Loop *L, const DominatorTree *DT, SmallVectorImpl< WeakVH > &DeadInsts, const TargetTransformInfo *TTI=nullptr)
replace congruent phis with their most canonical representative.
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:234
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:292
Iterator for intrusive lists based on ilist_node.
This is the shared class of boolean and integer constants.
Definition: Constants.h:88
auto find(R &&Range, const T &Val) -> decltype(std::begin(Range))
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:757
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
static cl::opt< bool > StressIVChain("stress-ivchain", cl::Hidden, cl::init(false), cl::desc("Stress test LSR IV chains"))
AnalysisUsage & addRequiredID(const void *ID)
Definition: Pass.cpp:289
Value * getOperandValToReplace() const
getOperandValToReplace - Return the Value of the operand in the user instruction that this IVStrideUs...
Definition: IVUsers.h:57
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:843
Module.h This file contains the declarations for the Module class.
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:230
constexpr size_t array_lengthof(T(&)[N])
Find the length of an array.
Definition: STLExtras.h:649
const SCEV * getNoopOrSignExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
static bool isHighCostExpansion(const SCEV *S, SmallPtrSetImpl< const SCEV * > &Processed, ScalarEvolution &SE)
Check if expanding this expression is likely to incur significant cost.
const DataFlowGraph & G
Definition: RDFGraph.cpp:206
void setIVIncInsertPos(const Loop *L, Instruction *Pos)
Set the current IV increment loop and position.
static Value * getWideOperand(Value *Oper)
IVChain logic must consistenctly peek base TruncInst operands, so wrap it in a convenient helper...
INITIALIZE_PASS(HexagonGenMux,"hexagon-mux","Hexagon generate mux instructions", false, false) void HexagonGenMux I isValid()
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:382
CHAIN = SC CHAIN, Imm128 - System call.
bool isStrictlyPositive() const
Determine if this APInt Value is positive.
Definition: APInt.h:337
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F)
Check if the addressing mode defined by F is completely folded in LU at isel time.
ConstantInt * getValue() const
static GCRegistry::Add< ShadowStackGC > C("shadow-stack","Very portable GC for uncooperative code generators")
void setOperand(unsigned i, Value *Val)
Definition: User.h:150
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
bool isAllOnesValue() const
Return true if this is the value that would be returned by getAllOnesValue.
Definition: Constants.cpp:105
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:586
size_t size() const
Returns the number of bits in this bitvector.
Class for arbitrary precision integers.
Definition: APInt.h:77
This node represents an addition of some number of SCEVs.
const SCEV * getSignExtendExpr(const SCEV *Op, Type *Ty)
Value * getIncomingValueForBlock(const BasicBlock *BB) const
This class represents a signed maximum selection.
static bool isAddressUse(Instruction *Inst, Value *OperandVal)
Returns true if the specified instruction is using the specified value as an address.
iterator_range< user_iterator > users()
Definition: Value.h:370
This class uses information about analyze scalars to rewrite expressions in canonical form...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:480
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Get a canonical add expression, or something simpler if possible.
static void clear(coro::Shape &Shape)
Definition: Coroutines.cpp:191
loop Loop Strength Reduction
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:464
virtual bool runOnLoop(Loop *L, LPPassManager &LPM)=0
bool isEHPad() const
Return true if the instruction is a variety of EH-block.
Definition: Instruction.h:453
Virtual Register Rewriter
Definition: VirtRegMap.cpp:194
bool operator!=(uint64_t V1, const APInt &V2)
Definition: APInt.h:1724
iterator end() const
Definition: SmallPtrSet.h:405
bool isAllOnesValue() const
Determine if all bits are set.
Definition: APInt.h:342
Basic Alias true
bool isLegalICmpImmediate(int64_t Imm) const
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
LLVM_ATTRIBUTE_ALWAYS_INLINE iterator end()
Definition: SmallVector.h:119
Value * getCondition() const
void emplace_back(ArgTypes &&...Args)
Definition: SmallVector.h:635
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition: APInt.h:372
size_type count(const ValueT &V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:81
This class represents an analyzed expression in the program.
Analysis pass that exposes the IVUsers for a loop.
Definition: IVUsers.h:189
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:368
#define I(x, y, z)
Definition: MD5.cpp:54
#define N
TerminatorInst * getTerminator()
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.cpp:124
bool isLandingPad() const
Return true if this basic block is a landing pad.
Definition: BasicBlock.cpp:436
LLVM_ATTRIBUTE_ALWAYS_INLINE size_type size() const
Definition: SmallVector.h:135
bool hasOneUse() const
Return true if there is exactly one user of this value.
Definition: Value.h:383
static const SCEV * getExprBase(const SCEV *S)
Return an approximation of this SCEV expression's "base", or NULL for any constant.
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1099
Sequence
A sequence of states that a pointer may go through in which an objc_retain and objc_release are actua...
Definition: PtrState.h:37
INITIALIZE_PASS_BEGIN(LoopStrengthReduce,"loop-reduce","Loop Strength Reduction", false, false) INITIALIZE_PASS_END(LoopStrengthReduce
This class represents a cast unsigned integer to floating point.
static MemAccessTy getAccessType(const Instruction *Inst)
Return the type of the memory being accessed.
This class represents an unsigned maximum selection.
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:287
const Loop * getLoop() const
void transformToPostInc(const Loop *L)
transformToPostInc - Transform the expression to post-inc form for the given loop.
Definition: IVUsers.cpp:372
loop reduce
static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, Value *Operand, const TargetTransformInfo &TTI)
Return true if the IVInc can be folded into an addressing mode.
const unsigned Kind
const SCEV * getBackedgeTakenCount(const Loop *L)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
bool use_empty() const
Definition: Value.h:299
This class represents a cast from signed integer to floating point.
static bool DeleteTriviallyDeadInstructions(SmallVectorImpl< WeakVH > &DeadInsts)
If any of the instructions is the specified set are trivially dead, delete them and see if this makes...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
unsigned getSCEVType() const
bool operator<(int64_t V1, const APSInt &V2)
Definition: APSInt.h:326
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:537
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction has no side ef...
Definition: Local.cpp:288
LLVM Value Representation.
Definition: Value.h:71
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:111
static GlobalValue * ExtractSymbol(const SCEV *&S, ScalarEvolution &SE)
If S involves the addition of a GlobalValue address, return that symbol, and mutate S to point to a n...
A vector that has set insertion semantics.
Definition: SetVector.h:41
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
Definition: Instruction.cpp:95
IVStrideUse - Keep track of one use of a strided induction variable.
Definition: IVUsers.h:38
bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
static const SCEV * CollectSubexprs(const SCEV *S, const SCEVConstant *C, SmallVectorImpl< const SCEV * > &Ops, const Loop *L, ScalarEvolution &SE, unsigned Depth=0)
Split S into subexpressions which can be pulled out into separate registers.
This class implements an extremely fast bulk output stream that can only output to a stream...
Definition: raw_ostream.h:44
bool isInsertedInstruction(Instruction *I) const
Return true if the specified instruction was inserted by the code rewriter.
bool empty() const
Definition: LoopInfo.h:136
#define DEBUG(X)
Definition: Debug.h:100
void disableCanonicalMode()
Disable the behavior of expanding expressions in canonical form rather than in a more literal form...
const SCEV * getUnknown(Value *V)
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:831
op_iterator op_end() const
A container for analyses that lazily runs them and caches their results.
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:217
This pass exposes codegen information to IR-level passes.
bool operator==(uint64_t V1, const APInt &V2)
Definition: APInt.h:1722
DomTreeNodeBase< NodeT > * getNode(NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
T Max(T a, T b)
Definition: FuzzerDefs.h:57
bool hasComputableLoopEvolution(const SCEV *S, const Loop *L)
Return true if the given SCEV changes value in a known way in the specified loop. ...
This node is a base class providing common functionality for n'ary operators.
void setIncomingValue(unsigned i, Value *V)
void setPostInc(const PostIncLoopSet &L)
Enable post-inc expansion for addrecs referring to the given loops.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:162
bool isSafeToExpand(const SCEV *S, ScalarEvolution &SE)
Return true if the given expression is safe to expand in the sense that all materialized values are s...
static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, const TargetTransformInfo &TTI)
static GCRegistry::Add< ErlangGC > A("erlang","erlang-compatible garbage collector")
unsigned getLoopDepth() const
Return the nesting level of this loop.
Definition: LoopInfo.h:95
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Get a canonical multiply expression, or something simpler if possible.
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition: BasicBlock.cpp:103
const BasicBlock * getParent() const
Definition: Instruction.h:62
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:83
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
Definition: Constants.h:206
bool empty() const
Definition: IVUsers.h:149
static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if the given addrec can be sign-extended without changing its value.
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:44
This class represents a constant integer value.
const SCEV * getAnyExtendExpr(const SCEV *Op, Type *Ty)
getAnyExtendExpr - Return a SCEV for the given operand extended with unspecified bits out to the give...
void resize(size_type N)
Definition: SmallVector.h:352
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:783