LLVM 20.0.0git
LoopStrengthReduce.cpp
Go to the documentation of this file.
1//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This transformation analyzes and transforms the induction variables (and
10// computations derived from them) into forms suitable for efficient execution
11// on the target.
12//
13// This pass performs a strength reduction on array references inside loops that
14// have as one or more of their components the loop induction variable, it
15// rewrites expressions to take advantage of scaled-index addressing modes
16// available on the target, and it performs a variety of other optimizations
17// related to loop induction variables.
18//
19// Terminology note: this code has a lot of handling for "post-increment" or
20// "post-inc" users. This is not talking about post-increment addressing modes;
21// it is instead talking about code like this:
22//
23// %i = phi [ 0, %entry ], [ %i.next, %latch ]
24// ...
25// %i.next = add %i, 1
26// %c = icmp eq %i.next, %n
27//
28// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
29// it's useful to think about these as the same register, with some uses using
30// the value of the register before the add and some using it after. In this
31// example, the icmp is a post-increment user, since it uses %i.next, which is
32// the value of the induction variable after the increment. The other common
33// case of post-increment users is users outside the loop.
34//
35// TODO: More sophistication in the way Formulae are generated and filtered.
36//
37// TODO: Handle multiple loops at a time.
38//
39// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
40// of a GlobalValue?
41//
42// TODO: When truncation is free, truncate ICmp users' operands to make it a
43// smaller encoding (on x86 at least).
44//
45// TODO: When a negated register is used by an add (such as in a list of
46// multiple base registers, or as the increment expression in an addrec),
47// we may not actually need both reg and (-1 * reg) in registers; the
48// negation can be implemented by using a sub instead of an add. The
49// lack of support for taking this into consideration when making
50// register pressure decisions is partly worked around by the "Special"
51// use kind.
52//
53//===----------------------------------------------------------------------===//
54
56#include "llvm/ADT/APInt.h"
57#include "llvm/ADT/DenseMap.h"
58#include "llvm/ADT/DenseSet.h"
59#include "llvm/ADT/Hashing.h"
61#include "llvm/ADT/STLExtras.h"
62#include "llvm/ADT/SetVector.h"
65#include "llvm/ADT/SmallSet.h"
67#include "llvm/ADT/Statistic.h"
84#include "llvm/Config/llvm-config.h"
85#include "llvm/IR/BasicBlock.h"
86#include "llvm/IR/Constant.h"
87#include "llvm/IR/Constants.h"
90#include "llvm/IR/Dominators.h"
91#include "llvm/IR/GlobalValue.h"
92#include "llvm/IR/IRBuilder.h"
93#include "llvm/IR/InstrTypes.h"
94#include "llvm/IR/Instruction.h"
97#include "llvm/IR/Module.h"
98#include "llvm/IR/Operator.h"
99#include "llvm/IR/PassManager.h"
100#include "llvm/IR/Type.h"
101#include "llvm/IR/Use.h"
102#include "llvm/IR/User.h"
103#include "llvm/IR/Value.h"
104#include "llvm/IR/ValueHandle.h"
106#include "llvm/Pass.h"
107#include "llvm/Support/Casting.h"
110#include "llvm/Support/Debug.h"
120#include <algorithm>
121#include <cassert>
122#include <cstddef>
123#include <cstdint>
124#include <iterator>
125#include <limits>
126#include <map>
127#include <numeric>
128#include <optional>
129#include <utility>
130
131using namespace llvm;
132
133#define DEBUG_TYPE "loop-reduce"
134
135/// MaxIVUsers is an arbitrary threshold that provides an early opportunity for
136/// bail out. This threshold is far beyond the number of users that LSR can
137/// conceivably solve, so it should not affect generated code, but catches the
138/// worst cases before LSR burns too much compile time and stack space.
139static const unsigned MaxIVUsers = 200;
140
141/// Limit the size of expression that SCEV-based salvaging will attempt to
142/// translate into a DIExpression.
143/// Choose a maximum size such that debuginfo is not excessively increased and
144/// the salvaging is not too expensive for the compiler.
145static const unsigned MaxSCEVSalvageExpressionSize = 64;
146
147// Cleanup congruent phis after LSR phi expansion.
149 "enable-lsr-phielim", cl::Hidden, cl::init(true),
150 cl::desc("Enable LSR phi elimination"));
151
152// The flag adds instruction count to solutions cost comparison.
154 "lsr-insns-cost", cl::Hidden, cl::init(true),
155 cl::desc("Add instruction count to a LSR cost model"));
156
157// Flag to choose how to narrow complex lsr solution
159 "lsr-exp-narrow", cl::Hidden, cl::init(false),
160 cl::desc("Narrow LSR complex solution using"
161 " expectation of registers number"));
162
163// Flag to narrow search space by filtering non-optimal formulae with
164// the same ScaledReg and Scale.
166 "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true),
167 cl::desc("Narrow LSR search space by filtering non-optimal formulae"
168 " with the same ScaledReg and Scale"));
169
171 "lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None),
172 cl::desc("A flag that overrides the target's preferred addressing mode."),
174 "none",
175 "Don't prefer any addressing mode"),
177 "preindexed",
178 "Prefer pre-indexed addressing mode"),
180 "postindexed",
181 "Prefer post-indexed addressing mode")));
182
184 "lsr-complexity-limit", cl::Hidden,
185 cl::init(std::numeric_limits<uint16_t>::max()),
186 cl::desc("LSR search space complexity limit"));
187
189 "lsr-setupcost-depth-limit", cl::Hidden, cl::init(7),
190 cl::desc("The limit on recursion depth for LSRs setup cost"));
191
193 "lsr-term-fold", cl::Hidden,
194 cl::desc("Attempt to replace primary IV with other IV."));
195
197 "lsr-drop-solution", cl::Hidden,
198 cl::desc("Attempt to drop solution if it is less profitable"));
199
201 "lsr-enable-vscale-immediates", cl::Hidden, cl::init(true),
202 cl::desc("Enable analysis of vscale-relative immediates in LSR"));
203
205 "lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true),
206 cl::desc("Avoid using scaled registers with vscale-relative addressing"));
207
208STATISTIC(NumTermFold,
209 "Number of terminating condition fold recognized and performed");
210
211#ifndef NDEBUG
212// Stress test IV chain generation.
214 "stress-ivchain", cl::Hidden, cl::init(false),
215 cl::desc("Stress test LSR IV chains"));
216#else
217static bool StressIVChain = false;
218#endif
219
220namespace {
221
222struct MemAccessTy {
223 /// Used in situations where the accessed memory type is unknown.
224 static const unsigned UnknownAddressSpace =
225 std::numeric_limits<unsigned>::max();
226
227 Type *MemTy = nullptr;
228 unsigned AddrSpace = UnknownAddressSpace;
229
230 MemAccessTy() = default;
231 MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {}
232
233 bool operator==(MemAccessTy Other) const {
234 return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
235 }
236
237 bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
238
239 static MemAccessTy getUnknown(LLVMContext &Ctx,
240 unsigned AS = UnknownAddressSpace) {
241 return MemAccessTy(Type::getVoidTy(Ctx), AS);
242 }
243
244 Type *getType() { return MemTy; }
245};
246
247/// This class holds data which is used to order reuse candidates.
248class RegSortData {
249public:
250 /// This represents the set of LSRUse indices which reference
251 /// a particular register.
252 SmallBitVector UsedByIndices;
253
254 void print(raw_ostream &OS) const;
255 void dump() const;
256};
257
258// An offset from an address that is either scalable or fixed. Used for
259// per-target optimizations of addressing modes.
260class Immediate : public details::FixedOrScalableQuantity<Immediate, int64_t> {
261 constexpr Immediate(ScalarTy MinVal, bool Scalable)
262 : FixedOrScalableQuantity(MinVal, Scalable) {}
263
264 constexpr Immediate(const FixedOrScalableQuantity<Immediate, int64_t> &V)
265 : FixedOrScalableQuantity(V) {}
266
267public:
268 constexpr Immediate() = delete;
269
270 static constexpr Immediate getFixed(ScalarTy MinVal) {
271 return {MinVal, false};
272 }
273 static constexpr Immediate getScalable(ScalarTy MinVal) {
274 return {MinVal, true};
275 }
276 static constexpr Immediate get(ScalarTy MinVal, bool Scalable) {
277 return {MinVal, Scalable};
278 }
279 static constexpr Immediate getZero() { return {0, false}; }
280 static constexpr Immediate getFixedMin() {
281 return {std::numeric_limits<int64_t>::min(), false};
282 }
283 static constexpr Immediate getFixedMax() {
284 return {std::numeric_limits<int64_t>::max(), false};
285 }
286 static constexpr Immediate getScalableMin() {
287 return {std::numeric_limits<int64_t>::min(), true};
288 }
289 static constexpr Immediate getScalableMax() {
290 return {std::numeric_limits<int64_t>::max(), true};
291 }
292
293 constexpr bool isLessThanZero() const { return Quantity < 0; }
294
295 constexpr bool isGreaterThanZero() const { return Quantity > 0; }
296
297 constexpr bool isCompatibleImmediate(const Immediate &Imm) const {
298 return isZero() || Imm.isZero() || Imm.Scalable == Scalable;
299 }
300
301 constexpr bool isMin() const {
302 return Quantity == std::numeric_limits<ScalarTy>::min();
303 }
304
305 constexpr bool isMax() const {
306 return Quantity == std::numeric_limits<ScalarTy>::max();
307 }
308
309 // Arithmetic 'operators' that cast to unsigned types first.
310 constexpr Immediate addUnsigned(const Immediate &RHS) const {
311 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
312 ScalarTy Value = (uint64_t)Quantity + RHS.getKnownMinValue();
313 return {Value, Scalable || RHS.isScalable()};
314 }
315
316 constexpr Immediate subUnsigned(const Immediate &RHS) const {
317 assert(isCompatibleImmediate(RHS) && "Incompatible Immediates");
318 ScalarTy Value = (uint64_t)Quantity - RHS.getKnownMinValue();
319 return {Value, Scalable || RHS.isScalable()};
320 }
321
322 // Scale the quantity by a constant without caring about runtime scalability.
323 constexpr Immediate mulUnsigned(const ScalarTy RHS) const {
324 ScalarTy Value = (uint64_t)Quantity * RHS;
325 return {Value, Scalable};
326 }
327
328 // Helpers for generating SCEVs with vscale terms where needed.
329 const SCEV *getSCEV(ScalarEvolution &SE, Type *Ty) const {
330 const SCEV *S = SE.getConstant(Ty, Quantity);
331 if (Scalable)
332 S = SE.getMulExpr(S, SE.getVScale(S->getType()));
333 return S;
334 }
335
336 const SCEV *getNegativeSCEV(ScalarEvolution &SE, Type *Ty) const {
337 const SCEV *NegS = SE.getConstant(Ty, -(uint64_t)Quantity);
338 if (Scalable)
339 NegS = SE.getMulExpr(NegS, SE.getVScale(NegS->getType()));
340 return NegS;
341 }
342
343 const SCEV *getUnknownSCEV(ScalarEvolution &SE, Type *Ty) const {
344 const SCEV *SU = SE.getUnknown(ConstantInt::getSigned(Ty, Quantity));
345 if (Scalable)
346 SU = SE.getMulExpr(SU, SE.getVScale(SU->getType()));
347 return SU;
348 }
349};
350
351// This is needed for the Compare type of std::map when Immediate is used
352// as a key. We don't need it to be fully correct against any value of vscale,
353// just to make sure that vscale-related terms in the map are considered against
354// each other rather than being mixed up and potentially missing opportunities.
355struct KeyOrderTargetImmediate {
356 bool operator()(const Immediate &LHS, const Immediate &RHS) const {
357 if (LHS.isScalable() && !RHS.isScalable())
358 return false;
359 if (!LHS.isScalable() && RHS.isScalable())
360 return true;
361 return LHS.getKnownMinValue() < RHS.getKnownMinValue();
362 }
363};
364
365// This would be nicer if we could be generic instead of directly using size_t,
366// but there doesn't seem to be a type trait for is_orderable or
367// is_lessthan_comparable or similar.
368struct KeyOrderSizeTAndImmediate {
369 bool operator()(const std::pair<size_t, Immediate> &LHS,
370 const std::pair<size_t, Immediate> &RHS) const {
371 size_t LSize = LHS.first;
372 size_t RSize = RHS.first;
373 if (LSize != RSize)
374 return LSize < RSize;
375 return KeyOrderTargetImmediate()(LHS.second, RHS.second);
376 }
377};
378} // end anonymous namespace
379
380#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
381void RegSortData::print(raw_ostream &OS) const {
382 OS << "[NumUses=" << UsedByIndices.count() << ']';
383}
384
385LLVM_DUMP_METHOD void RegSortData::dump() const {
386 print(errs()); errs() << '\n';
387}
388#endif
389
390namespace {
391
392/// Map register candidates to information about how they are used.
393class RegUseTracker {
394 using RegUsesTy = DenseMap<const SCEV *, RegSortData>;
395
396 RegUsesTy RegUsesMap;
398
399public:
400 void countRegister(const SCEV *Reg, size_t LUIdx);
401 void dropRegister(const SCEV *Reg, size_t LUIdx);
402 void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
403
404 bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
405
406 const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
407
408 void clear();
409
412
413 iterator begin() { return RegSequence.begin(); }
414 iterator end() { return RegSequence.end(); }
415 const_iterator begin() const { return RegSequence.begin(); }
416 const_iterator end() const { return RegSequence.end(); }
417};
418
419} // end anonymous namespace
420
421void
422RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
423 std::pair<RegUsesTy::iterator, bool> Pair =
424 RegUsesMap.insert(std::make_pair(Reg, RegSortData()));
425 RegSortData &RSD = Pair.first->second;
426 if (Pair.second)
427 RegSequence.push_back(Reg);
428 RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
429 RSD.UsedByIndices.set(LUIdx);
430}
431
432void
433RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
434 RegUsesTy::iterator It = RegUsesMap.find(Reg);
435 assert(It != RegUsesMap.end());
436 RegSortData &RSD = It->second;
437 assert(RSD.UsedByIndices.size() > LUIdx);
438 RSD.UsedByIndices.reset(LUIdx);
439}
440
441void
442RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
443 assert(LUIdx <= LastLUIdx);
444
445 // Update RegUses. The data structure is not optimized for this purpose;
446 // we must iterate through it and update each of the bit vectors.
447 for (auto &Pair : RegUsesMap) {
448 SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
449 if (LUIdx < UsedByIndices.size())
450 UsedByIndices[LUIdx] =
451 LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : false;
452 UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
453 }
454}
455
456bool
457RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
458 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
459 if (I == RegUsesMap.end())
460 return false;
461 const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
462 int i = UsedByIndices.find_first();
463 if (i == -1) return false;
464 if ((size_t)i != LUIdx) return true;
465 return UsedByIndices.find_next(i) != -1;
466}
467
468const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
469 RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
470 assert(I != RegUsesMap.end() && "Unknown register!");
471 return I->second.UsedByIndices;
472}
473
474void RegUseTracker::clear() {
475 RegUsesMap.clear();
476 RegSequence.clear();
477}
478
479namespace {
480
481/// This class holds information that describes a formula for computing
482/// satisfying a use. It may include broken-out immediates and scaled registers.
483struct Formula {
484 /// Global base address used for complex addressing.
485 GlobalValue *BaseGV = nullptr;
486
487 /// Base offset for complex addressing.
488 Immediate BaseOffset = Immediate::getZero();
489
490 /// Whether any complex addressing has a base register.
491 bool HasBaseReg = false;
492
493 /// The scale of any complex addressing.
494 int64_t Scale = 0;
495
496 /// The list of "base" registers for this use. When this is non-empty. The
497 /// canonical representation of a formula is
498 /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
499 /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
500 /// 3. The reg containing recurrent expr related with currect loop in the
501 /// formula should be put in the ScaledReg.
502 /// #1 enforces that the scaled register is always used when at least two
503 /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
504 /// #2 enforces that 1 * reg is reg.
505 /// #3 ensures invariant regs with respect to current loop can be combined
506 /// together in LSR codegen.
507 /// This invariant can be temporarily broken while building a formula.
508 /// However, every formula inserted into the LSRInstance must be in canonical
509 /// form.
511
512 /// The 'scaled' register for this use. This should be non-null when Scale is
513 /// not zero.
514 const SCEV *ScaledReg = nullptr;
515
516 /// An additional constant offset which added near the use. This requires a
517 /// temporary register, but the offset itself can live in an add immediate
518 /// field rather than a register.
519 Immediate UnfoldedOffset = Immediate::getZero();
520
521 Formula() = default;
522
523 void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
524
525 bool isCanonical(const Loop &L) const;
526
527 void canonicalize(const Loop &L);
528
529 bool unscale();
530
531 bool hasZeroEnd() const;
532
533 size_t getNumRegs() const;
534 Type *getType() const;
535
536 void deleteBaseReg(const SCEV *&S);
537
538 bool referencesReg(const SCEV *S) const;
539 bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
540 const RegUseTracker &RegUses) const;
541
542 void print(raw_ostream &OS) const;
543 void dump() const;
544};
545
546} // end anonymous namespace
547
548/// Recursion helper for initialMatch.
549static void DoInitialMatch(const SCEV *S, Loop *L,
552 ScalarEvolution &SE) {
553 // Collect expressions which properly dominate the loop header.
554 if (SE.properlyDominates(S, L->getHeader())) {
555 Good.push_back(S);
556 return;
557 }
558
559 // Look at add operands.
560 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
561 for (const SCEV *S : Add->operands())
562 DoInitialMatch(S, L, Good, Bad, SE);
563 return;
564 }
565
566 // Look at addrec operands.
567 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
568 if (!AR->getStart()->isZero() && AR->isAffine()) {
569 DoInitialMatch(AR->getStart(), L, Good, Bad, SE);
570 DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
571 AR->getStepRecurrence(SE),
572 // FIXME: AR->getNoWrapFlags()
573 AR->getLoop(), SCEV::FlagAnyWrap),
574 L, Good, Bad, SE);
575 return;
576 }
577
578 // Handle a multiplication by -1 (negation) if it didn't fold.
579 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
580 if (Mul->getOperand(0)->isAllOnesValue()) {
582 const SCEV *NewMul = SE.getMulExpr(Ops);
583
586 DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
587 const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
588 SE.getEffectiveSCEVType(NewMul->getType())));
589 for (const SCEV *S : MyGood)
590 Good.push_back(SE.getMulExpr(NegOne, S));
591 for (const SCEV *S : MyBad)
592 Bad.push_back(SE.getMulExpr(NegOne, S));
593 return;
594 }
595
596 // Ok, we can't do anything interesting. Just stuff the whole thing into a
597 // register and hope for the best.
598 Bad.push_back(S);
599}
600
601/// Incorporate loop-variant parts of S into this Formula, attempting to keep
602/// all loop-invariant and loop-computable values in a single base register.
603void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
606 DoInitialMatch(S, L, Good, Bad, SE);
607 if (!Good.empty()) {
608 const SCEV *Sum = SE.getAddExpr(Good);
609 if (!Sum->isZero())
610 BaseRegs.push_back(Sum);
611 HasBaseReg = true;
612 }
613 if (!Bad.empty()) {
614 const SCEV *Sum = SE.getAddExpr(Bad);
615 if (!Sum->isZero())
616 BaseRegs.push_back(Sum);
617 HasBaseReg = true;
618 }
619 canonicalize(*L);
620}
621
622static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L) {
623 return SCEVExprContains(S, [&L](const SCEV *S) {
624 return isa<SCEVAddRecExpr>(S) && (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
625 });
626}
627
628/// Check whether or not this formula satisfies the canonical
629/// representation.
630/// \see Formula::BaseRegs.
631bool Formula::isCanonical(const Loop &L) const {
632 if (!ScaledReg)
633 return BaseRegs.size() <= 1;
634
635 if (Scale != 1)
636 return true;
637
638 if (Scale == 1 && BaseRegs.empty())
639 return false;
640
641 if (containsAddRecDependentOnLoop(ScaledReg, L))
642 return true;
643
644 // If ScaledReg is not a recurrent expr, or it is but its loop is not current
645 // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
646 // loop, we want to swap the reg in BaseRegs with ScaledReg.
647 return none_of(BaseRegs, [&L](const SCEV *S) {
649 });
650}
651
652/// Helper method to morph a formula into its canonical representation.
653/// \see Formula::BaseRegs.
654/// Every formula having more than one base register, must use the ScaledReg
655/// field. Otherwise, we would have to do special cases everywhere in LSR
656/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
657/// On the other hand, 1*reg should be canonicalized into reg.
658void Formula::canonicalize(const Loop &L) {
659 if (isCanonical(L))
660 return;
661
662 if (BaseRegs.empty()) {
663 // No base reg? Use scale reg with scale = 1 as such.
664 assert(ScaledReg && "Expected 1*reg => reg");
665 assert(Scale == 1 && "Expected 1*reg => reg");
666 BaseRegs.push_back(ScaledReg);
667 Scale = 0;
668 ScaledReg = nullptr;
669 return;
670 }
671
672 // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
673 if (!ScaledReg) {
674 ScaledReg = BaseRegs.pop_back_val();
675 Scale = 1;
676 }
677
678 // If ScaledReg is an invariant with respect to L, find the reg from
679 // BaseRegs containing the recurrent expr related with Loop L. Swap the
680 // reg with ScaledReg.
681 if (!containsAddRecDependentOnLoop(ScaledReg, L)) {
682 auto I = find_if(BaseRegs, [&L](const SCEV *S) {
684 });
685 if (I != BaseRegs.end())
686 std::swap(ScaledReg, *I);
687 }
688 assert(isCanonical(L) && "Failed to canonicalize?");
689}
690
691/// Get rid of the scale in the formula.
692/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
693/// \return true if it was possible to get rid of the scale, false otherwise.
694/// \note After this operation the formula may not be in the canonical form.
695bool Formula::unscale() {
696 if (Scale != 1)
697 return false;
698 Scale = 0;
699 BaseRegs.push_back(ScaledReg);
700 ScaledReg = nullptr;
701 return true;
702}
703
704bool Formula::hasZeroEnd() const {
705 if (UnfoldedOffset || BaseOffset)
706 return false;
707 if (BaseRegs.size() != 1 || ScaledReg)
708 return false;
709 return true;
710}
711
712/// Return the total number of register operands used by this formula. This does
713/// not include register uses implied by non-constant addrec strides.
714size_t Formula::getNumRegs() const {
715 return !!ScaledReg + BaseRegs.size();
716}
717
718/// Return the type of this formula, if it has one, or null otherwise. This type
719/// is meaningless except for the bit size.
720Type *Formula::getType() const {
721 return !BaseRegs.empty() ? BaseRegs.front()->getType() :
722 ScaledReg ? ScaledReg->getType() :
723 BaseGV ? BaseGV->getType() :
724 nullptr;
725}
726
727/// Delete the given base reg from the BaseRegs list.
728void Formula::deleteBaseReg(const SCEV *&S) {
729 if (&S != &BaseRegs.back())
730 std::swap(S, BaseRegs.back());
731 BaseRegs.pop_back();
732}
733
734/// Test if this formula references the given register.
735bool Formula::referencesReg(const SCEV *S) const {
736 return S == ScaledReg || is_contained(BaseRegs, S);
737}
738
739/// Test whether this formula uses registers which are used by uses other than
740/// the use with the given index.
741bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
742 const RegUseTracker &RegUses) const {
743 if (ScaledReg)
744 if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
745 return true;
746 for (const SCEV *BaseReg : BaseRegs)
747 if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
748 return true;
749 return false;
750}
751
752#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
753void Formula::print(raw_ostream &OS) const {
754 bool First = true;
755 if (BaseGV) {
756 if (!First) OS << " + "; else First = false;
757 BaseGV->printAsOperand(OS, /*PrintType=*/false);
758 }
759 if (BaseOffset.isNonZero()) {
760 if (!First) OS << " + "; else First = false;
761 OS << BaseOffset;
762 }
763 for (const SCEV *BaseReg : BaseRegs) {
764 if (!First) OS << " + "; else First = false;
765 OS << "reg(" << *BaseReg << ')';
766 }
767 if (HasBaseReg && BaseRegs.empty()) {
768 if (!First) OS << " + "; else First = false;
769 OS << "**error: HasBaseReg**";
770 } else if (!HasBaseReg && !BaseRegs.empty()) {
771 if (!First) OS << " + "; else First = false;
772 OS << "**error: !HasBaseReg**";
773 }
774 if (Scale != 0) {
775 if (!First) OS << " + "; else First = false;
776 OS << Scale << "*reg(";
777 if (ScaledReg)
778 OS << *ScaledReg;
779 else
780 OS << "<unknown>";
781 OS << ')';
782 }
783 if (UnfoldedOffset.isNonZero()) {
784 if (!First) OS << " + ";
785 OS << "imm(" << UnfoldedOffset << ')';
786 }
787}
788
789LLVM_DUMP_METHOD void Formula::dump() const {
790 print(errs()); errs() << '\n';
791}
792#endif
793
794/// Return true if the given addrec can be sign-extended without changing its
795/// value.
797 Type *WideTy =
799 return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
800}
801
802/// Return true if the given add can be sign-extended without changing its
803/// value.
804static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
805 Type *WideTy =
806 IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
807 return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
808}
809
810/// Return true if the given mul can be sign-extended without changing its
811/// value.
812static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
813 Type *WideTy =
815 SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
816 return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
817}
818
819/// Return an expression for LHS /s RHS, if it can be determined and if the
820/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
821/// is true, expressions like (X * Y) /s Y are simplified to X, ignoring that
822/// the multiplication may overflow, which is useful when the result will be
823/// used in a context where the most significant bits are ignored.
824static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
825 ScalarEvolution &SE,
826 bool IgnoreSignificantBits = false) {
827 // Handle the trivial case, which works for any SCEV type.
828 if (LHS == RHS)
829 return SE.getConstant(LHS->getType(), 1);
830
831 // Handle a few RHS special cases.
832 const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
833 if (RC) {
834 const APInt &RA = RC->getAPInt();
835 // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
836 // some folding.
837 if (RA.isAllOnes()) {
838 if (LHS->getType()->isPointerTy())
839 return nullptr;
840 return SE.getMulExpr(LHS, RC);
841 }
842 // Handle x /s 1 as x.
843 if (RA == 1)
844 return LHS;
845 }
846
847 // Check for a division of a constant by a constant.
848 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
849 if (!RC)
850 return nullptr;
851 const APInt &LA = C->getAPInt();
852 const APInt &RA = RC->getAPInt();
853 if (LA.srem(RA) != 0)
854 return nullptr;
855 return SE.getConstant(LA.sdiv(RA));
856 }
857
858 // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
859 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
860 if ((IgnoreSignificantBits || isAddRecSExtable(AR, SE)) && AR->isAffine()) {
861 const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
862 IgnoreSignificantBits);
863 if (!Step) return nullptr;
864 const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
865 IgnoreSignificantBits);
866 if (!Start) return nullptr;
867 // FlagNW is independent of the start value, step direction, and is
868 // preserved with smaller magnitude steps.
869 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
870 return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
871 }
872 return nullptr;
873 }
874
875 // Distribute the sdiv over add operands, if the add doesn't overflow.
876 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
877 if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
879 for (const SCEV *S : Add->operands()) {
880 const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
881 if (!Op) return nullptr;
882 Ops.push_back(Op);
883 }
884 return SE.getAddExpr(Ops);
885 }
886 return nullptr;
887 }
888
889 // Check for a multiply operand that we can pull RHS out of.
890 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
891 if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
892 // Handle special case C1*X*Y /s C2*X*Y.
893 if (const SCEVMulExpr *MulRHS = dyn_cast<SCEVMulExpr>(RHS)) {
894 if (IgnoreSignificantBits || isMulSExtable(MulRHS, SE)) {
895 const SCEVConstant *LC = dyn_cast<SCEVConstant>(Mul->getOperand(0));
896 const SCEVConstant *RC =
897 dyn_cast<SCEVConstant>(MulRHS->getOperand(0));
898 if (LC && RC) {
900 SmallVector<const SCEV *, 4> ROps(drop_begin(MulRHS->operands()));
901 if (LOps == ROps)
902 return getExactSDiv(LC, RC, SE, IgnoreSignificantBits);
903 }
904 }
905 }
906
908 bool Found = false;
909 for (const SCEV *S : Mul->operands()) {
910 if (!Found)
911 if (const SCEV *Q = getExactSDiv(S, RHS, SE,
912 IgnoreSignificantBits)) {
913 S = Q;
914 Found = true;
915 }
916 Ops.push_back(S);
917 }
918 return Found ? SE.getMulExpr(Ops) : nullptr;
919 }
920 return nullptr;
921 }
922
923 // Otherwise we don't know.
924 return nullptr;
925}
926
927/// If S involves the addition of a constant integer value, return that integer
928/// value, and mutate S to point to a new SCEV with that value excluded.
929static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
930 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
931 if (C->getAPInt().getSignificantBits() <= 64) {
932 S = SE.getConstant(C->getType(), 0);
933 return Immediate::getFixed(C->getValue()->getSExtValue());
934 }
935 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
936 SmallVector<const SCEV *, 8> NewOps(Add->operands());
937 Immediate Result = ExtractImmediate(NewOps.front(), SE);
938 if (Result.isNonZero())
939 S = SE.getAddExpr(NewOps);
940 return Result;
941 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
942 SmallVector<const SCEV *, 8> NewOps(AR->operands());
943 Immediate Result = ExtractImmediate(NewOps.front(), SE);
944 if (Result.isNonZero())
945 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
946 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
948 return Result;
949 } else if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S)) {
950 if (EnableVScaleImmediates && M->getNumOperands() == 2) {
951 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(M->getOperand(0)))
952 if (isa<SCEVVScale>(M->getOperand(1))) {
953 S = SE.getConstant(M->getType(), 0);
954 return Immediate::getScalable(C->getValue()->getSExtValue());
955 }
956 }
957 }
958 return Immediate::getZero();
959}
960
961/// If S involves the addition of a GlobalValue address, return that symbol, and
962/// mutate S to point to a new SCEV with that value excluded.
964 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
965 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
966 S = SE.getConstant(GV->getType(), 0);
967 return GV;
968 }
969 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
970 SmallVector<const SCEV *, 8> NewOps(Add->operands());
971 GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
972 if (Result)
973 S = SE.getAddExpr(NewOps);
974 return Result;
975 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
976 SmallVector<const SCEV *, 8> NewOps(AR->operands());
977 GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
978 if (Result)
979 S = SE.getAddRecExpr(NewOps, AR->getLoop(),
980 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
982 return Result;
983 }
984 return nullptr;
985}
986
987/// Returns true if the specified instruction is using the specified value as an
988/// address.
990 Instruction *Inst, Value *OperandVal) {
991 bool isAddress = isa<LoadInst>(Inst);
992 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
993 if (SI->getPointerOperand() == OperandVal)
994 isAddress = true;
995 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
996 // Addressing modes can also be folded into prefetches and a variety
997 // of intrinsics.
998 switch (II->getIntrinsicID()) {
999 case Intrinsic::memset:
1000 case Intrinsic::prefetch:
1001 case Intrinsic::masked_load:
1002 if (II->getArgOperand(0) == OperandVal)
1003 isAddress = true;
1004 break;
1005 case Intrinsic::masked_store:
1006 if (II->getArgOperand(1) == OperandVal)
1007 isAddress = true;
1008 break;
1009 case Intrinsic::memmove:
1010 case Intrinsic::memcpy:
1011 if (II->getArgOperand(0) == OperandVal ||
1012 II->getArgOperand(1) == OperandVal)
1013 isAddress = true;
1014 break;
1015 default: {
1016 MemIntrinsicInfo IntrInfo;
1017 if (TTI.getTgtMemIntrinsic(II, IntrInfo)) {
1018 if (IntrInfo.PtrVal == OperandVal)
1019 isAddress = true;
1020 }
1021 }
1022 }
1023 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1024 if (RMW->getPointerOperand() == OperandVal)
1025 isAddress = true;
1026 } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1027 if (CmpX->getPointerOperand() == OperandVal)
1028 isAddress = true;
1029 }
1030 return isAddress;
1031}
1032
1033/// Return the type of the memory being accessed.
1034static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
1035 Instruction *Inst, Value *OperandVal) {
1036 MemAccessTy AccessTy = MemAccessTy::getUnknown(Inst->getContext());
1037
1038 // First get the type of memory being accessed.
1039 if (Type *Ty = Inst->getAccessType())
1040 AccessTy.MemTy = Ty;
1041
1042 // Then get the pointer address space.
1043 if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
1044 AccessTy.AddrSpace = SI->getPointerAddressSpace();
1045 } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
1046 AccessTy.AddrSpace = LI->getPointerAddressSpace();
1047 } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
1048 AccessTy.AddrSpace = RMW->getPointerAddressSpace();
1049 } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
1050 AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
1051 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
1052 switch (II->getIntrinsicID()) {
1053 case Intrinsic::prefetch:
1054 case Intrinsic::memset:
1055 AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
1056 AccessTy.MemTy = OperandVal->getType();
1057 break;
1058 case Intrinsic::memmove:
1059 case Intrinsic::memcpy:
1060 AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace();
1061 AccessTy.MemTy = OperandVal->getType();
1062 break;
1063 case Intrinsic::masked_load:
1064 AccessTy.AddrSpace =
1065 II->getArgOperand(0)->getType()->getPointerAddressSpace();
1066 break;
1067 case Intrinsic::masked_store:
1068 AccessTy.AddrSpace =
1069 II->getArgOperand(1)->getType()->getPointerAddressSpace();
1070 break;
1071 default: {
1072 MemIntrinsicInfo IntrInfo;
1073 if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) {
1074 AccessTy.AddrSpace
1075 = IntrInfo.PtrVal->getType()->getPointerAddressSpace();
1076 }
1077
1078 break;
1079 }
1080 }
1081 }
1082
1083 return AccessTy;
1084}
1085
1086/// Return true if this AddRec is already a phi in its loop.
1087static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
1088 for (PHINode &PN : AR->getLoop()->getHeader()->phis()) {
1089 if (SE.isSCEVable(PN.getType()) &&
1090 (SE.getEffectiveSCEVType(PN.getType()) ==
1091 SE.getEffectiveSCEVType(AR->getType())) &&
1092 SE.getSCEV(&PN) == AR)
1093 return true;
1094 }
1095 return false;
1096}
1097
1098/// Check if expanding this expression is likely to incur significant cost. This
1099/// is tricky because SCEV doesn't track which expressions are actually computed
1100/// by the current IR.
1101///
1102/// We currently allow expansion of IV increments that involve adds,
1103/// multiplication by constants, and AddRecs from existing phis.
1104///
1105/// TODO: Allow UDivExpr if we can find an existing IV increment that is an
1106/// obvious multiple of the UDivExpr.
1107static bool isHighCostExpansion(const SCEV *S,
1109 ScalarEvolution &SE) {
1110 // Zero/One operand expressions
1111 switch (S->getSCEVType()) {
1112 case scUnknown:
1113 case scConstant:
1114 case scVScale:
1115 return false;
1116 case scTruncate:
1117 return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
1118 Processed, SE);
1119 case scZeroExtend:
1120 return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
1121 Processed, SE);
1122 case scSignExtend:
1123 return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
1124 Processed, SE);
1125 default:
1126 break;
1127 }
1128
1129 if (!Processed.insert(S).second)
1130 return false;
1131
1132 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
1133 for (const SCEV *S : Add->operands()) {
1134 if (isHighCostExpansion(S, Processed, SE))
1135 return true;
1136 }
1137 return false;
1138 }
1139
1140 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
1141 if (Mul->getNumOperands() == 2) {
1142 // Multiplication by a constant is ok
1143 if (isa<SCEVConstant>(Mul->getOperand(0)))
1144 return isHighCostExpansion(Mul->getOperand(1), Processed, SE);
1145
1146 // If we have the value of one operand, check if an existing
1147 // multiplication already generates this expression.
1148 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) {
1149 Value *UVal = U->getValue();
1150 for (User *UR : UVal->users()) {
1151 // If U is a constant, it may be used by a ConstantExpr.
1152 Instruction *UI = dyn_cast<Instruction>(UR);
1153 if (UI && UI->getOpcode() == Instruction::Mul &&
1154 SE.isSCEVable(UI->getType())) {
1155 return SE.getSCEV(UI) == Mul;
1156 }
1157 }
1158 }
1159 }
1160 }
1161
1162 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
1163 if (isExistingPhi(AR, SE))
1164 return false;
1165 }
1166
1167 // Fow now, consider any other type of expression (div/mul/min/max) high cost.
1168 return true;
1169}
1170
1171namespace {
1172
1173class LSRUse;
1174
1175} // end anonymous namespace
1176
1177/// Check if the addressing mode defined by \p F is completely
1178/// folded in \p LU at isel time.
1179/// This includes address-mode folding and special icmp tricks.
1180/// This function returns true if \p LU can accommodate what \p F
1181/// defines and up to 1 base + 1 scaled + offset.
1182/// In other words, if \p F has several base registers, this function may
1183/// still return true. Therefore, users still need to account for
1184/// additional base registers and/or unfolded offsets to derive an
1185/// accurate cost model.
1187 const LSRUse &LU, const Formula &F);
1188
1189// Get the cost of the scaling factor used in F for LU.
1191 const LSRUse &LU, const Formula &F,
1192 const Loop &L);
1193
1194namespace {
1195
1196/// This class is used to measure and compare candidate formulae.
1197class Cost {
1198 const Loop *L = nullptr;
1199 ScalarEvolution *SE = nullptr;
1200 const TargetTransformInfo *TTI = nullptr;
1203
1204public:
1205 Cost() = delete;
1206 Cost(const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
1208 L(L), SE(&SE), TTI(&TTI), AMK(AMK) {
1209 C.Insns = 0;
1210 C.NumRegs = 0;
1211 C.AddRecCost = 0;
1212 C.NumIVMuls = 0;
1213 C.NumBaseAdds = 0;
1214 C.ImmCost = 0;
1215 C.SetupCost = 0;
1216 C.ScaleCost = 0;
1217 }
1218
1219 bool isLess(const Cost &Other) const;
1220
1221 void Lose();
1222
1223#ifndef NDEBUG
1224 // Once any of the metrics loses, they must all remain losers.
1225 bool isValid() {
1226 return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
1227 | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
1228 || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
1229 & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
1230 }
1231#endif
1232
1233 bool isLoser() {
1234 assert(isValid() && "invalid cost");
1235 return C.NumRegs == ~0u;
1236 }
1237
1238 void RateFormula(const Formula &F,
1240 const DenseSet<const SCEV *> &VisitedRegs,
1241 const LSRUse &LU,
1242 SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
1243
1244 void print(raw_ostream &OS) const;
1245 void dump() const;
1246
1247private:
1248 void RateRegister(const Formula &F, const SCEV *Reg,
1250 void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1253};
1254
1255/// An operand value in an instruction which is to be replaced with some
1256/// equivalent, possibly strength-reduced, replacement.
1257struct LSRFixup {
1258 /// The instruction which will be updated.
1259 Instruction *UserInst = nullptr;
1260
1261 /// The operand of the instruction which will be replaced. The operand may be
1262 /// used more than once; every instance will be replaced.
1263 Value *OperandValToReplace = nullptr;
1264
1265 /// If this user is to use the post-incremented value of an induction
1266 /// variable, this set is non-empty and holds the loops associated with the
1267 /// induction variable.
1268 PostIncLoopSet PostIncLoops;
1269
1270 /// A constant offset to be added to the LSRUse expression. This allows
1271 /// multiple fixups to share the same LSRUse with different offsets, for
1272 /// example in an unrolled loop.
1273 Immediate Offset = Immediate::getZero();
1274
1275 LSRFixup() = default;
1276
1277 bool isUseFullyOutsideLoop(const Loop *L) const;
1278
1279 void print(raw_ostream &OS) const;
1280 void dump() const;
1281};
1282
1283/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted
1284/// SmallVectors of const SCEV*.
1285struct UniquifierDenseMapInfo {
1286 static SmallVector<const SCEV *, 4> getEmptyKey() {
1288 V.push_back(reinterpret_cast<const SCEV *>(-1));
1289 return V;
1290 }
1291
1292 static SmallVector<const SCEV *, 4> getTombstoneKey() {
1294 V.push_back(reinterpret_cast<const SCEV *>(-2));
1295 return V;
1296 }
1297
1298 static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) {
1299 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
1300 }
1301
1302 static bool isEqual(const SmallVector<const SCEV *, 4> &LHS,
1304 return LHS == RHS;
1305 }
1306};
1307
1308/// This class holds the state that LSR keeps for each use in IVUsers, as well
1309/// as uses invented by LSR itself. It includes information about what kinds of
1310/// things can be folded into the user, information about the user itself, and
1311/// information about how the use may be satisfied. TODO: Represent multiple
1312/// users of the same expression in common?
1313class LSRUse {
1314 DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;
1315
1316public:
1317 /// An enum for a kind of use, indicating what types of scaled and immediate
1318 /// operands it might support.
1319 enum KindType {
1320 Basic, ///< A normal use, with no folding.
1321 Special, ///< A special case of basic, allowing -1 scales.
1322 Address, ///< An address use; folding according to TargetLowering
1323 ICmpZero ///< An equality icmp with both operands folded into one.
1324 // TODO: Add a generic icmp too?
1325 };
1326
1327 using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>;
1328
1329 KindType Kind;
1330 MemAccessTy AccessTy;
1331
1332 /// The list of operands which are to be replaced.
1334
1335 /// Keep track of the min and max offsets of the fixups.
1336 Immediate MinOffset = Immediate::getFixedMax();
1337 Immediate MaxOffset = Immediate::getFixedMin();
1338
1339 /// This records whether all of the fixups using this LSRUse are outside of
1340 /// the loop, in which case some special-case heuristics may be used.
1341 bool AllFixupsOutsideLoop = true;
1342
1343 /// RigidFormula is set to true to guarantee that this use will be associated
1344 /// with a single formula--the one that initially matched. Some SCEV
1345 /// expressions cannot be expanded. This allows LSR to consider the registers
1346 /// used by those expressions without the need to expand them later after
1347 /// changing the formula.
1348 bool RigidFormula = false;
1349
1350 /// This records the widest use type for any fixup using this
1351 /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
1352 /// fixup widths to be equivalent, because the narrower one may be relying on
1353 /// the implicit truncation to truncate away bogus bits.
1354 Type *WidestFixupType = nullptr;
1355
1356 /// A list of ways to build a value that can satisfy this user. After the
1357 /// list is populated, one of these is selected heuristically and used to
1358 /// formulate a replacement for OperandValToReplace in UserInst.
1359 SmallVector<Formula, 12> Formulae;
1360
1361 /// The set of register candidates used by all formulae in this LSRUse.
1363
1364 LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {}
1365
1366 LSRFixup &getNewFixup() {
1367 Fixups.push_back(LSRFixup());
1368 return Fixups.back();
1369 }
1370
1371 void pushFixup(LSRFixup &f) {
1372 Fixups.push_back(f);
1373 if (Immediate::isKnownGT(f.Offset, MaxOffset))
1374 MaxOffset = f.Offset;
1375 if (Immediate::isKnownLT(f.Offset, MinOffset))
1376 MinOffset = f.Offset;
1377 }
1378
1379 bool HasFormulaWithSameRegs(const Formula &F) const;
1380 float getNotSelectedProbability(const SCEV *Reg) const;
1381 bool InsertFormula(const Formula &F, const Loop &L);
1382 void DeleteFormula(Formula &F);
1383 void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
1384
1385 void print(raw_ostream &OS) const;
1386 void dump() const;
1387};
1388
1389} // end anonymous namespace
1390
1392 LSRUse::KindType Kind, MemAccessTy AccessTy,
1393 GlobalValue *BaseGV, Immediate BaseOffset,
1394 bool HasBaseReg, int64_t Scale,
1395 Instruction *Fixup = nullptr);
1396
1397static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) {
1398 if (isa<SCEVUnknown>(Reg) || isa<SCEVConstant>(Reg))
1399 return 1;
1400 if (Depth == 0)
1401 return 0;
1402 if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))
1403 return getSetupCost(S->getStart(), Depth - 1);
1404 if (auto S = dyn_cast<SCEVIntegralCastExpr>(Reg))
1405 return getSetupCost(S->getOperand(), Depth - 1);
1406 if (auto S = dyn_cast<SCEVNAryExpr>(Reg))
1407 return std::accumulate(S->operands().begin(), S->operands().end(), 0,
1408 [&](unsigned i, const SCEV *Reg) {
1409 return i + getSetupCost(Reg, Depth - 1);
1410 });
1411 if (auto S = dyn_cast<SCEVUDivExpr>(Reg))
1412 return getSetupCost(S->getLHS(), Depth - 1) +
1413 getSetupCost(S->getRHS(), Depth - 1);
1414 return 0;
1415}
1416
1417/// Tally up interesting quantities from the given register.
1418void Cost::RateRegister(const Formula &F, const SCEV *Reg,
1420 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
1421 // If this is an addrec for another loop, it should be an invariant
1422 // with respect to L since L is the innermost loop (at least
1423 // for now LSR only handles innermost loops).
1424 if (AR->getLoop() != L) {
1425 // If the AddRec exists, consider it's register free and leave it alone.
1426 if (isExistingPhi(AR, *SE) && AMK != TTI::AMK_PostIndexed)
1427 return;
1428
1429 // It is bad to allow LSR for current loop to add induction variables
1430 // for its sibling loops.
1431 if (!AR->getLoop()->contains(L)) {
1432 Lose();
1433 return;
1434 }
1435
1436 // Otherwise, it will be an invariant with respect to Loop L.
1437 ++C.NumRegs;
1438 return;
1439 }
1440
1441 unsigned LoopCost = 1;
1442 if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||
1443 TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {
1444
1445 // If the step size matches the base offset, we could use pre-indexed
1446 // addressing.
1447 if (AMK == TTI::AMK_PreIndexed && F.BaseOffset.isFixed()) {
1448 if (auto *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)))
1449 if (Step->getAPInt() == F.BaseOffset.getFixedValue())
1450 LoopCost = 0;
1451 } else if (AMK == TTI::AMK_PostIndexed) {
1452 const SCEV *LoopStep = AR->getStepRecurrence(*SE);
1453 if (isa<SCEVConstant>(LoopStep)) {
1454 const SCEV *LoopStart = AR->getStart();
1455 if (!isa<SCEVConstant>(LoopStart) &&
1456 SE->isLoopInvariant(LoopStart, L))
1457 LoopCost = 0;
1458 }
1459 }
1460 }
1461 C.AddRecCost += LoopCost;
1462
1463 // Add the step value register, if it needs one.
1464 // TODO: The non-affine case isn't precisely modeled here.
1465 if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
1466 if (!Regs.count(AR->getOperand(1))) {
1467 RateRegister(F, AR->getOperand(1), Regs);
1468 if (isLoser())
1469 return;
1470 }
1471 }
1472 }
1473 ++C.NumRegs;
1474
1475 // Rough heuristic; favor registers which don't require extra setup
1476 // instructions in the preheader.
1477 C.SetupCost += getSetupCost(Reg, SetupCostDepthLimit);
1478 // Ensure we don't, even with the recusion limit, produce invalid costs.
1479 C.SetupCost = std::min<unsigned>(C.SetupCost, 1 << 16);
1480
1481 C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
1482 SE->hasComputableLoopEvolution(Reg, L);
1483}
1484
1485/// Record this register in the set. If we haven't seen it before, rate
1486/// it. Optional LoserRegs provides a way to declare any formula that refers to
1487/// one of those regs an instant loser.
1488void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
1490 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1491 if (LoserRegs && LoserRegs->count(Reg)) {
1492 Lose();
1493 return;
1494 }
1495 if (Regs.insert(Reg).second) {
1496 RateRegister(F, Reg, Regs);
1497 if (LoserRegs && isLoser())
1498 LoserRegs->insert(Reg);
1499 }
1500}
1501
1502void Cost::RateFormula(const Formula &F,
1504 const DenseSet<const SCEV *> &VisitedRegs,
1505 const LSRUse &LU,
1506 SmallPtrSetImpl<const SCEV *> *LoserRegs) {
1507 if (isLoser())
1508 return;
1509 assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
1510 // Tally up the registers.
1511 unsigned PrevAddRecCost = C.AddRecCost;
1512 unsigned PrevNumRegs = C.NumRegs;
1513 unsigned PrevNumBaseAdds = C.NumBaseAdds;
1514 if (const SCEV *ScaledReg = F.ScaledReg) {
1515 if (VisitedRegs.count(ScaledReg)) {
1516 Lose();
1517 return;
1518 }
1519 RatePrimaryRegister(F, ScaledReg, Regs, LoserRegs);
1520 if (isLoser())
1521 return;
1522 }
1523 for (const SCEV *BaseReg : F.BaseRegs) {
1524 if (VisitedRegs.count(BaseReg)) {
1525 Lose();
1526 return;
1527 }
1528 RatePrimaryRegister(F, BaseReg, Regs, LoserRegs);
1529 if (isLoser())
1530 return;
1531 }
1532
1533 // Determine how many (unfolded) adds we'll need inside the loop.
1534 size_t NumBaseParts = F.getNumRegs();
1535 if (NumBaseParts > 1)
1536 // Do not count the base and a possible second register if the target
1537 // allows to fold 2 registers.
1538 C.NumBaseAdds +=
1539 NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F)));
1540 C.NumBaseAdds += (F.UnfoldedOffset.isNonZero());
1541
1542 // Accumulate non-free scaling amounts.
1543 C.ScaleCost += *getScalingFactorCost(*TTI, LU, F, *L).getValue();
1544
1545 // Tally up the non-zero immediates.
1546 for (const LSRFixup &Fixup : LU.Fixups) {
1547 if (Fixup.Offset.isCompatibleImmediate(F.BaseOffset)) {
1548 Immediate Offset = Fixup.Offset.addUnsigned(F.BaseOffset);
1549 if (F.BaseGV)
1550 C.ImmCost += 64; // Handle symbolic values conservatively.
1551 // TODO: This should probably be the pointer size.
1552 else if (Offset.isNonZero())
1553 C.ImmCost +=
1554 APInt(64, Offset.getKnownMinValue(), true).getSignificantBits();
1555
1556 // Check with target if this offset with this instruction is
1557 // specifically not supported.
1558 if (LU.Kind == LSRUse::Address && Offset.isNonZero() &&
1559 !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1560 Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
1561 C.NumBaseAdds++;
1562 } else {
1563 // Incompatible immediate type, increase cost to avoid using
1564 C.ImmCost += 2048;
1565 }
1566 }
1567
1568 // If we don't count instruction cost exit here.
1569 if (!InsnsCost) {
1570 assert(isValid() && "invalid cost");
1571 return;
1572 }
1573
1574 // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
1575 // additional instruction (at least fill).
1576 // TODO: Need distinguish register class?
1577 unsigned TTIRegNum = TTI->getNumberOfRegisters(
1578 TTI->getRegisterClassForType(false, F.getType())) - 1;
1579 if (C.NumRegs > TTIRegNum) {
1580 // Cost already exceeded TTIRegNum, then only newly added register can add
1581 // new instructions.
1582 if (PrevNumRegs > TTIRegNum)
1583 C.Insns += (C.NumRegs - PrevNumRegs);
1584 else
1585 C.Insns += (C.NumRegs - TTIRegNum);
1586 }
1587
1588 // If ICmpZero formula ends with not 0, it could not be replaced by
1589 // just add or sub. We'll need to compare final result of AddRec.
1590 // That means we'll need an additional instruction. But if the target can
1591 // macro-fuse a compare with a branch, don't count this extra instruction.
1592 // For -10 + {0, +, 1}:
1593 // i = i + 1;
1594 // cmp i, 10
1595 //
1596 // For {-10, +, 1}:
1597 // i = i + 1;
1598 if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() &&
1599 !TTI->canMacroFuseCmp())
1600 C.Insns++;
1601 // Each new AddRec adds 1 instruction to calculation.
1602 C.Insns += (C.AddRecCost - PrevAddRecCost);
1603
1604 // BaseAdds adds instructions for unfolded registers.
1605 if (LU.Kind != LSRUse::ICmpZero)
1606 C.Insns += C.NumBaseAdds - PrevNumBaseAdds;
1607 assert(isValid() && "invalid cost");
1608}
1609
1610/// Set this cost to a losing value.
1611void Cost::Lose() {
1612 C.Insns = std::numeric_limits<unsigned>::max();
1613 C.NumRegs = std::numeric_limits<unsigned>::max();
1614 C.AddRecCost = std::numeric_limits<unsigned>::max();
1615 C.NumIVMuls = std::numeric_limits<unsigned>::max();
1616 C.NumBaseAdds = std::numeric_limits<unsigned>::max();
1617 C.ImmCost = std::numeric_limits<unsigned>::max();
1618 C.SetupCost = std::numeric_limits<unsigned>::max();
1619 C.ScaleCost = std::numeric_limits<unsigned>::max();
1620}
1621
1622/// Choose the lower cost.
1623bool Cost::isLess(const Cost &Other) const {
1624 if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
1625 C.Insns != Other.C.Insns)
1626 return C.Insns < Other.C.Insns;
1627 return TTI->isLSRCostLess(C, Other.C);
1628}
1629
1630#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1631void Cost::print(raw_ostream &OS) const {
1632 if (InsnsCost)
1633 OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
1634 OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s");
1635 if (C.AddRecCost != 0)
1636 OS << ", with addrec cost " << C.AddRecCost;
1637 if (C.NumIVMuls != 0)
1638 OS << ", plus " << C.NumIVMuls << " IV mul"
1639 << (C.NumIVMuls == 1 ? "" : "s");
1640 if (C.NumBaseAdds != 0)
1641 OS << ", plus " << C.NumBaseAdds << " base add"
1642 << (C.NumBaseAdds == 1 ? "" : "s");
1643 if (C.ScaleCost != 0)
1644 OS << ", plus " << C.ScaleCost << " scale cost";
1645 if (C.ImmCost != 0)
1646 OS << ", plus " << C.ImmCost << " imm cost";
1647 if (C.SetupCost != 0)
1648 OS << ", plus " << C.SetupCost << " setup cost";
1649}
1650
1651LLVM_DUMP_METHOD void Cost::dump() const {
1652 print(errs()); errs() << '\n';
1653}
1654#endif
1655
1656/// Test whether this fixup always uses its value outside of the given loop.
1657bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
1658 // PHI nodes use their value in their incoming blocks.
1659 if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
1660 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
1661 if (PN->getIncomingValue(i) == OperandValToReplace &&
1662 L->contains(PN->getIncomingBlock(i)))
1663 return false;
1664 return true;
1665 }
1666
1667 return !L->contains(UserInst);
1668}
1669
1670#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1671void LSRFixup::print(raw_ostream &OS) const {
1672 OS << "UserInst=";
1673 // Store is common and interesting enough to be worth special-casing.
1674 if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
1675 OS << "store ";
1676 Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
1677 } else if (UserInst->getType()->isVoidTy())
1678 OS << UserInst->getOpcodeName();
1679 else
1680 UserInst->printAsOperand(OS, /*PrintType=*/false);
1681
1682 OS << ", OperandValToReplace=";
1683 OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
1684
1685 for (const Loop *PIL : PostIncLoops) {
1686 OS << ", PostIncLoop=";
1687 PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
1688 }
1689
1690 if (Offset.isNonZero())
1691 OS << ", Offset=" << Offset;
1692}
1693
1694LLVM_DUMP_METHOD void LSRFixup::dump() const {
1695 print(errs()); errs() << '\n';
1696}
1697#endif
1698
1699/// Test whether this use as a formula which has the same registers as the given
1700/// formula.
1701bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
1703 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1704 // Unstable sort by host order ok, because this is only used for uniquifying.
1705 llvm::sort(Key);
1706 return Uniquifier.count(Key);
1707}
1708
1709/// The function returns a probability of selecting formula without Reg.
1710float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
1711 unsigned FNum = 0;
1712 for (const Formula &F : Formulae)
1713 if (F.referencesReg(Reg))
1714 FNum++;
1715 return ((float)(Formulae.size() - FNum)) / Formulae.size();
1716}
1717
1718/// If the given formula has not yet been inserted, add it to the list, and
1719/// return true. Return false otherwise. The formula must be in canonical form.
1720bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
1721 assert(F.isCanonical(L) && "Invalid canonical representation");
1722
1723 if (!Formulae.empty() && RigidFormula)
1724 return false;
1725
1727 if (F.ScaledReg) Key.push_back(F.ScaledReg);
1728 // Unstable sort by host order ok, because this is only used for uniquifying.
1729 llvm::sort(Key);
1730
1731 if (!Uniquifier.insert(Key).second)
1732 return false;
1733
1734 // Using a register to hold the value of 0 is not profitable.
1735 assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
1736 "Zero allocated in a scaled register!");
1737#ifndef NDEBUG
1738 for (const SCEV *BaseReg : F.BaseRegs)
1739 assert(!BaseReg->isZero() && "Zero allocated in a base register!");
1740#endif
1741
1742 // Add the formula to the list.
1743 Formulae.push_back(F);
1744
1745 // Record registers now being used by this use.
1746 Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1747 if (F.ScaledReg)
1748 Regs.insert(F.ScaledReg);
1749
1750 return true;
1751}
1752
1753/// Remove the given formula from this use's list.
1754void LSRUse::DeleteFormula(Formula &F) {
1755 if (&F != &Formulae.back())
1756 std::swap(F, Formulae.back());
1757 Formulae.pop_back();
1758}
1759
1760/// Recompute the Regs field, and update RegUses.
1761void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
1762 // Now that we've filtered out some formulae, recompute the Regs set.
1763 SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
1764 Regs.clear();
1765 for (const Formula &F : Formulae) {
1766 if (F.ScaledReg) Regs.insert(F.ScaledReg);
1767 Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
1768 }
1769
1770 // Update the RegTracker.
1771 for (const SCEV *S : OldRegs)
1772 if (!Regs.count(S))
1773 RegUses.dropRegister(S, LUIdx);
1774}
1775
1776#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1777void LSRUse::print(raw_ostream &OS) const {
1778 OS << "LSR Use: Kind=";
1779 switch (Kind) {
1780 case Basic: OS << "Basic"; break;
1781 case Special: OS << "Special"; break;
1782 case ICmpZero: OS << "ICmpZero"; break;
1783 case Address:
1784 OS << "Address of ";
1785 if (AccessTy.MemTy->isPointerTy())
1786 OS << "pointer"; // the full pointer type could be really verbose
1787 else {
1788 OS << *AccessTy.MemTy;
1789 }
1790
1791 OS << " in addrspace(" << AccessTy.AddrSpace << ')';
1792 }
1793
1794 OS << ", Offsets={";
1795 bool NeedComma = false;
1796 for (const LSRFixup &Fixup : Fixups) {
1797 if (NeedComma) OS << ',';
1798 OS << Fixup.Offset;
1799 NeedComma = true;
1800 }
1801 OS << '}';
1802
1803 if (AllFixupsOutsideLoop)
1804 OS << ", all-fixups-outside-loop";
1805
1806 if (WidestFixupType)
1807 OS << ", widest fixup type: " << *WidestFixupType;
1808}
1809
1810LLVM_DUMP_METHOD void LSRUse::dump() const {
1811 print(errs()); errs() << '\n';
1812}
1813#endif
1814
1816 LSRUse::KindType Kind, MemAccessTy AccessTy,
1817 GlobalValue *BaseGV, Immediate BaseOffset,
1818 bool HasBaseReg, int64_t Scale,
1819 Instruction *Fixup /* = nullptr */) {
1820 switch (Kind) {
1821 case LSRUse::Address: {
1822 int64_t FixedOffset =
1823 BaseOffset.isScalable() ? 0 : BaseOffset.getFixedValue();
1824 int64_t ScalableOffset =
1825 BaseOffset.isScalable() ? BaseOffset.getKnownMinValue() : 0;
1826 return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, FixedOffset,
1827 HasBaseReg, Scale, AccessTy.AddrSpace,
1828 Fixup, ScalableOffset);
1829 }
1830 case LSRUse::ICmpZero:
1831 // There's not even a target hook for querying whether it would be legal to
1832 // fold a GV into an ICmp.
1833 if (BaseGV)
1834 return false;
1835
1836 // ICmp only has two operands; don't allow more than two non-trivial parts.
1837 if (Scale != 0 && HasBaseReg && BaseOffset.isNonZero())
1838 return false;
1839
1840 // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
1841 // putting the scaled register in the other operand of the icmp.
1842 if (Scale != 0 && Scale != -1)
1843 return false;
1844
1845 // If we have low-level target information, ask the target if it can fold an
1846 // integer immediate on an icmp.
1847 if (BaseOffset.isNonZero()) {
1848 // We don't have an interface to query whether the target supports
1849 // icmpzero against scalable quantities yet.
1850 if (BaseOffset.isScalable())
1851 return false;
1852
1853 // We have one of:
1854 // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
1855 // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
1856 // Offs is the ICmp immediate.
1857 if (Scale == 0)
1858 // The cast does the right thing with
1859 // std::numeric_limits<int64_t>::min().
1860 BaseOffset = BaseOffset.getFixed(-(uint64_t)BaseOffset.getFixedValue());
1861 return TTI.isLegalICmpImmediate(BaseOffset.getFixedValue());
1862 }
1863
1864 // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
1865 return true;
1866
1867 case LSRUse::Basic:
1868 // Only handle single-register values.
1869 return !BaseGV && Scale == 0 && BaseOffset.isZero();
1870
1871 case LSRUse::Special:
1872 // Special case Basic to handle -1 scales.
1873 return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset.isZero();
1874 }
1875
1876 llvm_unreachable("Invalid LSRUse Kind!");
1877}
1878
1880 Immediate MinOffset, Immediate MaxOffset,
1881 LSRUse::KindType Kind, MemAccessTy AccessTy,
1882 GlobalValue *BaseGV, Immediate BaseOffset,
1883 bool HasBaseReg, int64_t Scale) {
1884 if (BaseOffset.isNonZero() &&
1885 (BaseOffset.isScalable() != MinOffset.isScalable() ||
1886 BaseOffset.isScalable() != MaxOffset.isScalable()))
1887 return false;
1888 // Check for overflow.
1889 int64_t Base = BaseOffset.getKnownMinValue();
1890 int64_t Min = MinOffset.getKnownMinValue();
1891 int64_t Max = MaxOffset.getKnownMinValue();
1892 if (((int64_t)((uint64_t)Base + Min) > Base) != (Min > 0))
1893 return false;
1894 MinOffset = Immediate::get((uint64_t)Base + Min, MinOffset.isScalable());
1895 if (((int64_t)((uint64_t)Base + Max) > Base) != (Max > 0))
1896 return false;
1897 MaxOffset = Immediate::get((uint64_t)Base + Max, MaxOffset.isScalable());
1898
1899 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
1900 HasBaseReg, Scale) &&
1901 isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
1902 HasBaseReg, Scale);
1903}
1904
1906 Immediate MinOffset, Immediate MaxOffset,
1907 LSRUse::KindType Kind, MemAccessTy AccessTy,
1908 const Formula &F, const Loop &L) {
1909 // For the purpose of isAMCompletelyFolded either having a canonical formula
1910 // or a scale not equal to zero is correct.
1911 // Problems may arise from non canonical formulae having a scale == 0.
1912 // Strictly speaking it would best to just rely on canonical formulae.
1913 // However, when we generate the scaled formulae, we first check that the
1914 // scaling factor is profitable before computing the actual ScaledReg for
1915 // compile time sake.
1916 assert((F.isCanonical(L) || F.Scale != 0));
1917 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1918 F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
1919}
1920
1921/// Test whether we know how to expand the current formula.
1922static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1923 Immediate MaxOffset, LSRUse::KindType Kind,
1924 MemAccessTy AccessTy, GlobalValue *BaseGV,
1925 Immediate BaseOffset, bool HasBaseReg, int64_t Scale) {
1926 // We know how to expand completely foldable formulae.
1927 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
1928 BaseOffset, HasBaseReg, Scale) ||
1929 // Or formulae that use a base register produced by a sum of base
1930 // registers.
1931 (Scale == 1 &&
1932 isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
1933 BaseGV, BaseOffset, true, 0));
1934}
1935
1936static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
1937 Immediate MaxOffset, LSRUse::KindType Kind,
1938 MemAccessTy AccessTy, const Formula &F) {
1939 return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
1940 F.BaseOffset, F.HasBaseReg, F.Scale);
1941}
1942
1944 Immediate Offset) {
1945 if (Offset.isScalable())
1946 return TTI.isLegalAddScalableImmediate(Offset.getKnownMinValue());
1947
1948 return TTI.isLegalAddImmediate(Offset.getFixedValue());
1949}
1950
1952 const LSRUse &LU, const Formula &F) {
1953 // Target may want to look at the user instructions.
1954 if (LU.Kind == LSRUse::Address && TTI.LSRWithInstrQueries()) {
1955 for (const LSRFixup &Fixup : LU.Fixups)
1956 if (!isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
1957 (F.BaseOffset + Fixup.Offset), F.HasBaseReg,
1958 F.Scale, Fixup.UserInst))
1959 return false;
1960 return true;
1961 }
1962
1963 return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1964 LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
1965 F.Scale);
1966}
1967
1969 const LSRUse &LU, const Formula &F,
1970 const Loop &L) {
1971 if (!F.Scale)
1972 return 0;
1973
1974 // If the use is not completely folded in that instruction, we will have to
1975 // pay an extra cost only for scale != 1.
1976 if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
1977 LU.AccessTy, F, L))
1978 return F.Scale != 1;
1979
1980 switch (LU.Kind) {
1981 case LSRUse::Address: {
1982 // Check the scaling factor cost with both the min and max offsets.
1983 int64_t ScalableMin = 0, ScalableMax = 0, FixedMin = 0, FixedMax = 0;
1984 if (F.BaseOffset.isScalable()) {
1985 ScalableMin = (F.BaseOffset + LU.MinOffset).getKnownMinValue();
1986 ScalableMax = (F.BaseOffset + LU.MaxOffset).getKnownMinValue();
1987 } else {
1988 FixedMin = (F.BaseOffset + LU.MinOffset).getFixedValue();
1989 FixedMax = (F.BaseOffset + LU.MaxOffset).getFixedValue();
1990 }
1991 InstructionCost ScaleCostMinOffset = TTI.getScalingFactorCost(
1992 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMin, ScalableMin),
1993 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
1994 InstructionCost ScaleCostMaxOffset = TTI.getScalingFactorCost(
1995 LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMax, ScalableMax),
1996 F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
1997
1998 assert(ScaleCostMinOffset.isValid() && ScaleCostMaxOffset.isValid() &&
1999 "Legal addressing mode has an illegal cost!");
2000 return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
2001 }
2002 case LSRUse::ICmpZero:
2003 case LSRUse::Basic:
2004 case LSRUse::Special:
2005 // The use is completely folded, i.e., everything is folded into the
2006 // instruction.
2007 return 0;
2008 }
2009
2010 llvm_unreachable("Invalid LSRUse Kind!");
2011}
2012
2014 LSRUse::KindType Kind, MemAccessTy AccessTy,
2015 GlobalValue *BaseGV, Immediate BaseOffset,
2016 bool HasBaseReg) {
2017 // Fast-path: zero is always foldable.
2018 if (BaseOffset.isZero() && !BaseGV)
2019 return true;
2020
2021 // Conservatively, create an address with an immediate and a
2022 // base and a scale.
2023 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2024
2025 // Canonicalize a scale of 1 to a base register if the formula doesn't
2026 // already have a base register.
2027 if (!HasBaseReg && Scale == 1) {
2028 Scale = 0;
2029 HasBaseReg = true;
2030 }
2031
2032 // FIXME: Try with + without a scale? Maybe based on TTI?
2033 // I think basereg + scaledreg + immediateoffset isn't a good 'conservative'
2034 // default for many architectures, not just AArch64 SVE. More investigation
2035 // needed later to determine if this should be used more widely than just
2036 // on scalable types.
2037 if (HasBaseReg && BaseOffset.isNonZero() && Kind != LSRUse::ICmpZero &&
2038 AccessTy.MemTy && AccessTy.MemTy->isScalableTy() && DropScaledForVScale)
2039 Scale = 0;
2040
2041 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
2042 HasBaseReg, Scale);
2043}
2044
2046 ScalarEvolution &SE, Immediate MinOffset,
2047 Immediate MaxOffset, LSRUse::KindType Kind,
2048 MemAccessTy AccessTy, const SCEV *S,
2049 bool HasBaseReg) {
2050 // Fast-path: zero is always foldable.
2051 if (S->isZero()) return true;
2052
2053 // Conservatively, create an address with an immediate and a
2054 // base and a scale.
2055 Immediate BaseOffset = ExtractImmediate(S, SE);
2056 GlobalValue *BaseGV = ExtractSymbol(S, SE);
2057
2058 // If there's anything else involved, it's not foldable.
2059 if (!S->isZero()) return false;
2060
2061 // Fast-path: zero is always foldable.
2062 if (BaseOffset.isZero() && !BaseGV)
2063 return true;
2064
2065 if (BaseOffset.isScalable())
2066 return false;
2067
2068 // Conservatively, create an address with an immediate and a
2069 // base and a scale.
2070 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
2071
2072 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
2073 BaseOffset, HasBaseReg, Scale);
2074}
2075
2076namespace {
2077
2078/// An individual increment in a Chain of IV increments. Relate an IV user to
2079/// an expression that computes the IV it uses from the IV used by the previous
2080/// link in the Chain.
2081///
2082/// For the head of a chain, IncExpr holds the absolute SCEV expression for the
2083/// original IVOperand. The head of the chain's IVOperand is only valid during
2084/// chain collection, before LSR replaces IV users. During chain generation,
2085/// IncExpr can be used to find the new IVOperand that computes the same
2086/// expression.
2087struct IVInc {
2088 Instruction *UserInst;
2089 Value* IVOperand;
2090 const SCEV *IncExpr;
2091
2092 IVInc(Instruction *U, Value *O, const SCEV *E)
2093 : UserInst(U), IVOperand(O), IncExpr(E) {}
2094};
2095
2096// The list of IV increments in program order. We typically add the head of a
2097// chain without finding subsequent links.
2098struct IVChain {
2100 const SCEV *ExprBase = nullptr;
2101
2102 IVChain() = default;
2103 IVChain(const IVInc &Head, const SCEV *Base)
2104 : Incs(1, Head), ExprBase(Base) {}
2105
2107
2108 // Return the first increment in the chain.
2109 const_iterator begin() const {
2110 assert(!Incs.empty());
2111 return std::next(Incs.begin());
2112 }
2113 const_iterator end() const {
2114 return Incs.end();
2115 }
2116
2117 // Returns true if this chain contains any increments.
2118 bool hasIncs() const { return Incs.size() >= 2; }
2119
2120 // Add an IVInc to the end of this chain.
2121 void add(const IVInc &X) { Incs.push_back(X); }
2122
2123 // Returns the last UserInst in the chain.
2124 Instruction *tailUserInst() const { return Incs.back().UserInst; }
2125
2126 // Returns true if IncExpr can be profitably added to this chain.
2127 bool isProfitableIncrement(const SCEV *OperExpr,
2128 const SCEV *IncExpr,
2130};
2131
2132/// Helper for CollectChains to track multiple IV increment uses. Distinguish
2133/// between FarUsers that definitely cross IV increments and NearUsers that may
2134/// be used between IV increments.
2135struct ChainUsers {
2138};
2139
2140/// This class holds state for the main loop strength reduction logic.
2141class LSRInstance {
2142 IVUsers &IU;
2143 ScalarEvolution &SE;
2144 DominatorTree &DT;
2145 LoopInfo &LI;
2146 AssumptionCache &AC;
2147 TargetLibraryInfo &TLI;
2148 const TargetTransformInfo &TTI;
2149 Loop *const L;
2150 MemorySSAUpdater *MSSAU;
2152 mutable SCEVExpander Rewriter;
2153 bool Changed = false;
2154
2155 /// This is the insert position that the current loop's induction variable
2156 /// increment should be placed. In simple loops, this is the latch block's
2157 /// terminator. But in more complicated cases, this is a position which will
2158 /// dominate all the in-loop post-increment users.
2159 Instruction *IVIncInsertPos = nullptr;
2160
2161 /// Interesting factors between use strides.
2162 ///
2163 /// We explicitly use a SetVector which contains a SmallSet, instead of the
2164 /// default, a SmallDenseSet, because we need to use the full range of
2165 /// int64_ts, and there's currently no good way of doing that with
2166 /// SmallDenseSet.
2168
2169 /// The cost of the current SCEV, the best solution by LSR will be dropped if
2170 /// the solution is not profitable.
2171 Cost BaselineCost;
2172
2173 /// Interesting use types, to facilitate truncation reuse.
2175
2176 /// The list of interesting uses.
2178
2179 /// Track which uses use which register candidates.
2180 RegUseTracker RegUses;
2181
2182 // Limit the number of chains to avoid quadratic behavior. We don't expect to
2183 // have more than a few IV increment chains in a loop. Missing a Chain falls
2184 // back to normal LSR behavior for those uses.
2185 static const unsigned MaxChains = 8;
2186
2187 /// IV users can form a chain of IV increments.
2189
2190 /// IV users that belong to profitable IVChains.
2192
2193 /// Induction variables that were generated and inserted by the SCEV Expander.
2194 SmallVector<llvm::WeakVH, 2> ScalarEvolutionIVs;
2195
2196 void OptimizeShadowIV();
2197 bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
2198 ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
2199 void OptimizeLoopTermCond();
2200
2201 void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
2202 SmallVectorImpl<ChainUsers> &ChainUsersVec);
2203 void FinalizeChain(IVChain &Chain);
2204 void CollectChains();
2205 void GenerateIVChain(const IVChain &Chain,
2207
2208 void CollectInterestingTypesAndFactors();
2209 void CollectFixupsAndInitialFormulae();
2210
2211 // Support for sharing of LSRUses between LSRFixups.
2213 UseMapTy UseMap;
2214
2215 bool reconcileNewOffset(LSRUse &LU, Immediate NewOffset, bool HasBaseReg,
2216 LSRUse::KindType Kind, MemAccessTy AccessTy);
2217
2218 std::pair<size_t, Immediate> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
2219 MemAccessTy AccessTy);
2220
2221 void DeleteUse(LSRUse &LU, size_t LUIdx);
2222
2223 LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
2224
2225 void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2226 void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
2227 void CountRegisters(const Formula &F, size_t LUIdx);
2228 bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
2229
2230 void CollectLoopInvariantFixupsAndFormulae();
2231
2232 void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
2233 unsigned Depth = 0);
2234
2235 void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
2236 const Formula &Base, unsigned Depth,
2237 size_t Idx, bool IsScaledReg = false);
2238 void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
2239 void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2240 const Formula &Base, size_t Idx,
2241 bool IsScaledReg = false);
2242 void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2243 void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
2244 const Formula &Base,
2245 const SmallVectorImpl<Immediate> &Worklist,
2246 size_t Idx, bool IsScaledReg = false);
2247 void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
2248 void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2249 void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
2250 void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
2251 void GenerateCrossUseConstantOffsets();
2252 void GenerateAllReuseFormulae();
2253
2254 void FilterOutUndesirableDedicatedRegisters();
2255
2256 size_t EstimateSearchSpaceComplexity() const;
2257 void NarrowSearchSpaceByDetectingSupersets();
2258 void NarrowSearchSpaceByCollapsingUnrolledCode();
2259 void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
2260 void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
2261 void NarrowSearchSpaceByFilterPostInc();
2262 void NarrowSearchSpaceByDeletingCostlyFormulas();
2263 void NarrowSearchSpaceByPickingWinnerRegs();
2264 void NarrowSearchSpaceUsingHeuristics();
2265
2266 void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
2267 Cost &SolutionCost,
2269 const Cost &CurCost,
2270 const SmallPtrSet<const SCEV *, 16> &CurRegs,
2271 DenseSet<const SCEV *> &VisitedRegs) const;
2272 void Solve(SmallVectorImpl<const Formula *> &Solution) const;
2273
2275 HoistInsertPosition(BasicBlock::iterator IP,
2276 const SmallVectorImpl<Instruction *> &Inputs) const;
2277 BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP,
2278 const LSRFixup &LF,
2279 const LSRUse &LU) const;
2280
2281 Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2283 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2284 void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
2285 const Formula &F,
2286 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2287 void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
2288 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
2289 void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
2290
2291public:
2292 LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
2294 TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU);
2295
2296 bool getChanged() const { return Changed; }
2297 const SmallVectorImpl<WeakVH> &getScalarEvolutionIVs() const {
2298 return ScalarEvolutionIVs;
2299 }
2300
2301 void print_factors_and_types(raw_ostream &OS) const;
2302 void print_fixups(raw_ostream &OS) const;
2303 void print_uses(raw_ostream &OS) const;
2304 void print(raw_ostream &OS) const;
2305 void dump() const;
2306};
2307
2308} // end anonymous namespace
2309
2310/// If IV is used in a int-to-float cast inside the loop then try to eliminate
2311/// the cast operation.
2312void LSRInstance::OptimizeShadowIV() {
2313 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2314 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2315 return;
2316
2317 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
2318 UI != E; /* empty */) {
2319 IVUsers::const_iterator CandidateUI = UI;
2320 ++UI;
2321 Instruction *ShadowUse = CandidateUI->getUser();
2322 Type *DestTy = nullptr;
2323 bool IsSigned = false;
2324
2325 /* If shadow use is a int->float cast then insert a second IV
2326 to eliminate this cast.
2327
2328 for (unsigned i = 0; i < n; ++i)
2329 foo((double)i);
2330
2331 is transformed into
2332
2333 double d = 0.0;
2334 for (unsigned i = 0; i < n; ++i, ++d)
2335 foo(d);
2336 */
2337 if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
2338 IsSigned = false;
2339 DestTy = UCast->getDestTy();
2340 }
2341 else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
2342 IsSigned = true;
2343 DestTy = SCast->getDestTy();
2344 }
2345 if (!DestTy) continue;
2346
2347 // If target does not support DestTy natively then do not apply
2348 // this transformation.
2349 if (!TTI.isTypeLegal(DestTy)) continue;
2350
2351 PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
2352 if (!PH) continue;
2353 if (PH->getNumIncomingValues() != 2) continue;
2354
2355 // If the calculation in integers overflows, the result in FP type will
2356 // differ. So we only can do this transformation if we are guaranteed to not
2357 // deal with overflowing values
2358 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PH));
2359 if (!AR) continue;
2360 if (IsSigned && !AR->hasNoSignedWrap()) continue;
2361 if (!IsSigned && !AR->hasNoUnsignedWrap()) continue;
2362
2363 Type *SrcTy = PH->getType();
2364 int Mantissa = DestTy->getFPMantissaWidth();
2365 if (Mantissa == -1) continue;
2366 if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
2367 continue;
2368
2369 unsigned Entry, Latch;
2370 if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
2371 Entry = 0;
2372 Latch = 1;
2373 } else {
2374 Entry = 1;
2375 Latch = 0;
2376 }
2377
2378 ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
2379 if (!Init) continue;
2380 Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
2381 (double)Init->getSExtValue() :
2382 (double)Init->getZExtValue());
2383
2384 BinaryOperator *Incr =
2385 dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
2386 if (!Incr) continue;
2387 if (Incr->getOpcode() != Instruction::Add
2388 && Incr->getOpcode() != Instruction::Sub)
2389 continue;
2390
2391 /* Initialize new IV, double d = 0.0 in above example. */
2392 ConstantInt *C = nullptr;
2393 if (Incr->getOperand(0) == PH)
2394 C = dyn_cast<ConstantInt>(Incr->getOperand(1));
2395 else if (Incr->getOperand(1) == PH)
2396 C = dyn_cast<ConstantInt>(Incr->getOperand(0));
2397 else
2398 continue;
2399
2400 if (!C) continue;
2401
2402 // Ignore negative constants, as the code below doesn't handle them
2403 // correctly. TODO: Remove this restriction.
2404 if (!C->getValue().isStrictlyPositive())
2405 continue;
2406
2407 /* Add new PHINode. */
2408 PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH->getIterator());
2409 NewPH->setDebugLoc(PH->getDebugLoc());
2410
2411 /* create new increment. '++d' in above example. */
2412 Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
2414 Incr->getOpcode() == Instruction::Add ? Instruction::FAdd
2415 : Instruction::FSub,
2416 NewPH, CFP, "IV.S.next.", Incr->getIterator());
2417 NewIncr->setDebugLoc(Incr->getDebugLoc());
2418
2419 NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
2420 NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
2421
2422 /* Remove cast operation */
2423 ShadowUse->replaceAllUsesWith(NewPH);
2424 ShadowUse->eraseFromParent();
2425 Changed = true;
2426 break;
2427 }
2428}
2429
2430/// If Cond has an operand that is an expression of an IV, set the IV user and
2431/// stride information and return true, otherwise return false.
2432bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
2433 for (IVStrideUse &U : IU)
2434 if (U.getUser() == Cond) {
2435 // NOTE: we could handle setcc instructions with multiple uses here, but
2436 // InstCombine does it as well for simple uses, it's not clear that it
2437 // occurs enough in real life to handle.
2438 CondUse = &U;
2439 return true;
2440 }
2441 return false;
2442}
2443
2444/// Rewrite the loop's terminating condition if it uses a max computation.
2445///
2446/// This is a narrow solution to a specific, but acute, problem. For loops
2447/// like this:
2448///
2449/// i = 0;
2450/// do {
2451/// p[i] = 0.0;
2452/// } while (++i < n);
2453///
2454/// the trip count isn't just 'n', because 'n' might not be positive. And
2455/// unfortunately this can come up even for loops where the user didn't use
2456/// a C do-while loop. For example, seemingly well-behaved top-test loops
2457/// will commonly be lowered like this:
2458///
2459/// if (n > 0) {
2460/// i = 0;
2461/// do {
2462/// p[i] = 0.0;
2463/// } while (++i < n);
2464/// }
2465///
2466/// and then it's possible for subsequent optimization to obscure the if
2467/// test in such a way that indvars can't find it.
2468///
2469/// When indvars can't find the if test in loops like this, it creates a
2470/// max expression, which allows it to give the loop a canonical
2471/// induction variable:
2472///
2473/// i = 0;
2474/// max = n < 1 ? 1 : n;
2475/// do {
2476/// p[i] = 0.0;
2477/// } while (++i != max);
2478///
2479/// Canonical induction variables are necessary because the loop passes
2480/// are designed around them. The most obvious example of this is the
2481/// LoopInfo analysis, which doesn't remember trip count values. It
2482/// expects to be able to rediscover the trip count each time it is
2483/// needed, and it does this using a simple analysis that only succeeds if
2484/// the loop has a canonical induction variable.
2485///
2486/// However, when it comes time to generate code, the maximum operation
2487/// can be quite costly, especially if it's inside of an outer loop.
2488///
2489/// This function solves this problem by detecting this type of loop and
2490/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
2491/// the instructions for the maximum computation.
2492ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
2493 // Check that the loop matches the pattern we're looking for.
2494 if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
2495 Cond->getPredicate() != CmpInst::ICMP_NE)
2496 return Cond;
2497
2498 SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
2499 if (!Sel || !Sel->hasOneUse()) return Cond;
2500
2501 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2502 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
2503 return Cond;
2504 const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
2505
2506 // Add one to the backedge-taken count to get the trip count.
2507 const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
2508 if (IterationCount != SE.getSCEV(Sel)) return Cond;
2509
2510 // Check for a max calculation that matches the pattern. There's no check
2511 // for ICMP_ULE here because the comparison would be with zero, which
2512 // isn't interesting.
2513 CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
2514 const SCEVNAryExpr *Max = nullptr;
2515 if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
2516 Pred = ICmpInst::ICMP_SLE;
2517 Max = S;
2518 } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
2519 Pred = ICmpInst::ICMP_SLT;
2520 Max = S;
2521 } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
2522 Pred = ICmpInst::ICMP_ULT;
2523 Max = U;
2524 } else {
2525 // No match; bail.
2526 return Cond;
2527 }
2528
2529 // To handle a max with more than two operands, this optimization would
2530 // require additional checking and setup.
2531 if (Max->getNumOperands() != 2)
2532 return Cond;
2533
2534 const SCEV *MaxLHS = Max->getOperand(0);
2535 const SCEV *MaxRHS = Max->getOperand(1);
2536
2537 // ScalarEvolution canonicalizes constants to the left. For < and >, look
2538 // for a comparison with 1. For <= and >=, a comparison with zero.
2539 if (!MaxLHS ||
2540 (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
2541 return Cond;
2542
2543 // Check the relevant induction variable for conformance to
2544 // the pattern.
2545 const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
2546 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
2547 if (!AR || !AR->isAffine() ||
2548 AR->getStart() != One ||
2549 AR->getStepRecurrence(SE) != One)
2550 return Cond;
2551
2552 assert(AR->getLoop() == L &&
2553 "Loop condition operand is an addrec in a different loop!");
2554
2555 // Check the right operand of the select, and remember it, as it will
2556 // be used in the new comparison instruction.
2557 Value *NewRHS = nullptr;
2558 if (ICmpInst::isTrueWhenEqual(Pred)) {
2559 // Look for n+1, and grab n.
2560 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
2561 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2562 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2563 NewRHS = BO->getOperand(0);
2564 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
2565 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
2566 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
2567 NewRHS = BO->getOperand(0);
2568 if (!NewRHS)
2569 return Cond;
2570 } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
2571 NewRHS = Sel->getOperand(1);
2572 else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
2573 NewRHS = Sel->getOperand(2);
2574 else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
2575 NewRHS = SU->getValue();
2576 else
2577 // Max doesn't match expected pattern.
2578 return Cond;
2579
2580 // Determine the new comparison opcode. It may be signed or unsigned,
2581 // and the original comparison may be either equality or inequality.
2582 if (Cond->getPredicate() == CmpInst::ICMP_EQ)
2583 Pred = CmpInst::getInversePredicate(Pred);
2584
2585 // Ok, everything looks ok to change the condition into an SLT or SGE and
2586 // delete the max calculation.
2587 ICmpInst *NewCond = new ICmpInst(Cond->getIterator(), Pred,
2588 Cond->getOperand(0), NewRHS, "scmp");
2589
2590 // Delete the max calculation instructions.
2591 NewCond->setDebugLoc(Cond->getDebugLoc());
2592 Cond->replaceAllUsesWith(NewCond);
2593 CondUse->setUser(NewCond);
2594 Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
2595 Cond->eraseFromParent();
2596 Sel->eraseFromParent();
2597 if (Cmp->use_empty())
2598 Cmp->eraseFromParent();
2599 return NewCond;
2600}
2601
2602/// Change loop terminating condition to use the postinc iv when possible.
2603void
2604LSRInstance::OptimizeLoopTermCond() {
2606
2607 // We need a different set of heuristics for rotated and non-rotated loops.
2608 // If a loop is rotated then the latch is also the backedge, so inserting
2609 // post-inc expressions just before the latch is ideal. To reduce live ranges
2610 // it also makes sense to rewrite terminating conditions to use post-inc
2611 // expressions.
2612 //
2613 // If the loop is not rotated then the latch is not a backedge; the latch
2614 // check is done in the loop head. Adding post-inc expressions before the
2615 // latch will cause overlapping live-ranges of pre-inc and post-inc expressions
2616 // in the loop body. In this case we do *not* want to use post-inc expressions
2617 // in the latch check, and we want to insert post-inc expressions before
2618 // the backedge.
2619 BasicBlock *LatchBlock = L->getLoopLatch();
2620 SmallVector<BasicBlock*, 8> ExitingBlocks;
2621 L->getExitingBlocks(ExitingBlocks);
2622 if (!llvm::is_contained(ExitingBlocks, LatchBlock)) {
2623 // The backedge doesn't exit the loop; treat this as a head-tested loop.
2624 IVIncInsertPos = LatchBlock->getTerminator();
2625 return;
2626 }
2627
2628 // Otherwise treat this as a rotated loop.
2629 for (BasicBlock *ExitingBlock : ExitingBlocks) {
2630 // Get the terminating condition for the loop if possible. If we
2631 // can, we want to change it to use a post-incremented version of its
2632 // induction variable, to allow coalescing the live ranges for the IV into
2633 // one register value.
2634
2635 BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
2636 if (!TermBr)
2637 continue;
2638 // FIXME: Overly conservative, termination condition could be an 'or' etc..
2639 if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
2640 continue;
2641
2642 // Search IVUsesByStride to find Cond's IVUse if there is one.
2643 IVStrideUse *CondUse = nullptr;
2644 ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
2645 if (!FindIVUserForCond(Cond, CondUse))
2646 continue;
2647
2648 // If the trip count is computed in terms of a max (due to ScalarEvolution
2649 // being unable to find a sufficient guard, for example), change the loop
2650 // comparison to use SLT or ULT instead of NE.
2651 // One consequence of doing this now is that it disrupts the count-down
2652 // optimization. That's not always a bad thing though, because in such
2653 // cases it may still be worthwhile to avoid a max.
2654 Cond = OptimizeMax(Cond, CondUse);
2655
2656 // If this exiting block dominates the latch block, it may also use
2657 // the post-inc value if it won't be shared with other uses.
2658 // Check for dominance.
2659 if (!DT.dominates(ExitingBlock, LatchBlock))
2660 continue;
2661
2662 // Conservatively avoid trying to use the post-inc value in non-latch
2663 // exits if there may be pre-inc users in intervening blocks.
2664 if (LatchBlock != ExitingBlock)
2665 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
2666 // Test if the use is reachable from the exiting block. This dominator
2667 // query is a conservative approximation of reachability.
2668 if (&*UI != CondUse &&
2669 !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {
2670 // Conservatively assume there may be reuse if the quotient of their
2671 // strides could be a legal scale.
2672 const SCEV *A = IU.getStride(*CondUse, L);
2673 const SCEV *B = IU.getStride(*UI, L);
2674 if (!A || !B) continue;
2675 if (SE.getTypeSizeInBits(A->getType()) !=
2676 SE.getTypeSizeInBits(B->getType())) {
2677 if (SE.getTypeSizeInBits(A->getType()) >
2678 SE.getTypeSizeInBits(B->getType()))
2679 B = SE.getSignExtendExpr(B, A->getType());
2680 else
2681 A = SE.getSignExtendExpr(A, B->getType());
2682 }
2683 if (const SCEVConstant *D =
2684 dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
2685 const ConstantInt *C = D->getValue();
2686 // Stride of one or negative one can have reuse with non-addresses.
2687 if (C->isOne() || C->isMinusOne())
2688 goto decline_post_inc;
2689 // Avoid weird situations.
2690 if (C->getValue().getSignificantBits() >= 64 ||
2691 C->getValue().isMinSignedValue())
2692 goto decline_post_inc;
2693 // Check for possible scaled-address reuse.
2694 if (isAddressUse(TTI, UI->getUser(), UI->getOperandValToReplace())) {
2695 MemAccessTy AccessTy = getAccessType(
2696 TTI, UI->getUser(), UI->getOperandValToReplace());
2697 int64_t Scale = C->getSExtValue();
2698 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2699 /*BaseOffset=*/0,
2700 /*HasBaseReg=*/true, Scale,
2701 AccessTy.AddrSpace))
2702 goto decline_post_inc;
2703 Scale = -Scale;
2704 if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
2705 /*BaseOffset=*/0,
2706 /*HasBaseReg=*/true, Scale,
2707 AccessTy.AddrSpace))
2708 goto decline_post_inc;
2709 }
2710 }
2711 }
2712
2713 LLVM_DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: "
2714 << *Cond << '\n');
2715
2716 // It's possible for the setcc instruction to be anywhere in the loop, and
2717 // possible for it to have multiple users. If it is not immediately before
2718 // the exiting block branch, move it.
2719 if (Cond->getNextNonDebugInstruction() != TermBr) {
2720 if (Cond->hasOneUse()) {
2721 Cond->moveBefore(TermBr);
2722 } else {
2723 // Clone the terminating condition and insert into the loopend.
2724 ICmpInst *OldCond = Cond;
2725 Cond = cast<ICmpInst>(Cond->clone());
2726 Cond->setName(L->getHeader()->getName() + ".termcond");
2727 Cond->insertInto(ExitingBlock, TermBr->getIterator());
2728
2729 // Clone the IVUse, as the old use still exists!
2730 CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
2731 TermBr->replaceUsesOfWith(OldCond, Cond);
2732 }
2733 }
2734
2735 // If we get to here, we know that we can transform the setcc instruction to
2736 // use the post-incremented version of the IV, allowing us to coalesce the
2737 // live ranges for the IV correctly.
2738 CondUse->transformToPostInc(L);
2739 Changed = true;
2740
2741 PostIncs.insert(Cond);
2742 decline_post_inc:;
2743 }
2744
2745 // Determine an insertion point for the loop induction variable increment. It
2746 // must dominate all the post-inc comparisons we just set up, and it must
2747 // dominate the loop latch edge.
2748 IVIncInsertPos = L->getLoopLatch()->getTerminator();
2749 for (Instruction *Inst : PostIncs)
2750 IVIncInsertPos = DT.findNearestCommonDominator(IVIncInsertPos, Inst);
2751}
2752
2753/// Determine if the given use can accommodate a fixup at the given offset and
2754/// other details. If so, update the use and return true.
2755bool LSRInstance::reconcileNewOffset(LSRUse &LU, Immediate NewOffset,
2756 bool HasBaseReg, LSRUse::KindType Kind,
2757 MemAccessTy AccessTy) {
2758 Immediate NewMinOffset = LU.MinOffset;
2759 Immediate NewMaxOffset = LU.MaxOffset;
2760 MemAccessTy NewAccessTy = AccessTy;
2761
2762 // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
2763 // something conservative, however this can pessimize in the case that one of
2764 // the uses will have all its uses outside the loop, for example.
2765 if (LU.Kind != Kind)
2766 return false;
2767
2768 // Check for a mismatched access type, and fall back conservatively as needed.
2769 // TODO: Be less conservative when the type is similar and can use the same
2770 // addressing modes.
2771 if (Kind == LSRUse::Address) {
2772 if (AccessTy.MemTy != LU.AccessTy.MemTy) {
2773 NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext(),
2774 AccessTy.AddrSpace);
2775 }
2776 }
2777
2778 // Conservatively assume HasBaseReg is true for now.
2779 if (Immediate::isKnownLT(NewOffset, LU.MinOffset)) {
2780 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2781 LU.MaxOffset - NewOffset, HasBaseReg))
2782 return false;
2783 NewMinOffset = NewOffset;
2784 } else if (Immediate::isKnownGT(NewOffset, LU.MaxOffset)) {
2785 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
2786 NewOffset - LU.MinOffset, HasBaseReg))
2787 return false;
2788 NewMaxOffset = NewOffset;
2789 }
2790
2791 // FIXME: We should be able to handle some level of scalable offset support
2792 // for 'void', but in order to get basic support up and running this is
2793 // being left out.
2794 if (NewAccessTy.MemTy && NewAccessTy.MemTy->isVoidTy() &&
2795 (NewMinOffset.isScalable() || NewMaxOffset.isScalable()))
2796 return false;
2797
2798 // Update the use.
2799 LU.MinOffset = NewMinOffset;
2800 LU.MaxOffset = NewMaxOffset;
2801 LU.AccessTy = NewAccessTy;
2802 return true;
2803}
2804
2805/// Return an LSRUse index and an offset value for a fixup which needs the given
2806/// expression, with the given kind and optional access type. Either reuse an
2807/// existing use or create a new one, as needed.
2808std::pair<size_t, Immediate> LSRInstance::getUse(const SCEV *&Expr,
2809 LSRUse::KindType Kind,
2810 MemAccessTy AccessTy) {
2811 const SCEV *Copy = Expr;
2812 Immediate Offset = ExtractImmediate(Expr, SE);
2813
2814 // Basic uses can't accept any offset, for example.
2815 if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
2816 Offset, /*HasBaseReg=*/ true)) {
2817 Expr = Copy;
2818 Offset = Immediate::getFixed(0);
2819 }
2820
2821 std::pair<UseMapTy::iterator, bool> P =
2822 UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0));
2823 if (!P.second) {
2824 // A use already existed with this base.
2825 size_t LUIdx = P.first->second;
2826 LSRUse &LU = Uses[LUIdx];
2827 if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
2828 // Reuse this use.
2829 return std::make_pair(LUIdx, Offset);
2830 }
2831
2832 // Create a new use.
2833 size_t LUIdx = Uses.size();
2834 P.first->second = LUIdx;
2835 Uses.push_back(LSRUse(Kind, AccessTy));
2836 LSRUse &LU = Uses[LUIdx];
2837
2838 LU.MinOffset = Offset;
2839 LU.MaxOffset = Offset;
2840 return std::make_pair(LUIdx, Offset);
2841}
2842
2843/// Delete the given use from the Uses list.
2844void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
2845 if (&LU != &Uses.back())
2846 std::swap(LU, Uses.back());
2847 Uses.pop_back();
2848
2849 // Update RegUses.
2850 RegUses.swapAndDropUse(LUIdx, Uses.size());
2851}
2852
2853/// Look for a use distinct from OrigLU which is has a formula that has the same
2854/// registers as the given formula.
2855LSRUse *
2856LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
2857 const LSRUse &OrigLU) {
2858 // Search all uses for the formula. This could be more clever.
2859 for (LSRUse &LU : Uses) {
2860 // Check whether this use is close enough to OrigLU, to see whether it's
2861 // worthwhile looking through its formulae.
2862 // Ignore ICmpZero uses because they may contain formulae generated by
2863 // GenerateICmpZeroScales, in which case adding fixup offsets may
2864 // be invalid.
2865 if (&LU != &OrigLU &&
2866 LU.Kind != LSRUse::ICmpZero &&
2867 LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
2868 LU.WidestFixupType == OrigLU.WidestFixupType &&
2869 LU.HasFormulaWithSameRegs(OrigF)) {
2870 // Scan through this use's formulae.
2871 for (const Formula &F : LU.Formulae) {
2872 // Check to see if this formula has the same registers and symbols
2873 // as OrigF.
2874 if (F.BaseRegs == OrigF.BaseRegs &&
2875 F.ScaledReg == OrigF.ScaledReg &&
2876 F.BaseGV == OrigF.BaseGV &&
2877 F.Scale == OrigF.Scale &&
2878 F.UnfoldedOffset == OrigF.UnfoldedOffset) {
2879 if (F.BaseOffset.isZero())
2880 return &LU;
2881 // This is the formula where all the registers and symbols matched;
2882 // there aren't going to be any others. Since we declined it, we
2883 // can skip the rest of the formulae and proceed to the next LSRUse.
2884 break;
2885 }
2886 }
2887 }
2888 }
2889
2890 // Nothing looked good.
2891 return nullptr;
2892}
2893
2894void LSRInstance::CollectInterestingTypesAndFactors() {
2896
2897 // Collect interesting types and strides.
2899 for (const IVStrideUse &U : IU) {
2900 const SCEV *Expr = IU.getExpr(U);
2901 if (!Expr)
2902 continue;
2903
2904 // Collect interesting types.
2905 Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
2906
2907 // Add strides for mentioned loops.
2908 Worklist.push_back(Expr);
2909 do {
2910 const SCEV *S = Worklist.pop_back_val();
2911 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
2912 if (AR->getLoop() == L)
2913 Strides.insert(AR->getStepRecurrence(SE));
2914 Worklist.push_back(AR->getStart());
2915 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
2916 append_range(Worklist, Add->operands());
2917 }
2918 } while (!Worklist.empty());
2919 }
2920
2921 // Compute interesting factors from the set of interesting strides.
2923 I = Strides.begin(), E = Strides.end(); I != E; ++I)
2925 std::next(I); NewStrideIter != E; ++NewStrideIter) {
2926 const SCEV *OldStride = *I;
2927 const SCEV *NewStride = *NewStrideIter;
2928
2929 if (SE.getTypeSizeInBits(OldStride->getType()) !=
2930 SE.getTypeSizeInBits(NewStride->getType())) {
2931 if (SE.getTypeSizeInBits(OldStride->getType()) >
2932 SE.getTypeSizeInBits(NewStride->getType()))
2933 NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
2934 else
2935 OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
2936 }
2937 if (const SCEVConstant *Factor =
2938 dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
2939 SE, true))) {
2940 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2941 Factors.insert(Factor->getAPInt().getSExtValue());
2942 } else if (const SCEVConstant *Factor =
2943 dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
2944 NewStride,
2945 SE, true))) {
2946 if (Factor->getAPInt().getSignificantBits() <= 64 && !Factor->isZero())
2947 Factors.insert(Factor->getAPInt().getSExtValue());
2948 }
2949 }
2950
2951 // If all uses use the same type, don't bother looking for truncation-based
2952 // reuse.
2953 if (Types.size() == 1)
2954 Types.clear();
2955
2956 LLVM_DEBUG(print_factors_and_types(dbgs()));
2957}
2958
2959/// Helper for CollectChains that finds an IV operand (computed by an AddRec in
2960/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
2961/// IVStrideUses, we could partially skip this.
2962static User::op_iterator
2964 Loop *L, ScalarEvolution &SE) {
2965 for(; OI != OE; ++OI) {
2966 if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
2967 if (!SE.isSCEVable(Oper->getType()))
2968 continue;
2969
2970 if (const SCEVAddRecExpr *AR =
2971 dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) {
2972 if (AR->getLoop() == L)
2973 break;
2974 }
2975 }
2976 }
2977 return OI;
2978}
2979
2980/// IVChain logic must consistently peek base TruncInst operands, so wrap it in
2981/// a convenient helper.
2983 if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
2984 return Trunc->getOperand(0);
2985 return Oper;
2986}
2987
2988/// Return an approximation of this SCEV expression's "base", or NULL for any
2989/// constant. Returning the expression itself is conservative. Returning a
2990/// deeper subexpression is more precise and valid as long as it isn't less
2991/// complex than another subexpression. For expressions involving multiple
2992/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
2993/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
2994/// IVInc==b-a.
2995///
2996/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
2997/// SCEVUnknown, we simply return the rightmost SCEV operand.
2998static const SCEV *getExprBase(const SCEV *S) {
2999 switch (S->getSCEVType()) {
3000 default: // including scUnknown.
3001 return S;
3002 case scConstant:
3003 case scVScale:
3004 return nullptr;
3005 case scTruncate:
3006 return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
3007 case scZeroExtend:
3008 return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
3009 case scSignExtend:
3010 return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
3011 case scAddExpr: {
3012 // Skip over scaled operands (scMulExpr) to follow add operands as long as
3013 // there's nothing more complex.
3014 // FIXME: not sure if we want to recognize negation.
3015 const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
3016 for (const SCEV *SubExpr : reverse(Add->operands())) {
3017 if (SubExpr->getSCEVType() == scAddExpr)
3018 return getExprBase(SubExpr);
3019
3020 if (SubExpr->getSCEVType() != scMulExpr)
3021 return SubExpr;
3022 }
3023 return S; // all operands are scaled, be conservative.
3024 }
3025 case scAddRecExpr:
3026 return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
3027 }
3028 llvm_unreachable("Unknown SCEV kind!");
3029}
3030
3031/// Return true if the chain increment is profitable to expand into a loop
3032/// invariant value, which may require its own register. A profitable chain
3033/// increment will be an offset relative to the same base. We allow such offsets
3034/// to potentially be used as chain increment as long as it's not obviously
3035/// expensive to expand using real instructions.
3036bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
3037 const SCEV *IncExpr,
3038 ScalarEvolution &SE) {
3039 // Aggressively form chains when -stress-ivchain.
3040 if (StressIVChain)
3041 return true;
3042
3043 // Do not replace a constant offset from IV head with a nonconstant IV
3044 // increment.
3045 if (!isa<SCEVConstant>(IncExpr)) {
3046 const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
3047 if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
3048 return false;
3049 }
3050
3052 return !isHighCostExpansion(IncExpr, Processed, SE);
3053}
3054
3055/// Return true if the number of registers needed for the chain is estimated to
3056/// be less than the number required for the individual IV users. First prohibit
3057/// any IV users that keep the IV live across increments (the Users set should
3058/// be empty). Next count the number and type of increments in the chain.
3059///
3060/// Chaining IVs can lead to considerable code bloat if ISEL doesn't
3061/// effectively use postinc addressing modes. Only consider it profitable it the
3062/// increments can be computed in fewer registers when chained.
3063///
3064/// TODO: Consider IVInc free if it's already used in another chains.
3065static bool isProfitableChain(IVChain &Chain,
3067 ScalarEvolution &SE,
3068 const TargetTransformInfo &TTI) {
3069 if (StressIVChain)
3070 return true;
3071
3072 if (!Chain.hasIncs())
3073 return false;
3074
3075 if (!Users.empty()) {
3076 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
3077 for (Instruction *Inst
3078 : Users) { dbgs() << " " << *Inst << "\n"; });
3079 return false;
3080 }
3081 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3082
3083 // The chain itself may require a register, so intialize cost to 1.
3084 int cost = 1;
3085
3086 // A complete chain likely eliminates the need for keeping the original IV in
3087 // a register. LSR does not currently know how to form a complete chain unless
3088 // the header phi already exists.
3089 if (isa<PHINode>(Chain.tailUserInst())
3090 && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
3091 --cost;
3092 }
3093 const SCEV *LastIncExpr = nullptr;
3094 unsigned NumConstIncrements = 0;
3095 unsigned NumVarIncrements = 0;
3096 unsigned NumReusedIncrements = 0;
3097
3098 if (TTI.isProfitableLSRChainElement(Chain.Incs[0].UserInst))
3099 return true;
3100
3101 for (const IVInc &Inc : Chain) {
3102 if (TTI.isProfitableLSRChainElement(Inc.UserInst))
3103 return true;
3104 if (Inc.IncExpr->isZero())
3105 continue;
3106
3107 // Incrementing by zero or some constant is neutral. We assume constants can
3108 // be folded into an addressing mode or an add's immediate operand.
3109 if (isa<SCEVConstant>(Inc.IncExpr)) {
3110 ++NumConstIncrements;
3111 continue;
3112 }
3113
3114 if (Inc.IncExpr == LastIncExpr)
3115 ++NumReusedIncrements;
3116 else
3117 ++NumVarIncrements;
3118
3119 LastIncExpr = Inc.IncExpr;
3120 }
3121 // An IV chain with a single increment is handled by LSR's postinc
3122 // uses. However, a chain with multiple increments requires keeping the IV's
3123 // value live longer than it needs to be if chained.
3124 if (NumConstIncrements > 1)
3125 --cost;
3126
3127 // Materializing increment expressions in the preheader that didn't exist in
3128 // the original code may cost a register. For example, sign-extended array
3129 // indices can produce ridiculous increments like this:
3130 // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
3131 cost += NumVarIncrements;
3132
3133 // Reusing variable increments likely saves a register to hold the multiple of
3134 // the stride.
3135 cost -= NumReusedIncrements;
3136
3137 LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
3138 << "\n");
3139
3140 return cost < 0;
3141}
3142
3143/// Add this IV user to an existing chain or make it the head of a new chain.
3144void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
3145 SmallVectorImpl<ChainUsers> &ChainUsersVec) {
3146 // When IVs are used as types of varying widths, they are generally converted
3147 // to a wider type with some uses remaining narrow under a (free) trunc.
3148 Value *const NextIV = getWideOperand(IVOper);
3149 const SCEV *const OperExpr = SE.getSCEV(NextIV);
3150 const SCEV *const OperExprBase = getExprBase(OperExpr);
3151
3152 // Visit all existing chains. Check if its IVOper can be computed as a
3153 // profitable loop invariant increment from the last link in the Chain.
3154 unsigned ChainIdx = 0, NChains = IVChainVec.size();
3155 const SCEV *LastIncExpr = nullptr;
3156 for (; ChainIdx < NChains; ++ChainIdx) {
3157 IVChain &Chain = IVChainVec[ChainIdx];
3158
3159 // Prune the solution space aggressively by checking that both IV operands
3160 // are expressions that operate on the same unscaled SCEVUnknown. This
3161 // "base" will be canceled by the subsequent getMinusSCEV call. Checking
3162 // first avoids creating extra SCEV expressions.
3163 if (!StressIVChain && Chain.ExprBase != OperExprBase)
3164 continue;
3165
3166 Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
3167 if (PrevIV->getType() != NextIV->getType())
3168 continue;
3169
3170 // A phi node terminates a chain.
3171 if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
3172 continue;
3173
3174 // The increment must be loop-invariant so it can be kept in a register.
3175 const SCEV *PrevExpr = SE.getSCEV(PrevIV);
3176 const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
3177 if (isa<SCEVCouldNotCompute>(IncExpr) || !SE.isLoopInvariant(IncExpr, L))
3178 continue;
3179
3180 if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
3181 LastIncExpr = IncExpr;
3182 break;
3183 }
3184 }
3185 // If we haven't found a chain, create a new one, unless we hit the max. Don't
3186 // bother for phi nodes, because they must be last in the chain.
3187 if (ChainIdx == NChains) {
3188 if (isa<PHINode>(UserInst))
3189 return;
3190 if (NChains >= MaxChains && !StressIVChain) {
3191 LLVM_DEBUG(dbgs() << "IV Chain Limit\n");
3192 return;
3193 }
3194 LastIncExpr = OperExpr;
3195 // IVUsers may have skipped over sign/zero extensions. We don't currently
3196 // attempt to form chains involving extensions unless they can be hoisted
3197 // into this loop's AddRec.
3198 if (!isa<SCEVAddRecExpr>(LastIncExpr))
3199 return;
3200 ++NChains;
3201 IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
3202 OperExprBase));
3203 ChainUsersVec.resize(NChains);
3204 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
3205 << ") IV=" << *LastIncExpr << "\n");
3206 } else {
3207 LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst
3208 << ") IV+" << *LastIncExpr << "\n");
3209 // Add this IV user to the end of the chain.
3210 IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
3211 }
3212 IVChain &Chain = IVChainVec[ChainIdx];
3213
3214 SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
3215 // This chain's NearUsers become FarUsers.
3216 if (!LastIncExpr->isZero()) {
3217 ChainUsersVec[ChainIdx].FarUsers.insert(NearUsers.begin(),
3218 NearUsers.end());
3219 NearUsers.clear();
3220 }
3221
3222 // All other uses of IVOperand become near uses of the chain.
3223 // We currently ignore intermediate values within SCEV expressions, assuming
3224 // they will eventually be used be the current chain, or can be computed
3225 // from one of the chain increments. To be more precise we could
3226 // transitively follow its user and only add leaf IV users to the set.
3227 for (User *U : IVOper->users()) {
3228 Instruction *OtherUse = dyn_cast<Instruction>(U);
3229 if (!OtherUse)
3230 continue;
3231 // Uses in the chain will no longer be uses if the chain is formed.
3232 // Include the head of the chain in this iteration (not Chain.begin()).
3233 IVChain::const_iterator IncIter = Chain.Incs.begin();
3234 IVChain::const_iterator IncEnd = Chain.Incs.end();
3235 for( ; IncIter != IncEnd; ++IncIter) {
3236 if (IncIter->UserInst == OtherUse)
3237 break;
3238 }
3239 if (IncIter != IncEnd)
3240 continue;
3241
3242 if (SE.isSCEVable(OtherUse->getType())
3243 && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
3244 && IU.isIVUserOrOperand(OtherUse)) {
3245 continue;
3246 }
3247 NearUsers.insert(OtherUse);
3248 }
3249
3250 // Since this user is part of the chain, it's no longer considered a use
3251 // of the chain.
3252 ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
3253}
3254
3255/// Populate the vector of Chains.
3256///
3257/// This decreases ILP at the architecture level. Targets with ample registers,
3258/// multiple memory ports, and no register renaming probably don't want
3259/// this. However, such targets should probably disable LSR altogether.
3260///
3261/// The job of LSR is to make a reasonable choice of induction variables across
3262/// the loop. Subsequent passes can easily "unchain" computation exposing more
3263/// ILP *within the loop* if the target wants it.
3264///
3265/// Finding the best IV chain is potentially a scheduling problem. Since LSR
3266/// will not reorder memory operations, it will recognize this as a chain, but
3267/// will generate redundant IV increments. Ideally this would be corrected later
3268/// by a smart scheduler:
3269/// = A[i]
3270/// = A[i+x]
3271/// A[i] =
3272/// A[i+x] =
3273///
3274/// TODO: Walk the entire domtree within this loop, not just the path to the
3275/// loop latch. This will discover chains on side paths, but requires
3276/// maintaining multiple copies of the Chains state.
3277void LSRInstance::CollectChains() {
3278 LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n");
3279 SmallVector<ChainUsers, 8> ChainUsersVec;
3280
3282 BasicBlock *LoopHeader = L->getHeader();
3283 for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
3284 Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
3285 LatchPath.push_back(Rung->getBlock());
3286 }
3287 LatchPath.push_back(LoopHeader);
3288
3289 // Walk the instruction stream from the loop header to the loop latch.
3290 for (BasicBlock *BB : reverse(LatchPath)) {
3291 for (Instruction &I : *BB) {
3292 // Skip instructions that weren't seen by IVUsers analysis.
3293 if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&I))
3294 continue;
3295
3296 // Ignore users that are part of a SCEV expression. This way we only
3297 // consider leaf IV Users. This effectively rediscovers a portion of
3298 // IVUsers analysis but in program order this time.
3299 if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I)))
3300 continue;
3301
3302 // Remove this instruction from any NearUsers set it may be in.
3303 for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
3304 ChainIdx < NChains; ++ChainIdx) {
3305 ChainUsersVec[ChainIdx].NearUsers.erase(&I);
3306 }
3307 // Search for operands that can be chained.
3308 SmallPtrSet<Instruction*, 4> UniqueOperands;
3309 User::op_iterator IVOpEnd = I.op_end();
3310 User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE);
3311 while (IVOpIter != IVOpEnd) {
3312 Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
3313 if (UniqueOperands.insert(IVOpInst).second)
3314 ChainInstruction(&I, IVOpInst, ChainUsersVec);
3315 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3316 }
3317 } // Continue walking down the instructions.
3318 } // Continue walking down the domtree.
3319 // Visit phi backedges to determine if the chain can generate the IV postinc.
3320 for (PHINode &PN : L->getHeader()->phis()) {
3321 if (!SE.isSCEVable(PN.getType()))
3322 continue;
3323
3324 Instruction *IncV =
3325 dyn_cast<Instruction>(PN.getIncomingValueForBlock(L->getLoopLatch()));
3326 if (IncV)
3327 ChainInstruction(&PN, IncV, ChainUsersVec);
3328 }
3329 // Remove any unprofitable chains.
3330 unsigned ChainIdx = 0;
3331 for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
3332 UsersIdx < NChains; ++UsersIdx) {
3333 if (!isProfitableChain(IVChainVec[UsersIdx],
3334 ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
3335 continue;
3336 // Preserve the chain at UsesIdx.
3337 if (ChainIdx != UsersIdx)
3338 IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
3339 FinalizeChain(IVChainVec[ChainIdx]);
3340 ++ChainIdx;
3341 }
3342 IVChainVec.resize(ChainIdx);
3343}
3344
3345void LSRInstance::FinalizeChain(IVChain &Chain) {
3346 assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
3347 LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
3348
3349 for (const IVInc &Inc : Chain) {
3350 LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");
3351 auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
3352 assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
3353 IVIncSet.insert(UseI);
3354 }
3355}
3356
3357/// Return true if the IVInc can be folded into an addressing mode.
3358static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
3359 Value *Operand, const TargetTransformInfo &TTI) {
3360 const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
3361 Immediate IncOffset = Immediate::getZero();
3362 if (IncConst) {
3363 if (IncConst && IncConst->getAPInt().getSignificantBits() > 64)
3364 return false;
3365 IncOffset = Immediate::getFixed(IncConst->getValue()->getSExtValue());
3366 } else {
3367 // Look for mul(vscale, constant), to detect a scalable offset.
3368 auto *IncVScale = dyn_cast<SCEVMulExpr>(IncExpr);
3369 if (!IncVScale || IncVScale->getNumOperands() != 2 ||
3370 !isa<SCEVVScale>(IncVScale->getOperand(1)))
3371 return false;
3372 auto *Scale = dyn_cast<SCEVConstant>(IncVScale->getOperand(0));
3373 if (!Scale || Scale->getType()->getScalarSizeInBits() > 64)
3374 return false;
3375 IncOffset = Immediate::getScalable(Scale->getValue()->getSExtValue());
3376 }
3377
3378 if (!isAddressUse(TTI, UserInst, Operand))
3379 return false;
3380
3381 MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
3382 if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
3383 IncOffset, /*HasBaseReg=*/false))
3384 return false;
3385
3386 return true;
3387}
3388
3389/// Generate an add or subtract for each IVInc in a chain to materialize the IV
3390/// user's operand from the previous IV user's operand.
3391void LSRInstance::GenerateIVChain(const IVChain &Chain,
3393 // Find the new IVOperand for the head of the chain. It may have been replaced
3394 // by LSR.
3395 const IVInc &Head = Chain.Incs[0];
3396 User::op_iterator IVOpEnd = Head.UserInst->op_end();
3397 // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
3398 User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
3399 IVOpEnd, L, SE);
3400 Value *IVSrc = nullptr;
3401 while (IVOpIter != IVOpEnd) {
3402 IVSrc = getWideOperand(*IVOpIter);
3403
3404 // If this operand computes the expression that the chain needs, we may use
3405 // it. (Check this after setting IVSrc which is used below.)
3406 //
3407 // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
3408 // narrow for the chain, so we can no longer use it. We do allow using a
3409 // wider phi, assuming the LSR checked for free truncation. In that case we
3410 // should already have a truncate on this operand such that
3411 // getSCEV(IVSrc) == IncExpr.
3412 if (SE.getSCEV(*IVOpIter) == Head.IncExpr
3413 || SE.getSCEV(IVSrc) == Head.IncExpr) {
3414 break;
3415 }
3416 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
3417 }
3418 if (IVOpIter == IVOpEnd) {
3419 // Gracefully give up on this chain.
3420 LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
3421 return;
3422 }
3423 assert(IVSrc && "Failed to find IV chain source");
3424
3425 LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
3426 Type *IVTy = IVSrc->getType();
3427 Type *IntTy = SE.getEffectiveSCEVType(IVTy);
3428 const SCEV *LeftOverExpr = nullptr;
3429 const SCEV *Accum = SE.getZero(IntTy);
3431 Bases.emplace_back(Accum, IVSrc);
3432
3433 for (const IVInc &Inc : Chain) {
3434 Instruction *InsertPt = Inc.UserInst;
3435 if (isa<PHINode>(InsertPt))
3436 InsertPt = L->getLoopLatch()->getTerminator();
3437
3438 // IVOper will replace the current IV User's operand. IVSrc is the IV
3439 // value currently held in a register.
3440 Value *IVOper = IVSrc;
3441 if (!Inc.IncExpr->isZero()) {
3442 // IncExpr was the result of subtraction of two narrow values, so must
3443 // be signed.
3444 const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
3445 Accum = SE.getAddExpr(Accum, IncExpr);
3446 LeftOverExpr = LeftOverExpr ?
3447 SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
3448 }
3449
3450 // Look through each base to see if any can produce a nice addressing mode.
3451 bool FoundBase = false;
3452 for (auto [MapScev, MapIVOper] : reverse(Bases)) {
3453 const SCEV *Remainder = SE.getMinusSCEV(Accum, MapScev);
3454 if (canFoldIVIncExpr(Remainder, Inc.UserInst, Inc.IVOperand, TTI)) {
3455 if (!Remainder->isZero()) {
3456 Rewriter.clearPostInc();
3457 Value *IncV = Rewriter.expandCodeFor(Remainder, IntTy, InsertPt);
3458 const SCEV *IVOperExpr =
3459 SE.getAddExpr(SE.getUnknown(MapIVOper), SE.getUnknown(IncV));
3460 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3461 } else {
3462 IVOper = MapIVOper;
3463 }
3464
3465 FoundBase = true;
3466 break;
3467 }
3468 }
3469 if (!FoundBase && LeftOverExpr && !LeftOverExpr->isZero()) {
3470 // Expand the IV increment.
3471 Rewriter.clearPostInc();
3472 Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
3473 const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
3474 SE.getUnknown(IncV));
3475 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
3476
3477 // If an IV increment can't be folded, use it as the next IV value.
3478 if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
3479 assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
3480 Bases.emplace_back(Accum, IVOper);
3481 IVSrc = IVOper;
3482 LeftOverExpr = nullptr;
3483 }
3484 }
3485 Type *OperTy = Inc.IVOperand->getType();
3486 if (IVTy != OperTy) {
3487 assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
3488 "cannot extend a chained IV");
3489 IRBuilder<> Builder(InsertPt);
3490 IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
3491 }
3492 Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
3493 if (auto *OperandIsInstr = dyn_cast<Instruction>(Inc.IVOperand))
3494 DeadInsts.emplace_back(OperandIsInstr);
3495 }
3496 // If LSR created a new, wider phi, we may also replace its postinc. We only
3497 // do this if we also found a wide value for the head of the chain.
3498 if (isa<PHINode>(Chain.tailUserInst())) {
3499 for (PHINode &Phi : L->getHeader()->phis()) {
3500 if (Phi.getType() != IVSrc->getType())
3501 continue;
3502 Instruction *PostIncV = dyn_cast<Instruction>(
3503 Phi.getIncomingValueForBlock(L->getLoopLatch()));
3504 if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
3505 continue;
3506 Value *IVOper = IVSrc;
3507 Type *PostIncTy = PostIncV->getType();
3508 if (IVTy != PostIncTy) {
3509 assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
3510 IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
3511 Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
3512 IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
3513 }
3514 Phi.replaceUsesOfWith(PostIncV, IVOper);
3515 DeadInsts.emplace_back(PostIncV);
3516 }
3517 }
3518}
3519
3520void LSRInstance::CollectFixupsAndInitialFormulae() {
3521 BranchInst *ExitBranch = nullptr;
3522 bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &TLI);
3523
3524 // For calculating baseline cost
3526 DenseSet<const SCEV *> VisitedRegs;
3527 DenseSet<size_t> VisitedLSRUse;
3528
3529 for (const IVStrideUse &U : IU) {
3530 Instruction *UserInst = U.getUser();
3531 // Skip IV users that are part of profitable IV Chains.
3532 User::op_iterator UseI =
3533 find(UserInst->operands(), U.getOperandValToReplace());
3534 assert(UseI != UserInst->op_end() && "cannot find IV operand");
3535 if (IVIncSet.count(UseI)) {
3536 LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
3537 continue;
3538 }
3539
3540 LSRUse::KindType Kind = LSRUse::Basic;
3541 MemAccessTy AccessTy;
3542 if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
3543 Kind = LSRUse::Address;
3544 AccessTy = getAccessType(TTI, UserInst, U.getOperandValToReplace());
3545 }
3546
3547 const SCEV *S = IU.getExpr(U);
3548 if (!S)
3549 continue;
3550 PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
3551
3552 // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
3553 // (N - i == 0), and this allows (N - i) to be the expression that we work
3554 // with rather than just N or i, so we can consider the register
3555 // requirements for both N and i at the same time. Limiting this code to
3556 // equality icmps is not a problem because all interesting loops use
3557 // equality icmps, thanks to IndVarSimplify.
3558 if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) {
3559 // If CI can be saved in some target, like replaced inside hardware loop
3560 // in PowerPC, no need to generate initial formulae for it.
3561 if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition()))
3562 continue;
3563 if (CI->isEquality()) {
3564 // Swap the operands if needed to put the OperandValToReplace on the
3565 // left, for consistency.
3566 Value *NV = CI->getOperand(1);
3567 if (NV == U.getOperandValToReplace()) {
3568 CI->setOperand(1, CI->getOperand(0));
3569 CI->setOperand(0, NV);
3570 NV = CI->getOperand(1);
3571 Changed = true;
3572 }
3573
3574 // x == y --> x - y == 0
3575 const SCEV *N = SE.getSCEV(NV);
3576 if (SE.isLoopInvariant(N, L) && Rewriter.isSafeToExpand(N) &&
3577 (!NV->getType()->isPointerTy() ||
3578 SE.getPointerBase(N) == SE.getPointerBase(S))) {
3579 // S is normalized, so normalize N before folding it into S
3580 // to keep the result normalized.
3581 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3582 if (!N)
3583 continue;
3584 Kind = LSRUse::ICmpZero;
3585 S = SE.getMinusSCEV(N, S);
3586 } else if (L->isLoopInvariant(NV) &&
3587 (!isa<Instruction>(NV) ||
3588 DT.dominates(cast<Instruction>(NV), L->getHeader())) &&
3589 !NV->getType()->isPointerTy()) {
3590 // If we can't generally expand the expression (e.g. it contains
3591 // a divide), but it is already at a loop invariant point before the
3592 // loop, wrap it in an unknown (to prevent the expander from trying
3593 // to re-expand in a potentially unsafe way.) The restriction to
3594 // integer types is required because the unknown hides the base, and
3595 // SCEV can't compute the difference of two unknown pointers.
3596 N = SE.getUnknown(NV);
3597 N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
3598 if (!N)
3599 continue;
3600 Kind = LSRUse::ICmpZero;
3601 S = SE.getMinusSCEV(N, S);
3602 assert(!isa<SCEVCouldNotCompute>(S));
3603 }
3604
3605 // -1 and the negations of all interesting strides (except the negation
3606 // of -1) are now also interesting.
3607 for (size_t i = 0, e = Factors.size(); i != e; ++i)
3608 if (Factors[i] != -1)
3609 Factors.insert(-(uint64_t)Factors[i]);
3610 Factors.insert(-1);
3611 }
3612 }
3613
3614 // Get or create an LSRUse.
3615 std::pair<size_t, Immediate> P = getUse(S, Kind, AccessTy);
3616 size_t LUIdx = P.first;
3617 Immediate Offset = P.second;
3618 LSRUse &LU = Uses[LUIdx];
3619
3620 // Record the fixup.
3621 LSRFixup &LF = LU.getNewFixup();
3622 LF.UserInst = UserInst;
3623 LF.OperandValToReplace = U.getOperandValToReplace();
3624 LF.PostIncLoops = TmpPostIncLoops;
3625 LF.Offset = Offset;
3626 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3627
3628 // Create SCEV as Formula for calculating baseline cost
3629 if (!VisitedLSRUse.count(LUIdx) && !LF.isUseFullyOutsideLoop(L)) {
3630 Formula F;
3631 F.initialMatch(S, L, SE);
3632 BaselineCost.RateFormula(F, Regs, VisitedRegs, LU);
3633 VisitedLSRUse.insert(LUIdx);
3634 }
3635
3636 if (!LU.WidestFixupType ||
3637 SE.getTypeSizeInBits(LU.WidestFixupType) <
3638 SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3639 LU.WidestFixupType = LF.OperandValToReplace->getType();
3640
3641 // If this is the first use of this LSRUse, give it a formula.
3642 if (LU.Formulae.empty()) {
3643 InsertInitialFormula(S, LU, LUIdx);
3644 CountRegisters(LU.Formulae.back(), LUIdx);
3645 }
3646 }
3647
3648 LLVM_DEBUG(print_fixups(dbgs()));
3649}
3650
3651/// Insert a formula for the given expression into the given use, separating out
3652/// loop-variant portions from loop-invariant and loop-computable portions.
3653void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU,
3654 size_t LUIdx) {
3655 // Mark uses whose expressions cannot be expanded.
3656 if (!Rewriter.isSafeToExpand(S))
3657 LU.RigidFormula = true;
3658
3659 Formula F;
3660 F.initialMatch(S, L, SE);
3661 bool Inserted = InsertFormula(LU, LUIdx, F);
3662 assert(Inserted && "Initial formula already exists!"); (void)Inserted;
3663}
3664
3665/// Insert a simple single-register formula for the given expression into the
3666/// given use.
3667void
3668LSRInstance::InsertSupplementalFormula(const SCEV *S,
3669 LSRUse &LU, size_t LUIdx) {
3670 Formula F;
3671 F.BaseRegs.push_back(S);
3672 F.HasBaseReg = true;
3673 bool Inserted = InsertFormula(LU, LUIdx, F);
3674 assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
3675}
3676
3677/// Note which registers are used by the given formula, updating RegUses.
3678void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
3679 if (F.ScaledReg)
3680 RegUses.countRegister(F.ScaledReg, LUIdx);
3681 for (const SCEV *BaseReg : F.BaseRegs)
3682 RegUses.countRegister(BaseReg, LUIdx);
3683}
3684
3685/// If the given formula has not yet been inserted, add it to the list, and
3686/// return true. Return false otherwise.
3687bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
3688 // Do not insert formula that we will not be able to expand.
3689 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
3690 "Formula is illegal");
3691
3692 if (!LU.InsertFormula(F, *L))
3693 return false;
3694
3695 CountRegisters(F, LUIdx);
3696 return true;
3697}
3698
3699/// Check for other uses of loop-invariant values which we're tracking. These
3700/// other uses will pin these values in registers, making them less profitable
3701/// for elimination.
3702/// TODO: This currently misses non-constant addrec step registers.
3703/// TODO: Should this give more weight to users inside the loop?
3704void
3705LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
3706 SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
3708
3709 // Don't collect outside uses if we are favoring postinc - the instructions in
3710 // the loop are more important than the ones outside of it.
3711 if (AMK == TTI::AMK_PostIndexed)
3712 return;
3713
3714 while (!Worklist.empty()) {
3715 const SCEV *S = Worklist.pop_back_val();
3716
3717 // Don't process the same SCEV twice
3718 if (!Visited.insert(S).second)
3719 continue;
3720
3721 if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
3722 append_range(Worklist, N->operands());
3723 else if (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(S))
3724 Worklist.push_back(C->getOperand());
3725 else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
3726 Worklist.push_back(D->getLHS());
3727 Worklist.push_back(D->getRHS());
3728 } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
3729 const Value *V = US->getValue();
3730 if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
3731 // Look for instructions defined outside the loop.
3732 if (L->contains(Inst)) continue;
3733 } else if (isa<Constant>(V))
3734 // Constants can be re-materialized.
3735 continue;
3736 for (const Use &U : V->uses()) {
3737 const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
3738 // Ignore non-instructions.
3739 if (!UserInst)
3740 continue;
3741 // Don't bother if the instruction is an EHPad.
3742 if (UserInst->isEHPad())
3743 continue;
3744 // Ignore instructions in other functions (as can happen with
3745 // Constants).
3746 if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
3747 continue;
3748 // Ignore instructions not dominated by the loop.
3749 const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
3750 UserInst->getParent() :
3751 cast<PHINode>(UserInst)->getIncomingBlock(
3753 if (!DT.dominates(L->getHeader(), UseBB))
3754 continue;
3755 // Don't bother if the instruction is in a BB which ends in an EHPad.
3756 if (UseBB->getTerminator()->isEHPad())
3757 continue;
3758
3759 // Ignore cases in which the currently-examined value could come from
3760 // a basic block terminated with an EHPad. This checks all incoming
3761 // blocks of the phi node since it is possible that the same incoming
3762 // value comes from multiple basic blocks, only some of which may end
3763 // in an EHPad. If any of them do, a subsequent rewrite attempt by this
3764 // pass would try to insert instructions into an EHPad, hitting an
3765 // assertion.
3766 if (isa<PHINode>(UserInst)) {
3767 const auto *PhiNode = cast<PHINode>(UserInst);
3768 bool HasIncompatibleEHPTerminatedBlock = false;
3769 llvm::Value *ExpectedValue = U;
3770 for (unsigned int I = 0; I < PhiNode->getNumIncomingValues(); I++) {
3771 if (PhiNode->getIncomingValue(I) == ExpectedValue) {
3772 if (PhiNode->getIncomingBlock(I)->getTerminator()->isEHPad()) {
3773 HasIncompatibleEHPTerminatedBlock = true;
3774 break;
3775 }
3776 }
3777 }
3778 if (HasIncompatibleEHPTerminatedBlock) {
3779 continue;
3780 }
3781 }
3782
3783 // Don't bother rewriting PHIs in catchswitch blocks.
3784 if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
3785 continue;
3786 // Ignore uses which are part of other SCEV expressions, to avoid
3787 // analyzing them multiple times.
3788 if (SE.isSCEVable(UserInst->getType())) {
3789 const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
3790 // If the user is a no-op, look through to its uses.
3791 if (!isa<SCEVUnknown>(UserS))
3792 continue;
3793 if (UserS == US) {
3794 Worklist.push_back(
3795 SE.getUnknown(const_cast<Instruction *>(UserInst)));
3796 continue;
3797 }
3798 }
3799 // Ignore icmp instructions which are already being analyzed.
3800 if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
3801 unsigned OtherIdx = !U.getOperandNo();
3802 Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
3803 if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
3804 continue;
3805 }
3806
3807 std::pair<size_t, Immediate> P =
3808 getUse(S, LSRUse::Basic, MemAccessTy());
3809 size_t LUIdx = P.first;
3810 Immediate Offset = P.second;
3811 LSRUse &LU = Uses[LUIdx];
3812 LSRFixup &LF = LU.getNewFixup();
3813 LF.UserInst = const_cast<Instruction *>(UserInst);
3814 LF.OperandValToReplace = U;
3815 LF.Offset = Offset;
3816 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
3817 if (!LU.WidestFixupType ||
3818 SE.getTypeSizeInBits(LU.WidestFixupType) <
3819 SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
3820 LU.WidestFixupType = LF.OperandValToReplace->getType();
3821 InsertSupplementalFormula(US, LU, LUIdx);
3822 CountRegisters(LU.Formulae.back(), Uses.size() - 1);
3823 break;
3824 }
3825 }
3826 }
3827}
3828
3829/// Split S into subexpressions which can be pulled out into separate
3830/// registers. If C is non-null, multiply each subexpression by C.
3831///
3832/// Return remainder expression after factoring the subexpressions captured by
3833/// Ops. If Ops is complete, return NULL.
3834static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
3836 const Loop *L,
3837 ScalarEvolution &SE,
3838 unsigned Depth = 0) {
3839 // Arbitrarily cap recursion to protect compile time.
3840 if (Depth >= 3)
3841 return S;
3842
3843 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
3844 // Break out add operands.
3845 for (const SCEV *S : Add->operands()) {
3846 const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
3847 if (Remainder)
3848 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3849 }
3850 return nullptr;
3851 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
3852 // Split a non-zero base out of an addrec.
3853 if (AR->getStart()->isZero() || !AR->isAffine())
3854 return S;
3855
3856 const SCEV *Remainder = CollectSubexprs(AR->getStart(),
3857 C, Ops, L, SE, Depth+1);
3858 // Split the non-zero AddRec unless it is part of a nested recurrence that
3859 // does not pertain to this loop.
3860 if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) {
3861 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
3862 Remainder = nullptr;
3863 }
3864 if (Remainder != AR->getStart()) {
3865 if (!Remainder)
3866 Remainder = SE.getConstant(AR->getType(), 0);
3867 return SE.getAddRecExpr(Remainder,
3868 AR->getStepRecurrence(SE),
3869 AR->getLoop(),
3870 //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
3872 }
3873 } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
3874 // Break (C * (a + b + c)) into C*a + C*b + C*c.
3875 if (Mul->getNumOperands() != 2)
3876 return S;
3877 if (const SCEVConstant *Op0 =
3878 dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
3879 C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
3880 const SCEV *Remainder =
3881 CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1);
3882 if (Remainder)
3883 Ops.push_back(SE.getMulExpr(C, Remainder));
3884 return nullptr;
3885 }
3886 }
3887 return S;
3888}
3889
3890/// Return true if the SCEV represents a value that may end up as a
3891/// post-increment operation.
3893 LSRUse &LU, const SCEV *S, const Loop *L,
3894 ScalarEvolution &SE) {
3895 if (LU.Kind != LSRUse::Address ||
3896 !LU.AccessTy.getType()->isIntOrIntVectorTy())
3897 return false;
3898 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S);
3899 if (!AR)
3900 return false;
3901 const SCEV *LoopStep = AR->getStepRecurrence(SE);
3902 if (!isa<SCEVConstant>(LoopStep))
3903 return false;
3904 // Check if a post-indexed load/store can be used.
3907 const SCEV *LoopStart = AR->getStart();
3908 if (!isa<SCEVConstant>(LoopStart) && SE.isLoopInvariant(LoopStart, L))
3909 return true;
3910 }
3911 return false;
3912}
3913
3914/// Helper function for LSRInstance::GenerateReassociations.
3915void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
3916 const Formula &Base,
3917 unsigned Depth, size_t Idx,
3918 bool IsScaledReg) {
3919 const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
3920 // Don't generate reassociations for the base register of a value that
3921 // may generate a post-increment operator. The reason is that the
3922 // reassociations cause extra base+register formula to be created,
3923 // and possibly chosen, but the post-increment is more efficient.
3924 if (AMK == TTI::AMK_PostIndexed && mayUsePostIncMode(TTI, LU, BaseReg, L, SE))
3925 return;
3927 const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
3928 if (Remainder)
3929 AddOps.push_back(Remainder);
3930
3931 if (AddOps.size() == 1)
3932 return;
3933
3935 JE = AddOps.end();
3936 J != JE; ++J) {
3937 // Loop-variant "unknown" values are uninteresting; we won't be able to
3938 // do anything meaningful with them.
3939 if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
3940 continue;
3941
3942 // Don't pull a constant into a register if the constant could be folded
3943 // into an immediate field.
3944 if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3945 LU.AccessTy, *J, Base.getNumRegs() > 1))
3946 continue;
3947
3948 // Collect all operands except *J.
3949 SmallVector<const SCEV *, 8> InnerAddOps(
3950 ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
3951 InnerAddOps.append(std::next(J),
3952 ((const SmallVector<const SCEV *, 8> &)AddOps).end());
3953
3954 // Don't leave just a constant behind in a register if the constant could
3955 // be folded into an immediate field.
3956 if (InnerAddOps.size() == 1 &&
3957 isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
3958 LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
3959 continue;
3960
3961 const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
3962 if (InnerSum->isZero())
3963 continue;
3964 Formula F = Base;
3965
3966 if (F.UnfoldedOffset.isNonZero() && F.UnfoldedOffset.isScalable())
3967 continue;
3968
3969 // Add the remaining pieces of the add back into the new formula.
3970 const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
3971 if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
3972 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
3973 InnerSumSC->getValue()->getZExtValue())) {
3974 F.UnfoldedOffset =
3975 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
3976 InnerSumSC->getValue()->getZExtValue());
3977 if (IsScaledReg)
3978 F.ScaledReg = nullptr;
3979 else
3980 F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
3981 } else if (IsScaledReg)
3982 F.ScaledReg = InnerSum;
3983 else
3984 F.BaseRegs[Idx] = InnerSum;
3985
3986 // Add J as its own register, or an unfolded immediate.
3987 const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
3988 if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
3989 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() +
3990 SC->getValue()->getZExtValue()))
3991 F.UnfoldedOffset =
3992 Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
3993 SC->getValue()->getZExtValue());
3994 else
3995 F.BaseRegs.push_back(*J);
3996 // We may have changed the number of register in base regs, adjust the
3997 // formula accordingly.
3998 F.canonicalize(*L);
3999
4000 if (InsertFormula(LU, LUIdx, F))
4001 // If that formula hadn't been seen before, recurse to find more like
4002 // it.
4003 // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2)
4004 // Because just Depth is not enough to bound compile time.
4005 // This means that every time AddOps.size() is greater 16^x we will add
4006 // x to Depth.
4007 GenerateReassociations(LU, LUIdx, LU.Formulae.back(),
4008 Depth + 1 + (Log2_32(AddOps.size()) >> 2));
4009 }
4010}
4011
4012/// Split out subexpressions from adds and the bases of addrecs.
4013void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
4014 Formula Base, unsigned Depth) {
4015 assert(Base.isCanonical(*L) && "Input must be in the canonical form");
4016 // Arbitrarily cap recursion to protect compile time.
4017 if (Depth >= 3)
4018 return;
4019
4020 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4021 GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
4022
4023 if (Base.Scale == 1)
4024 GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
4025 /* Idx */ -1, /* IsScaledReg */ true);
4026}
4027
4028/// Generate a formula consisting of all of the loop-dominating registers added
4029/// into a single register.
4030void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
4031 Formula Base) {
4032 // This method is only interesting on a plurality of registers.
4033 if (Base.BaseRegs.size() + (Base.Scale == 1) +
4034 (Base.UnfoldedOffset.isNonZero()) <=
4035 1)
4036 return;
4037
4038 // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
4039 // processing the formula.
4040 Base.unscale();
4042 Formula NewBase = Base;
4043 NewBase.BaseRegs.clear();
4044 Type *CombinedIntegerType = nullptr;
4045 for (const SCEV *BaseReg : Base.BaseRegs) {
4046 if (SE.properlyDominates(BaseReg, L->getHeader()) &&
4047 !SE.hasComputableLoopEvolution(BaseReg, L)) {
4048 if (!CombinedIntegerType)
4049 CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType());
4050 Ops.push_back(BaseReg);
4051 }
4052 else
4053 NewBase.BaseRegs.push_back(BaseReg);
4054 }
4055
4056 // If no register is relevant, we're done.
4057 if (Ops.size() == 0)
4058 return;
4059
4060 // Utility function for generating the required variants of the combined
4061 // registers.
4062 auto GenerateFormula = [&](const SCEV *Sum) {
4063 Formula F = NewBase;
4064
4065 // TODO: If Sum is zero, it probably means ScalarEvolution missed an
4066 // opportunity to fold something. For now, just ignore such cases
4067 // rather than proceed with zero in a register.
4068 if (Sum->isZero())
4069 return;
4070
4071 F.BaseRegs.push_back(Sum);
4072 F.canonicalize(*L);
4073 (void)InsertFormula(LU, LUIdx, F);
4074 };
4075
4076 // If we collected at least two registers, generate a formula combining them.
4077 if (Ops.size() > 1) {
4078 SmallVector<const SCEV *, 4> OpsCopy(Ops); // Don't let SE modify Ops.
4079 GenerateFormula(SE.getAddExpr(OpsCopy));
4080 }
4081
4082 // If we have an unfolded offset, generate a formula combining it with the
4083 // registers collected.
4084 if (NewBase.UnfoldedOffset.isNonZero() && NewBase.UnfoldedOffset.isFixed()) {
4085 assert(CombinedIntegerType && "Missing a type for the unfolded offset");
4086 Ops.push_back(SE.getConstant(CombinedIntegerType,
4087 NewBase.UnfoldedOffset.getFixedValue(), true));
4088 NewBase.UnfoldedOffset = Immediate::getFixed(0);
4089 GenerateFormula(SE.getAddExpr(Ops));
4090 }
4091}
4092
4093/// Helper function for LSRInstance::GenerateSymbolicOffsets.
4094void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
4095 const Formula &Base, size_t Idx,
4096 bool IsScaledReg) {
4097 const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4098 GlobalValue *GV = ExtractSymbol(G, SE);
4099 if (G->isZero() || !GV)
4100 return;
4101 Formula F = Base;
4102 F.BaseGV = GV;
4103 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4104 return;
4105 if (IsScaledReg)
4106 F.ScaledReg = G;
4107 else
4108 F.BaseRegs[Idx] = G;
4109 (void)InsertFormula(LU, LUIdx, F);
4110}
4111
4112/// Generate reuse formulae using symbolic offsets.
4113void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
4114 Formula Base) {
4115 // We can't add a symbolic offset if the address already contains one.
4116 if (Base.BaseGV) return;
4117
4118 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4119 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
4120 if (Base.Scale == 1)
4121 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
4122 /* IsScaledReg */ true);
4123}
4124
4125/// Helper function for LSRInstance::GenerateConstantOffsets.
4126void LSRInstance::GenerateConstantOffsetsImpl(
4127 LSRUse &LU, unsigned LUIdx, const Formula &Base,
4128 const SmallVectorImpl<Immediate> &Worklist, size_t Idx, bool IsScaledReg) {
4129
4130 auto GenerateOffset = [&](const SCEV *G, Immediate Offset) {
4131 Formula F = Base;
4132 if (!Base.BaseOffset.isCompatibleImmediate(Offset))
4133 return;
4134 F.BaseOffset = Base.BaseOffset.subUnsigned(Offset);
4135
4136 if (isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) {
4137 // Add the offset to the base register.
4138 const SCEV *NewOffset = Offset.getSCEV(SE, G->getType());
4139 const SCEV *NewG = SE.getAddExpr(NewOffset, G);
4140 // If it cancelled out, drop the base register, otherwise update it.
4141 if (NewG->isZero()) {
4142 if (IsScaledReg) {
4143 F.Scale = 0;
4144 F.ScaledReg = nullptr;
4145 } else
4146 F.deleteBaseReg(F.BaseRegs[Idx]);
4147 F.canonicalize(*L);
4148 } else if (IsScaledReg)
4149 F.ScaledReg = NewG;
4150 else
4151 F.BaseRegs[Idx] = NewG;
4152
4153 (void)InsertFormula(LU, LUIdx, F);
4154 }
4155 };
4156
4157 const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
4158
4159 // With constant offsets and constant steps, we can generate pre-inc
4160 // accesses by having the offset equal the step. So, for access #0 with a
4161 // step of 8, we generate a G - 8 base which would require the first access
4162 // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer
4163 // for itself and hopefully becomes the base for other accesses. This means
4164 // means that a single pre-indexed access can be generated to become the new
4165 // base pointer for each iteration of the loop, resulting in no extra add/sub
4166 // instructions for pointer updating.
4167 if (AMK == TTI::AMK_PreIndexed && LU.Kind == LSRUse::Address) {
4168 if (auto *GAR = dyn_cast<SCEVAddRecExpr>(G)) {
4169 if (auto *StepRec =
4170 dyn_cast<SCEVConstant>(GAR->getStepRecurrence(SE))) {
4171 const APInt &StepInt = StepRec->getAPInt();
4172 int64_t Step = StepInt.isNegative() ?
4173 StepInt.getSExtValue() : StepInt.getZExtValue();
4174
4175 for (Immediate Offset : Worklist) {
4176 if (Offset.isFixed()) {
4177 Offset = Immediate::getFixed(Offset.getFixedValue() - Step);
4178 GenerateOffset(G, Offset);
4179 }
4180 }
4181 }
4182 }
4183 }
4184 for (Immediate Offset : Worklist)
4185 GenerateOffset(G, Offset);
4186
4187 Immediate Imm = ExtractImmediate(G, SE);
4188 if (G->isZero() || Imm.isZero() ||
4189 !Base.BaseOffset.isCompatibleImmediate(Imm))
4190 return;
4191 Formula F = Base;
4192 F.BaseOffset = F.BaseOffset.addUnsigned(Imm);
4193 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
4194 return;
4195 if (IsScaledReg) {
4196 F.ScaledReg = G;
4197 } else {
4198 F.BaseRegs[Idx] = G;
4199 // We may generate non canonical Formula if G is a recurrent expr reg
4200 // related with current loop while F.ScaledReg is not.
4201 F.canonicalize(*L);
4202 }
4203 (void)InsertFormula(LU, LUIdx, F);
4204}
4205
4206/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
4207void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
4208 Formula Base) {
4209 // TODO: For now, just add the min and max offset, because it usually isn't
4210 // worthwhile looking at everything inbetween.
4212 Worklist.push_back(LU.MinOffset);
4213 if (LU.MaxOffset != LU.MinOffset)
4214 Worklist.push_back(LU.MaxOffset);
4215
4216 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
4217 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
4218 if (Base.Scale == 1)
4219 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
4220 /* IsScaledReg */ true);
4221}
4222
4223/// For ICmpZero, check to see if we can scale up the comparison. For example, x
4224/// == y -> x*c == y*c.
4225void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
4226 Formula Base) {
4227 if (LU.Kind != LSRUse::ICmpZero) return;
4228
4229 // Determine the integer type for the base formula.
4230 Type *IntTy = Base.getType();
4231 if (!IntTy) return;
4232 if (SE.getTypeSizeInBits(IntTy) > 64) return;
4233
4234 // Don't do this if there is more than one offset.
4235 if (LU.MinOffset != LU.MaxOffset) return;
4236
4237 // Check if transformation is valid. It is illegal to multiply pointer.
4238 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4239 return;
4240 for (const SCEV *BaseReg : Base.BaseRegs)
4241 if (BaseReg->getType()->isPointerTy())
4242 return;
4243 assert(!Base.BaseGV && "ICmpZero use is not legal!");
4244
4245 // Check each interesting stride.
4246 for (int64_t Factor : Factors) {
4247 // Check that Factor can be represented by IntTy
4248 if (!ConstantInt::isValueValidForType(IntTy, Factor))
4249 continue;
4250 // Check that the multiplication doesn't overflow.
4251 if (Base.BaseOffset.isMin() && Factor == -1)
4252 continue;
4253 // Not supporting scalable immediates.
4254 if (Base.BaseOffset.isNonZero() && Base.BaseOffset.isScalable())
4255 continue;
4256 Immediate NewBaseOffset = Base.BaseOffset.mulUnsigned(Factor);
4257 assert(Factor != 0 && "Zero factor not expected!");
4258 if (NewBaseOffset.getFixedValue() / Factor !=
4259 Base.BaseOffset.getFixedValue())
4260 continue;
4261 // If the offset will be truncated at this use, check that it is in bounds.
4262 if (!IntTy->isPointerTy() &&
4263 !ConstantInt::isValueValidForType(IntTy, NewBaseOffset.getFixedValue()))
4264 continue;
4265
4266 // Check that multiplying with the use offset doesn't overflow.
4267 Immediate Offset = LU.MinOffset;
4268 if (Offset.isMin() && Factor == -1)
4269 continue;
4270 Offset = Offset.mulUnsigned(Factor);
4271 if (Offset.getFixedValue() / Factor != LU.MinOffset.getFixedValue())
4272 continue;
4273 // If the offset will be truncated at this use, check that it is in bounds.
4274 if (!IntTy->isPointerTy() &&
4275 !ConstantInt::isValueValidForType(IntTy, Offset.getFixedValue()))
4276 continue;
4277
4278 Formula F = Base;
4279 F.BaseOffset = NewBaseOffset;
4280
4281 // Check that this scale is legal.
4282 if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
4283 continue;
4284
4285 // Compensate for the use having MinOffset built into it.
4286 F.BaseOffset = F.BaseOffset.addUnsigned(Offset).subUnsigned(LU.MinOffset);
4287
4288 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4289
4290 // Check that multiplying with each base register doesn't overflow.
4291 for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
4292 F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
4293 if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
4294 goto next;
4295 }
4296
4297 // Check that multiplying with the scaled register doesn't overflow.
4298 if (F.ScaledReg) {
4299 F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
4300 if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
4301 continue;
4302 }
4303
4304 // Check that multiplying with the unfolded offset doesn't overflow.
4305 if (F.UnfoldedOffset.isNonZero()) {
4306 if (F.UnfoldedOffset.isMin() && Factor == -1)
4307 continue;
4308 F.UnfoldedOffset = F.UnfoldedOffset.mulUnsigned(Factor);
4309 if (F.UnfoldedOffset.getFixedValue() / Factor !=
4310 Base.UnfoldedOffset.getFixedValue())
4311 continue;
4312 // If the offset will be truncated, check that it is in bounds.
4314 IntTy, F.UnfoldedOffset.getFixedValue()))
4315 continue;
4316 }
4317
4318 // If we make it here and it's legal, add it.
4319 (void)InsertFormula(LU, LUIdx, F);
4320 next:;
4321 }
4322}
4323
4324/// Generate stride factor reuse formulae by making use of scaled-offset address
4325/// modes, for example.
4326void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
4327 // Determine the integer type for the base formula.
4328 Type *IntTy = Base.getType();
4329 if (!IntTy) return;
4330
4331 // If this Formula already has a scaled register, we can't add another one.
4332 // Try to unscale the formula to generate a better scale.
4333 if (Base.Scale != 0 && !Base.unscale())
4334 return;
4335
4336 assert(Base.Scale == 0 && "unscale did not did its job!");
4337
4338 // Check each interesting stride.
4339 for (int64_t Factor : Factors) {
4340 Base.Scale = Factor;
4341 Base.HasBaseReg = Base.BaseRegs.size() > 1;
4342 // Check whether this scale is going to be legal.
4343 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4344 Base)) {
4345 // As a special-case, handle special out-of-loop Basic users specially.
4346 // TODO: Reconsider this special case.
4347 if (LU.Kind == LSRUse::Basic &&
4348 isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
4349 LU.AccessTy, Base) &&
4350 LU.AllFixupsOutsideLoop)
4351 LU.Kind = LSRUse::Special;
4352 else
4353 continue;
4354 }
4355 // For an ICmpZero, negating a solitary base register won't lead to
4356 // new solutions.
4357 if (LU.Kind == LSRUse::ICmpZero && !Base.HasBaseReg &&
4358 Base.BaseOffset.isZero() && !Base.BaseGV)
4359 continue;
4360 // For each addrec base reg, if its loop is current loop, apply the scale.
4361 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
4362 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
4363 if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
4364 const SCEV *FactorS = SE.getConstant(IntTy, Factor);
4365 if (FactorS->isZero())
4366 continue;
4367 // Divide out the factor, ignoring high bits, since we'll be
4368 // scaling the value back up in the end.
4369 if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true))
4370 if (!Quotient->isZero()) {
4371 // TODO: This could be optimized to avoid all the copying.
4372 Formula F = Base;
4373 F.ScaledReg = Quotient;
4374 F.deleteBaseReg(F.BaseRegs[i]);
4375 // The canonical representation of 1*reg is reg, which is already in
4376 // Base. In that case, do not try to insert the formula, it will be
4377 // rejected anyway.
4378 if (F.Scale == 1 && (F.BaseRegs.empty() ||
4379 (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
4380 continue;
4381 // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
4382 // non canonical Formula with ScaledReg's loop not being L.
4383 if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
4384 F.canonicalize(*L);
4385 (void)InsertFormula(LU, LUIdx, F);
4386 }
4387 }
4388 }
4389 }
4390}
4391
4392/// Extend/Truncate \p Expr to \p ToTy considering post-inc uses in \p Loops.
4393/// For all PostIncLoopSets in \p Loops, first de-normalize \p Expr, then
4394/// perform the extension/truncate and normalize again, as the normalized form
4395/// can result in folds that are not valid in the post-inc use contexts. The
4396/// expressions for all PostIncLoopSets must match, otherwise return nullptr.
4397static const SCEV *
4399 const SCEV *Expr, Type *ToTy,
4400 ScalarEvolution &SE) {
4401 const SCEV *Result = nullptr;
4402 for (auto &L : Loops) {
4403 auto *DenormExpr = denormalizeForPostIncUse(Expr, L, SE);
4404 const SCEV *NewDenormExpr = SE.getAnyExtendExpr(DenormExpr, ToTy);
4405 const SCEV *New = normalizeForPostIncUse(NewDenormExpr, L, SE);
4406 if (!New || (Result && New != Result))
4407 return nullptr;
4408 Result = New;
4409 }
4410
4411 assert(Result && "failed to create expression");
4412 return Result;
4413}
4414
4415/// Generate reuse formulae from different IV types.
4416void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
4417 // Don't bother truncating symbolic values.
4418 if (Base.BaseGV) return;
4419
4420 // Determine the integer type for the base formula.
4421 Type *DstTy = Base.getType();
4422 if (!DstTy) return;
4423 if (DstTy->isPointerTy())
4424 return;
4425
4426 // It is invalid to extend a pointer type so exit early if ScaledReg or
4427 // any of the BaseRegs are pointers.
4428 if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
4429 return;
4430 if (any_of(Base.BaseRegs,
4431 [](const SCEV *S) { return S->getType()->isPointerTy(); }))
4432 return;
4433
4435 for (auto &LF : LU.Fixups)
4436 Loops.push_back(LF.PostIncLoops);
4437
4438 for (Type *SrcTy : Types) {
4439 if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
4440 Formula F = Base;
4441
4442 // Sometimes SCEV is able to prove zero during ext transform. It may
4443 // happen if SCEV did not do all possible transforms while creating the
4444 // initial node (maybe due to depth limitations), but it can do them while
4445 // taking ext.
4446 if (F.ScaledReg) {
4447 const SCEV *NewScaledReg =
4448 getAnyExtendConsideringPostIncUses(Loops, F.ScaledReg, SrcTy, SE);
4449 if (!NewScaledReg || NewScaledReg->isZero())
4450 continue;
4451 F.ScaledReg = NewScaledReg;
4452 }
4453 bool HasZeroBaseReg = false;
4454 for (const SCEV *&BaseReg : F.BaseRegs) {
4455 const SCEV *NewBaseReg =
4456 getAnyExtendConsideringPostIncUses(Loops, BaseReg, SrcTy, SE);
4457 if (!NewBaseReg || NewBaseReg->isZero()) {
4458 HasZeroBaseReg = true;
4459 break;
4460 }
4461 BaseReg = NewBaseReg;
4462 }
4463 if (HasZeroBaseReg)
4464 continue;
4465
4466 // TODO: This assumes we've done basic processing on all uses and
4467 // have an idea what the register usage is.
4468 if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
4469 continue;
4470
4471 F.canonicalize(*L);
4472 (void)InsertFormula(LU, LUIdx, F);
4473 }
4474 }
4475}
4476
4477namespace {
4478
4479/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
4480/// modifications so that the search phase doesn't have to worry about the data
4481/// structures moving underneath it.
4482struct WorkItem {
4483 size_t LUIdx;
4484 Immediate Imm;
4485 const SCEV *OrigReg;
4486
4487 WorkItem(size_t LI, Immediate I, const SCEV *R)
4488 : LUIdx(LI), Imm(I), OrigReg(R) {}
4489
4490 void print(raw_ostream &OS) const;
4491 void dump() const;
4492};
4493
4494} // end anonymous namespace
4495
4496#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4497void WorkItem::print(raw_ostream &OS) const {
4498 OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
4499 << " , add offset " << Imm;
4500}
4501
4502LLVM_DUMP_METHOD void WorkItem::dump() const {
4503 print(errs()); errs() << '\n';
4504}
4505#endif
4506
4507/// Look for registers which are a constant distance apart and try to form reuse
4508/// opportunities between them.
4509void LSRInstance::GenerateCrossUseConstantOffsets() {
4510 // Group the registers by their value without any added constant offset.
4511 using ImmMapTy = std::map<Immediate, const SCEV *, KeyOrderTargetImmediate>;
4512
4516 for (const SCEV *Use : RegUses) {
4517 const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify.
4518 Immediate Imm = ExtractImmediate(Reg, SE);
4519 auto Pair = Map.insert(std::make_pair(Reg, ImmMapTy()));
4520 if (Pair.second)
4521 Sequence.push_back(Reg);
4522 Pair.first->second.insert(std::make_pair(Imm, Use));
4523 UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
4524 }
4525
4526 // Now examine each set of registers with the same base value. Build up
4527 // a list of work to do and do the work in a separate step so that we're
4528 // not adding formulae and register counts while we're searching.
4529 SmallVector<WorkItem, 32> WorkItems;
4530 SmallSet<std::pair<size_t, Immediate>, 32, KeyOrderSizeTAndImmediate>
4531 UniqueItems;
4532 for (const SCEV *Reg : Sequence) {
4533 const ImmMapTy &Imms = Map.find(Reg)->second;
4534
4535 // It's not worthwhile looking for reuse if there's only one offset.
4536 if (Imms.size() == 1)
4537 continue;
4538
4539 LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
4540 for (const auto &Entry
4541 : Imms) dbgs()
4542 << ' ' << Entry.first;
4543 dbgs() << '\n');
4544
4545 // Examine each offset.
4546 for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
4547 J != JE; ++J) {
4548 const SCEV *OrigReg = J->second;
4549
4550 Immediate JImm = J->first;
4551 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
4552
4553 if (!isa<SCEVConstant>(OrigReg) &&
4554 UsedByIndicesMap[Reg].count() == 1) {
4555 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4556 << '\n');
4557 continue;
4558 }
4559
4560 // Conservatively examine offsets between this orig reg a few selected
4561 // other orig regs.
4562 Immediate First = Imms.begin()->first;
4563 Immediate Last = std::prev(Imms.end())->first;
4564 if (!First.isCompatibleImmediate(Last)) {
4565 LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
4566 << "\n");
4567 continue;
4568 }
4569 // Only scalable if both terms are scalable, or if one is scalable and
4570 // the other is 0.
4571 bool Scalable = First.isScalable() || Last.isScalable();
4572 int64_t FI = First.getKnownMinValue();
4573 int64_t LI = Last.getKnownMinValue();
4574 // Compute (First + Last) / 2 without overflow using the fact that
4575 // First + Last = 2 * (First + Last) + (First ^ Last).
4576 int64_t Avg = (FI & LI) + ((FI ^ LI) >> 1);
4577 // If the result is negative and FI is odd and LI even (or vice versa),
4578 // we rounded towards -inf. Add 1 in that case, to round towards 0.
4579 Avg = Avg + ((FI ^ LI) & ((uint64_t)Avg >> 63));
4580 ImmMapTy::const_iterator OtherImms[] = {
4581 Imms.begin(), std::prev(Imms.end()),
4582 Imms.lower_bound(Immediate::get(Avg, Scalable))};
4583 for (const auto &M : OtherImms) {
4584 if (M == J || M == JE) continue;
4585 if (!JImm.isCompatibleImmediate(M->first))
4586 continue;
4587
4588 // Compute the difference between the two.
4589 Immediate Imm = JImm.subUnsigned(M->first);
4590 for (unsigned LUIdx : UsedByIndices.set_bits())
4591 // Make a memo of this use, offset, and register tuple.
4592 if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
4593 WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
4594 }
4595 }
4596 }
4597
4598 Map.clear();
4599 Sequence.clear();
4600 UsedByIndicesMap.clear();
4601 UniqueItems.clear();
4602
4603 // Now iterate through the worklist and add new formulae.
4604 for (const WorkItem &WI : WorkItems) {
4605 size_t LUIdx = WI.LUIdx;
4606 LSRUse &LU = Uses[LUIdx];
4607 Immediate Imm = WI.Imm;
4608 const SCEV *OrigReg = WI.OrigReg;
4609
4610 Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
4611 const SCEV *NegImmS = Imm.getNegativeSCEV(SE, IntTy);
4612 unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
4613
4614 // TODO: Use a more targeted data structure.
4615 for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
4616 Formula F = LU.Formulae[L];
4617 // FIXME: The code for the scaled and unscaled registers looks
4618 // very similar but slightly different. Investigate if they
4619 // could be merged. That way, we would not have to unscale the
4620 // Formula.
4621 F.unscale();
4622 // Use the immediate in the scaled register.
4623 if (F.ScaledReg == OrigReg) {
4624 if (!F.BaseOffset.isCompatibleImmediate(Imm))
4625 continue;
4626 Immediate Offset = F.BaseOffset.addUnsigned(Imm.mulUnsigned(F.Scale));
4627 // Don't create 50 + reg(-50).
4628 const SCEV *S = Offset.getNegativeSCEV(SE, IntTy);
4629 if (F.referencesReg(S))
4630 continue;
4631 Formula NewF = F;
4632 NewF.BaseOffset = Offset;
4633 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
4634 NewF))
4635 continue;
4636 NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
4637
4638 // If the new scale is a constant in a register, and adding the constant
4639 // value to the immediate would produce a value closer to zero than the
4640 // immediate itself, then the formula isn't worthwhile.
4641 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) {
4642 // FIXME: Do we need to do something for scalable immediates here?
4643 // A scalable SCEV won't be constant, but we might still have
4644 // something in the offset? Bail out for now to be safe.
4645 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4646 continue;
4647 if (C->getValue()->isNegative() !=
4648 (NewF.BaseOffset.isLessThanZero()) &&
4649 (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
4650 .ule(std::abs(NewF.BaseOffset.getFixedValue())))
4651 continue;
4652 }
4653
4654 // OK, looks good.
4655 NewF.canonicalize(*this->L);
4656 (void)InsertFormula(LU, LUIdx, NewF);
4657 } else {
4658 // Use the immediate in a base register.
4659 for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
4660 const SCEV *BaseReg = F.BaseRegs[N];
4661 if (BaseReg != OrigReg)
4662 continue;
4663 Formula NewF = F;
4664 if (!NewF.BaseOffset.isCompatibleImmediate(Imm) ||
4665 !NewF.UnfoldedOffset.isCompatibleImmediate(Imm) ||
4666 !NewF.BaseOffset.isCompatibleImmediate(NewF.UnfoldedOffset))
4667 continue;
4668 NewF.BaseOffset = NewF.BaseOffset.addUnsigned(Imm);
4669 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
4670 LU.Kind, LU.AccessTy, NewF)) {
4671 if (AMK == TTI::AMK_PostIndexed &&
4672 mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
4673 continue;
4674 Immediate NewUnfoldedOffset = NewF.UnfoldedOffset.addUnsigned(Imm);
4675 if (!isLegalAddImmediate(TTI, NewUnfoldedOffset))
4676 continue;
4677 NewF = F;
4678 NewF.UnfoldedOffset = NewUnfoldedOffset;
4679 }
4680 NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
4681
4682 // If the new formula has a constant in a register, and adding the
4683 // constant value to the immediate would produce a value closer to
4684 // zero than the immediate itself, then the formula isn't worthwhile.
4685 for (const SCEV *NewReg : NewF.BaseRegs)
4686 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg)) {
4687 if (NewF.BaseOffset.isNonZero() && NewF.BaseOffset.isScalable())
4688 goto skip_formula;
4689 if ((C->getAPInt() + NewF.BaseOffset.getFixedValue())
4690 .abs()
4691 .slt(std::abs(NewF.BaseOffset.getFixedValue())) &&
4692 (C->getAPInt() + NewF.BaseOffset.getFixedValue())
4693 .countr_zero() >=
4694 (unsigned)llvm::countr_zero<uint64_t>(
4695 NewF.BaseOffset.getFixedValue()))
4696 goto skip_formula;
4697 }
4698
4699 // Ok, looks good.
4700 NewF.canonicalize(*this->L);
4701 (void)InsertFormula(LU, LUIdx, NewF);
4702 break;
4703 skip_formula:;
4704 }
4705 }
4706 }
4707 }
4708}
4709
4710/// Generate formulae for each use.
4711void
4712LSRInstance::GenerateAllReuseFormulae() {
4713 // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
4714 // queries are more precise.
4715 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4716 LSRUse &LU = Uses[LUIdx];
4717 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4718 GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
4719 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4720 GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
4721 }
4722 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4723 LSRUse &LU = Uses[LUIdx];
4724 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4725 GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
4726 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4727 GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
4728 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4729 GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
4730 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4731 GenerateScales(LU, LUIdx, LU.Formulae[i]);
4732 }
4733 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4734 LSRUse &LU = Uses[LUIdx];
4735 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
4736 GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
4737 }
4738
4739 GenerateCrossUseConstantOffsets();
4740
4741 LLVM_DEBUG(dbgs() << "\n"
4742 "After generating reuse formulae:\n";
4743 print_uses(dbgs()));
4744}
4745
4746/// If there are multiple formulae with the same set of registers used
4747/// by other uses, pick the best one and delete the others.
4748void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
4749 DenseSet<const SCEV *> VisitedRegs;
4752#ifndef NDEBUG
4753 bool ChangedFormulae = false;
4754#endif
4755
4756 // Collect the best formula for each unique set of shared registers. This
4757 // is reset for each use.
4758 using BestFormulaeTy =
4759 DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>;
4760
4761 BestFormulaeTy BestFormulae;
4762
4763 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4764 LSRUse &LU = Uses[LUIdx];
4765 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
4766 dbgs() << '\n');
4767
4768 bool Any = false;
4769 for (size_t FIdx = 0, NumForms = LU.Formulae.size();
4770 FIdx != NumForms; ++FIdx) {
4771 Formula &F = LU.Formulae[FIdx];
4772
4773 // Some formulas are instant losers. For example, they may depend on
4774 // nonexistent AddRecs from other loops. These need to be filtered
4775 // immediately, otherwise heuristics could choose them over others leading
4776 // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
4777 // avoids the need to recompute this information across formulae using the
4778 // same bad AddRec. Passing LoserRegs is also essential unless we remove
4779 // the corresponding bad register from the Regs set.
4780 Cost CostF(L, SE, TTI, AMK);
4781 Regs.clear();
4782 CostF.RateFormula(F, Regs, VisitedRegs, LU, &LoserRegs);
4783 if (CostF.isLoser()) {
4784 // During initial formula generation, undesirable formulae are generated
4785 // by uses within other loops that have some non-trivial address mode or
4786 // use the postinc form of the IV. LSR needs to provide these formulae
4787 // as the basis of rediscovering the desired formula that uses an AddRec
4788 // corresponding to the existing phi. Once all formulae have been
4789 // generated, these initial losers may be pruned.
4790 LLVM_DEBUG(dbgs() << " Filtering loser "; F.print(dbgs());
4791 dbgs() << "\n");
4792 }
4793 else {
4795 for (const SCEV *Reg : F.BaseRegs) {
4796 if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
4797 Key.push_back(Reg);
4798 }
4799 if (F.ScaledReg &&
4800 RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
4801 Key.push_back(F.ScaledReg);
4802 // Unstable sort by host order ok, because this is only used for
4803 // uniquifying.
4804 llvm::sort(Key);
4805
4806 std::pair<BestFormulaeTy::const_iterator, bool> P =
4807 BestFormulae.insert(std::make_pair(Key, FIdx));
4808 if (P.second)
4809 continue;
4810
4811 Formula &Best = LU.Formulae[P.first->second];
4812
4813 Cost CostBest(L, SE, TTI, AMK);
4814 Regs.clear();
4815 CostBest.RateFormula(Best, Regs, VisitedRegs, LU);
4816 if (CostF.isLess(CostBest))
4817 std::swap(F, Best);
4818 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
4819 dbgs() << "\n"
4820 " in favor of formula ";
4821 Best.print(dbgs()); dbgs() << '\n');
4822 }
4823#ifndef NDEBUG
4824 ChangedFormulae = true;
4825#endif
4826 LU.DeleteFormula(F);
4827 --FIdx;
4828 --NumForms;
4829 Any = true;
4830 }
4831
4832 // Now that we've filtered out some formulae, recompute the Regs set.
4833 if (Any)
4834 LU.RecomputeRegs(LUIdx, RegUses);
4835
4836 // Reset this to prepare for the next use.
4837 BestFormulae.clear();
4838 }
4839
4840 LLVM_DEBUG(if (ChangedFormulae) {
4841 dbgs() << "\n"
4842 "After filtering out undesirable candidates:\n";
4843 print_uses(dbgs());
4844 });
4845}
4846
4847/// Estimate the worst-case number of solutions the solver might have to
4848/// consider. It almost never considers this many solutions because it prune the
4849/// search space, but the pruning isn't always sufficient.
4850size_t LSRInstance::EstimateSearchSpaceComplexity() const {
4851 size_t Power = 1;
4852 for (const LSRUse &LU : Uses) {
4853 size_t FSize = LU.Formulae.size();
4854 if (FSize >= ComplexityLimit) {
4855 Power = ComplexityLimit;
4856 break;
4857 }
4858 Power *= FSize;
4859 if (Power >= ComplexityLimit)
4860 break;
4861 }
4862 return Power;
4863}
4864
4865/// When one formula uses a superset of the registers of another formula, it
4866/// won't help reduce register pressure (though it may not necessarily hurt
4867/// register pressure); remove it to simplify the system.
4868void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
4869 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
4870 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
4871
4872 LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
4873 "which use a superset of registers used by other "
4874 "formulae.\n");
4875
4876 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4877 LSRUse &LU = Uses[LUIdx];
4878 bool Any = false;
4879 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
4880 Formula &F = LU.Formulae[i];
4881 if (F.BaseOffset.isNonZero() && F.BaseOffset.isScalable())
4882 continue;
4883 // Look for a formula with a constant or GV in a register. If the use
4884 // also has a formula with that same value in an immediate field,
4885 // delete the one that uses a register.
4887 I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
4888 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
4889 Formula NewF = F;
4890 //FIXME: Formulas should store bitwidth to do wrapping properly.
4891 // See PR41034.
4892 NewF.BaseOffset =
4893 Immediate::getFixed(NewF.BaseOffset.getFixedValue() +
4894 (uint64_t)C->getValue()->getSExtValue());
4895 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4896 (I - F.BaseRegs.begin()));
4897 if (LU.HasFormulaWithSameRegs(NewF)) {
4898 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4899 dbgs() << '\n');
4900 LU.DeleteFormula(F);
4901 --i;
4902 --e;
4903 Any = true;
4904 break;
4905 }
4906 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
4907 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
4908 if (!F.BaseGV) {
4909 Formula NewF = F;
4910 NewF.BaseGV = GV;
4911 NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
4912 (I - F.BaseRegs.begin()));
4913 if (LU.HasFormulaWithSameRegs(NewF)) {
4914 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
4915 dbgs() << '\n');
4916 LU.DeleteFormula(F);
4917 --i;
4918 --e;
4919 Any = true;
4920 break;
4921 }
4922 }
4923 }
4924 }
4925 }
4926 if (Any)
4927 LU.RecomputeRegs(LUIdx, RegUses);
4928 }
4929
4930 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4931 }
4932}
4933
4934/// When there are many registers for expressions like A, A+1, A+2, etc.,
4935/// allocate a single register for them.
4936void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
4937 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
4938 return;
4939
4940 LLVM_DEBUG(
4941 dbgs() << "The search space is too complex.\n"
4942 "Narrowing the search space by assuming that uses separated "
4943 "by a constant offset will use the same registers.\n");
4944
4945 // This is especially useful for unrolled loops.
4946
4947 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
4948 LSRUse &LU = Uses[LUIdx];
4949 for (const Formula &F : LU.Formulae) {
4950 if (F.BaseOffset.isZero() || (F.Scale != 0 && F.Scale != 1))
4951 continue;
4952
4953 LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
4954 if (!LUThatHas)
4955 continue;
4956
4957 if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
4958 LU.Kind, LU.AccessTy))
4959 continue;
4960
4961 LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');
4962
4963 LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
4964
4965 // Transfer the fixups of LU to LUThatHas.
4966 for (LSRFixup &Fixup : LU.Fixups) {
4967 Fixup.Offset += F.BaseOffset;
4968 LUThatHas->pushFixup(Fixup);
4969 LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
4970 }
4971
4972 // Delete formulae from the new use which are no longer legal.
4973 bool Any = false;
4974 for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
4975 Formula &F = LUThatHas->Formulae[i];
4976 if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
4977 LUThatHas->Kind, LUThatHas->AccessTy, F)) {
4978 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
4979 LUThatHas->DeleteFormula(F);
4980 --i;
4981 --e;
4982 Any = true;
4983 }
4984 }
4985
4986 if (Any)
4987 LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
4988
4989 // Delete the old use.
4990 DeleteUse(LU, LUIdx);
4991 --LUIdx;
4992 --NumUses;
4993 break;
4994 }
4995 }
4996
4997 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
4998}
4999
5000/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
5001/// we've done more filtering, as it may be able to find more formulae to
5002/// eliminate.
5003void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
5004 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5005 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5006
5007 LLVM_DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
5008 "undesirable dedicated registers.\n");
5009
5010 FilterOutUndesirableDedicatedRegisters();
5011
5012 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5013 }
5014}
5015
5016/// If a LSRUse has multiple formulae with the same ScaledReg and Scale.
5017/// Pick the best one and delete the others.
5018/// This narrowing heuristic is to keep as many formulae with different
5019/// Scale and ScaledReg pair as possible while narrowing the search space.
5020/// The benefit is that it is more likely to find out a better solution
5021/// from a formulae set with more Scale and ScaledReg variations than
5022/// a formulae set with the same Scale and ScaledReg. The picking winner
5023/// reg heuristic will often keep the formulae with the same Scale and
5024/// ScaledReg and filter others, and we want to avoid that if possible.
5025void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
5026 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5027 return;
5028
5029 LLVM_DEBUG(
5030 dbgs() << "The search space is too complex.\n"
5031 "Narrowing the search space by choosing the best Formula "
5032 "from the Formulae with the same Scale and ScaledReg.\n");
5033
5034 // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
5035 using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>;
5036
5037 BestFormulaeTy BestFormulae;
5038#ifndef NDEBUG
5039 bool ChangedFormulae = false;
5040#endif
5041 DenseSet<const SCEV *> VisitedRegs;
5043
5044 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5045 LSRUse &LU = Uses[LUIdx];
5046 LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
5047 dbgs() << '\n');
5048
5049 // Return true if Formula FA is better than Formula FB.
5050 auto IsBetterThan = [&](Formula &FA, Formula &FB) {
5051 // First we will try to choose the Formula with fewer new registers.
5052 // For a register used by current Formula, the more the register is
5053 // shared among LSRUses, the less we increase the register number
5054 // counter of the formula.
5055 size_t FARegNum = 0;
5056 for (const SCEV *Reg : FA.BaseRegs) {
5057 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5058 FARegNum += (NumUses - UsedByIndices.count() + 1);
5059 }
5060 size_t FBRegNum = 0;
5061 for (const SCEV *Reg : FB.BaseRegs) {
5062 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
5063 FBRegNum += (NumUses - UsedByIndices.count() + 1);
5064 }
5065 if (FARegNum != FBRegNum)
5066 return FARegNum < FBRegNum;
5067
5068 // If the new register numbers are the same, choose the Formula with
5069 // less Cost.
5070 Cost CostFA(L, SE, TTI, AMK);
5071 Cost CostFB(L, SE, TTI, AMK);
5072 Regs.clear();
5073 CostFA.RateFormula(FA, Regs, VisitedRegs, LU);
5074 Regs.clear();
5075 CostFB.RateFormula(FB, Regs, VisitedRegs, LU);
5076 return CostFA.isLess(CostFB);
5077 };
5078
5079 bool Any = false;
5080 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5081 ++FIdx) {
5082 Formula &F = LU.Formulae[FIdx];
5083 if (!F.ScaledReg)
5084 continue;
5085 auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx});
5086 if (P.second)
5087 continue;
5088
5089 Formula &Best = LU.Formulae[P.first->second];
5090 if (IsBetterThan(F, Best))
5091 std::swap(F, Best);
5092 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5093 dbgs() << "\n"
5094 " in favor of formula ";
5095 Best.print(dbgs()); dbgs() << '\n');
5096#ifndef NDEBUG
5097 ChangedFormulae = true;
5098#endif
5099 LU.DeleteFormula(F);
5100 --FIdx;
5101 --NumForms;
5102 Any = true;
5103 }
5104 if (Any)
5105 LU.RecomputeRegs(LUIdx, RegUses);
5106
5107 // Reset this to prepare for the next use.
5108 BestFormulae.clear();
5109 }
5110
5111 LLVM_DEBUG(if (ChangedFormulae) {
5112 dbgs() << "\n"
5113 "After filtering out undesirable candidates:\n";
5114 print_uses(dbgs());
5115 });
5116}
5117
5118/// If we are over the complexity limit, filter out any post-inc prefering
5119/// variables to only post-inc values.
5120void LSRInstance::NarrowSearchSpaceByFilterPostInc() {
5121 if (AMK != TTI::AMK_PostIndexed)
5122 return;
5123 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5124 return;
5125
5126 LLVM_DEBUG(dbgs() << "The search space is too complex.\n"
5127 "Narrowing the search space by choosing the lowest "
5128 "register Formula for PostInc Uses.\n");
5129
5130 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5131 LSRUse &LU = Uses[LUIdx];
5132
5133 if (LU.Kind != LSRUse::Address)
5134 continue;
5135 if (!TTI.isIndexedLoadLegal(TTI.MIM_PostInc, LU.AccessTy.getType()) &&
5136 !TTI.isIndexedStoreLegal(TTI.MIM_PostInc, LU.AccessTy.getType()))
5137 continue;
5138
5139 size_t MinRegs = std::numeric_limits<size_t>::max();
5140 for (const Formula &F : LU.Formulae)
5141 MinRegs = std::min(F.getNumRegs(), MinRegs);
5142
5143 bool Any = false;
5144 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
5145 ++FIdx) {
5146 Formula &F = LU.Formulae[FIdx];
5147 if (F.getNumRegs() > MinRegs) {
5148 LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
5149 dbgs() << "\n");
5150 LU.DeleteFormula(F);
5151 --FIdx;
5152 --NumForms;
5153 Any = true;
5154 }
5155 }
5156 if (Any)
5157 LU.RecomputeRegs(LUIdx, RegUses);
5158
5159 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5160 break;
5161 }
5162
5163 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5164}
5165
5166/// The function delete formulas with high registers number expectation.
5167/// Assuming we don't know the value of each formula (already delete
5168/// all inefficient), generate probability of not selecting for each
5169/// register.
5170/// For example,
5171/// Use1:
5172/// reg(a) + reg({0,+,1})
5173/// reg(a) + reg({-1,+,1}) + 1
5174/// reg({a,+,1})
5175/// Use2:
5176/// reg(b) + reg({0,+,1})
5177/// reg(b) + reg({-1,+,1}) + 1
5178/// reg({b,+,1})
5179/// Use3:
5180/// reg(c) + reg(b) + reg({0,+,1})
5181/// reg(c) + reg({b,+,1})
5182///
5183/// Probability of not selecting
5184/// Use1 Use2 Use3
5185/// reg(a) (1/3) * 1 * 1
5186/// reg(b) 1 * (1/3) * (1/2)
5187/// reg({0,+,1}) (2/3) * (2/3) * (1/2)
5188/// reg({-1,+,1}) (2/3) * (2/3) * 1
5189/// reg({a,+,1}) (2/3) * 1 * 1
5190/// reg({b,+,1}) 1 * (2/3) * (2/3)
5191/// reg(c) 1 * 1 * 0
5192///
5193/// Now count registers number mathematical expectation for each formula:
5194/// Note that for each use we exclude probability if not selecting for the use.
5195/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
5196/// probabilty 1/3 of not selecting for Use1).
5197/// Use1:
5198/// reg(a) + reg({0,+,1}) 1 + 1/3 -- to be deleted
5199/// reg(a) + reg({-1,+,1}) + 1 1 + 4/9 -- to be deleted
5200/// reg({a,+,1}) 1
5201/// Use2:
5202/// reg(b) + reg({0,+,1}) 1/2 + 1/3 -- to be deleted
5203/// reg(b) + reg({-1,+,1}) + 1 1/2 + 2/3 -- to be deleted
5204/// reg({b,+,1}) 2/3
5205/// Use3:
5206/// reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
5207/// reg(c) + reg({b,+,1}) 1 + 2/3
5208void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
5209 if (EstimateSearchSpaceComplexity() < ComplexityLimit)
5210 return;
5211 // Ok, we have too many of formulae on our hands to conveniently handle.
5212 // Use a rough heuristic to thin out the list.
5213
5214 // Set of Regs wich will be 100% used in final solution.
5215 // Used in each formula of a solution (in example above this is reg(c)).
5216 // We can skip them in calculations.
5218 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5219
5220 // Map each register to probability of not selecting
5221 DenseMap <const SCEV *, float> RegNumMap;
5222 for (const SCEV *Reg : RegUses) {
5223 if (UniqRegs.count(Reg))
5224 continue;
5225 float PNotSel = 1;
5226 for (const LSRUse &LU : Uses) {
5227 if (!LU.Regs.count(Reg))
5228 continue;
5229 float P = LU.getNotSelectedProbability(Reg);
5230 if (P != 0.0)
5231 PNotSel *= P;
5232 else
5233 UniqRegs.insert(Reg);
5234 }
5235 RegNumMap.insert(std::make_pair(Reg, PNotSel));
5236 }
5237
5238 LLVM_DEBUG(
5239 dbgs() << "Narrowing the search space by deleting costly formulas\n");
5240
5241 // Delete formulas where registers number expectation is high.
5242 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5243 LSRUse &LU = Uses[LUIdx];
5244 // If nothing to delete - continue.
5245 if (LU.Formulae.size() < 2)
5246 continue;
5247 // This is temporary solution to test performance. Float should be
5248 // replaced with round independent type (based on integers) to avoid
5249 // different results for different target builds.
5250 float FMinRegNum = LU.Formulae[0].getNumRegs();
5251 float FMinARegNum = LU.Formulae[0].getNumRegs();
5252 size_t MinIdx = 0;
5253 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5254 Formula &F = LU.Formulae[i];
5255 float FRegNum = 0;
5256 float FARegNum = 0;
5257 for (const SCEV *BaseReg : F.BaseRegs) {
5258 if (UniqRegs.count(BaseReg))
5259 continue;
5260 FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5261 if (isa<SCEVAddRecExpr>(BaseReg))
5262 FARegNum +=
5263 RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
5264 }
5265 if (const SCEV *ScaledReg = F.ScaledReg) {
5266 if (!UniqRegs.count(ScaledReg)) {
5267 FRegNum +=
5268 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5269 if (isa<SCEVAddRecExpr>(ScaledReg))
5270 FARegNum +=
5271 RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
5272 }
5273 }
5274 if (FMinRegNum > FRegNum ||
5275 (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
5276 FMinRegNum = FRegNum;
5277 FMinARegNum = FARegNum;
5278 MinIdx = i;
5279 }
5280 }
5281 LLVM_DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs());
5282 dbgs() << " with min reg num " << FMinRegNum << '\n');
5283 if (MinIdx != 0)
5284 std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
5285 while (LU.Formulae.size() != 1) {
5286 LLVM_DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs());
5287 dbgs() << '\n');
5288 LU.Formulae.pop_back();
5289 }
5290 LU.RecomputeRegs(LUIdx, RegUses);
5291 assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
5292 Formula &F = LU.Formulae[0];
5293 LLVM_DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n');
5294 // When we choose the formula, the regs become unique.
5295 UniqRegs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
5296 if (F.ScaledReg)
5297 UniqRegs.insert(F.ScaledReg);
5298 }
5299 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5300}
5301
5302// Check if Best and Reg are SCEVs separated by a constant amount C, and if so
5303// would the addressing offset +C would be legal where the negative offset -C is
5304// not.
5306 ScalarEvolution &SE, const SCEV *Best,
5307 const SCEV *Reg,
5308 MemAccessTy AccessType) {
5309 if (Best->getType() != Reg->getType() ||
5310 (isa<SCEVAddRecExpr>(Best) && isa<SCEVAddRecExpr>(Reg) &&
5311 cast<SCEVAddRecExpr>(Best)->getLoop() !=
5312 cast<SCEVAddRecExpr>(Reg)->getLoop()))
5313 return false;
5314 const auto *Diff = dyn_cast<SCEVConstant>(SE.getMinusSCEV(Best, Reg));
5315 if (!Diff)
5316 return false;
5317
5319 AccessType.MemTy, /*BaseGV=*/nullptr,
5320 /*BaseOffset=*/Diff->getAPInt().getSExtValue(),
5321 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace) &&
5323 AccessType.MemTy, /*BaseGV=*/nullptr,
5324 /*BaseOffset=*/-Diff->getAPInt().getSExtValue(),
5325 /*HasBaseReg=*/true, /*Scale=*/0, AccessType.AddrSpace);
5326}
5327
5328/// Pick a register which seems likely to be profitable, and then in any use
5329/// which has any reference to that register, delete all formulae which do not
5330/// reference that register.
5331void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
5332 // With all other options exhausted, loop until the system is simple
5333 // enough to handle.
5335 while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
5336 // Ok, we have too many of formulae on our hands to conveniently handle.
5337 // Use a rough heuristic to thin out the list.
5338 LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
5339
5340 // Pick the register which is used by the most LSRUses, which is likely
5341 // to be a good reuse register candidate.
5342 const SCEV *Best = nullptr;
5343 unsigned BestNum = 0;
5344 for (const SCEV *Reg : RegUses) {
5345 if (Taken.count(Reg))
5346 continue;
5347 if (!Best) {
5348 Best = Reg;
5349 BestNum = RegUses.getUsedByIndices(Reg).count();
5350 } else {
5351 unsigned Count = RegUses.getUsedByIndices(Reg).count();
5352 if (Count > BestNum) {
5353 Best = Reg;
5354 BestNum = Count;
5355 }
5356
5357 // If the scores are the same, but the Reg is simpler for the target
5358 // (for example {x,+,1} as opposed to {x+C,+,1}, where the target can
5359 // handle +C but not -C), opt for the simpler formula.
5360 if (Count == BestNum) {
5361 int LUIdx = RegUses.getUsedByIndices(Reg).find_first();
5362 if (LUIdx >= 0 && Uses[LUIdx].Kind == LSRUse::Address &&
5363 IsSimplerBaseSCEVForTarget(TTI, SE, Best, Reg,
5364 Uses[LUIdx].AccessTy)) {
5365 Best = Reg;
5366 BestNum = Count;
5367 }
5368 }
5369 }
5370 }
5371 assert(Best && "Failed to find best LSRUse candidate");
5372
5373 LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
5374 << " will yield profitable reuse.\n");
5375 Taken.insert(Best);
5376
5377 // In any use with formulae which references this register, delete formulae
5378 // which don't reference it.
5379 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
5380 LSRUse &LU = Uses[LUIdx];
5381 if (!LU.Regs.count(Best)) continue;
5382
5383 bool Any = false;
5384 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
5385 Formula &F = LU.Formulae[i];
5386 if (!F.referencesReg(Best)) {
5387 LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
5388 LU.DeleteFormula(F);
5389 --e;
5390 --i;
5391 Any = true;
5392 assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
5393 continue;
5394 }
5395 }
5396
5397 if (Any)
5398 LU.RecomputeRegs(LUIdx, RegUses);
5399 }
5400
5401 LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
5402 }
5403}
5404
5405/// If there are an extraordinary number of formulae to choose from, use some
5406/// rough heuristics to prune down the number of formulae. This keeps the main
5407/// solver from taking an extraordinary amount of time in some worst-case
5408/// scenarios.
5409void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
5410 NarrowSearchSpaceByDetectingSupersets();
5411 NarrowSearchSpaceByCollapsingUnrolledCode();
5412 NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
5414 NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
5415 NarrowSearchSpaceByFilterPostInc();
5416 if (LSRExpNarrow)
5417 NarrowSearchSpaceByDeletingCostlyFormulas();
5418 else
5419 NarrowSearchSpaceByPickingWinnerRegs();
5420}
5421
5422/// This is the recursive solver.
5423void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
5424 Cost &SolutionCost,
5426 const Cost &CurCost,
5427 const SmallPtrSet<const SCEV *, 16> &CurRegs,
5428 DenseSet<const SCEV *> &VisitedRegs) const {
5429 // Some ideas:
5430 // - prune more:
5431 // - use more aggressive filtering
5432 // - sort the formula so that the most profitable solutions are found first
5433 // - sort the uses too
5434 // - search faster:
5435 // - don't compute a cost, and then compare. compare while computing a cost
5436 // and bail early.
5437 // - track register sets with SmallBitVector
5438
5439 const LSRUse &LU = Uses[Workspace.size()];
5440
5441 // If this use references any register that's already a part of the
5442 // in-progress solution, consider it a requirement that a formula must
5443 // reference that register in order to be considered. This prunes out
5444 // unprofitable searching.
5446 for (const SCEV *S : CurRegs)
5447 if (LU.Regs.count(S))
5448 ReqRegs.insert(S);
5449
5451 Cost NewCost(L, SE, TTI, AMK);
5452 for (const Formula &F : LU.Formulae) {
5453 // Ignore formulae which may not be ideal in terms of register reuse of
5454 // ReqRegs. The formula should use all required registers before
5455 // introducing new ones.
5456 // This can sometimes (notably when trying to favour postinc) lead to
5457 // sub-optimial decisions. There it is best left to the cost modelling to
5458 // get correct.
5459 if (AMK != TTI::AMK_PostIndexed || LU.Kind != LSRUse::Address) {
5460 int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
5461 for (const SCEV *Reg : ReqRegs) {
5462 if ((F.ScaledReg && F.ScaledReg == Reg) ||
5463 is_contained(F.BaseRegs, Reg)) {
5464 --NumReqRegsToFind;
5465 if (NumReqRegsToFind == 0)
5466 break;
5467 }
5468 }
5469 if (NumReqRegsToFind != 0) {
5470 // If none of the formulae satisfied the required registers, then we could
5471 // clear ReqRegs and try again. Currently, we simply give up in this case.
5472 continue;
5473 }
5474 }
5475
5476 // Evaluate the cost of the current formula. If it's already worse than
5477 // the current best, prune the search at that point.
5478 NewCost = CurCost;
5479 NewRegs = CurRegs;
5480 NewCost.RateFormula(F, NewRegs, VisitedRegs, LU);
5481 if (NewCost.isLess(SolutionCost)) {
5482 Workspace.push_back(&F);
5483 if (Workspace.size() != Uses.size()) {
5484 SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
5485 NewRegs, VisitedRegs);
5486 if (F.getNumRegs() == 1 && Workspace.size() == 1)
5487 VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
5488 } else {
5489 LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
5490 dbgs() << ".\nRegs:\n";
5491 for (const SCEV *S : NewRegs) dbgs()
5492 << "- " << *S << "\n";
5493 dbgs() << '\n');
5494
5495 SolutionCost = NewCost;
5496 Solution = Workspace;
5497 }
5498 Workspace.pop_back();
5499 }
5500 }
5501}
5502
5503/// Choose one formula from each use. Return the results in the given Solution
5504/// vector.
5505void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
5507 Cost SolutionCost(L, SE, TTI, AMK);
5508 SolutionCost.Lose();
5509 Cost CurCost(L, SE, TTI, AMK);
5511 DenseSet<const SCEV *> VisitedRegs;
5512 Workspace.reserve(Uses.size());
5513
5514 // SolveRecurse does all the work.
5515 SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
5516 CurRegs, VisitedRegs);
5517 if (Solution.empty()) {
5518 LLVM_DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
5519 return;
5520 }
5521
5522 // Ok, we've now made all our decisions.
5523 LLVM_DEBUG(dbgs() << "\n"
5524 "The chosen solution requires ";
5525 SolutionCost.print(dbgs()); dbgs() << ":\n";
5526 for (size_t i = 0, e = Uses.size(); i != e; ++i) {
5527 dbgs() << " ";
5528 Uses[i].print(dbgs());
5529 dbgs() << "\n"
5530 " ";
5531 Solution[i]->print(dbgs());
5532 dbgs() << '\n';
5533 });
5534
5535 assert(Solution.size() == Uses.size() && "Malformed solution!");
5536
5537 const bool EnableDropUnprofitableSolution = [&] {
5539 case cl::BOU_TRUE:
5540 return true;
5541 case cl::BOU_FALSE:
5542 return false;
5543 case cl::BOU_UNSET:
5545 }
5546 llvm_unreachable("Unhandled cl::boolOrDefault enum");
5547 }();
5548
5549 if (BaselineCost.isLess(SolutionCost)) {
5550 if (!EnableDropUnprofitableSolution)
5551 LLVM_DEBUG(
5552 dbgs() << "Baseline is more profitable than chosen solution, "
5553 "add option 'lsr-drop-solution' to drop LSR solution.\n");
5554 else {
5555 LLVM_DEBUG(dbgs() << "Baseline is more profitable than chosen "
5556 "solution, dropping LSR solution.\n";);
5557 Solution.clear();
5558 }
5559 }
5560}
5561
5562/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
5563/// we can go while still being dominated by the input positions. This helps
5564/// canonicalize the insert position, which encourages sharing.
5566LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
5567 const SmallVectorImpl<Instruction *> &Inputs)
5568 const {
5569 Instruction *Tentative = &*IP;
5570 while (true) {
5571 bool AllDominate = true;
5572 Instruction *BetterPos = nullptr;
5573 // Don't bother attempting to insert before a catchswitch, their basic block
5574 // cannot have other non-PHI instructions.
5575 if (isa<CatchSwitchInst>(Tentative))
5576 return IP;
5577
5578 for (Instruction *Inst : Inputs) {
5579 if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
5580 AllDominate = false;
5581 break;
5582 }
5583 // Attempt to find an insert position in the middle of the block,
5584 // instead of at the end, so that it can be used for other expansions.
5585 if (Tentative->getParent() == Inst->getParent() &&
5586 (!BetterPos || !DT.dominates(Inst, BetterPos)))
5587 BetterPos = &*std::next(BasicBlock::iterator(Inst));
5588 }
5589 if (!AllDominate)
5590 break;
5591 if (BetterPos)
5592 IP = BetterPos->getIterator();
5593 else
5594 IP = Tentative->getIterator();
5595
5596 const Loop *IPLoop = LI.getLoopFor(IP->getParent());
5597 unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
5598
5599 BasicBlock *IDom;
5600 for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
5601 if (!Rung) return IP;
5602 Rung = Rung->getIDom();
5603 if (!Rung) return IP;
5604 IDom = Rung->getBlock();
5605
5606 // Don't climb into a loop though.
5607 const Loop *IDomLoop = LI.getLoopFor(IDom);
5608 unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
5609 if (IDomDepth <= IPLoopDepth &&
5610 (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
5611 break;
5612 }
5613
5614 Tentative = IDom->getTerminator();
5615 }
5616
5617 return IP;
5618}
5619
5620/// Determine an input position which will be dominated by the operands and
5621/// which will dominate the result.
5622BasicBlock::iterator LSRInstance::AdjustInsertPositionForExpand(
5623 BasicBlock::iterator LowestIP, const LSRFixup &LF, const LSRUse &LU) const {
5624 // Collect some instructions which must be dominated by the
5625 // expanding replacement. These must be dominated by any operands that
5626 // will be required in the expansion.
5628 if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
5629 Inputs.push_back(I);
5630 if (LU.Kind == LSRUse::ICmpZero)
5631 if (Instruction *I =
5632 dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
5633 Inputs.push_back(I);
5634 if (LF.PostIncLoops.count(L)) {
5635 if (LF.isUseFullyOutsideLoop(L))
5636 Inputs.push_back(L->getLoopLatch()->getTerminator());
5637 else
5638 Inputs.push_back(IVIncInsertPos);
5639 }
5640 // The expansion must also be dominated by the increment positions of any
5641 // loops it for which it is using post-inc mode.
5642 for (const Loop *PIL : LF.PostIncLoops) {
5643 if (PIL == L) continue;
5644
5645 // Be dominated by the loop exit.
5646 SmallVector<BasicBlock *, 4> ExitingBlocks;
5647 PIL->getExitingBlocks(ExitingBlocks);
5648 if (!ExitingBlocks.empty()) {
5649 BasicBlock *BB = ExitingBlocks[0];
5650 for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
5651 BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
5652 Inputs.push_back(BB->getTerminator());
5653 }
5654 }
5655
5656 assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad()
5657 && !isa<DbgInfoIntrinsic>(LowestIP) &&
5658 "Insertion point must be a normal instruction");
5659
5660 // Then, climb up the immediate dominator tree as far as we can go while
5661 // still being dominated by the input positions.
5662 BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
5663
5664 // Don't insert instructions before PHI nodes.
5665 while (isa<PHINode>(IP)) ++IP;
5666
5667 // Ignore landingpad instructions.
5668 while (IP->isEHPad()) ++IP;
5669
5670 // Ignore debug intrinsics.
5671 while (isa<DbgInfoIntrinsic>(IP)) ++IP;
5672
5673 // Set IP below instructions recently inserted by SCEVExpander. This keeps the
5674 // IP consistent across expansions and allows the previously inserted
5675 // instructions to be reused by subsequent expansion.
5676 while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP)
5677 ++IP;
5678
5679 return IP;
5680}
5681
5682/// Emit instructions for the leading candidate expression for this LSRUse (this
5683/// is called "expanding").
5684Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
5685 const Formula &F, BasicBlock::iterator IP,
5686 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5687 if (LU.RigidFormula)
5688 return LF.OperandValToReplace;
5689
5690 // Determine an input position which will be dominated by the operands and
5691 // which will dominate the result.
5692 IP = AdjustInsertPositionForExpand(IP, LF, LU);
5693 Rewriter.setInsertPoint(&*IP);
5694
5695 // Inform the Rewriter if we have a post-increment use, so that it can
5696 // perform an advantageous expansion.
5697 Rewriter.setPostInc(LF.PostIncLoops);
5698
5699 // This is the type that the user actually needs.
5700 Type *OpTy = LF.OperandValToReplace->getType();
5701 // This will be the type that we'll initially expand to.
5702 Type *Ty = F.getType();
5703 if (!Ty)
5704 // No type known; just expand directly to the ultimate type.
5705 Ty = OpTy;
5706 else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
5707 // Expand directly to the ultimate type if it's the right size.
5708 Ty = OpTy;
5709 // This is the type to do integer arithmetic in.
5710 Type *IntTy = SE.getEffectiveSCEVType(Ty);
5711
5712 // Build up a list of operands to add together to form the full base.
5714
5715 // Expand the BaseRegs portion.
5716 for (const SCEV *Reg : F.BaseRegs) {
5717 assert(!Reg->isZero() && "Zero allocated in a base register!");
5718
5719 // If we're expanding for a post-inc user, make the post-inc adjustment.
5720 Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE);
5721 Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
5722 }
5723
5724 // Expand the ScaledReg portion.
5725 Value *ICmpScaledV = nullptr;
5726 if (F.Scale != 0) {
5727 const SCEV *ScaledS = F.ScaledReg;
5728
5729 // If we're expanding for a post-inc user, make the post-inc adjustment.
5730 PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
5731 ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE);
5732
5733 if (LU.Kind == LSRUse::ICmpZero) {
5734 // Expand ScaleReg as if it was part of the base regs.
5735 if (F.Scale == 1)
5736 Ops.push_back(
5737 SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)));
5738 else {
5739 // An interesting way of "folding" with an icmp is to use a negated
5740 // scale, which we'll implement by inserting it into the other operand
5741 // of the icmp.
5742 assert(F.Scale == -1 &&
5743 "The only scale supported by ICmpZero uses is -1!");
5744 ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr);
5745 }
5746 } else {
5747 // Otherwise just expand the scaled register and an explicit scale,
5748 // which is expected to be matched as part of the address.
5749
5750 // Flush the operand list to suppress SCEVExpander hoisting address modes.
5751 // Unless the addressing mode will not be folded.
5752 if (!Ops.empty() && LU.Kind == LSRUse::Address &&
5753 isAMCompletelyFolded(TTI, LU, F)) {
5754 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), nullptr);
5755 Ops.clear();
5756 Ops.push_back(SE.getUnknown(FullV));
5757 }
5758 ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr));
5759 if (F.Scale != 1)
5760 ScaledS =
5761 SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
5762 Ops.push_back(ScaledS);
5763 }
5764 }
5765
5766 // Expand the GV portion.
5767 if (F.BaseGV) {
5768 // Flush the operand list to suppress SCEVExpander hoisting.
5769 if (!Ops.empty()) {
5770 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), IntTy);
5771 Ops.clear();
5772 Ops.push_back(SE.getUnknown(FullV));
5773 }
5774 Ops.push_back(SE.getUnknown(F.BaseGV));
5775 }
5776
5777 // Flush the operand list to suppress SCEVExpander hoisting of both folded and
5778 // unfolded offsets. LSR assumes they both live next to their uses.
5779 if (!Ops.empty()) {
5780 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
5781 Ops.clear();
5782 Ops.push_back(SE.getUnknown(FullV));
5783 }
5784
5785 // FIXME: Are we sure we won't get a mismatch here? Is there a way to bail
5786 // out at this point, or should we generate a SCEV adding together mixed
5787 // offsets?
5788 assert(F.BaseOffset.isCompatibleImmediate(LF.Offset) &&
5789 "Expanding mismatched offsets\n");
5790 // Expand the immediate portion.
5791 Immediate Offset = F.BaseOffset.addUnsigned(LF.Offset);
5792 if (Offset.isNonZero()) {
5793 if (LU.Kind == LSRUse::ICmpZero) {
5794 // The other interesting way of "folding" with an ICmpZero is to use a
5795 // negated immediate.
5796 if (!ICmpScaledV)
5797 ICmpScaledV =
5798 ConstantInt::get(IntTy, -(uint64_t)Offset.getFixedValue());
5799 else {
5800 Ops.push_back(SE.getUnknown(ICmpScaledV));
5801 ICmpScaledV = ConstantInt::get(IntTy, Offset.getFixedValue());
5802 }
5803 } else {
5804 // Just add the immediate values. These again are expected to be matched
5805 // as part of the address.
5806 Ops.push_back(Offset.getUnknownSCEV(SE, IntTy));
5807 }
5808 }
5809
5810 // Expand the unfolded offset portion.
5811 Immediate UnfoldedOffset = F.UnfoldedOffset;
5812 if (UnfoldedOffset.isNonZero()) {
5813 // Just add the immediate values.
5814 Ops.push_back(UnfoldedOffset.getUnknownSCEV(SE, IntTy));
5815 }
5816
5817 // Emit instructions summing all the operands.
5818 const SCEV *FullS = Ops.empty() ?
5819 SE.getConstant(IntTy, 0) :
5820 SE.getAddExpr(Ops);
5821 Value *FullV = Rewriter.expandCodeFor(FullS, Ty);
5822
5823 // We're done expanding now, so reset the rewriter.
5824 Rewriter.clearPostInc();
5825
5826 // An ICmpZero Formula represents an ICmp which we're handling as a
5827 // comparison against zero. Now that we've expanded an expression for that
5828 // form, update the ICmp's other operand.
5829 if (LU.Kind == LSRUse::ICmpZero) {
5830 ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
5831 if (auto *OperandIsInstr = dyn_cast<Instruction>(CI->getOperand(1)))
5832 DeadInsts.emplace_back(OperandIsInstr);
5833 assert(!F.BaseGV && "ICmp does not support folding a global value and "
5834 "a scale at the same time!");
5835 if (F.Scale == -1) {
5836 if (ICmpScaledV->getType() != OpTy) {
5838 CastInst::getCastOpcode(ICmpScaledV, false, OpTy, false),
5839 ICmpScaledV, OpTy, "tmp", CI->getIterator());
5840 ICmpScaledV = Cast;
5841 }
5842 CI->setOperand(1, ICmpScaledV);
5843 } else {
5844 // A scale of 1 means that the scale has been expanded as part of the
5845 // base regs.
5846 assert((F.Scale == 0 || F.Scale == 1) &&
5847 "ICmp does not support folding a global value and "
5848 "a scale at the same time!");
5850 -(uint64_t)Offset.getFixedValue());
5851 if (C->getType() != OpTy) {
5853 CastInst::getCastOpcode(C, false, OpTy, false), C, OpTy,
5854 CI->getDataLayout());
5855 assert(C && "Cast of ConstantInt should have folded");
5856 }
5857
5858 CI->setOperand(1, C);
5859 }
5860 }
5861
5862 return FullV;
5863}
5864
5865/// Helper for Rewrite. PHI nodes are special because the use of their operands
5866/// effectively happens in their predecessor blocks, so the expression may need
5867/// to be expanded in multiple places.
5868void LSRInstance::RewriteForPHI(
5869 PHINode *PN, const LSRUse &LU, const LSRFixup &LF, const Formula &F,
5870 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
5872
5873 // Inserting instructions in the loop and using them as PHI's input could
5874 // break LCSSA in case if PHI's parent block is not a loop exit (i.e. the
5875 // corresponding incoming block is not loop exiting). So collect all such
5876 // instructions to form LCSSA for them later.
5877 SmallVector<Instruction *, 4> InsertedNonLCSSAInsts;
5878
5879 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
5880 if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
5881 bool needUpdateFixups = false;
5882 BasicBlock *BB = PN->getIncomingBlock(i);
5883
5884 // If this is a critical edge, split the edge so that we do not insert
5885 // the code on all predecessor/successor paths. We do this unless this
5886 // is the canonical backedge for this loop, which complicates post-inc
5887 // users.
5888 if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
5889 !isa<IndirectBrInst>(BB->getTerminator()) &&
5890 !isa<CatchSwitchInst>(BB->getTerminator())) {
5891 BasicBlock *Parent = PN->getParent();
5892 Loop *PNLoop = LI.getLoopFor(Parent);
5893 if (!PNLoop || Parent != PNLoop->getHeader()) {
5894 // Split the critical edge.
5895 BasicBlock *NewBB = nullptr;
5896 if (!Parent->isLandingPad()) {
5897 NewBB =
5898 SplitCriticalEdge(BB, Parent,
5899 CriticalEdgeSplittingOptions(&DT, &LI, MSSAU)
5900 .setMergeIdenticalEdges()
5901 .setKeepOneInputPHIs());
5902 } else {
5904 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
5905 SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DTU, &LI);
5906 NewBB = NewBBs[0];
5907 }
5908 // If NewBB==NULL, then SplitCriticalEdge refused to split because all
5909 // phi predecessors are identical. The simple thing to do is skip
5910 // splitting in this case rather than complicate the API.
5911 if (NewBB) {
5912 // If PN is outside of the loop and BB is in the loop, we want to
5913 // move the block to be immediately before the PHI block, not
5914 // immediately after BB.
5915 if (L->contains(BB) && !L->contains(PN))
5916 NewBB->moveBefore(PN->getParent());
5917
5918 // Splitting the edge can reduce the number of PHI entries we have.
5919 e = PN->getNumIncomingValues();
5920 BB = NewBB;
5921 i = PN->getBasicBlockIndex(BB);
5922
5923 needUpdateFixups = true;
5924 }
5925 }
5926 }
5927
5928 std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
5929 Inserted.insert(std::make_pair(BB, static_cast<Value *>(nullptr)));
5930 if (!Pair.second)
5931 PN->setIncomingValue(i, Pair.first->second);
5932 else {
5933 Value *FullV =
5934 Expand(LU, LF, F, BB->getTerminator()->getIterator(), DeadInsts);
5935
5936 // If this is reuse-by-noop-cast, insert the noop cast.
5937 Type *OpTy = LF.OperandValToReplace->getType();
5938 if (FullV->getType() != OpTy)
5939 FullV = CastInst::Create(
5940 CastInst::getCastOpcode(FullV, false, OpTy, false), FullV,
5941 LF.OperandValToReplace->getType(), "tmp",
5942 BB->getTerminator()->getIterator());
5943
5944 // If the incoming block for this value is not in the loop, it means the
5945 // current PHI is not in a loop exit, so we must create a LCSSA PHI for
5946 // the inserted value.
5947 if (auto *I = dyn_cast<Instruction>(FullV))
5948 if (L->contains(I) && !L->contains(BB))
5949 InsertedNonLCSSAInsts.push_back(I);
5950
5951 PN->setIncomingValue(i, FullV);
5952 Pair.first->second = FullV;
5953 }
5954
5955 // If LSR splits critical edge and phi node has other pending
5956 // fixup operands, we need to update those pending fixups. Otherwise
5957 // formulae will not be implemented completely and some instructions
5958 // will not be eliminated.
5959 if (needUpdateFixups) {
5960 for (LSRUse &LU : Uses)
5961 for (LSRFixup &Fixup : LU.Fixups)
5962 // If fixup is supposed to rewrite some operand in the phi
5963 // that was just updated, it may be already moved to
5964 // another phi node. Such fixup requires update.
5965 if (Fixup.UserInst == PN) {
5966 // Check if the operand we try to replace still exists in the
5967 // original phi.
5968 bool foundInOriginalPHI = false;
5969 for (const auto &val : PN->incoming_values())
5970 if (val == Fixup.OperandValToReplace) {
5971 foundInOriginalPHI = true;
5972 break;
5973 }
5974
5975 // If fixup operand found in original PHI - nothing to do.
5976 if (foundInOriginalPHI)
5977 continue;
5978
5979 // Otherwise it might be moved to another PHI and requires update.
5980 // If fixup operand not found in any of the incoming blocks that
5981 // means we have already rewritten it - nothing to do.
5982 for (const auto &Block : PN->blocks())
5983 for (BasicBlock::iterator I = Block->begin(); isa<PHINode>(I);
5984 ++I) {
5985 PHINode *NewPN = cast<PHINode>(I);
5986 for (const auto &val : NewPN->incoming_values())
5987 if (val == Fixup.OperandValToReplace)
5988 Fixup.UserInst = NewPN;
5989 }
5990 }
5991 }
5992 }
5993
5994 formLCSSAForInstructions(InsertedNonLCSSAInsts, DT, LI, &SE);
5995}
5996
5997/// Emit instructions for the leading candidate expression for this LSRUse (this
5998/// is called "expanding"), and update the UserInst to reference the newly
5999/// expanded value.
6000void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
6001 const Formula &F,
6002 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
6003 // First, find an insertion point that dominates UserInst. For PHI nodes,
6004 // find the nearest block which dominates all the relevant uses.
6005 if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
6006 RewriteForPHI(PN, LU, LF, F, DeadInsts);
6007 } else {
6008 Value *FullV = Expand(LU, LF, F, LF.UserInst->getIterator(), DeadInsts);
6009
6010 // If this is reuse-by-noop-cast, insert the noop cast.
6011 Type *OpTy = LF.OperandValToReplace->getType();
6012 if (FullV->getType() != OpTy) {
6013 Instruction *Cast =
6014 CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
6015 FullV, OpTy, "tmp", LF.UserInst->getIterator());
6016 FullV = Cast;
6017 }
6018
6019 // Update the user. ICmpZero is handled specially here (for now) because
6020 // Expand may have updated one of the operands of the icmp already, and
6021 // its new value may happen to be equal to LF.OperandValToReplace, in
6022 // which case doing replaceUsesOfWith leads to replacing both operands
6023 // with the same value. TODO: Reorganize this.
6024 if (LU.Kind == LSRUse::ICmpZero)
6025 LF.UserInst->setOperand(0, FullV);
6026 else
6027 LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
6028 }
6029
6030 if (auto *OperandIsInstr = dyn_cast<Instruction>(LF.OperandValToReplace))
6031 DeadInsts.emplace_back(OperandIsInstr);
6032}
6033
6034// Trying to hoist the IVInc to loop header if all IVInc users are in
6035// the loop header. It will help backend to generate post index load/store
6036// when the latch block is different from loop header block.
6037static bool canHoistIVInc(const TargetTransformInfo &TTI, const LSRFixup &Fixup,
6038 const LSRUse &LU, Instruction *IVIncInsertPos,
6039 Loop *L) {
6040 if (LU.Kind != LSRUse::Address)
6041 return false;
6042
6043 // For now this code do the conservative optimization, only work for
6044 // the header block. Later we can hoist the IVInc to the block post
6045 // dominate all users.
6046 BasicBlock *LHeader = L->getHeader();
6047 if (IVIncInsertPos->getParent() == LHeader)
6048 return false;
6049
6050 if (!Fixup.OperandValToReplace ||
6051 any_of(Fixup.OperandValToReplace->users(), [&LHeader](User *U) {
6052 Instruction *UI = cast<Instruction>(U);
6053 return UI->getParent() != LHeader;
6054 }))
6055 return false;
6056
6057 Instruction *I = Fixup.UserInst;
6058 Type *Ty = I->getType();
6059 return Ty->isIntegerTy() &&
6060 ((isa<LoadInst>(I) && TTI.isIndexedLoadLegal(TTI.MIM_PostInc, Ty)) ||
6061 (isa<StoreInst>(I) && TTI.isIndexedStoreLegal(TTI.MIM_PostInc, Ty)));
6062}
6063
6064/// Rewrite all the fixup locations with new values, following the chosen
6065/// solution.
6066void LSRInstance::ImplementSolution(
6067 const SmallVectorImpl<const Formula *> &Solution) {
6068 // Keep track of instructions we may have made dead, so that
6069 // we can remove them after we are done working.
6071
6072 // Mark phi nodes that terminate chains so the expander tries to reuse them.
6073 for (const IVChain &Chain : IVChainVec) {
6074 if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst()))
6075 Rewriter.setChainedPhi(PN);
6076 }
6077
6078 // Expand the new value definitions and update the users.
6079 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
6080 for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) {
6081 Instruction *InsertPos =
6082 canHoistIVInc(TTI, Fixup, Uses[LUIdx], IVIncInsertPos, L)
6083 ? L->getHeader()->getTerminator()
6084 : IVIncInsertPos;
6085 Rewriter.setIVIncInsertPos(L, InsertPos);
6086 Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], DeadInsts);
6087 Changed = true;
6088 }
6089
6090 for (const IVChain &Chain : IVChainVec) {
6091 GenerateIVChain(Chain, DeadInsts);
6092 Changed = true;
6093 }
6094
6095 for (const WeakVH &IV : Rewriter.getInsertedIVs())
6096 if (IV && dyn_cast<Instruction>(&*IV)->getParent())
6097 ScalarEvolutionIVs.push_back(IV);
6098
6099 // Clean up after ourselves. This must be done before deleting any
6100 // instructions.
6101 Rewriter.clear();
6102
6104 &TLI, MSSAU);
6105
6106 // In our cost analysis above, we assume that each addrec consumes exactly
6107 // one register, and arrange to have increments inserted just before the
6108 // latch to maximimize the chance this is true. However, if we reused
6109 // existing IVs, we now need to move the increments to match our
6110 // expectations. Otherwise, our cost modeling results in us having a
6111 // chosen a non-optimal result for the actual schedule. (And yes, this
6112 // scheduling decision does impact later codegen.)
6113 for (PHINode &PN : L->getHeader()->phis()) {
6114 BinaryOperator *BO = nullptr;
6115 Value *Start = nullptr, *Step = nullptr;
6116 if (!matchSimpleRecurrence(&PN, BO, Start, Step))
6117 continue;
6118
6119 switch (BO->getOpcode()) {
6120 case Instruction::Sub:
6121 if (BO->getOperand(0) != &PN)
6122 // sub is non-commutative - match handling elsewhere in LSR
6123 continue;
6124 break;
6125 case Instruction::Add:
6126 break;
6127 default:
6128 continue;
6129 };
6130
6131 if (!isa<Constant>(Step))
6132 // If not a constant step, might increase register pressure
6133 // (We assume constants have been canonicalized to RHS)
6134 continue;
6135
6136 if (BO->getParent() == IVIncInsertPos->getParent())
6137 // Only bother moving across blocks. Isel can handle block local case.
6138 continue;
6139
6140 // Can we legally schedule inc at the desired point?
6141 if (!llvm::all_of(BO->uses(),
6142 [&](Use &U) {return DT.dominates(IVIncInsertPos, U);}))
6143 continue;
6144 BO->moveBefore(IVIncInsertPos);
6145 Changed = true;
6146 }
6147
6148
6149}
6150
6151LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
6152 DominatorTree &DT, LoopInfo &LI,
6155 : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), TLI(TLI), TTI(TTI), L(L),
6156 MSSAU(MSSAU), AMK(PreferredAddresingMode.getNumOccurrences() > 0
6158 : TTI.getPreferredAddressingMode(L, &SE)),
6159 Rewriter(SE, L->getHeader()->getDataLayout(), "lsr", false),
6160 BaselineCost(L, SE, TTI, AMK) {
6161 // If LoopSimplify form is not available, stay out of trouble.
6162 if (!L->isLoopSimplifyForm())
6163 return;
6164
6165 // If there's no interesting work to be done, bail early.
6166 if (IU.empty()) return;
6167
6168 // If there's too much analysis to be done, bail early. We won't be able to
6169 // model the problem anyway.
6170 unsigned NumUsers = 0;
6171 for (const IVStrideUse &U : IU) {
6172 if (++NumUsers > MaxIVUsers) {
6173 (void)U;
6174 LLVM_DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U
6175 << "\n");
6176 return;
6177 }
6178 // Bail out if we have a PHI on an EHPad that gets a value from a
6179 // CatchSwitchInst. Because the CatchSwitchInst cannot be split, there is
6180 // no good place to stick any instructions.
6181 if (auto *PN = dyn_cast<PHINode>(U.getUser())) {
6182 auto *FirstNonPHI = PN->getParent()->getFirstNonPHI();
6183 if (isa<FuncletPadInst>(FirstNonPHI) ||
6184 isa<CatchSwitchInst>(FirstNonPHI))
6185 for (BasicBlock *PredBB : PN->blocks())
6186 if (isa<CatchSwitchInst>(PredBB->getFirstNonPHI()))
6187 return;
6188 }
6189 }
6190
6191 LLVM_DEBUG(dbgs() << "\nLSR on loop ";
6192 L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
6193 dbgs() << ":\n");
6194
6195 // Configure SCEVExpander already now, so the correct mode is used for
6196 // isSafeToExpand() checks.
6197#ifndef NDEBUG
6198 Rewriter.setDebugType(DEBUG_TYPE);
6199#endif
6200 Rewriter.disableCanonicalMode();
6201 Rewriter.enableLSRMode();
6202
6203 // First, perform some low-level loop optimizations.
6204 OptimizeShadowIV();
6205 OptimizeLoopTermCond();
6206
6207 // If loop preparation eliminates all interesting IV users, bail.
6208 if (IU.empty()) return;
6209
6210 // Skip nested loops until we can model them better with formulae.
6211 if (!L->isInnermost()) {
6212 LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
6213 return;
6214 }
6215
6216 // Start collecting data and preparing for the solver.
6217 // If number of registers is not the major cost, we cannot benefit from the
6218 // current profitable chain optimization which is based on number of
6219 // registers.
6220 // FIXME: add profitable chain optimization for other kinds major cost, for
6221 // example number of instructions.
6223 CollectChains();
6224 CollectInterestingTypesAndFactors();
6225 CollectFixupsAndInitialFormulae();
6226 CollectLoopInvariantFixupsAndFormulae();
6227
6228 if (Uses.empty())
6229 return;
6230
6231 LLVM_DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
6232 print_uses(dbgs()));
6233 LLVM_DEBUG(dbgs() << "The baseline solution requires ";
6234 BaselineCost.print(dbgs()); dbgs() << "\n");
6235
6236 // Now use the reuse data to generate a bunch of interesting ways
6237 // to formulate the values needed for the uses.
6238 GenerateAllReuseFormulae();
6239
6240 FilterOutUndesirableDedicatedRegisters();
6241 NarrowSearchSpaceUsingHeuristics();
6242
6244 Solve(Solution);
6245
6246 // Release memory that is no longer needed.
6247 Factors.clear();
6248 Types.clear();
6249 RegUses.clear();
6250
6251 if (Solution.empty())
6252 return;
6253
6254#ifndef NDEBUG
6255 // Formulae should be legal.
6256 for (const LSRUse &LU : Uses) {
6257 for (const Formula &F : LU.Formulae)
6258 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
6259 F) && "Illegal formula generated!");
6260 };
6261#endif
6262
6263 // Now that we've decided what we want, make it so.
6264 ImplementSolution(Solution);
6265}
6266
6267#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
6268void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
6269 if (Factors.empty() && Types.empty()) return;
6270
6271 OS << "LSR has identified the following interesting factors and types: ";
6272 bool First = true;
6273
6274 for (int64_t Factor : Factors) {
6275 if (!First) OS << ", ";
6276 First = false;
6277 OS << '*' << Factor;
6278 }
6279
6280 for (Type *Ty : Types) {
6281 if (!First) OS << ", ";
6282 First = false;
6283 OS << '(' << *Ty << ')';
6284 }
6285 OS << '\n';
6286}
6287
6288void LSRInstance::print_fixups(raw_ostream &OS) const {
6289 OS << "LSR is examining the following fixup sites:\n";
6290 for (const LSRUse &LU : Uses)
6291 for (const LSRFixup &LF : LU.Fixups) {
6292 dbgs() << " ";
6293 LF.print(OS);
6294 OS << '\n';
6295 }
6296}
6297
6298void LSRInstance::print_uses(raw_ostream &OS) const {
6299 OS << "LSR is examining the following uses:\n";
6300 for (const LSRUse &LU : Uses) {
6301 dbgs() << " ";
6302 LU.print(OS);
6303 OS << '\n';
6304 for (const Formula &F : LU.Formulae) {
6305 OS << " ";
6306 F.print(OS);
6307 OS << '\n';
6308 }
6309 }
6310}
6311
6312void LSRInstance::print(raw_ostream &OS) const {
6313 print_factors_and_types(OS);
6314 print_fixups(OS);
6315 print_uses(OS);
6316}
6317
6318LLVM_DUMP_METHOD void LSRInstance::dump() const {
6319 print(errs()); errs() << '\n';
6320}
6321#endif
6322
6323namespace {
6324
6325class LoopStrengthReduce : public LoopPass {
6326public:
6327 static char ID; // Pass ID, replacement for typeid
6328
6329 LoopStrengthReduce();
6330
6331private:
6332 bool runOnLoop(Loop *L, LPPassManager &LPM) override;
6333 void getAnalysisUsage(AnalysisUsage &AU) const override;
6334};
6335
6336} // end anonymous namespace
6337
6338LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
6340}
6341
6342void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
6343 // We split critical edges, so we change the CFG. However, we do update
6344 // many analyses if they are around.
6346
6356 // Requiring LoopSimplify a second time here prevents IVUsers from running
6357 // twice, since LoopSimplify was invalidated by running ScalarEvolution.
6363}
6364
6365namespace {
6366
6367/// Enables more convenient iteration over a DWARF expression vector.
6369ToDwarfOpIter(SmallVectorImpl<uint64_t> &Expr) {
6374 return {Begin, End};
6375}
6376
6377struct SCEVDbgValueBuilder {
6378 SCEVDbgValueBuilder() = default;
6379 SCEVDbgValueBuilder(const SCEVDbgValueBuilder &Base) { clone(Base); }
6380
6381 void clone(const SCEVDbgValueBuilder &Base) {
6382 LocationOps = Base.LocationOps;
6383 Expr = Base.Expr;
6384 }
6385
6386 void clear() {
6387 LocationOps.clear();
6388 Expr.clear();
6389 }
6390
6391 /// The DIExpression as we translate the SCEV.
6393 /// The location ops of the DIExpression.
6394 SmallVector<Value *, 2> LocationOps;
6395
6396 void pushOperator(uint64_t Op) { Expr.push_back(Op); }
6397 void pushUInt(uint64_t Operand) { Expr.push_back(Operand); }
6398
6399 /// Add a DW_OP_LLVM_arg to the expression, followed by the index of the value
6400 /// in the set of values referenced by the expression.
6401 void pushLocation(llvm::Value *V) {
6403 auto *It = llvm::find(LocationOps, V);
6404 unsigned ArgIndex = 0;
6405 if (It != LocationOps.end()) {
6406 ArgIndex = std::distance(LocationOps.begin(), It);
6407 } else {
6408 ArgIndex = LocationOps.size();
6409 LocationOps.push_back(V);
6410 }
6411 Expr.push_back(ArgIndex);
6412 }
6413
6414 void pushValue(const SCEVUnknown *U) {
6415 llvm::Value *V = cast<SCEVUnknown>(U)->getValue();
6416 pushLocation(V);
6417 }
6418
6419 bool pushConst(const SCEVConstant *C) {
6420 if (C->getAPInt().getSignificantBits() > 64)
6421 return false;
6422 Expr.push_back(llvm::dwarf::DW_OP_consts);
6423 Expr.push_back(C->getAPInt().getSExtValue());
6424 return true;
6425 }
6426
6427 // Iterating the expression as DWARF ops is convenient when updating
6428 // DWARF_OP_LLVM_args.
6430 return ToDwarfOpIter(Expr);
6431 }
6432
6433 /// Several SCEV types are sequences of the same arithmetic operator applied
6434 /// to constants and values that may be extended or truncated.
6435 bool pushArithmeticExpr(const llvm::SCEVCommutativeExpr *CommExpr,
6436 uint64_t DwarfOp) {
6437 assert((isa<llvm::SCEVAddExpr>(CommExpr) || isa<SCEVMulExpr>(CommExpr)) &&
6438 "Expected arithmetic SCEV type");
6439 bool Success = true;
6440 unsigned EmitOperator = 0;
6441 for (const auto &Op : CommExpr->operands()) {
6442 Success &= pushSCEV(Op);
6443
6444 if (EmitOperator >= 1)
6445 pushOperator(DwarfOp);
6446 ++EmitOperator;
6447 }
6448 return Success;
6449 }
6450
6451 // TODO: Identify and omit noop casts.
6452 bool pushCast(const llvm::SCEVCastExpr *C, bool IsSigned) {
6453 const llvm::SCEV *Inner = C->getOperand(0);
6454 const llvm::Type *Type = C->getType();
6455 uint64_t ToWidth = Type->getIntegerBitWidth();
6456 bool Success = pushSCEV(Inner);
6457 uint64_t CastOps[] = {dwarf::DW_OP_LLVM_convert, ToWidth,
6458 IsSigned ? llvm::dwarf::DW_ATE_signed
6459 : llvm::dwarf::DW_ATE_unsigned};
6460 for (const auto &Op : CastOps)
6461 pushOperator(Op);
6462 return Success;
6463 }
6464
6465 // TODO: MinMax - although these haven't been encountered in the test suite.
6466 bool pushSCEV(const llvm::SCEV *S) {
6467 bool Success = true;
6468 if (const SCEVConstant *StartInt = dyn_cast<SCEVConstant>(S)) {
6469 Success &= pushConst(StartInt);
6470
6471 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
6472 if (!U->getValue())
6473 return false;
6474 pushLocation(U->getValue());
6475
6476 } else if (const SCEVMulExpr *MulRec = dyn_cast<SCEVMulExpr>(S)) {
6477 Success &= pushArithmeticExpr(MulRec, llvm::dwarf::DW_OP_mul);
6478
6479 } else if (const SCEVUDivExpr *UDiv = dyn_cast<SCEVUDivExpr>(S)) {
6480 Success &= pushSCEV(UDiv->getLHS());
6481 Success &= pushSCEV(UDiv->getRHS());
6482 pushOperator(llvm::dwarf::DW_OP_div);
6483
6484 } else if (const SCEVCastExpr *Cast = dyn_cast<SCEVCastExpr>(S)) {
6485 // Assert if a new and unknown SCEVCastEXpr type is encountered.
6486 assert((isa<SCEVZeroExtendExpr>(Cast) || isa<SCEVTruncateExpr>(Cast) ||
6487 isa<SCEVPtrToIntExpr>(Cast) || isa<SCEVSignExtendExpr>(Cast)) &&
6488 "Unexpected cast type in SCEV.");
6489 Success &= pushCast(Cast, (isa<SCEVSignExtendExpr>(Cast)));
6490
6491 } else if (const SCEVAddExpr *AddExpr = dyn_cast<SCEVAddExpr>(S)) {
6492 Success &= pushArithmeticExpr(AddExpr, llvm::dwarf::DW_OP_plus);
6493
6494 } else if (isa<SCEVAddRecExpr>(S)) {
6495 // Nested SCEVAddRecExpr are generated by nested loops and are currently
6496 // unsupported.
6497 return false;
6498
6499 } else {
6500 return false;
6501 }
6502 return Success;
6503 }
6504
6505 /// Return true if the combination of arithmetic operator and underlying
6506 /// SCEV constant value is an identity function.
6507 bool isIdentityFunction(uint64_t Op, const SCEV *S) {
6508 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
6509 if (C->getAPInt().getSignificantBits() > 64)
6510 return false;
6511 int64_t I = C->getAPInt().getSExtValue();
6512 switch (Op) {
6513 case llvm::dwarf::DW_OP_plus:
6514 case llvm::dwarf::DW_OP_minus:
6515 return I == 0;
6516 case llvm::dwarf::DW_OP_mul:
6517 case llvm::dwarf::DW_OP_div:
6518 return I == 1;
6519 }
6520 }
6521 return false;
6522 }
6523
6524 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6525 /// builder's expression stack. The stack should already contain an
6526 /// expression for the iteration count, so that it can be multiplied by
6527 /// the stride and added to the start.
6528 /// Components of the expression are omitted if they are an identity function.
6529 /// Chain (non-affine) SCEVs are not supported.
6530 bool SCEVToValueExpr(const llvm::SCEVAddRecExpr &SAR, ScalarEvolution &SE) {
6531 assert(SAR.isAffine() && "Expected affine SCEV");
6532 // TODO: Is this check needed?
6533 if (isa<SCEVAddRecExpr>(SAR.getStart()))
6534 return false;
6535
6536 const SCEV *Start = SAR.getStart();
6537 const SCEV *Stride = SAR.getStepRecurrence(SE);
6538
6539 // Skip pushing arithmetic noops.
6540 if (!isIdentityFunction(llvm::dwarf::DW_OP_mul, Stride)) {
6541 if (!pushSCEV(Stride))
6542 return false;
6543 pushOperator(llvm::dwarf::DW_OP_mul);
6544 }
6545 if (!isIdentityFunction(llvm::dwarf::DW_OP_plus, Start)) {
6546 if (!pushSCEV(Start))
6547 return false;
6548 pushOperator(llvm::dwarf::DW_OP_plus);
6549 }
6550 return true;
6551 }
6552
6553 /// Create an expression that is an offset from a value (usually the IV).
6554 void createOffsetExpr(int64_t Offset, Value *OffsetValue) {
6555 pushLocation(OffsetValue);
6557 LLVM_DEBUG(
6558 dbgs() << "scev-salvage: Generated IV offset expression. Offset: "
6559 << std::to_string(Offset) << "\n");
6560 }
6561
6562 /// Combine a translation of the SCEV and the IV to create an expression that
6563 /// recovers a location's value.
6564 /// returns true if an expression was created.
6565 bool createIterCountExpr(const SCEV *S,
6566 const SCEVDbgValueBuilder &IterationCount,
6567 ScalarEvolution &SE) {
6568 // SCEVs for SSA values are most frquently of the form
6569 // {start,+,stride}, but sometimes they are ({start,+,stride} + %a + ..).
6570 // This is because %a is a PHI node that is not the IV. However, these
6571 // SCEVs have not been observed to result in debuginfo-lossy optimisations,
6572 // so its not expected this point will be reached.
6573 if (!isa<SCEVAddRecExpr>(S))
6574 return false;
6575
6576 LLVM_DEBUG(dbgs() << "scev-salvage: Location to salvage SCEV: " << *S
6577 << '\n');
6578
6579 const auto *Rec = cast<SCEVAddRecExpr>(S);
6580 if (!Rec->isAffine())
6581 return false;
6582
6584 return false;
6585
6586 // Initialise a new builder with the iteration count expression. In
6587 // combination with the value's SCEV this enables recovery.
6588 clone(IterationCount);
6589 if (!SCEVToValueExpr(*Rec, SE))
6590 return false;
6591
6592 return true;
6593 }
6594
6595 /// Convert a SCEV of a value to a DIExpression that is pushed onto the
6596 /// builder's expression stack. The stack should already contain an
6597 /// expression for the iteration count, so that it can be multiplied by
6598 /// the stride and added to the start.
6599 /// Components of the expression are omitted if they are an identity function.
6600 bool SCEVToIterCountExpr(const llvm::SCEVAddRecExpr &SAR,
6601 ScalarEvolution &SE) {
6602 assert(SAR.isAffine() && "Expected affine SCEV");
6603 if (isa<SCEVAddRecExpr>(SAR.getStart())) {
6604 LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV. Unsupported nested AddRec: "
6605 << SAR << '\n');
6606 return false;
6607 }
6608 const SCEV *Start = SAR.getStart();
6609 const SCEV *Stride = SAR.getStepRecurrence(SE);
6610
6611 // Skip pushing arithmetic noops.
6612 if (!isIdentityFunction(llvm::dwarf::DW_OP_minus, Start)) {
6613 if (!pushSCEV(Start))
6614 return false;
6615 pushOperator(llvm::dwarf::DW_OP_minus);
6616 }
6617 if (!isIdentityFunction(llvm::dwarf::DW_OP_div, Stride)) {
6618 if (!pushSCEV(Stride))
6619 return false;
6620 pushOperator(llvm::dwarf::DW_OP_div);
6621 }
6622 return true;
6623 }
6624
6625 // Append the current expression and locations to a location list and an
6626 // expression list. Modify the DW_OP_LLVM_arg indexes to account for
6627 // the locations already present in the destination list.
6628 void appendToVectors(SmallVectorImpl<uint64_t> &DestExpr,
6629 SmallVectorImpl<Value *> &DestLocations) {
6630 assert(!DestLocations.empty() &&
6631 "Expected the locations vector to contain the IV");
6632 // The DWARF_OP_LLVM_arg arguments of the expression being appended must be
6633 // modified to account for the locations already in the destination vector.
6634 // All builders contain the IV as the first location op.
6635 assert(!LocationOps.empty() &&
6636 "Expected the location ops to contain the IV.");
6637 // DestIndexMap[n] contains the index in DestLocations for the nth
6638 // location in this SCEVDbgValueBuilder.
6639 SmallVector<uint64_t, 2> DestIndexMap;
6640 for (const auto &Op : LocationOps) {
6641 auto It = find(DestLocations, Op);
6642 if (It != DestLocations.end()) {
6643 // Location already exists in DestLocations, reuse existing ArgIndex.
6644 DestIndexMap.push_back(std::distance(DestLocations.begin(), It));
6645 continue;
6646 }
6647 // Location is not in DestLocations, add it.
6648 DestIndexMap.push_back(DestLocations.size());
6649 DestLocations.push_back(Op);
6650 }
6651
6652 for (const auto &Op : expr_ops()) {
6653 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6654 Op.appendToVector(DestExpr);
6655 continue;
6656 }
6657
6659 // `DW_OP_LLVM_arg n` represents the nth LocationOp in this SCEV,
6660 // DestIndexMap[n] contains its new index in DestLocations.
6661 uint64_t NewIndex = DestIndexMap[Op.getArg(0)];
6662 DestExpr.push_back(NewIndex);
6663 }
6664 }
6665};
6666
6667/// Holds all the required data to salvage a dbg.value using the pre-LSR SCEVs
6668/// and DIExpression.
6669struct DVIRecoveryRec {
6670 DVIRecoveryRec(DbgValueInst *DbgValue)
6671 : DbgRef(DbgValue), Expr(DbgValue->getExpression()),
6672 HadLocationArgList(false) {}
6673 DVIRecoveryRec(DbgVariableRecord *DVR)
6674 : DbgRef(DVR), Expr(DVR->getExpression()), HadLocationArgList(false) {}
6675
6677 DIExpression *Expr;
6678 bool HadLocationArgList;
6679 SmallVector<WeakVH, 2> LocationOps;
6682
6683 void clear() {
6684 for (auto &RE : RecoveryExprs)
6685 RE.reset();
6686 RecoveryExprs.clear();
6687 }
6688
6689 ~DVIRecoveryRec() { clear(); }
6690};
6691} // namespace
6692
6693/// Returns the total number of DW_OP_llvm_arg operands in the expression.
6694/// This helps in determining if a DIArglist is necessary or can be omitted from
6695/// the dbg.value.
6697 auto expr_ops = ToDwarfOpIter(Expr);
6698 unsigned Count = 0;
6699 for (auto Op : expr_ops)
6700 if (Op.getOp() == dwarf::DW_OP_LLVM_arg)
6701 Count++;
6702 return Count;
6703}
6704
6705/// Overwrites DVI with the location and Ops as the DIExpression. This will
6706/// create an invalid expression if Ops has any dwarf::DW_OP_llvm_arg operands,
6707/// because a DIArglist is not created for the first argument of the dbg.value.
6708template <typename T>
6709static void updateDVIWithLocation(T &DbgVal, Value *Location,
6711 assert(numLLVMArgOps(Ops) == 0 && "Expected expression that does not "
6712 "contain any DW_OP_llvm_arg operands.");
6713 DbgVal.setRawLocation(ValueAsMetadata::get(Location));
6714 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6715 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6716}
6717
6718/// Overwrite DVI with locations placed into a DIArglist.
6719template <typename T>
6720static void updateDVIWithLocations(T &DbgVal,
6721 SmallVectorImpl<Value *> &Locations,
6723 assert(numLLVMArgOps(Ops) != 0 &&
6724 "Expected expression that references DIArglist locations using "
6725 "DW_OP_llvm_arg operands.");
6727 for (Value *V : Locations)
6728 MetadataLocs.push_back(ValueAsMetadata::get(V));
6729 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6730 DbgVal.setRawLocation(llvm::DIArgList::get(DbgVal.getContext(), ValArrayRef));
6731 DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
6732}
6733
6734/// Write the new expression and new location ops for the dbg.value. If possible
6735/// reduce the szie of the dbg.value intrinsic by omitting DIArglist. This
6736/// can be omitted if:
6737/// 1. There is only a single location, refenced by a single DW_OP_llvm_arg.
6738/// 2. The DW_OP_LLVM_arg is the first operand in the expression.
6739static void UpdateDbgValueInst(DVIRecoveryRec &DVIRec,
6740 SmallVectorImpl<Value *> &NewLocationOps,
6742 auto UpdateDbgValueInstImpl = [&](auto *DbgVal) {
6743 unsigned NumLLVMArgs = numLLVMArgOps(NewExpr);
6744 if (NumLLVMArgs == 0) {
6745 // Location assumed to be on the stack.
6746 updateDVIWithLocation(*DbgVal, NewLocationOps[0], NewExpr);
6747 } else if (NumLLVMArgs == 1 && NewExpr[0] == dwarf::DW_OP_LLVM_arg) {
6748 // There is only a single DW_OP_llvm_arg at the start of the expression,
6749 // so it can be omitted along with DIArglist.
6750 assert(NewExpr[1] == 0 &&
6751 "Lone LLVM_arg in a DIExpression should refer to location-op 0.");
6753 updateDVIWithLocation(*DbgVal, NewLocationOps[0], ShortenedOps);
6754 } else {
6755 // Multiple DW_OP_llvm_arg, so DIArgList is strictly necessary.
6756 updateDVIWithLocations(*DbgVal, NewLocationOps, NewExpr);
6757 }
6758
6759 // If the DIExpression was previously empty then add the stack terminator.
6760 // Non-empty expressions have only had elements inserted into them and so
6761 // the terminator should already be present e.g. stack_value or fragment.
6762 DIExpression *SalvageExpr = DbgVal->getExpression();
6763 if (!DVIRec.Expr->isComplex() && SalvageExpr->isComplex()) {
6764 SalvageExpr =
6765 DIExpression::append(SalvageExpr, {dwarf::DW_OP_stack_value});
6766 DbgVal->setExpression(SalvageExpr);
6767 }
6768 };
6769 if (isa<DbgValueInst *>(DVIRec.DbgRef))
6770 UpdateDbgValueInstImpl(cast<DbgValueInst *>(DVIRec.DbgRef));
6771 else
6772 UpdateDbgValueInstImpl(cast<DbgVariableRecord *>(DVIRec.DbgRef));
6773}
6774
6775/// Cached location ops may be erased during LSR, in which case a poison is
6776/// required when restoring from the cache. The type of that location is no
6777/// longer available, so just use int8. The poison will be replaced by one or
6778/// more locations later when a SCEVDbgValueBuilder selects alternative
6779/// locations to use for the salvage.
6781 return (VH) ? VH : PoisonValue::get(llvm::Type::getInt8Ty(C));
6782}
6783
6784/// Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
6785static void restorePreTransformState(DVIRecoveryRec &DVIRec) {
6786 auto RestorePreTransformStateImpl = [&](auto *DbgVal) {
6787 LLVM_DEBUG(dbgs() << "scev-salvage: restore dbg.value to pre-LSR state\n"
6788 << "scev-salvage: post-LSR: " << *DbgVal << '\n');
6789 assert(DVIRec.Expr && "Expected an expression");
6790 DbgVal->setExpression(DVIRec.Expr);
6791
6792 // Even a single location-op may be inside a DIArgList and referenced with
6793 // DW_OP_LLVM_arg, which is valid only with a DIArgList.
6794 if (!DVIRec.HadLocationArgList) {
6795 assert(DVIRec.LocationOps.size() == 1 &&
6796 "Unexpected number of location ops.");
6797 // LSR's unsuccessful salvage attempt may have added DIArgList, which in
6798 // this case was not present before, so force the location back to a
6799 // single uncontained Value.
6800 Value *CachedValue =
6801 getValueOrPoison(DVIRec.LocationOps[0], DbgVal->getContext());
6802 DbgVal->setRawLocation(ValueAsMetadata::get(CachedValue));
6803 } else {
6805 for (WeakVH VH : DVIRec.LocationOps) {
6806 Value *CachedValue = getValueOrPoison(VH, DbgVal->getContext());
6807 MetadataLocs.push_back(ValueAsMetadata::get(CachedValue));
6808 }
6809 auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
6810 DbgVal->setRawLocation(
6811 llvm::DIArgList::get(DbgVal->getContext(), ValArrayRef));
6812 }
6813 LLVM_DEBUG(dbgs() << "scev-salvage: pre-LSR: " << *DbgVal << '\n');
6814 };
6815 if (isa<DbgValueInst *>(DVIRec.DbgRef))
6816 RestorePreTransformStateImpl(cast<DbgValueInst *>(DVIRec.DbgRef));
6817 else
6818 RestorePreTransformStateImpl(cast<DbgVariableRecord *>(DVIRec.DbgRef));
6819}
6820
6822 llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec,
6823 const SCEV *SCEVInductionVar,
6824 SCEVDbgValueBuilder IterCountExpr) {
6825
6826 if (isa<DbgValueInst *>(DVIRec.DbgRef)
6827 ? !cast<DbgValueInst *>(DVIRec.DbgRef)->isKillLocation()
6828 : !cast<DbgVariableRecord *>(DVIRec.DbgRef)->isKillLocation())
6829 return false;
6830
6831 // LSR may have caused several changes to the dbg.value in the failed salvage
6832 // attempt. So restore the DIExpression, the location ops and also the
6833 // location ops format, which is always DIArglist for multiple ops, but only
6834 // sometimes for a single op.
6836
6837 // LocationOpIndexMap[i] will store the post-LSR location index of
6838 // the non-optimised out location at pre-LSR index i.
6839 SmallVector<int64_t, 2> LocationOpIndexMap;
6840 LocationOpIndexMap.assign(DVIRec.LocationOps.size(), -1);
6841 SmallVector<Value *, 2> NewLocationOps;
6842 NewLocationOps.push_back(LSRInductionVar);
6843
6844 for (unsigned i = 0; i < DVIRec.LocationOps.size(); i++) {
6845 WeakVH VH = DVIRec.LocationOps[i];
6846 // Place the locations not optimised out in the list first, avoiding
6847 // inserts later. The map is used to update the DIExpression's
6848 // DW_OP_LLVM_arg arguments as the expression is updated.
6849 if (VH && !isa<UndefValue>(VH)) {
6850 NewLocationOps.push_back(VH);
6851 LocationOpIndexMap[i] = NewLocationOps.size() - 1;
6852 LLVM_DEBUG(dbgs() << "scev-salvage: Location index " << i
6853 << " now at index " << LocationOpIndexMap[i] << "\n");
6854 continue;
6855 }
6856
6857 // It's possible that a value referred to in the SCEV may have been
6858 // optimised out by LSR.
6859 if (SE.containsErasedValue(DVIRec.SCEVs[i]) ||
6860 SE.containsUndefs(DVIRec.SCEVs[i])) {
6861 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV for location at index: " << i
6862 << " refers to a location that is now undef or erased. "
6863 "Salvage abandoned.\n");
6864 return false;
6865 }
6866
6867 LLVM_DEBUG(dbgs() << "scev-salvage: salvaging location at index " << i
6868 << " with SCEV: " << *DVIRec.SCEVs[i] << "\n");
6869
6870 DVIRec.RecoveryExprs[i] = std::make_unique<SCEVDbgValueBuilder>();
6871 SCEVDbgValueBuilder *SalvageExpr = DVIRec.RecoveryExprs[i].get();
6872
6873 // Create an offset-based salvage expression if possible, as it requires
6874 // less DWARF ops than an iteration count-based expression.
6875 if (std::optional<APInt> Offset =
6876 SE.computeConstantDifference(DVIRec.SCEVs[i], SCEVInductionVar)) {
6877 if (Offset->getSignificantBits() <= 64)
6878 SalvageExpr->createOffsetExpr(Offset->getSExtValue(), LSRInductionVar);
6879 } else if (!SalvageExpr->createIterCountExpr(DVIRec.SCEVs[i], IterCountExpr,
6880 SE))
6881 return false;
6882 }
6883
6884 // Merge the DbgValueBuilder generated expressions and the original
6885 // DIExpression, place the result into an new vector.
6887 if (DVIRec.Expr->getNumElements() == 0) {
6888 assert(DVIRec.RecoveryExprs.size() == 1 &&
6889 "Expected only a single recovery expression for an empty "
6890 "DIExpression.");
6891 assert(DVIRec.RecoveryExprs[0] &&
6892 "Expected a SCEVDbgSalvageBuilder for location 0");
6893 SCEVDbgValueBuilder *B = DVIRec.RecoveryExprs[0].get();
6894 B->appendToVectors(NewExpr, NewLocationOps);
6895 }
6896 for (const auto &Op : DVIRec.Expr->expr_ops()) {
6897 // Most Ops needn't be updated.
6898 if (Op.getOp() != dwarf::DW_OP_LLVM_arg) {
6899 Op.appendToVector(NewExpr);
6900 continue;
6901 }
6902
6903 uint64_t LocationArgIndex = Op.getArg(0);
6904 SCEVDbgValueBuilder *DbgBuilder =
6905 DVIRec.RecoveryExprs[LocationArgIndex].get();
6906 // The location doesn't have s SCEVDbgValueBuilder, so LSR did not
6907 // optimise it away. So just translate the argument to the updated
6908 // location index.
6909 if (!DbgBuilder) {
6910 NewExpr.push_back(dwarf::DW_OP_LLVM_arg);
6911 assert(LocationOpIndexMap[Op.getArg(0)] != -1 &&
6912 "Expected a positive index for the location-op position.");
6913 NewExpr.push_back(LocationOpIndexMap[Op.getArg(0)]);
6914 continue;
6915 }
6916 // The location has a recovery expression.
6917 DbgBuilder->appendToVectors(NewExpr, NewLocationOps);
6918 }
6919
6920 UpdateDbgValueInst(DVIRec, NewLocationOps, NewExpr);
6921 if (isa<DbgValueInst *>(DVIRec.DbgRef))
6922 LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: "
6923 << *cast<DbgValueInst *>(DVIRec.DbgRef) << "\n");
6924 else
6925 LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: "
6926 << *cast<DbgVariableRecord *>(DVIRec.DbgRef) << "\n");
6927 return true;
6928}
6929
6930/// Obtain an expression for the iteration count, then attempt to salvage the
6931/// dbg.value intrinsics.
6933 llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar,
6934 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &DVIToUpdate) {
6935 if (DVIToUpdate.empty())
6936 return;
6937
6938 const llvm::SCEV *SCEVInductionVar = SE.getSCEV(LSRInductionVar);
6939 assert(SCEVInductionVar &&
6940 "Anticipated a SCEV for the post-LSR induction variable");
6941
6942 if (const SCEVAddRecExpr *IVAddRec =
6943 dyn_cast<SCEVAddRecExpr>(SCEVInductionVar)) {
6944 if (!IVAddRec->isAffine())
6945 return;
6946
6947 // Prevent translation using excessive resources.
6948 if (IVAddRec->getExpressionSize() > MaxSCEVSalvageExpressionSize)
6949 return;
6950
6951 // The iteration count is required to recover location values.
6952 SCEVDbgValueBuilder IterCountExpr;
6953 IterCountExpr.pushLocation(LSRInductionVar);
6954 if (!IterCountExpr.SCEVToIterCountExpr(*IVAddRec, SE))
6955 return;
6956
6957 LLVM_DEBUG(dbgs() << "scev-salvage: IV SCEV: " << *SCEVInductionVar
6958 << '\n');
6959
6960 for (auto &DVIRec : DVIToUpdate) {
6961 SalvageDVI(L, SE, LSRInductionVar, *DVIRec, SCEVInductionVar,
6962 IterCountExpr);
6963 }
6964 }
6965}
6966
6967/// Identify and cache salvageable DVI locations and expressions along with the
6968/// corresponding SCEV(s). Also ensure that the DVI is not deleted between
6969/// cacheing and salvaging.
6971 Loop *L, ScalarEvolution &SE,
6972 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &SalvageableDVISCEVs,
6973 SmallSet<AssertingVH<DbgValueInst>, 2> &DVIHandles) {
6974 for (const auto &B : L->getBlocks()) {
6975 for (auto &I : *B) {
6976 auto ProcessDbgValue = [&](auto *DbgVal) -> bool {
6977 // Ensure that if any location op is undef that the dbg.vlue is not
6978 // cached.
6979 if (DbgVal->isKillLocation())
6980 return false;
6981
6982 // Check that the location op SCEVs are suitable for translation to
6983 // DIExpression.
6984 const auto &HasTranslatableLocationOps =
6985 [&](const auto *DbgValToTranslate) -> bool {
6986 for (const auto LocOp : DbgValToTranslate->location_ops()) {
6987 if (!LocOp)
6988 return false;
6989
6990 if (!SE.isSCEVable(LocOp->getType()))
6991 return false;
6992
6993 const SCEV *S = SE.getSCEV(LocOp);
6994 if (SE.containsUndefs(S))
6995 return false;
6996 }
6997 return true;
6998 };
6999
7000 if (!HasTranslatableLocationOps(DbgVal))
7001 return false;
7002
7003 std::unique_ptr<DVIRecoveryRec> NewRec =
7004 std::make_unique<DVIRecoveryRec>(DbgVal);
7005 // Each location Op may need a SCEVDbgValueBuilder in order to recover
7006 // it. Pre-allocating a vector will enable quick lookups of the builder
7007 // later during the salvage.
7008 NewRec->RecoveryExprs.resize(DbgVal->getNumVariableLocationOps());
7009 for (const auto LocOp : DbgVal->location_ops()) {
7010 NewRec->SCEVs.push_back(SE.getSCEV(LocOp));
7011 NewRec->LocationOps.push_back(LocOp);
7012 NewRec->HadLocationArgList = DbgVal->hasArgList();
7013 }
7014 SalvageableDVISCEVs.push_back(std::move(NewRec));
7015 return true;
7016 };
7017 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
7018 if (DVR.isDbgValue() || DVR.isDbgAssign())
7019 ProcessDbgValue(&DVR);
7020 }
7021 auto DVI = dyn_cast<DbgValueInst>(&I);
7022 if (!DVI)
7023 continue;
7024 if (ProcessDbgValue(DVI))
7025 DVIHandles.insert(DVI);
7026 }
7027 }
7028}
7029
7030/// Ideally pick the PHI IV inserted by ScalarEvolutionExpander. As a fallback
7031/// any PHi from the loop header is usable, but may have less chance of
7032/// surviving subsequent transforms.
7034 const LSRInstance &LSR) {
7035
7036 auto IsSuitableIV = [&](PHINode *P) {
7037 if (!SE.isSCEVable(P->getType()))
7038 return false;
7039 if (const SCEVAddRecExpr *Rec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(P)))
7040 return Rec->isAffine() && !SE.containsUndefs(SE.getSCEV(P));
7041 return false;
7042 };
7043
7044 // For now, just pick the first IV that was generated and inserted by
7045 // ScalarEvolution. Ideally pick an IV that is unlikely to be optimised away
7046 // by subsequent transforms.
7047 for (const WeakVH &IV : LSR.getScalarEvolutionIVs()) {
7048 if (!IV)
7049 continue;
7050
7051 // There should only be PHI node IVs.
7052 PHINode *P = cast<PHINode>(&*IV);
7053
7054 if (IsSuitableIV(P))
7055 return P;
7056 }
7057
7058 for (PHINode &P : L.getHeader()->phis()) {
7059 if (IsSuitableIV(&P))
7060 return &P;
7061 }
7062 return nullptr;
7063}
7064
7065static std::optional<std::tuple<PHINode *, PHINode *, const SCEV *, bool>>
7067 const LoopInfo &LI, const TargetTransformInfo &TTI) {
7068 if (!L->isInnermost()) {
7069 LLVM_DEBUG(dbgs() << "Cannot fold on non-innermost loop\n");
7070 return std::nullopt;
7071 }
7072 // Only inspect on simple loop structure
7073 if (!L->isLoopSimplifyForm()) {
7074 LLVM_DEBUG(dbgs() << "Cannot fold on non-simple loop\n");
7075 return std::nullopt;
7076 }
7077
7079 LLVM_DEBUG(dbgs() << "Cannot fold on backedge that is loop variant\n");
7080 return std::nullopt;
7081 }
7082
7083 BasicBlock *LoopLatch = L->getLoopLatch();
7084 BranchInst *BI = dyn_cast<BranchInst>(LoopLatch->getTerminator());
7085 if (!BI || BI->isUnconditional())
7086 return std::nullopt;
7087 auto *TermCond = dyn_cast<ICmpInst>(BI->getCondition());
7088 if (!TermCond) {
7089 LLVM_DEBUG(
7090 dbgs() << "Cannot fold on branching condition that is not an ICmpInst");
7091 return std::nullopt;
7092 }
7093 if (!TermCond->hasOneUse()) {
7094 LLVM_DEBUG(
7095 dbgs()
7096 << "Cannot replace terminating condition with more than one use\n");
7097 return std::nullopt;
7098 }
7099
7100 BinaryOperator *LHS = dyn_cast<BinaryOperator>(TermCond->getOperand(0));
7101 Value *RHS = TermCond->getOperand(1);
7102 if (!LHS || !L->isLoopInvariant(RHS))
7103 // We could pattern match the inverse form of the icmp, but that is
7104 // non-canonical, and this pass is running *very* late in the pipeline.
7105 return std::nullopt;
7106
7107 // Find the IV used by the current exit condition.
7108 PHINode *ToFold;
7109 Value *ToFoldStart, *ToFoldStep;
7110 if (!matchSimpleRecurrence(LHS, ToFold, ToFoldStart, ToFoldStep))
7111 return std::nullopt;
7112
7113 // Ensure the simple recurrence is a part of the current loop.
7114 if (ToFold->getParent() != L->getHeader())
7115 return std::nullopt;
7116
7117 // If that IV isn't dead after we rewrite the exit condition in terms of
7118 // another IV, there's no point in doing the transform.
7119 if (!isAlmostDeadIV(ToFold, LoopLatch, TermCond))
7120 return std::nullopt;
7121
7122 // Inserting instructions in the preheader has a runtime cost, scale
7123 // the allowed cost with the loops trip count as best we can.
7124 const unsigned ExpansionBudget = [&]() {
7125 unsigned Budget = 2 * SCEVCheapExpansionBudget;
7126 if (unsigned SmallTC = SE.getSmallConstantMaxTripCount(L))
7127 return std::min(Budget, SmallTC);
7128 if (std::optional<unsigned> SmallTC = getLoopEstimatedTripCount(L))
7129 return std::min(Budget, *SmallTC);
7130 // Unknown trip count, assume long running by default.
7131 return Budget;
7132 }();
7133
7134 const SCEV *BECount = SE.getBackedgeTakenCount(L);
7135 const DataLayout &DL = L->getHeader()->getDataLayout();
7136 SCEVExpander Expander(SE, DL, "lsr_fold_term_cond");
7137
7138 PHINode *ToHelpFold = nullptr;
7139 const SCEV *TermValueS = nullptr;
7140 bool MustDropPoison = false;
7141 auto InsertPt = L->getLoopPreheader()->getTerminator();
7142 for (PHINode &PN : L->getHeader()->phis()) {
7143 if (ToFold == &PN)
7144 continue;
7145
7146 if (!SE.isSCEVable(PN.getType())) {
7147 LLVM_DEBUG(dbgs() << "IV of phi '" << PN
7148 << "' is not SCEV-able, not qualified for the "
7149 "terminating condition folding.\n");
7150 continue;
7151 }
7152 const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(&PN));
7153 // Only speculate on affine AddRec
7154 if (!AddRec || !AddRec->isAffine()) {
7155 LLVM_DEBUG(dbgs() << "SCEV of phi '" << PN
7156 << "' is not an affine add recursion, not qualified "
7157 "for the terminating condition folding.\n");
7158 continue;
7159 }
7160
7161 // Check that we can compute the value of AddRec on the exiting iteration
7162 // without soundness problems. evaluateAtIteration internally needs
7163 // to multiply the stride of the iteration number - which may wrap around.
7164 // The issue here is subtle because computing the result accounting for
7165 // wrap is insufficient. In order to use the result in an exit test, we
7166 // must also know that AddRec doesn't take the same value on any previous
7167 // iteration. The simplest case to consider is a candidate IV which is
7168 // narrower than the trip count (and thus original IV), but this can
7169 // also happen due to non-unit strides on the candidate IVs.
7170 if (!AddRec->hasNoSelfWrap() ||
7171 !SE.isKnownNonZero(AddRec->getStepRecurrence(SE)))
7172 continue;
7173
7174 const SCEVAddRecExpr *PostInc = AddRec->getPostIncExpr(SE);
7175 const SCEV *TermValueSLocal = PostInc->evaluateAtIteration(BECount, SE);
7176 if (!Expander.isSafeToExpand(TermValueSLocal)) {
7177 LLVM_DEBUG(
7178 dbgs() << "Is not safe to expand terminating value for phi node" << PN
7179 << "\n");
7180 continue;
7181 }
7182
7183 if (Expander.isHighCostExpansion(TermValueSLocal, L, ExpansionBudget,
7184 &TTI, InsertPt)) {
7185 LLVM_DEBUG(
7186 dbgs() << "Is too expensive to expand terminating value for phi node"
7187 << PN << "\n");
7188 continue;
7189 }
7190
7191 // The candidate IV may have been otherwise dead and poison from the
7192 // very first iteration. If we can't disprove that, we can't use the IV.
7193 if (!mustExecuteUBIfPoisonOnPathTo(&PN, LoopLatch->getTerminator(), &DT)) {
7194 LLVM_DEBUG(dbgs() << "Can not prove poison safety for IV "
7195 << PN << "\n");
7196 continue;
7197 }
7198
7199 // The candidate IV may become poison on the last iteration. If this
7200 // value is not branched on, this is a well defined program. We're
7201 // about to add a new use to this IV, and we have to ensure we don't
7202 // insert UB which didn't previously exist.
7203 bool MustDropPoisonLocal = false;
7204 Instruction *PostIncV =
7205 cast<Instruction>(PN.getIncomingValueForBlock(LoopLatch));
7206 if (!mustExecuteUBIfPoisonOnPathTo(PostIncV, LoopLatch->getTerminator(),
7207 &DT)) {
7208 LLVM_DEBUG(dbgs() << "Can not prove poison safety to insert use"
7209 << PN << "\n");
7210
7211 // If this is a complex recurrance with multiple instructions computing
7212 // the backedge value, we might need to strip poison flags from all of
7213 // them.
7214 if (PostIncV->getOperand(0) != &PN)
7215 continue;
7216
7217 // In order to perform the transform, we need to drop the poison generating
7218 // flags on this instruction (if any).
7219 MustDropPoisonLocal = PostIncV->hasPoisonGeneratingFlags();
7220 }
7221
7222 // We pick the last legal alternate IV. We could expore choosing an optimal
7223 // alternate IV if we had a decent heuristic to do so.
7224 ToHelpFold = &PN;
7225 TermValueS = TermValueSLocal;
7226 MustDropPoison = MustDropPoisonLocal;
7227 }
7228
7229 LLVM_DEBUG(if (ToFold && !ToHelpFold) dbgs()
7230 << "Cannot find other AddRec IV to help folding\n";);
7231
7232 LLVM_DEBUG(if (ToFold && ToHelpFold) dbgs()
7233 << "\nFound loop that can fold terminating condition\n"
7234 << " BECount (SCEV): " << *SE.getBackedgeTakenCount(L) << "\n"
7235 << " TermCond: " << *TermCond << "\n"
7236 << " BrandInst: " << *BI << "\n"
7237 << " ToFold: " << *ToFold << "\n"
7238 << " ToHelpFold: " << *ToHelpFold << "\n");
7239
7240 if (!ToFold || !ToHelpFold)
7241 return std::nullopt;
7242 return std::make_tuple(ToFold, ToHelpFold, TermValueS, MustDropPoison);
7243}
7244
7246 DominatorTree &DT, LoopInfo &LI,
7247 const TargetTransformInfo &TTI,
7249 MemorySSA *MSSA) {
7250
7251 // Debug preservation - before we start removing anything identify which DVI
7252 // meet the salvageable criteria and store their DIExpression and SCEVs.
7253 SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> SalvageableDVIRecords;
7255 DbgGatherSalvagableDVI(L, SE, SalvageableDVIRecords, DVIHandles);
7256
7257 bool Changed = false;
7258 std::unique_ptr<MemorySSAUpdater> MSSAU;
7259 if (MSSA)
7260 MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
7261
7262 // Run the main LSR transformation.
7263 const LSRInstance &Reducer =
7264 LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get());
7265 Changed |= Reducer.getChanged();
7266
7267 // Remove any extra phis created by processing inner loops.
7268 Changed |= DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7269 if (EnablePhiElim && L->isLoopSimplifyForm()) {
7271 const DataLayout &DL = L->getHeader()->getDataLayout();
7272 SCEVExpander Rewriter(SE, DL, "lsr", false);
7273#ifndef NDEBUG
7274 Rewriter.setDebugType(DEBUG_TYPE);
7275#endif
7276 unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI);
7277 Rewriter.clear();
7278 if (numFolded) {
7279 Changed = true;
7281 MSSAU.get());
7282 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7283 }
7284 }
7285 // LSR may at times remove all uses of an induction variable from a loop.
7286 // The only remaining use is the PHI in the exit block.
7287 // When this is the case, if the exit value of the IV can be calculated using
7288 // SCEV, we can replace the exit block PHI with the final value of the IV and
7289 // skip the updates in each loop iteration.
7290 if (L->isRecursivelyLCSSAForm(DT, LI) && L->getExitBlock()) {
7292 const DataLayout &DL = L->getHeader()->getDataLayout();
7293 SCEVExpander Rewriter(SE, DL, "lsr", true);
7294 int Rewrites = rewriteLoopExitValues(L, &LI, &TLI, &SE, &TTI, Rewriter, &DT,
7295 UnusedIndVarInLoop, DeadInsts);
7296 Rewriter.clear();
7297 if (Rewrites) {
7298 Changed = true;
7300 MSSAU.get());
7301 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7302 }
7303 }
7304
7305 const bool EnableFormTerm = [&] {
7307 case cl::BOU_TRUE:
7308 return true;
7309 case cl::BOU_FALSE:
7310 return false;
7311 case cl::BOU_UNSET:
7313 }
7314 llvm_unreachable("Unhandled cl::boolOrDefault enum");
7315 }();
7316
7317 if (EnableFormTerm) {
7318 if (auto Opt = canFoldTermCondOfLoop(L, SE, DT, LI, TTI)) {
7319 auto [ToFold, ToHelpFold, TermValueS, MustDrop] = *Opt;
7320
7321 Changed = true;
7322 NumTermFold++;
7323
7324 BasicBlock *LoopPreheader = L->getLoopPreheader();
7325 BasicBlock *LoopLatch = L->getLoopLatch();
7326
7327 (void)ToFold;
7328 LLVM_DEBUG(dbgs() << "To fold phi-node:\n"
7329 << *ToFold << "\n"
7330 << "New term-cond phi-node:\n"
7331 << *ToHelpFold << "\n");
7332
7333 Value *StartValue = ToHelpFold->getIncomingValueForBlock(LoopPreheader);
7334 (void)StartValue;
7335 Value *LoopValue = ToHelpFold->getIncomingValueForBlock(LoopLatch);
7336
7337 // See comment in canFoldTermCondOfLoop on why this is sufficient.
7338 if (MustDrop)
7339 cast<Instruction>(LoopValue)->dropPoisonGeneratingFlags();
7340
7341 // SCEVExpander for both use in preheader and latch
7342 const DataLayout &DL = L->getHeader()->getDataLayout();
7343 SCEVExpander Expander(SE, DL, "lsr_fold_term_cond");
7344
7345 assert(Expander.isSafeToExpand(TermValueS) &&
7346 "Terminating value was checked safe in canFoldTerminatingCondition");
7347
7348 // Create new terminating value at loop preheader
7349 Value *TermValue = Expander.expandCodeFor(TermValueS, ToHelpFold->getType(),
7350 LoopPreheader->getTerminator());
7351
7352 LLVM_DEBUG(dbgs() << "Start value of new term-cond phi-node:\n"
7353 << *StartValue << "\n"
7354 << "Terminating value of new term-cond phi-node:\n"
7355 << *TermValue << "\n");
7356
7357 // Create new terminating condition at loop latch
7358 BranchInst *BI = cast<BranchInst>(LoopLatch->getTerminator());
7359 ICmpInst *OldTermCond = cast<ICmpInst>(BI->getCondition());
7360 IRBuilder<> LatchBuilder(LoopLatch->getTerminator());
7361 Value *NewTermCond =
7362 LatchBuilder.CreateICmp(CmpInst::ICMP_EQ, LoopValue, TermValue,
7363 "lsr_fold_term_cond.replaced_term_cond");
7364 // Swap successors to exit loop body if IV equals to new TermValue
7365 if (BI->getSuccessor(0) == L->getHeader())
7366 BI->swapSuccessors();
7367
7368 LLVM_DEBUG(dbgs() << "Old term-cond:\n"
7369 << *OldTermCond << "\n"
7370 << "New term-cond:\n" << *NewTermCond << "\n");
7371
7372 BI->setCondition(NewTermCond);
7373
7374 Expander.clear();
7375 OldTermCond->eraseFromParent();
7376 DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
7377 }
7378 }
7379
7380 if (SalvageableDVIRecords.empty())
7381 return Changed;
7382
7383 // Obtain relevant IVs and attempt to rewrite the salvageable DVIs with
7384 // expressions composed using the derived iteration count.
7385 // TODO: Allow for multiple IV references for nested AddRecSCEVs
7386 for (const auto &L : LI) {
7387 if (llvm::PHINode *IV = GetInductionVariable(*L, SE, Reducer))
7388 DbgRewriteSalvageableDVIs(L, SE, IV, SalvageableDVIRecords);
7389 else {
7390 LLVM_DEBUG(dbgs() << "scev-salvage: SCEV salvaging not possible. An IV "
7391 "could not be identified.\n");
7392 }
7393 }
7394
7395 for (auto &Rec : SalvageableDVIRecords)
7396 Rec->clear();
7397 SalvageableDVIRecords.clear();
7398 DVIHandles.clear();
7399 return Changed;
7400}
7401
7402bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
7403 if (skipLoop(L))
7404 return false;
7405
7406 auto &IU = getAnalysis<IVUsersWrapperPass>().getIU();
7407 auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
7408 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
7409 auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
7410 const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
7411 *L->getHeader()->getParent());
7412 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
7413 *L->getHeader()->getParent());
7414 auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
7415 *L->getHeader()->getParent());
7416 auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
7417 MemorySSA *MSSA = nullptr;
7418 if (MSSAAnalysis)
7419 MSSA = &MSSAAnalysis->getMSSA();
7420 return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, TLI, MSSA);
7421}
7422
7425 LPMUpdater &) {
7426 if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,
7427 AR.DT, AR.LI, AR.TTI, AR.AC, AR.TLI, AR.MSSA))
7428 return PreservedAnalyses::all();
7429
7430 auto PA = getLoopPassPreservedAnalyses();
7431 if (AR.MSSA)
7432 PA.preserve<MemorySSAAnalysis>();
7433 return PA;
7434}
7435
7436char LoopStrengthReduce::ID = 0;
7437
7438INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
7439 "Loop Strength Reduction", false, false)
7445INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
7446INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
7447 "Loop Strength Reduction", false, false)
7448
7449Pass *llvm::createLoopStrengthReducePass() { return new LoopStrengthReduce(); }
#define Success
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
This file implements a class to represent arbitrary precision integral constant values and operations...
@ PostInc
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:686
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:537
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static void clear(coro::Shape &Shape)
Definition: Coroutines.cpp:148
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static bool isCanonical(const MDString *S)
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
This file contains constants used for implementing Dwarf debug support.
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1294
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Rewrite Partial Register Uses
Hexagon Hardware Loops
This defines the Use class.
iv Induction Variable Users
Definition: IVUsers.cpp:48
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:512
This header provides classes for managing per-loop analyses.
static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec, const SCEV *SCEVInductionVar, SCEVDbgValueBuilder IterCountExpr)
static cl::opt< bool > DropScaledForVScale("lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true), cl::desc("Avoid using scaled registers with vscale-relative addressing"))
static std::optional< std::tuple< PHINode *, PHINode *, const SCEV *, bool > > canFoldTermCondOfLoop(Loop *L, ScalarEvolution &SE, DominatorTree &DT, const LoopInfo &LI, const TargetTransformInfo &TTI)
static Value * getWideOperand(Value *Oper)
IVChain logic must consistently peek base TruncInst operands, so wrap it in a convenient helper.
static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE)
Return true if the given add can be sign-extended without changing its value.
static bool mayUsePostIncMode(const TargetTransformInfo &TTI, LSRUse &LU, const SCEV *S, const Loop *L, ScalarEvolution &SE)
Return true if the SCEV represents a value that may end up as a post-increment operation.
static void restorePreTransformState(DVIRecoveryRec &DVIRec)
Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE)
If S involves the addition of a constant integer value, return that integer value,...
static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L)
static User::op_iterator findIVOperand(User::op_iterator OI, User::op_iterator OE, Loop *L, ScalarEvolution &SE)
Helper for CollectChains that finds an IV operand (computed by an AddRec in this loop) within [OI,...
static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset, Immediate MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg, int64_t Scale)
Test whether we know how to expand the current formula.
static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE)
Return true if the given mul can be sign-extended without changing its value.
static const unsigned MaxSCEVSalvageExpressionSize
Limit the size of expression that SCEV-based salvaging will attempt to translate into a DIExpression.
static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if this AddRec is already a phi in its loop.
static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F, const Loop &L)
static cl::opt< bool > InsnsCost("lsr-insns-cost", cl::Hidden, cl::init(true), cl::desc("Add instruction count to a LSR cost model"))
static cl::opt< bool > StressIVChain("stress-ivchain", cl::Hidden, cl::init(false), cl::desc("Stress test LSR IV chains"))
static bool isAddressUse(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Returns true if the specified instruction is using the specified value as an address.
static GlobalValue * ExtractSymbol(const SCEV *&S, ScalarEvolution &SE)
If S involves the addition of a GlobalValue address, return that symbol, and mutate S to point to a n...
static void updateDVIWithLocation(T &DbgVal, Value *Location, SmallVectorImpl< uint64_t > &Ops)
Overwrites DVI with the location and Ops as the DIExpression.
static bool isLegalAddImmediate(const TargetTransformInfo &TTI, Immediate Offset)
static cl::opt< cl::boolOrDefault > AllowDropSolutionIfLessProfitable("lsr-drop-solution", cl::Hidden, cl::desc("Attempt to drop solution if it is less profitable"))
static cl::opt< bool > EnableVScaleImmediates("lsr-enable-vscale-immediates", cl::Hidden, cl::init(true), cl::desc("Enable analysis of vscale-relative immediates in LSR"))
static cl::opt< TTI::AddressingModeKind > PreferredAddresingMode("lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None), cl::desc("A flag that overrides the target's preferred addressing mode."), cl::values(clEnumValN(TTI::AMK_None, "none", "Don't prefer any addressing mode"), clEnumValN(TTI::AMK_PreIndexed, "preindexed", "Prefer pre-indexed addressing mode"), clEnumValN(TTI::AMK_PostIndexed, "postindexed", "Prefer post-indexed addressing mode")))
static const SCEV * getExprBase(const SCEV *S)
Return an approximation of this SCEV expression's "base", or NULL for any constant.
static bool isAlwaysFoldable(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg)
static llvm::PHINode * GetInductionVariable(const Loop &L, ScalarEvolution &SE, const LSRInstance &LSR)
Ideally pick the PHI IV inserted by ScalarEvolutionExpander.
static cl::opt< cl::boolOrDefault > AllowTerminatingConditionFoldingAfterLSR("lsr-term-fold", cl::Hidden, cl::desc("Attempt to replace primary IV with other IV."))
static bool IsSimplerBaseSCEVForTarget(const TargetTransformInfo &TTI, ScalarEvolution &SE, const SCEV *Best, const SCEV *Reg, MemAccessTy AccessType)
loop reduce
static const unsigned MaxIVUsers
MaxIVUsers is an arbitrary threshold that provides an early opportunity for bail out.
static bool isHighCostExpansion(const SCEV *S, SmallPtrSetImpl< const SCEV * > &Processed, ScalarEvolution &SE)
Check if expanding this expression is likely to incur significant cost.
static Value * getValueOrPoison(WeakVH &VH, LLVMContext &C)
Cached location ops may be erased during LSR, in which case a poison is required when restoring from ...
static MemAccessTy getAccessType(const TargetTransformInfo &TTI, Instruction *Inst, Value *OperandVal)
Return the type of the memory being accessed.
static unsigned numLLVMArgOps(SmallVectorImpl< uint64_t > &Expr)
Returns the total number of DW_OP_llvm_arg operands in the expression.
static void DbgRewriteSalvageableDVIs(llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &DVIToUpdate)
Obtain an expression for the iteration count, then attempt to salvage the dbg.value intrinsics.
static cl::opt< bool > EnablePhiElim("enable-lsr-phielim", cl::Hidden, cl::init(true), cl::desc("Enable LSR phi elimination"))
static void DbgGatherSalvagableDVI(Loop *L, ScalarEvolution &SE, SmallVector< std::unique_ptr< DVIRecoveryRec >, 2 > &SalvageableDVISCEVs, SmallSet< AssertingVH< DbgValueInst >, 2 > &DVIHandles)
Identify and cache salvageable DVI locations and expressions along with the corresponding SCEV(s).
static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE)
Return true if the given addrec can be sign-extended without changing its value.
static bool canHoistIVInc(const TargetTransformInfo &TTI, const LSRFixup &Fixup, const LSRUse &LU, Instruction *IVIncInsertPos, Loop *L)
static void DoInitialMatch(const SCEV *S, Loop *L, SmallVectorImpl< const SCEV * > &Good, SmallVectorImpl< const SCEV * > &Bad, ScalarEvolution &SE)
Recursion helper for initialMatch.
static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F)
Check if the addressing mode defined by F is completely folded in LU at isel time.
static cl::opt< bool > LSRExpNarrow("lsr-exp-narrow", cl::Hidden, cl::init(false), cl::desc("Narrow LSR complex solution using" " expectation of registers number"))
static cl::opt< bool > FilterSameScaledReg("lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true), cl::desc("Narrow LSR search space by filtering non-optimal formulae" " with the same ScaledReg and Scale"))
static void updateDVIWithLocations(T &DbgVal, SmallVectorImpl< Value * > &Locations, SmallVectorImpl< uint64_t > &Ops)
Overwrite DVI with locations placed into a DIArglist.
static cl::opt< unsigned > ComplexityLimit("lsr-complexity-limit", cl::Hidden, cl::init(std::numeric_limits< uint16_t >::max()), cl::desc("LSR search space complexity limit"))
static void UpdateDbgValueInst(DVIRecoveryRec &DVIRec, SmallVectorImpl< Value * > &NewLocationOps, SmallVectorImpl< uint64_t > &NewExpr)
Write the new expression and new location ops for the dbg.value.
static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT, LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC, TargetLibraryInfo &TLI, MemorySSA *MSSA)
static bool isProfitableChain(IVChain &Chain, SmallPtrSetImpl< Instruction * > &Users, ScalarEvolution &SE, const TargetTransformInfo &TTI)
Return true if the number of registers needed for the chain is estimated to be less than the number r...
static const SCEV * CollectSubexprs(const SCEV *S, const SCEVConstant *C, SmallVectorImpl< const SCEV * > &Ops, const Loop *L, ScalarEvolution &SE, unsigned Depth=0)
Split S into subexpressions which can be pulled out into separate registers.
static const SCEV * getExactSDiv(const SCEV *LHS, const SCEV *RHS, ScalarEvolution &SE, bool IgnoreSignificantBits=false)
Return an expression for LHS /s RHS, if it can be determined and if the remainder is known to be zero...
static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, Value *Operand, const TargetTransformInfo &TTI)
Return true if the IVInc can be folded into an addressing mode.
#define DEBUG_TYPE
static const SCEV * getAnyExtendConsideringPostIncUses(ArrayRef< PostIncLoopSet > Loops, const SCEV *Expr, Type *ToTy, ScalarEvolution &SE)
Extend/Truncate Expr to ToTy considering post-inc uses in Loops.
static unsigned getSetupCost(const SCEV *Reg, unsigned Depth)
static cl::opt< unsigned > SetupCostDepthLimit("lsr-setupcost-depth-limit", cl::Hidden, cl::init(7), cl::desc("The limit on recursion depth for LSRs setup cost"))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
unsigned Reg
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
Module.h This file contains the declarations for the Module class.
uint64_t IntrinsicInst * II
#define P(N)
PowerPC TLS Dynamic Call Fixup
This header defines various interfaces for pass management in LLVM.
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file defines the PointerIntPair class.
const SmallVectorImpl< MachineOperand > & Cond
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
SI optimize exec mask operations pre RA
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This pass exposes codegen information to IR-level passes.
Virtual Register Rewriter
Definition: VirtRegMap.cpp:237
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class recording the (high level) value of a variable.
Class for arbitrary precision integers.
Definition: APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1500
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:309
APInt sdiv(const APInt &RHS) const
Signed division function for APInt.
Definition: APInt.cpp:1614
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition: APInt.h:1491
APInt srem(const APInt &RHS) const
Function for signed remainder operation.
Definition: APInt.cpp:1706
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1522
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:405
Represent the analysis usage information of a pass.
AnalysisUsage & addRequiredID(const void *ID)
Definition: Pass.cpp:283
AnalysisUsage & addPreservedID(const void *ID)
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Definition: Any.h:28
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
Value handle that asserts if the Value is deleted.
Definition: ValueHandle.h:264
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:495
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:696
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:507
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:167
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition: BasicBlock.h:366
bool isLandingPad() const
Return true if this basic block is a landing pad.
Definition: BasicBlock.cpp:677
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:229
BinaryOps getOpcode() const
Definition: InstrTypes.h:442
static BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
Conditional or Unconditional Branch instruction.
void setCondition(Value *V)
void swapSuccessors()
Swap the successors of this branch instruction.
BasicBlock * getSuccessor(unsigned i) const
bool isUnconditional() const
Value * getCondition() const
static Instruction::CastOps getCastOpcode(const Value *Val, bool SrcIsSigned, Type *Ty, bool DstIsSigned)
Returns the opcode necessary to cast Val into Ty using usual casting rules.
static CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ ICMP_EQ
equal
Definition: InstrTypes.h:778
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:871
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
static bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
Definition: Constants.cpp:1575
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.h:124
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:161
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:155
This is an important base class in LLVM.
Definition: Constant.h:42
static DIArgList * get(LLVMContext &Context, ArrayRef< ValueAsMetadata * > Args)
An iterator for expression operands.
DWARF expression.
static DIExpression * append(const DIExpression *Expr, ArrayRef< uint64_t > Ops)
Append the opcodes Ops to DIExpr.
static void appendOffset(SmallVectorImpl< uint64_t > &Ops, int64_t Offset)
Append Ops with operations to apply the Offset.
bool isComplex() const
Return whether the location is computed on the expression stack, meaning it cannot be a simple regist...
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
This represents the llvm.dbg.value instruction.
Record of a variable value-assignment, aka a non instruction representation of the dbg....
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
NodeT * getBlock() const
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:317
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
This instruction compares its operands according to the predicate given to the constructor.
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2356
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2671
IVStrideUse - Keep track of one use of a strided induction variable.
Definition: IVUsers.h:35
void transformToPostInc(const Loop *L)
transformToPostInc - Transform the expression to post-inc form for the given loop.
Definition: IVUsers.cpp:367
Value * getOperandValToReplace() const
getOperandValToReplace - Return the Value of the operand in the user instruction that this IVStrideUs...
Definition: IVUsers.h:54
void setUser(Instruction *NewUser)
setUser - Assign a new user instruction for this use.
Definition: IVUsers.h:48
Analysis pass that exposes the IVUsers for a loop.
Definition: IVUsers.h:184
ilist< IVStrideUse >::const_iterator const_iterator
Definition: IVUsers.h:142
bool empty() const
Definition: IVUsers.h:147
void print(raw_ostream &OS) const
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:466
bool isEHPad() const
Return true if the instruction is a variety of EH-block.
Definition: Instruction.h:824
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:92
Type * getAccessType() const LLVM_READONLY
Return the type this instruction accesses in memory, if any.
bool hasPoisonGeneratingFlags() const LLVM_READONLY
Return true if this operator has flags which may cause this instruction to evaluate to poison despite...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:463
const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition: Instruction.cpp:74
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
An instruction for reading from memory.
Definition: Instructions.h:174
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:598
virtual bool runOnLoop(Loop *L, LPPassManager &LPM)=0
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
An analysis that produces MemorySSA for a function.
Definition: MemorySSA.h:928
Legacy analysis pass which computes MemorySSA.
Definition: MemorySSA.h:985
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition: MemorySSA.h:701
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
iterator_range< const_block_iterator > blocks() const
op_range incoming_values()
void setIncomingValue(unsigned i, Value *V)
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
Value * getIncomingValue(unsigned i) const
Return incoming value number x.
static unsigned getIncomingValueNumForOperand(unsigned i)
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98
PointerIntPair - This class implements a pair of a pointer and small integer.
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1852
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
This node represents an addition of some number of SCEVs.
This node represents a polynomial recurrence on the trip count of the specified loop.
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
const SCEVAddRecExpr * getPostIncExpr(ScalarEvolution &SE) const
Return an expression representing the value of this expression one iteration of the loop ahead.
This is the base class for unary cast operator classes.
This node is the base class for n'ary commutative operators.
This class represents a constant integer value.
ConstantInt * getValue() const
const APInt & getAPInt() const
This class uses information about analyze scalars to rewrite expressions in canonical form.
bool isSafeToExpand(const SCEV *S) const
Return true if the given expression is safe to expand in the sense that all materialized values are s...
bool isHighCostExpansion(ArrayRef< const SCEV * > Exprs, Loop *L, unsigned Budget, const TargetTransformInfo *TTI, const Instruction *At)
Return true for expressions that can't be evaluated at runtime within given Budget.
void clear()
Erase the contents of the InsertedExpressions map so that users trying to expand the same expression ...
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This is the base class for unary integral cast operator classes.
This node represents multiplication of some number of SCEVs.
This node is a base class providing common functionality for n'ary operators.
ArrayRef< const SCEV * > operands() const
This class represents a signed maximum selection.
This class represents a binary unsigned division operation.
This class represents an unsigned maximum selection.
This means that we are dealing with an entirely unknown SCEV value, and only represent it as its LLVM...
This class represents an analyzed expression in the program.
ArrayRef< const SCEV * > operands() const
Return operands of this SCEV expression.
unsigned short getExpressionSize() const
bool isZero() const
Return true if the expression is a constant zero.
SCEVTypes getSCEVType() const
Type * getType() const
Return the LLVM type of this SCEV expression.
This class represents a cast from signed integer to floating point.
The main scalar evolution driver.
bool isKnownNonZero(const SCEV *S)
Test if the given expression is known to be non-zero.
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getZero(Type *Ty)
Return a SCEV for the constant 0 of a specific type.
uint64_t getTypeSizeInBits(Type *Ty) const
Return the size in bits of the specified type, for which isSCEVable must return true.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getNoopOrSignExtend(const SCEV *V, Type *Ty)
Return a SCEV corresponding to a conversion of the input value to the specified type.
unsigned getSmallConstantMaxTripCount(const Loop *L)
Returns the upper bound of the loop trip count as a normal unsigned value.
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getAddRecExpr(const SCEV *Start, const SCEV *Step, const Loop *L, SCEV::NoWrapFlags Flags)
Get an add recurrence expression for the specified loop.
bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
Type * getEffectiveSCEVType(Type *Ty) const
Return a type with the same bitwidth as the given type and which represents how SCEV will treat the g...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
const SCEV * getAnyExtendExpr(const SCEV *Op, Type *Ty)
getAnyExtendExpr - Return a SCEV for the given operand extended with unspecified bits out to the give...
bool containsUndefs(const SCEV *S) const
Return true if the SCEV expression contains an undef value.
const SCEV * getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth=0)
const SCEV * getVScale(Type *Ty)
bool hasComputableLoopEvolution(const SCEV *S, const Loop *L)
Return true if the given SCEV changes value in a known way in the specified loop.
const SCEV * getPointerBase(const SCEV *V)
Transitively follow the chain of pointer-type operands until reaching a SCEV that does not have a sin...
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUnknown(Value *V)
std::optional< APInt > computeConstantDifference(const SCEV *LHS, const SCEV *RHS)
Compute LHS - RHS and returns the result as an APInt if it is a constant, and std::nullopt if it isn'...
bool properlyDominates(const SCEV *S, const BasicBlock *BB)
Return true if elements that makes up the given SCEV properly dominate the specified basic block.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
bool containsErasedValue(const SCEV *S) const
Return true if the SCEV expression contains a Value that has been optimised out and is now a nullptr.
LLVMContext & getContext() const
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:113
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
iterator_range< const_set_bits_iterator > set_bits() const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
size_type size() const
Returns the number of bits in this bitvector.
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
size_type count() const
Returns the number of bits which are set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:323
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:412
iterator end() const
Definition: SmallPtrSet.h:437
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:344
iterator begin() const
Definition: SmallPtrSet.h:432
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
void clear()
Definition: SmallSet.h:218
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:717
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void reserve(size_type N)
Definition: SmallVector.h:676
iterator erase(const_iterator CI)
Definition: SmallVector.h:750
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:591
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
typename SuperClass::iterator iterator
Definition: SmallVector.h:590
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
static StackOffset get(int64_t Fixed, int64_t Scalable)
Definition: TypeSize.h:44
An instruction for storing to memory.
Definition: Instructions.h:290
Provides information about what library functions are available for the current target.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const
bool shouldDropLSRSolutionIfLessProfitable() const
Return true if LSR should drop a found solution if it's calculated to be less profitable than the bas...
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const
Return true if LSR cost of C1 is lower than C2.
bool shouldFoldTerminatingConditionAfterLSR() const
Return true if LSR should attempts to replace a use of an otherwise dead primary IV in the latch cond...
bool isProfitableLSRChainElement(Instruction *I) const
bool LSRWithInstrQueries() const
Return true if the loop strength reduce pass should make Instruction* based TTI queries to isLegalAdd...
bool isIndexedStoreLegal(enum MemIndexedMode Mode, Type *Ty) const
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace=0, Instruction *I=nullptr, int64_t ScalableOffset=0) const
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isIndexedLoadLegal(enum MemIndexedMode Mode, Type *Ty) const
bool isLegalICmpImmediate(int64_t Imm) const
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
bool isLegalAddImmediate(int64_t Imm) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *LibInfo) const
Return true if the target can save a compare for loop count, for example hardware loop saves a compar...
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isLegalAddScalableImmediate(int64_t Imm) const
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
bool isNumRegsMajorCostOfLSR() const
Return true if LSR major cost is number of registers.
@ MIM_PostInc
Post-incrementing.
bool canMacroFuseCmp() const
Return true if the target can fuse a compare and branch.
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace=0) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
This class represents a truncation of integer types.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static Type * getVoidTy(LLVMContext &C)
int getFPMantissaWidth() const
Return the width of the mantissa of this type.
static IntegerType * getInt8Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
This class represents a cast unsigned integer to floating point.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:242
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
op_iterator op_end()
Definition: User.h:236
static ValueAsMetadata * get(Value *V)
Definition: Metadata.cpp:495
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
iterator_range< use_iterator > uses()
Definition: Value.h:376
A nullable Value handle that is nullable.
Definition: ValueHandle.h:144
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Key
PAL metadata keys.
@ Entry
Definition: COFF.h:811
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ SC
CHAIN = SC CHAIN, Imm128 - System call.
Reg
All possible values of the reg field in the ModR/M byte.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:711
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
@ DW_OP_LLVM_arg
Only used in LLVM metadata.
Definition: Dwarf.h:147
@ DW_OP_LLVM_convert
Only used in LLVM metadata.
Definition: Dwarf.h:143
constexpr double e
Definition: MathExtras.h:47
Sequence
A sequence of states that a pointer may go through in which an objc_retain and objc_release are actua...
Definition: PtrState.h:41
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:227
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:236
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool mustExecuteUBIfPoisonOnPathTo(Instruction *Root, Instruction *OnPathTo, DominatorTree *DT)
Return true if undefined behavior would provable be executed on the path to OnPathTo if Root produced...
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition: DWP.cpp:480
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Returns a loop's estimated trip count based on branch weight metadata.
Definition: LoopUtils.cpp:849
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
bool operator!=(uint64_t V1, const APInt &V2)
Definition: APInt.h:2062
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2067
char & LoopSimplifyID
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
bool matchSimpleRecurrence(const PHINode *P, BinaryOperator *&BO, Value *&Start, Value *&Step)
Attempt to match a simple first order recurrence cycle of the form: iv = phi Ty [Start,...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
bool DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr)
Examine each PHI in the given block and delete it if it is dead.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
const SCEV * denormalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE)
Denormalize S to be post-increment for all loops present in Loops.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
cl::opt< unsigned > SCEVCheapExpansionBudget
Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
void SplitLandingPadPredecessors(BasicBlock *OrigBB, ArrayRef< BasicBlock * > Preds, const char *Suffix, const char *Suffix2, SmallVectorImpl< BasicBlock * > &NewBBs, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, bool PreserveLCSSA=false)
This method transforms the landing pad, OrigBB, by introducing two new basic blocks into the function...
const SCEV * normalizeForPostIncUse(const SCEV *S, const PostIncLoopSet &Loops, ScalarEvolution &SE, bool CheckInvertible=true)
Normalize S to be post-increment for all loops present in Loops.
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
@ Add
Sum of integers.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1914
Pass * createLoopStrengthReducePass()
BasicBlock * SplitCriticalEdge(Instruction *TI, unsigned SuccNum, const CriticalEdgeSplittingOptions &Options=CriticalEdgeSplittingOptions(), const Twine &BBName="")
If this edge is a critical edge, insert a new node to split the critical edge.
bool RecursivelyDeleteTriviallyDeadInstructionsPermissive(SmallVectorImpl< WeakTrackingVH > &DeadInsts, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
Same functionality as RecursivelyDeleteTriviallyDeadInstructions, but allow instructions that are not...
Definition: Local.cpp:555
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
bool formLCSSAForInstructions(SmallVectorImpl< Instruction * > &Worklist, const DominatorTree &DT, const LoopInfo &LI, ScalarEvolution *SE, SmallVectorImpl< PHINode * > *PHIsToRemove=nullptr, SmallVectorImpl< PHINode * > *InsertedPHIs=nullptr)
Ensures LCSSA form for every instruction from the Worklist in the scope of innermost containing loop.
Definition: LCSSA.cpp:77
void initializeLoopStrengthReducePass(PassRegistry &)
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
bool isAlmostDeadIV(PHINode *IV, BasicBlock *LatchBlock, Value *Cond)
Return true if the induction variable IV in a Loop whose latch is LatchBlock would become dead if the...
Definition: LoopUtils.cpp:469
int rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI, ScalarEvolution *SE, const TargetTransformInfo *TTI, SCEVExpander &Rewriter, DominatorTree *DT, ReplaceExitVal ReplaceExitValue, SmallVector< WeakTrackingVH, 16 > &DeadInsts)
If the final value of any expressions that are recurrent in the loop can be computed,...
Definition: LoopUtils.cpp:1489
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
@ UnusedIndVarInLoop
Definition: LoopUtils.h:472
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:471
bool SCEVExprContains(const SCEV *Root, PredTy Pred)
Return true if any node in Root satisfies the predicate Pred.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Option class for critical edge splitting.
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Information about a load/store intrinsic defined by the target.
Value * PtrVal
This is the pointer that the intrinsic is loading from or storing to.